def main(): parser = argparse.ArgumentParser(description='Evaluate a model on TriviaQA data') parser.add_argument('model', help='model directory') parser.add_argument('-p', '--paragraph_output', type=str, help="Save fine grained results for each paragraph in csv format") parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's" " most confident span for each (question, doc) pair") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument('--n_processes', type=int, default=None, help="Number of processes to do the preprocessing (selecting paragraphs+loading context) with") parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument('-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes might be faster but wll take more memory") parser.add_argument('--max_answer_len', type=int, default=8, help="Max answer span to select") parser.add_argument('-c', '--corpus', choices=["web-dev", "web-test", "web-verified-dev", "web-train", "open-dev", "open-train"], default="web-verified-dev") args = parser.parse_args() model_dir = ModelDir(args.model) model = model_dir.get_model() if args.corpus.startswith('web'): dataset = TriviaQaWebDataset() corpus = dataset.evidence if args.corpus == "web-dev": test_questions = dataset.get_dev() elif args.corpus == "web-test": test_questions = dataset.get_test() elif args.corpus == "web-verified-dev": test_questions = dataset.get_verified() elif args.corpus == "web-train": test_questions = dataset.get_train() else: raise RuntimeError() else: dataset = TriviaQaOpenDataset() corpus = dataset.evidence if args.corpus == "open-dev": test_questions = dataset.get_dev() elif args.corpus == "open-train": test_questions = dataset.get_train() else: raise RuntimeError() splitter = MergeParagraphs(args.tokens) per_document = not args.corpus.startswith("open") filter_name = args.filter if filter_name is None: if args.corpus.startswith("open"): filter_name = "linear" else: filter_name = "tfidf" print("Selecting %d paragraphs using %s method per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample if n_questions is not None: test_questions.sort(key=lambda x:x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append(DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) print("Done, starting eval") if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() test_questions = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True)) evaluation = trainer.test(model, [RecordParagraphSpanPrediction(args.max_answer_len, True)], {args.corpus:test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args.async)[args.corpus] if not all(len(x) == len(data) for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) if args.official_output is not None: print("Saving question result") # I didn't store the unormalized filenames exactly, so unfortunately we have to reload # the source data to get exact filename to output an official test script fns = {} print("Loading proper filenames") if args.corpus == 'web-test': source = join(TRIVIA_QA, "qa", "web-test-without-answers.json") elif args.corpus == "web-dev": source = join(TRIVIA_QA, "qa", "web-dev.json") else: raise NotImplementedError() with open(join(source)) as f: data = json.load(f)["Data"] for point in data: for doc in point["EntityPages"]: filename = doc["Filename"] fn = join("wikipedia", filename[:filename.rfind(".")]) fn = normalize_wiki_filename(fn) fns[(point["QuestionId"], fn)] = filename answers = {} scores = {} for q_id, doc_id, start, end, txt, score in df[["question_id", "doc_id", "para_start", "para_end", "text_answer", "predicted_score"]].itertuples(index=False): filename = dataset.evidence.file_id_map[doc_id] if filename.startswith("web"): true_name = filename[4:] + ".txt" else: true_name = fns[(q_id, filename)] key = q_id + "--" + true_name prev_score = scores.get(key) if prev_score is None or prev_score < score: scores[key] = score answers[key] = txt with open(args.official_output, "w") as f: json.dump(answers, f) if per_document: group_by = ["question_id", "doc_id"] else: group_by = ["question_id"] # Print a table of scores as more paragraphs are used df.sort_values(group_by + ["rank"], inplace=True) f1 = compute_model_scores(df, "predicted_score", "text_f1", group_by) em = compute_model_scores(df, "predicted_score", "text_em", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i+1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) print_table(table) output_file = args.paragraph_output if output_file is not None: print("Saving paragraph result") if output_file.endswith("json"): with open(output_file, "w") as f: json.dump(evaluation.per_sample, f) elif output_file.endswith("pkl"): with open(output_file, "wb") as f: pickle.dump(evaluation.per_sample, f) elif output_file.endswith("csv"): df.to_csv(output_file, index=False) else: raise ValueError("Unrecognized file format")
def main(): parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD') parser.add_argument('model', help='model directory to evaluate') parser.add_argument("-o", "--official_output", type=str, help="where to output an official result file") parser.add_argument('-n', '--sample_questions', type=int, default=None, help="(for testing) run on a subset of questions") parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17], help="Max size of answer") parser.add_argument('-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes can be faster but uses more memory") parser.add_argument('-s', '--step', default=None, help="Weights to load, can be a checkpoint step or 'latest'") # Add ja_test choice to test Multilingual QA dataset. parser.add_argument( '-c', '--corpus', choices=["dev", "train", "ja_test", "pred"], default="dev") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") # Add ja_test choice to test Multilingual QA pipeline. parser.add_argument('-p', '--pred_filepath', default=None, help="The csv file path if you try pred mode") args = parser.parse_args() model_dir = ModelDir(args.model) corpus = SquadCorpus() if args.corpus == "dev": questions = corpus.get_dev() # Add ja_test choice to test Multilingual QA pipeline. elif args.corpus == "ja_test": questions = corpus.get_ja_test() # This is for prediction mode for MLQA pipeline. elif args.corpus == "pred": questions = create_pred_dataset(args.pred_filepath) else: questions = corpus.get_train() questions = split_docs(questions) if args.sample_questions: np.random.RandomState(0).shuffle( sorted(questions, key=lambda x: x.question_id)) questions = questions[:args.sample_questions] questions.sort(key=lambda x: x.n_context_words, reverse=True) dataset = ParagraphAndQuestionDataset( questions, FixedOrderBatcher(args.batch_size, True)) evaluators = [SpanEvaluator(args.answer_bounds, text_eval="squad")] if args.official_output is not None: evaluators.append(RecordSpanPrediction(args.answer_bounds[0])) if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() model = model_dir.get_model() evaluation = trainer.test(model, evaluators, {args.corpus: dataset}, corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus] # Print the scalar results in a two column table scalars = evaluation.scalars cols = list(sorted(scalars.keys())) table = [cols] header = ["Metric", ""] table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols]) print_table([header] + transpose_lists(table)) # Save the official output if args.official_output is not None: quid_to_para = {} for x in questions: quid_to_para[x.question_id] = x.paragraph q_id_to_answers = {} q_ids = evaluation.per_sample["question_id"] spans = evaluation.per_sample["predicted_span"] for q_id, (start, end) in zip(q_ids, spans): text = quid_to_para[q_id].get_original_text(start, end) q_id_to_answers[q_id] = text with open(args.official_output, "w") as f: json.dump(q_id_to_answers, f)
def main(): parser = argparse.ArgumentParser( description='Evaluate a model on TriviaQA data') parser.add_argument('model', help='model directory') parser.add_argument( '-p', '--paragraph_output', type=str, help="Save fine grained results for each paragraph in csv format") parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's" " most confident span for each (question, doc) pair") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument( '--n_processes', type=int, default=None, help= "Number of processes to do the preprocessing (selecting paragraphs+loading context) with" ) parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument( '-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes might be faster but wll take more memory" ) parser.add_argument('--max_answer_len', type=int, default=8, help="Max answer span to select") parser.add_argument('-c', '--corpus', choices=[ "web-dev", "web-test", "web-verified-dev", "web-train", "open-dev", "open-train", "wiki-dev", "wiki-test" ], default="web-verified-dev") parser.add_argument("-s", "--source_dir", type=str, default=None, help="where to take input files") parser.add_argument("--n_span_per_q", type=int, default=1, help="where to take input files") args = parser.parse_args() dataset_name = args.source_dir.split('/')[-1] model_name = args.model.split('/')[-1] ElasticLogger().write_log('INFO', 'Start Evaluation', context_dict={ 'model': model_name, 'dataset': dataset_name }) model_dir = ModelDir(args.model) model = model_dir.get_model() if args.corpus.startswith('web'): dataset = TriviaQaWebDataset() if args.corpus == "web-dev": test_questions = dataset.get_dev() elif args.corpus == "web-test": test_questions = dataset.get_test() elif args.corpus == "web-verified-dev": test_questions = dataset.get_verified() elif args.corpus == "web-train": test_questions = dataset.get_train() else: raise AssertionError() elif args.corpus.startswith("wiki"): dataset = TriviaQaWikiDataset() if args.corpus == "wiki-dev": test_questions = dataset.get_dev() elif args.corpus == "wiki-test": test_questions = dataset.get_test() else: raise AssertionError() else: dataset = TriviaQaOpenDataset(args.source_dir) if args.corpus == "open-dev": # just loading the pkl that was saved in build_span_corpus test_questions = dataset.get_dev() elif args.corpus == "open-train": test_questions = dataset.get_train() else: raise AssertionError() ### ALON debuging #test_questions = test_questions[0:5] corpus = dataset.evidence splitter = MergeParagraphs(args.tokens) per_document = args.corpus.startswith( "web") # wiki and web are both multi-document #per_document = True filter_name = args.filter if filter_name is None: # Pick default depending on the kind of data we are using if per_document: filter_name = "tfidf" else: filter_name = "linear" print("Selecting %d paragraphs using method \"%s\" per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample docqa.config.SPANS_PER_QUESTION = args.n_span_per_q #n_questions = 1 if n_questions is not None: test_questions.sort(key=lambda x: x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append( DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) print("Done, starting eval") if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() test_questions = ParagraphAndQuestionDataset( questions, FixedOrderBatcher(args.batch_size, True)) evaluation = trainer.test( model, [RecordParagraphSpanPrediction(args.max_answer_len, True)], {args.corpus: test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args. async)[args.corpus] if not all(len(x) == len(data) for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) if args.official_output is not None: print("Saving question result") fns = {} if per_document: # I didn't store the unormalized filenames exactly, so unfortunately we have to reload # the source data to get exact filename to output an official test script print("Loading proper filenames") if args.corpus == 'web-test': source = join(TRIVIA_QA, "qa", "web-test-without-answers.json") elif args.corpus == "web-dev": source = join(TRIVIA_QA, "qa", "web-dev.json") else: raise AssertionError() with open(join(source)) as f: data = json.load(f)["Data"] for point in data: for doc in point["EntityPages"]: filename = doc["Filename"] fn = join("wikipedia", filename[:filename.rfind(".")]) fn = normalize_wiki_filename(fn) fns[(point["QuestionId"], fn)] = filename answers = {} scores = {} for q_id, doc_id, start, end, txt, score in df[[ "question_id", "doc_id", "para_start", "para_end", "text_answer", "predicted_score" ]].itertuples(index=False): filename = dataset.evidence.file_id_map[doc_id] if per_document: if filename.startswith("web"): true_name = filename[4:] + ".txt" else: true_name = fns[(q_id, filename)] # Alon Patch for triviaqa test results true_name = true_name.replace('TriviaQA_Org/', '') key = q_id + "--" + true_name else: key = q_id prev_score = scores.get(key) if prev_score is None or prev_score < score: scores[key] = score answers[key] = txt with open(args.official_output, "w") as f: json.dump(answers, f) output_file = args.paragraph_output if output_file is not None: print("Saving paragraph result") df.to_csv(output_file, index=False) print("Computing scores") if per_document: group_by = ["question_id", "doc_id"] else: group_by = ["question_id"] # Print a table of scores as more paragraphs are used df.sort_values(group_by + ["rank"], inplace=True) df_scores = df.copy(deep=True) df_scores['predicted_score'] = df_scores['predicted_score'].apply( lambda x: pd.Series(x).max()) em = compute_ranked_scores(df_scores, "predicted_score", "text_em", group_by) f1 = compute_ranked_scores(df_scores, "predicted_score", "text_f1", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i + 1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) table_df = pd.DataFrame(table[1:], columns=table[0]).drop(['N Paragraphs'], axis=1) ElasticLogger().write_log('INFO', 'Results', context_dict={'model': model_name, 'dataset': dataset_name, \ 'max_EM':table_df.max().ix['EM'], \ 'max_F1':table_df.max().ix['F1'], \ 'result_table': str(table_df)}) df_flat = [] for id, question in df.iterrows(): for text_answer, predicted_span, predicted_score in zip( question['text_answer'], question['predicted_span'], question['predicted_score']): new_question = dict(question.copy()) new_question.update({ 'text_answer': text_answer, 'predicted_span': predicted_span, 'predicted_score': predicted_score }) df_flat.append(new_question) results_df = pd.DataFrame(df_flat) #Alon: outputing the estimates for all the #results_df = results_df.groupby(['question_id', 'text_answer']).apply(lambda df: df.ix[df['predicted_score'].argmax()]).reset_index(drop=True) results_df.sort_values(by=['question_id', 'predicted_score'], ascending=False).set_index([ 'question_id', 'text_answer' ])[['question', 'predicted_score', 'text_em']].to_csv('results.csv') print_table(table)
def main(): parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD') parser.add_argument('model', help='model directory to evaluate') parser.add_argument("-o", "--official_output", type=str, help="where to output an official result file") parser.add_argument('-n', '--sample_questions', type=int, default=None, help="(for testing) run on a subset of questions") parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17], help="Max size of answer") parser.add_argument('-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes can be faster but uses more memory") parser.add_argument('-s', '--step', default=None, help="Weights to load, can be a checkpoint step or 'latest'") parser.add_argument('-c', '--corpus', choices=["dev", "train"], default="dev") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument('--none_prob', action="store_true", help="Output none probability for samples") parser.add_argument('--elmo', action="store_true", help="Use elmo model") parser.add_argument('--per_question_loss_file', type=str, default=None, help="Run question by question and output a question_id -> loss output to this file") args = parser.parse_known_args()[0] model_dir = ModelDir(args.model) corpus = SquadCorpus() if args.corpus == "dev": questions = corpus.get_dev() else: questions = corpus.get_train() questions = split_docs(questions) if args.sample_questions: np.random.RandomState(0).shuffle(sorted(questions, key=lambda x: x.question_id)) questions = questions[:args.sample_questions] questions.sort(key=lambda x:x.n_context_words, reverse=True) dataset = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True)) evaluators = [SpanEvaluator(args.answer_bounds, text_eval="squad")] if args.official_output is not None: evaluators.append(RecordSpanPrediction(args.answer_bounds[0])) if args.per_question_loss_file is not None: evaluators.append(RecordSpanPredictionScore(args.answer_bounds[0], args.batch_size, args.none_prob)) if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() model = model_dir.get_model() if args.elmo: model.lm_model.lm_vocab_file = './elmo-params/squad_train_dev_all_unique_tokens.txt' model.lm_model.options_file = './elmo-params/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json' model.lm_model.weight_file = './elmo-params/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5' model.lm_model.embed_weights_file = None evaluation = trainer.test(model, evaluators, {args.corpus: dataset}, corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus] # Print the scalar results in a two column table scalars = evaluation.scalars cols = list(sorted(scalars.keys())) table = [cols] header = ["Metric", ""] table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols]) print_table([header] + transpose_lists(table)) # Save the official output if args.official_output is not None: quid_to_para = {} for x in questions: quid_to_para[x.question_id] = x.paragraph q_id_to_answers = {} q_ids = evaluation.per_sample["question_id"] spans = evaluation.per_sample["predicted_span"] for q_id, (start, end) in zip(q_ids, spans): text = quid_to_para[q_id].get_original_text(start, end) q_id_to_answers[q_id] = text with open(args.official_output, "w") as f: json.dump(q_id_to_answers, f) if args.per_question_loss_file is not None: print("Saving result") output_file = args.per_question_loss_file ids = evaluation.per_sample["question_ids"] f1s = evaluation.per_sample["text_f1"] ems = evaluation.per_sample["text_em"] losses = evaluation.per_sample["loss"] if args.none_prob: none_probs = evaluation.per_sample["none_probs"] """ results = {question_id: {'f1': float(f1), 'em': float(em), 'loss': float(loss), 'none_prob': float(none_prob)} for question_id, f1, em, loss, none_prob in zip(ids, f1s, ems, losses, none_probs)} """ results = {question_id: float(none_prob) for question_id, none_prob in zip(ids, none_probs)} else: results = {question_id: {'f1': float(f1), 'em': float(em), 'loss': float(loss)} for question_id, f1, em, loss in zip(ids, f1s, ems, losses)} with open(output_file, 'w') as f: json.dump(results, f)
def main(): parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD') parser.add_argument('model', help='model directory to evaluate') parser.add_argument("-o", "--official_output", type=str, help="where to output an official result file") parser.add_argument('-n', '--sample_questions', type=int, default=None, help="(for testing) run on a subset of questions") parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17], help="Max size of answer") parser.add_argument('-b', '--batch_size', type=int, default=45, help="Batch size, larger sizes can be faster but uses more memory") parser.add_argument('-s', '--step', default=None, help="Weights to load, can be a checkpoint step or 'latest'") parser.add_argument('-c', '--corpus', choices=["dev", "train"], default="dev") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") args = parser.parse_args() num_choices = 4 model_dir = ModelDir(args.model) corpus = SquadCorpus() if args.corpus == "dev": questions = corpus.get_dev() else: questions = corpus.get_train() questions = split_docs(questions) if args.sample_questions: np.random.RandomState(0).shuffle(sorted(questions, key=lambda x: x.question_id)) questions = questions[:args.sample_questions] questions.sort(key=lambda x:x.n_context_words, reverse=True) #pdb.set_trace() #print(args.batch_size) #dataset = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, False),None,num_choices) dataset = ParagraphAndQuestionDataset(questions, ClusteredBatcher(45, ContextLenKey(), False, False),None,num_choices) #ClusteredBatcher(45, ContextLenKey(), False, False) evaluators = [MultiChoiceEvaluator(num_choices)] #if args.official_output is not None: #evaluators.append(RecordSpanPrediction(args.answer_bounds[0])) #pdb.set_trace() if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() model = model_dir.get_model() #pdb.set_trace() evaluation = trainer.test(model, evaluators, {args.corpus: dataset}, corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus] #pdb.set_trace() # Print the scalar results in a two column table scalars = evaluation.scalars cols = list(sorted(scalars.keys())) table = [cols] header = ["Metric", ""] table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols]) print_table([header] + transpose_lists(table)) # Save the official output if args.official_output is not None: data_to_dump = {} list_of_choices = ['A','B','C','D'] q_ids = evaluation.per_sample["question_id"] correct_ans = evaluation.per_sample["correct answer"] correct_ids = evaluation.per_sample["correct index"] pred_ids = evaluation.per_sample["predictied index"] pred_ans = evaluation.per_sample["predictied answer"] is_correct = evaluation.per_sample["is correct"] #pdb.set_trace() for ix, q_ids in enumerate(q_ids): if(is_correct[ix]): data_to_dump[q_ids] = {'Is Correct' : 'True', 'predictied' : [' '.join(pred_ans[ix]),list_of_choices[pred_ids[ix]]], 'correct' : [' '.join(correct_ans[ix]),list_of_choices[correct_ids[ix]]] } else: data_to_dump[q_ids] = {'Is Correct' : 'False', 'predictied' : [' '.join(pred_ans[ix]),list_of_choices[pred_ids[ix]]], 'correct' : [' '.join(correct_ans[ix]),list_of_choices[correct_ids[ix]]] } #pdb.set_trace() with open(args.official_output, "w") as f: json.dump(data_to_dump , f)