def show_unk(corpus: SquadCorpus, vec_name: str, context: bool = True, question: bool = True): vecs = corpus.get_pruned_word_vecs(vec_name) docs = corpus.get_train() lower_unk = Counter() unk = Counter() for doc in docs: for para in doc.paragraphs: if context: for sent in para.text: for word in sent: if word not in vecs: unk[word] += 1 word = word.lower() if word not in vecs: lower_unk[word] += 1 if question: for question in para.questions: for word in question.words: if word not in vecs: unk[word] += 1 word = word.lower() if word not in vecs: lower_unk[word] += 1 print("\n".join("%s: %d" % (k, v) for k, v in lower_unk.most_common()))
def main(): data = SquadCorpus() string_f1 = 0 mapped_string_f1 = 0 docs = data.get_train() n_questions = 0 for doc in tqdm(docs): for para in doc.paragraphs: words = flatten_iterable(para.text) for question in para.questions: n_questions += 1 span_answer = question.answer[0] span_str = " ".join( words[span_answer. para_word_start:span_answer.para_word_end + 1]) raw_answer = span_answer.text mapped_str = para.get_original_text( span_answer.para_word_start, span_answer.para_word_end) string_f1 += f1_score(raw_answer, span_str) mapped_string_f1 += f1_score(raw_answer, mapped_str) print(string_f1 / n_questions) print(mapped_string_f1 / n_questions)
def show_nums(corpus: SquadCorpus): n_regex = re.compile(".*[0-9].*") data = corpus.get_train() np.random.shuffle(data) for doc in data: paragraphs = list(doc.paragraphs) np.random.shuffle(paragraphs) for para in paragraphs: sentences = list(para.context) + [x.words for x in para.questions] np.random.shuffle(sentences) for words in sentences: for i, word in enumerate(words): if n_regex.match(word) is not None: print(word)
def show_in_context_unks(corpus: SquadCorpus, vec_name): data = corpus.get_train() np.random.shuffle(data) vecs = corpus.get_pruned_word_vecs(vec_name) for doc in data: paragraphs = list(doc.paragraphs) np.random.shuffle(paragraphs) for para in paragraphs: sentences = list(para.text) + [x.words for x in para.questions] np.random.shuffle(sentences) for words in sentences: for i, word in enumerate(words): if word.lower() not in vecs: words[i] = "{{{" + word + "}}}" print(" ".join(words[max(0, i - 10):min(len(words), i + 10)])) words[i] = word
def main(): corpus = SquadCorpus() prepro = SquadTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True) orig_data = corpus.get_train( ) if OPTS.split == 'train' else corpus.get_dev() orig_lens = [ len(p.text[0]) for doc in orig_data for p in doc.paragraphs for q in p.questions ] new_data = preprocess_par(orig_data, corpus.evidence, prepro, n_processes=1) new_lens = [len(p.text) for q in new_data for p in q.paragraphs] print('%d original, mean %.2f words' % (len(orig_lens), np.mean(orig_lens))) print('%d new, mean %.2f words' % (len(new_lens), np.mean(new_lens))) if OPTS.out_file: write_output(OPTS.split, new_data, OPTS.out_file)
def main(): corpus = SquadCorpus() if OPTS.normalize_before_ranking: normalizer = WordNormalizer() else: normalizer = None if OPTS.use_vec_dist: word_vecs = corpus.get_pruned_word_vecs('glove.840B.300d') prepro = SquadVectorTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_vecs, word_normalizer=normalizer) else: prepro = SquadTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_normalizer=normalizer) orig_data = corpus.get_train() if OPTS.split == 'train' else corpus.get_dev() orig_lens = [len(p.text[0]) for doc in orig_data for p in doc.paragraphs for q in p.questions] new_data = preprocess_par(orig_data, corpus.evidence, prepro, n_processes=1) new_lens = [len(p.text) for q in new_data for p in q.paragraphs] print('%d original, mean %.2f words' % (len(orig_lens), np.mean(orig_lens))) print('%d new, mean %.2f words'% (len(new_lens), np.mean(new_lens))) if OPTS.out_file: write_output(OPTS.split, new_data, OPTS.out_file)
def show_features(corpus: SquadCorpus, vec_name): print("Loading train docs") data = corpus.get_train() np.random.shuffle(data) data = data[:100] print("Loading vectors") vecs = corpus.get_pruned_word_vecs(vec_name) fe = BasicWordFeatures() grouped_by_features = defaultdict(Counter) print("start") for doc in data: paragraphs = list(doc.paragraphs) np.random.shuffle(paragraphs) for para in paragraphs: sentences = list(para.text) + [x.words for x in para.questions] np.random.shuffle(sentences) for words in sentences: for i, word in enumerate(words): if word.lower() not in vecs: x = fe.get_word_features(word) for i, val in enumerate(x): if val > 0: grouped_by_features[i][word] += 1 for i in sorted(grouped_by_features.keys()): name = BasicWordFeatures.features_names[i] if name in ["Len"]: continue vals = grouped_by_features[i] print() print("*" * 30) print("%s-%d %d (%d)" % (name, i, len(vals), sum(vals.values()))) for k, v in vals.most_common(30): print("%s: %d" % (k, v))
def main(): parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD') parser.add_argument('model', help='model directory to evaluate') parser.add_argument("-o", "--official_output", type=str, help="where to output an official result file") parser.add_argument('-n', '--sample_questions', type=int, default=None, help="(for testing) run on a subset of questions") parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17], help="Max size of answer") parser.add_argument('-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes can be faster but uses more memory") parser.add_argument('-s', '--step', default=None, help="Weights to load, can be a checkpoint step or 'latest'") # Add ja_test choice to test Multilingual QA dataset. parser.add_argument( '-c', '--corpus', choices=["dev", "train", "ja_test", "pred"], default="dev") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") # Add ja_test choice to test Multilingual QA pipeline. parser.add_argument('-p', '--pred_filepath', default=None, help="The csv file path if you try pred mode") args = parser.parse_args() model_dir = ModelDir(args.model) corpus = SquadCorpus() if args.corpus == "dev": questions = corpus.get_dev() # Add ja_test choice to test Multilingual QA pipeline. elif args.corpus == "ja_test": questions = corpus.get_ja_test() # This is for prediction mode for MLQA pipeline. elif args.corpus == "pred": questions = create_pred_dataset(args.pred_filepath) else: questions = corpus.get_train() questions = split_docs(questions) if args.sample_questions: np.random.RandomState(0).shuffle( sorted(questions, key=lambda x: x.question_id)) questions = questions[:args.sample_questions] questions.sort(key=lambda x: x.n_context_words, reverse=True) dataset = ParagraphAndQuestionDataset( questions, FixedOrderBatcher(args.batch_size, True)) evaluators = [SpanEvaluator(args.answer_bounds, text_eval="squad")] if args.official_output is not None: evaluators.append(RecordSpanPrediction(args.answer_bounds[0])) if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() model = model_dir.get_model() evaluation = trainer.test(model, evaluators, {args.corpus: dataset}, corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus] # Print the scalar results in a two column table scalars = evaluation.scalars cols = list(sorted(scalars.keys())) table = [cols] header = ["Metric", ""] table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols]) print_table([header] + transpose_lists(table)) # Save the official output if args.official_output is not None: quid_to_para = {} for x in questions: quid_to_para[x.question_id] = x.paragraph q_id_to_answers = {} q_ids = evaluation.per_sample["question_id"] spans = evaluation.per_sample["predicted_span"] for q_id, (start, end) in zip(q_ids, spans): text = quid_to_para[q_id].get_original_text(start, end) q_id_to_answers[q_id] = text with open(args.official_output, "w") as f: json.dump(q_id_to_answers, f)
def main(): parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD') parser.add_argument('model', help='model directory to evaluate') parser.add_argument("-o", "--official_output", type=str, help="where to output an official result file") parser.add_argument('-n', '--sample_questions', type=int, default=None, help="(for testing) run on a subset of questions") parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17], help="Max size of answer") parser.add_argument('-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes can be faster but uses more memory") parser.add_argument('-s', '--step', default=None, help="Weights to load, can be a checkpoint step or 'latest'") parser.add_argument('-c', '--corpus', choices=["dev", "train"], default="dev") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument('--none_prob', action="store_true", help="Output none probability for samples") parser.add_argument('--elmo', action="store_true", help="Use elmo model") parser.add_argument('--per_question_loss_file', type=str, default=None, help="Run question by question and output a question_id -> loss output to this file") args = parser.parse_known_args()[0] model_dir = ModelDir(args.model) corpus = SquadCorpus() if args.corpus == "dev": questions = corpus.get_dev() else: questions = corpus.get_train() questions = split_docs(questions) if args.sample_questions: np.random.RandomState(0).shuffle(sorted(questions, key=lambda x: x.question_id)) questions = questions[:args.sample_questions] questions.sort(key=lambda x:x.n_context_words, reverse=True) dataset = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True)) evaluators = [SpanEvaluator(args.answer_bounds, text_eval="squad")] if args.official_output is not None: evaluators.append(RecordSpanPrediction(args.answer_bounds[0])) if args.per_question_loss_file is not None: evaluators.append(RecordSpanPredictionScore(args.answer_bounds[0], args.batch_size, args.none_prob)) if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() model = model_dir.get_model() if args.elmo: model.lm_model.lm_vocab_file = './elmo-params/squad_train_dev_all_unique_tokens.txt' model.lm_model.options_file = './elmo-params/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json' model.lm_model.weight_file = './elmo-params/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5' model.lm_model.embed_weights_file = None evaluation = trainer.test(model, evaluators, {args.corpus: dataset}, corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus] # Print the scalar results in a two column table scalars = evaluation.scalars cols = list(sorted(scalars.keys())) table = [cols] header = ["Metric", ""] table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols]) print_table([header] + transpose_lists(table)) # Save the official output if args.official_output is not None: quid_to_para = {} for x in questions: quid_to_para[x.question_id] = x.paragraph q_id_to_answers = {} q_ids = evaluation.per_sample["question_id"] spans = evaluation.per_sample["predicted_span"] for q_id, (start, end) in zip(q_ids, spans): text = quid_to_para[q_id].get_original_text(start, end) q_id_to_answers[q_id] = text with open(args.official_output, "w") as f: json.dump(q_id_to_answers, f) if args.per_question_loss_file is not None: print("Saving result") output_file = args.per_question_loss_file ids = evaluation.per_sample["question_ids"] f1s = evaluation.per_sample["text_f1"] ems = evaluation.per_sample["text_em"] losses = evaluation.per_sample["loss"] if args.none_prob: none_probs = evaluation.per_sample["none_probs"] """ results = {question_id: {'f1': float(f1), 'em': float(em), 'loss': float(loss), 'none_prob': float(none_prob)} for question_id, f1, em, loss, none_prob in zip(ids, f1s, ems, losses, none_probs)} """ results = {question_id: float(none_prob) for question_id, none_prob in zip(ids, none_probs)} else: results = {question_id: {'f1': float(f1), 'em': float(em), 'loss': float(loss)} for question_id, f1, em, loss in zip(ids, f1s, ems, losses)} with open(output_file, 'w') as f: json.dump(results, f)