def _init(self, loader: ResourceLoader, voc: Iterable[str]): # TODO we should not be building variables here if voc is not None: word_to_vec = loader.load_word_vec(self.vec_name, voc) else: word_to_vec = loader.load_word_vec(self.vec_name) voc = set(word_to_vec.keys()) self._word_to_ix = {} dim = next(iter(word_to_vec.values())).shape[0] null_embed = tf.zeros((1, dim), dtype=tf.float32) unk_embed = tf.get_variable(shape=(1, dim), name="unk_embed", dtype=np.float32, trainable=self.learn_unk, initializer=tf.random_uniform_initializer( -self.word_vec_init_scale, self.word_vec_init_scale)) ix = 2 matrix_list = [null_embed, unk_embed] if self._special_tokens is not None and len(self._special_tokens) > 0: print("Building embeddings for %d special_tokens" % (len(self._special_tokens))) tok_embed = tf.get_variable( shape=(len(self._special_tokens), dim), name="token_embed", dtype=np.float32, trainable=True, initializer=tf.random_uniform_initializer( -self.word_vec_init_scale, self.word_vec_init_scale)) matrix_list.append(tok_embed) for token in self._special_tokens: self._word_to_ix[token] = ix ix += 1 mat = [] for word in voc: if word in self._word_to_ix: continue # in case we already added due after seeing a capitalized version of `word` if word in word_to_vec: mat.append(word_to_vec[word]) self._word_to_ix[word] = ix ix += 1 else: lower = word.lower() # Full back to the lower-case version if lower in word_to_vec and lower not in self._word_to_ix: mat.append(word_to_vec[lower]) self._word_to_ix[lower] = ix ix += 1 print("Had pre-trained word embeddings for %d of %d words" % (len(mat), len(voc))) matrix_list.append(tf.constant(value=np.vstack(mat))) self._word_emb_mat = tf.concat(matrix_list, axis=0)
def squad_build_drqa_doc_encodings(out_dir, encoder_model, num_workers, all_squad=False): print("loading data...") corpus = SquadRelevanceCorpus() questions = corpus.get_dev() if all_squad: questions.extend(corpus.get_train()) # docs = corpus.dev_title_to_document.values() if dev else corpus.train_title_to_document.values() relevant_titles = list(set([q.paragraph.doc_title for q in questions])) conn = sqlite3.connect(DRQA_DOC_DB) c = conn.cursor() titles = list(set([q.paragraph.doc_title for q in questions])) for i, t in enumerate(titles): # Had to manually resolve this (due to changes in Wikipedia?) if t == "Sky (United Kingdom)": titles[i] = "Sky UK" title_to_doc_id = {t1: t2 for t1, t2 in zip(titles, relevant_titles)} c.execute("CREATE TEMPORARY TABLE squad_docs(id)") c.executemany("INSERT INTO squad_docs VALUES (?)", [(x, ) for x in titles]) c.execute("SELECT id, text FROM documents WHERE id IN squad_docs") out = c.fetchall() conn.close() out = [(title_to_doc_id[title], text) for title, text in out] spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=1, max_num_question_words=None, max_num_context_words=None) voc = corpus.get_vocab() encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader()) # Setup worker pool workers = ProcessPool(num_workers, initializer=init, initargs=[]) documents = {} tokenized_documents = {} print("Tokenizing...") with tqdm(total=len(out)) as pbar: for doc, tok_doc in tqdm( workers.imap_unordered(get_document_paragraphs, out)): documents.update(doc) tokenized_documents.update(tok_doc) pbar.update() encodings = {} print("Encoding...") for title, paragraphs in tqdm(tokenized_documents.items()): dummy_question = "Hello Hello".split() model_paragraphs = [ BinaryQuestionAndParagraphs(question=dummy_question, paragraphs=[x], label=1, num_distractors=0, question_id='dummy') for x in paragraphs ] encodings.update({ f"{title}_{i}": rep for i, rep in enumerate(encoder.encode_paragraphs( model_paragraphs)) }) with open(join(out_dir, 'docs.json'), 'w') as f: json.dump(documents, f) np.savez_compressed(join(out_dir, 'encodings.npz'), **encodings)
def build_openqa_top_titles(out_file, questions_file, docs_file, encodings_dir, encoder_model, k, use_ema: bool, checkpoint: str, safety_mult: int, n_titles: int): print('Loading data...') s = time.time() with open(questions_file, 'r') as f: questions = json.load(f) with open(docs_file, 'r') as f: documents = json.load(f) print(f'Done, took {time.time()-s} seconds.') # Setup worker pool workers = ProcessPool(16, initializer=init, initargs=[]) qid2tokenized = {} tupled_questions = [(q['qid'], q['question']) for q in questions] print("Tokenizing questions...") with tqdm(total=len(tupled_questions)) as pbar: for tok_q in tqdm( workers.imap_unordered(tokenize_question, tupled_questions)): qid2tokenized.update(tok_q) pbar.update() workers.close() workers.join() voc = set() for question in qid2tokenized.values(): voc.update(question) all_titles = list( set([title for q in questions for title in q['top_titles']])) def parname_to_text(par_name): par_title = par_name_to_title(par_name) par_num = int(par_name.split('_')[-1]) return documents[par_title][par_num] print(f"Gathering documents...") # Setup worker pool workers = ProcessPool(32, initializer=init_encoding_handler, initargs=[encodings_dir]) # title2encs = {} # title2idx2par_name = {} # with tqdm(total=len(all_titles)) as pbar: # for t2enc, t2id2p in tqdm(workers.imap_unordered(get_title_mappings_from_saver, all_titles)): # title2encs.update(t2enc) # title2idx2par_name.update(t2id2p) # pbar.update() print("Loading encoder...") spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=1, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader(), use_char_inputs=False, use_ema=use_ema, checkpoint=checkpoint) print("Encoding questions...") q_encodings = encoder.encode_text_questions( [qid2tokenized[q['qid']] for q in questions], return_search_vectors=True, show_progress=True) print("Calculating similarities...") for idx, question in tqdm(enumerate(questions), total=len(questions), ncols=80): q_titles = question['top_titles'] if n_titles is not None: q_titles = q_titles[:n_titles] title2encs = {} title2idx2par_name = {} for t2enc, t2id2p in workers.imap_unordered( get_title_mappings_from_saver, q_titles): title2encs.update(t2enc) title2idx2par_name.update(t2id2p) q_enc = q_encodings[idx] title2ids = {} reps = [] total_sentences = 0 titles_offset_dict = {} for title in q_titles: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list( range(total_sentences, total_sentences + len(rep))) reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} reps = np.concatenate(reps, axis=0) top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), reps, k * safety_mult)[0] def id_to_par_name(rep_id): return title2idx2par_name[id2title[rep_id]][ rep_id - titles_offset_dict[id2title[rep_id]]] seen = set() p_names = [ id_to_par_name(x) for x in top_k if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x))) ][:k] question['paragraphs'] = [ parname_to_text(p_name) for p_name in p_names ] with open(out_file, 'w') as f: json.dump(questions, f)
def build_openqa_iterative_top_titles(out_file, questions_file, docs_file, encodings_dir, encoder_model, k1, k2, n1, n2, evaluate: bool, reformulate_from_text: bool, use_ema: bool, checkpoint: str): print('Loading data...') s = time.time() with open(questions_file, 'r') as f: questions = json.load(f) with open(docs_file, 'r') as f: documents = json.load(f) print(f'Done, took {time.time()-s} seconds.') if n1 is not None and n2 is not None: for q in questions: q['top_titles'] = q['top_titles'][:max(n1, n2)] # Setup worker pool workers = ProcessPool(16, initializer=init, initargs=[]) qid2tokenized = {} tupled_questions = [(q['qid'], q['question']) for q in questions] print("Tokenizing questions...") with tqdm(total=len(tupled_questions)) as pbar: for tok_q in tqdm( workers.imap_unordered(tokenize_question, tupled_questions)): qid2tokenized.update(tok_q) pbar.update() voc = set() for question in qid2tokenized.values(): voc.update(question) all_titles = list( set([title for q in questions for title in q['top_titles']])) def parname_to_text(par_name): par_title = par_name_to_title(par_name) par_num = int(par_name.split('_')[-1]) return documents[par_title][par_num] print(f"Gathering documents...") # Setup worker pool workers = ProcessPool(32, initializer=init_encoding_handler, initargs=[encodings_dir]) title2encs = {} title2idx2par_name = {} with tqdm(total=len(all_titles)) as pbar: for t2enc, t2id2p in tqdm( workers.imap_unordered(get_title_mappings_from_saver, all_titles)): title2encs.update(t2enc) title2idx2par_name.update(t2id2p) pbar.update() title2par_name2idxs = {} for title, id2par in title2idx2par_name.items(): par2idxs = {} for idx, parname in id2par.items(): if parname in par2idxs: par2idxs[parname].append(idx) else: par2idxs[parname] = [idx] title2par_name2idxs[title] = { par: sorted(idxs) for par, idxs in par2idxs.items() } print("Loading encoder...") spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=2, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderIterativeModel(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader(), use_char_inputs=False, use_ema=use_ema, checkpoint=checkpoint) print("Encoding questions...") q_original_encodings = encoder.encode_text_questions( [qid2tokenized[q['qid']] for q in questions], return_search_vectors=False, show_progress=True) q_search_encodings = encoder.question_rep_to_search_vector( question_encodings=q_original_encodings) init() # for initializing the tokenizer print("Calculating similarities...") for idx, question in tqdm(enumerate(questions), total=len(questions)): title2ids = {} all_par_reps = [] total_sentences = 0 titles_offset_dict = {} for title in question['top_titles'][:n1]: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list( range(total_sentences, total_sentences + len(rep))) all_par_reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} all_par_reps = np.concatenate(all_par_reps, axis=0) q_enc = q_search_encodings[idx] top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), all_par_reps, k1 * 2)[0] def id_to_par_name(rep_id): return title2idx2par_name[id2title[rep_id]][ rep_id - titles_offset_dict[id2title[rep_id]]] seen = set() p_names = [ id_to_par_name(x) for x in top_k if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x))) ][:k1] iteration1_paragraphs = \ [title2encs[par_name_to_title(pname)][title2par_name2idxs[par_name_to_title(pname)][pname], :] for pname in p_names] if not reformulate_from_text: reformulations = encoder.reformulate_questions( questions_rep=np.tile(q_original_encodings[idx], reps=(len(p_names), 1)), paragraphs_rep=iteration1_paragraphs, return_search_vectors=True) else: tok_q = tokenize(question['question']).words() par_texts = [ tokenize(parname_to_text(pname)).words() for pname in p_names ] reformulations = encoder.reformulate_questions_from_texts( tokenized_questions=[tok_q for _ in range(len(par_texts))], tokenized_pars=par_texts, return_search_vectors=True) title2ids = {} all_par_reps = [] total_sentences = 0 titles_offset_dict = {} for title in question['top_titles'][:n2]: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list( range(total_sentences, total_sentences + len(rep))) all_par_reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} all_par_reps = np.concatenate(all_par_reps, axis=0) top_k_second = numpy_global_knn(reformulations, all_par_reps, k2 * k1) seen = set() final_p_name_pairs = [ (p_names[x1], id_to_par_name(x2)) for x1, x2 in top_k_second if not ((p_names[x1], id_to_par_name(x2)) in seen or seen.add( (p_names[x1], id_to_par_name(x2)))) ][:k2] # important to note that in the iterative dataset the paragraphs of each question are in pairs question['paragraph_pairs'] = final_p_name_pairs with open(out_file, 'w') as f: json.dump(questions, f) if evaluate: eval_questions(questions)
def encode_from_file(docs_file, questions_file, encodings_dir, encoder_model, num_workers, hotpot: bool, long_batch: int, short_batch: int, use_chars: bool, use_ema: bool, checkpoint: str, document_chunk_size=1000, samples=None, encode_all_db=False): """ :param out_file: .npz file to dump the encodings :param docs_file: path to json file whose structure is [{title: list of paragraphs}, ...] :return: """ doc_encs_handler = DocumentEncodingHandler(encodings_dir) # Setup worker pool workers = ProcessPool(num_workers, initializer=init, initargs=[]) if docs_file is not None: with open(docs_file, 'r') as f: documents = json.load(f) documents = { k: v for k, v in documents.items() if k not in doc_encs_handler.titles2filenames } tokenized_documents = {} tupled_doc_list = [(title, pars) for title, pars in documents.items()] if samples is not None: print(f"sampling {samples} samples") tupled_doc_list = tupled_doc_list[:samples] print("Tokenizing from file...") with tqdm(total=len(tupled_doc_list), ncols=80) as pbar: for tok_doc in tqdm( workers.imap_unordered(tokenize_document, tupled_doc_list)): tokenized_documents.update(tok_doc) pbar.update() else: if questions_file is not None: with open(questions_file, 'r') as f: questions = json.load(f) all_titles = list( set([title for q in questions for title in q['top_titles']])) else: print("encoding all DB!") all_titles = DocDB().get_doc_titles() if samples is not None: print(f"sampling {samples} samples") all_titles = all_titles[:samples] all_titles = [ t for t in all_titles if t not in doc_encs_handler.titles2filenames ] tokenized_documents = {} print("Tokenizing from DB...") with tqdm(total=len(all_titles), ncols=80) as pbar: for tok_doc in tqdm( workers.imap_unordered(tokenize_from_db, all_titles)): tokenized_documents.update(tok_doc) pbar.update() workers.close() workers.join() voc = set() for paragraphs in tokenized_documents.values(): for par in paragraphs: voc.update(par) if not hotpot: spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=1, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader(), use_char_inputs=use_chars, use_ema=use_ema, checkpoint=checkpoint) else: spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=2, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderIterativeModel(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader(), use_char_inputs=use_chars, use_ema=use_ema, checkpoint=checkpoint) tokenized_documents_items = list(tokenized_documents.items()) for tokenized_doc_chunk in tqdm([ tokenized_documents_items[i:i + document_chunk_size] for i in range(0, len(tokenized_documents_items), document_chunk_size) ], ncols=80): flattened_pars_with_names = [(f"{title}_{i}", par) for title, pars in tokenized_doc_chunk for i, par in enumerate(pars)] # filtering out empty paragraphs (probably had some short string the tokenization removed) # important to notice that the filtered paragraphs will have no representation, # but they still exist in the numbering of paragraphs for consistency with the docs. flattened_pars_with_names = [(name, par) for name, par in flattened_pars_with_names if len(par) > 0] # sort such that longer paragraphs are first to identify OOMs early on flattened_pars_with_names = sorted(flattened_pars_with_names, key=lambda x: len(x[1]), reverse=True) long_paragraphs_ids = [ i for i, name_par in enumerate(flattened_pars_with_names) if len(name_par[1]) >= 900 ] short_paragraphs_ids = [ i for i, name_par in enumerate(flattened_pars_with_names) if len(name_par[1]) < 900 ] # print(f"Encoding {len(flattened_pars_with_names)} paragraphs...") name2enc = {} dummy_question = "Hello Hello".split() if not hotpot: model_paragraphs = [ BinaryQuestionAndParagraphs(question=dummy_question, paragraphs=[x], label=1, num_distractors=0, question_id='dummy') for _, x in flattened_pars_with_names ] else: # todo allow precomputed sentence segments model_paragraphs = [ IterativeQuestionAndParagraphs(question=dummy_question, paragraphs=[x, dummy_question], first_label=1, second_label=1, question_id='dummy', sentence_segments=None) for _, x in flattened_pars_with_names ] # print("Encoding long paragraphs...") long_pars = [model_paragraphs[i] for i in long_paragraphs_ids] name2enc.update({ flattened_pars_with_names[long_paragraphs_ids[i]][0]: enc for i, enc in enumerate( encoder.encode_paragraphs( long_pars, batch_size=long_batch, show_progress=True ) if not hotpot else encoder.encode_first_paragraphs( long_pars, batch_size=long_batch, show_progress=True)) }) # print("Encoding short paragraphs...") short_pars = [model_paragraphs[i] for i in short_paragraphs_ids] name2enc.update({ flattened_pars_with_names[short_paragraphs_ids[i]][0]: enc for i, enc in enumerate( encoder.encode_paragraphs( short_pars, batch_size=short_batch, show_progress=True ) if not hotpot else encoder.encode_first_paragraphs( short_pars, batch_size=short_batch, show_progress=True)) }) doc_encs_handler.save_multiple_documents(name2enc)
def get_resource_loader(self) -> ResourceLoader: return ResourceLoader()
def get_resource_loader(self): return ResourceLoader(self.get_pruned_word_vecs)
def tokenize_words(text): global PROCESS_TOK return PROCESS_TOK.tokenize(text).words() def tokenize_sentences(sentences): global PROCESS_TOK return [PROCESS_TOK.tokenize(s).words() if s != '' else [] for s in sentences] print("Loading TF-IDF...") tfidf_ranker = TfidfDocRanker() db = DocDB() loader = ResourceLoader() # loader = HotpotQuestions().get_resource_loader() word_counts = load_counts(join(LOCAL_DATA_DIR, 'hotpot', 'wiki_word_counts.txt')) title_counts = load_counts(join(LOCAL_DATA_DIR, 'hotpot', 'wiki_title_word_counts.txt')) word_counts.update(title_counts) voc = set(word_counts.keys()) print("Loading encoder...") spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=2, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderIterativeModel(model_dir_path=args.encoder_model, vocabulary=voc, spec=spec, loader=loader, use_char_inputs=False, use_ema=not args.no_ema, checkpoint=args.checkpoint)
def build_openqa_iterative_top_titles( base_dir, questions_file, docs_file, encodings_dir, encoder_model, k1_list: List[int], k2_list: List[int], n1_list: List[int], n2_list: List[int], evaluate: bool, reformulate_from_text: bool, use_ema: bool, checkpoint: str, safety_mult: int): print('Loading data...') s = time.time() with open(questions_file, 'r') as f: questions = json.load(f) if docs_file is not None: with open(docs_file, 'r') as f: documents = json.load(f) else: docs_db = DocDB() print(f'Done, took {time.time()-s} seconds.') if n1_list is not None and n2_list is not None: for q in questions: q['top_titles'] = q['top_titles'][:max(max(n1_list), max(n2_list))] # Setup worker pool workers = ProcessPool(16, initializer=init, initargs=[]) qid2tokenized = {} tupled_questions = [(q['qid'], q['question']) for q in questions] print("Tokenizing questions...") with tqdm(total=len(tupled_questions)) as pbar: for tok_q in tqdm( workers.imap_unordered(tokenize_question, tupled_questions)): qid2tokenized.update(tok_q) pbar.update() voc = set() for question in qid2tokenized.values(): voc.update(question) workers.close() workers.join() # all_titles = list(set([title for q in questions for title in q['top_titles']])) def parname_to_text(par_name): par_title = par_name_to_title(par_name) par_num = int(par_name.split('_')[-1]) if docs_file is not None: return documents[par_title][par_num] return ' '.join(docs_db.get_doc_sentences(par_title)) # print(f"Gathering documents...") # Setup worker pool workers = ProcessPool(16, initializer=init_encoding_handler, initargs=[encodings_dir]) # title2encs = {} # title2idx2par_name = {} # with tqdm(total=len(all_titles)) as pbar: # for t2enc, t2id2p in tqdm(workers.imap_unordered(get_title_mappings_from_saver, all_titles)): # title2encs.update(t2enc) # title2idx2par_name.update(t2id2p) # pbar.update() # title2par_name2idxs = {} # for title, id2par in title2idx2par_name.items(): # par2idxs = {} # for idx, parname in id2par.items(): # if parname in par2idxs: # par2idxs[parname].append(idx) # else: # par2idxs[parname] = [idx] # title2par_name2idxs[title] = {par: sorted(idxs) for par, idxs in par2idxs.items()} print("Loading encoder...") spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=2, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderIterativeModel(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader(), use_char_inputs=False, use_ema=use_ema, checkpoint=checkpoint) print("Encoding questions...") q_original_encodings = encoder.encode_text_questions( [qid2tokenized[q['qid']] for q in questions], return_search_vectors=False, show_progress=True) q_search_encodings = encoder.question_rep_to_search_vector( question_encodings=q_original_encodings) init() # for initializing the tokenizer total_num = len(n1_list) * len(n2_list) * len(k1_list) * len(k2_list) print("Calculating similarities...") for n1, n2, k1, k2 in tqdm(itertools.product(n1_list, n2_list, k1_list, k2_list), total=total_num, ncols=80): questions = iterative_retrieval(encoder, questions, qid2tokenized, q_search_encodings, workers, parname_to_text, reformulate_from_text, n1, n2, k1, k2, safety_mult) dir_path = os.path.join(base_dir, f"n2-{n2}", f"n1-{n1}") os.makedirs(dir_path, exist_ok=True) out_file = os.path.join(dir_path, f"n1-{n1}_n2-{n2}_k1-{k1}_k2-{k2}.json") questions_copy = deepcopy(questions) for question in questions_copy: question.pop('top_titles') with open(out_file, 'w') as f: json.dump(questions_copy, f) if evaluate: eval_questions(questions_copy)
def main(): parser = argparse.ArgumentParser( description='Full ranking evaluation on Hotpot') parser.add_argument('model', help='model directory to evaluate') parser.add_argument( 'output', type=str, help="Store the per-paragraph results in csv format in this file, " "or the json prediction if in test mode") parser.add_argument('-n', '--sample_questions', type=int, default=None, help="(for testing) run on a subset of questions") parser.add_argument( '-b', '--batch_size', type=int, default=64, help="Batch size, larger sizes can be faster but uses more memory") parser.add_argument( '-s', '--step', default=None, help="Weights to load, can be a checkpoint step or 'latest'") parser.add_argument('-a', '--answer_bound', type=int, default=8, help="Max answer span length") parser.add_argument('-c', '--corpus', choices=[ "distractors", "gold", "hotpot_file", "retrieval_file", "top_titles" ], default="distractors") parser.add_argument('-t', '--tokens', type=int, default=None, help="Max tokens per a paragraph") parser.add_argument('--input_file', type=str, default=None) parser.add_argument('--docs_file', type=str, default=None) parser.add_argument('--num_workers', type=int, default=16, help='Number of workers for tokenizing') parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument('--no_sp', action="store_true", help="Don't predict supporting facts") parser.add_argument('--test_mode', action='store_true', help="produce a prediction file, no answers given") args = parser.parse_args() model_dir = ModelDir(args.model) batcher = ClusteredBatcher(args.batch_size, multiple_contexts_len, truncate_batches=True) loader = ResourceLoader() if args.corpus not in {"distractors", "gold"} and args.input_file is None: raise ValueError( "Must pass an input file if not using precomputed dataset") if args.corpus in {"distractors", "gold"} and args.test_mode: raise ValueError( "Test mode not available in 'distractors' or 'gold' mode") if args.corpus in {"distractors", "gold"}: corpus = HotpotQuestions() loader = corpus.get_resource_loader() questions = corpus.get_dev() question_preprocessor = HotpotTextLengthPreprocessorWithSpans( args.tokens) questions = [ question_preprocessor.preprocess(x) for x in questions if (question_preprocessor.preprocess(x) is not None) ] if args.sample_questions: np.random.RandomState(0).shuffle( sorted(questions, key=lambda x: x.question_id)) questions = questions[:args.sample_questions] data = HotpotFullQADistractorsDataset(questions, batcher) gold_idxs = set(data.gold_idxs) if args.corpus == 'gold': data.samples = [data.samples[i] for i in data.gold_idxs] qid2samples = {} qid2idx = {} for i, sample in enumerate(data.samples): key = sample.question_id if key in qid2samples: qid2samples[key].append(sample) qid2idx[key].append(i) else: qid2samples[key] = [sample] qid2idx[key] = [i] questions = [] print("Ranking pairs...") gold_ranks = [] for qid, samples in tqdm(qid2samples.items()): question = " ".join(samples[0].question) pars = [" ".join(x.paragraphs[0]) for x in samples] ranks = get_paragraph_ranks(question, pars) for sample, rank, idx in zip(samples, ranks, qid2idx[qid]): questions.append( RankedQAPair(question=sample.question, paragraphs=sample.paragraphs, spans=np.zeros((0, 2), dtype=np.int32), question_id=sample.question_id, answer=sample.answer, rank=rank, q_type=sample.q_type, sentence_segments=sample.sentence_segments)) if idx in gold_idxs: gold_ranks.append(rank + 1) print(f"Mean rank: {np.mean(gold_ranks)}") ranks_counter = Counter(gold_ranks) for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: print(f"Hits at {i}: {ranks_counter[i]}") elif args.corpus == 'hotpot_file': # a hotpot json format input file. We rank the pairs with tf-idf with open(args.input_file, 'r') as f: hotpot_data = json.load(f) if args.sample_questions: np.random.RandomState(0).shuffle( sorted(hotpot_data, key=lambda x: x['_id'])) hotpot_data = hotpot_data[:args.sample_questions] title2sentences = { context[0]: context[1] for q in hotpot_data for context in q['context'] } question_tok_texts = tokenize_texts( [q['question'] for q in hotpot_data], num_workers=args.num_workers) sentences_tok = tokenize_texts(list(title2sentences.values()), num_workers=args.num_workers, sentences=True) if args.tokens is not None: sentences_tok = [ truncate_paragraph(p, args.tokens) for p in sentences_tok ] title2tok_sents = { title: sentences for title, sentences in zip(title2sentences.keys(), sentences_tok) } questions = [] for idx, question in enumerate(tqdm(hotpot_data, desc='tf-idf ranking')): q_titles = [title for title, _ in question['context']] par_pairs = [(title1, title2) for i, title1 in enumerate(q_titles) for title2 in q_titles[i + 1:]] if len(par_pairs) == 0: continue ranks = get_paragraph_ranks(question['question'], [ ' '.join(title2sentences[t1] + title2sentences[t2]) for t1, t2 in par_pairs ]) for rank, par_pair in zip(ranks, par_pairs): sent_tok_pair = title2tok_sents[par_pair[0]] + title2tok_sents[ par_pair[1]] sentence_segments, _ = get_segments_from_sentences_fix_sup( sent_tok_pair, np.zeros(0)) missing_sent_idx = [[ i for i, sent in enumerate(title2tok_sents[title]) if len(sent) == 0 ] for title in par_pair] questions.append( RankedQAPair( question=question_tok_texts[idx], paragraphs=[flatten_iterable(sent_tok_pair)], spans=np.zeros((0, 2), dtype=np.int32), question_id=question['_id'], answer='noanswer' if args.test_mode else question['answer'], rank=rank, q_type='null' if args.test_mode else question['type'], sentence_segments=[sentence_segments], par_titles_num_sents=[ (title, sum(1 for sent in title2tok_sents[title] if len(sent) > 0)) for title in par_pair ], missing_sent_idxs=missing_sent_idx, true_sp=[] if args.test_mode else question['supporting_facts'])) elif args.corpus == 'retrieval_file' or args.corpus == 'top_titles': if args.docs_file is None: print("Using DB documents") doc_db = DocDB(config.DOC_DB, full_docs=False) else: with open(args.docs_file, 'r') as f: docs = json.load(f) with open(args.input_file, 'r') as f: retrieval_data = json.load(f) if args.sample_questions: np.random.RandomState(0).shuffle( sorted(retrieval_data, key=lambda x: x['qid'])) retrieval_data = retrieval_data[:args.sample_questions] def parname_to_text(par_name): par_title = par_name_to_title(par_name) par_num = int(par_name.split('_')[-1]) if args.docs_file is None: return doc_db.get_doc_sentences(par_title) return docs[par_title][par_num] if args.corpus == 'top_titles': print("Top TF-IDF!") for q in retrieval_data: top_titles = q['top_titles'][:10] q['paragraph_pairs'] = [(title1 + '_0', title2 + '_0') for i, title1 in enumerate(top_titles) for title2 in top_titles[i + 1:]] question_tok_texts = tokenize_texts( [q['question'] for q in retrieval_data], num_workers=args.num_workers) all_parnames = list( set([ parname for q in retrieval_data for pair in q['paragraph_pairs'] for parname in pair ])) texts_tok = tokenize_texts([parname_to_text(x) for x in all_parnames], num_workers=args.num_workers, sentences=True) if args.tokens is not None: texts_tok = [truncate_paragraph(p, args.tokens) for p in texts_tok] parname2tok_text = { parname: text for parname, text in zip(all_parnames, texts_tok) } questions = [] for idx, question in enumerate(retrieval_data): for rank, par_pair in enumerate(question['paragraph_pairs']): tok_pair = parname2tok_text[par_pair[0]] + parname2tok_text[ par_pair[1]] sentence_segments, _ = get_segments_from_sentences_fix_sup( tok_pair, np.zeros(0)) missing_sent_idx = [[ i for i, sent in enumerate(parname2tok_text[parname]) if len(sent) == 0 ] for parname in par_pair] questions.append( RankedQAPair( question=question_tok_texts[idx], paragraphs=[flatten_iterable(tok_pair)], spans=np.zeros((0, 2), dtype=np.int32), question_id=question['qid'], answer='noanswer' if args.test_mode else question['answers'][0], rank=rank, q_type='null' if args.test_mode else question['type'], sentence_segments=[sentence_segments], par_titles_num_sents=[ (par_name_to_title(parname), sum(1 for sent in parname2tok_text[parname] if len(sent) > 0)) for parname in par_pair ], missing_sent_idxs=missing_sent_idx, true_sp=[] if args.test_mode else question['supporting_facts'])) else: raise NotImplementedError() data = DummyDataset(questions, batcher) evaluators = [ RecordHotpotQAPrediction(args.answer_bound, True, sp_prediction=not args.no_sp) ] if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() model = model_dir.get_model() evaluation = trainer.test(model, evaluators, {args.corpus: data}, loader, checkpoint, not args.no_ema, 10)[args.corpus] print("Saving result") output_file = args.output df = pd.DataFrame(evaluation.per_sample) df.sort_values(["question_id", "rank"], inplace=True, ascending=True) group_by = ["question_id"] def get_ranked_scores(score_name): filtered_df = df[df.type == 'comparison'] if "Cp" in score_name else \ df[df.type == 'bridge'] if "Br" in score_name else df target_prefix = 'joint' if 'joint' in score_name else 'sp' if 'sp' in score_name else 'text' target_score = f"{target_prefix}_{'em' if 'EM' in score_name else 'f1'}" return compute_ranked_scores_with_yes_no( filtered_df, span_q_col="span_question_scores", yes_no_q_col="yes_no_question_scores", yes_no_scores_col="yes_no_confidence_scores", span_scores_col="predicted_score", span_target_score=target_score, group_cols=group_by) if not args.test_mode: score_names = ["EM", "F1", "Br EM", "Br F1", "Cp EM", "Cp F1"] if not args.no_sp: score_names.extend([ f"{prefix} {name}" for prefix in ['sp', 'joint'] for name in score_names ]) table = [["N Paragraphs"] + score_names] scores = [get_ranked_scores(score_name) for score_name in score_names] table += list([str(i + 1), *["%.4f" % x for x in score_vals]] for i, score_vals in enumerate(zip(*scores))) print_table(table) df.to_csv(output_file, index=False) else: df_to_pred(df, output_file)