def get_spec(self): batch_size = self.batcher.get_fixed_batch_size() num_contexts = 1 max_q_words = max(len(q.question) for q in self.questions) max_c_words = max(max(c.num_tokens for c in (q.distractors + [q.paragraph])) for q in self.questions) return QuestionAndParagraphsSpec(batch_size=batch_size, max_num_contexts=num_contexts, max_num_question_words=max_q_words, max_num_context_words=max_c_words)
def get_spec(self): batch_size = self.batcher.get_fixed_batch_size() num_contexts = 1 max_q_words = max(len(q.question) for q in self.questions) max_c_words = max(max(c.num_tokens for c in d.paragraphs) for d in self.title2doc.values()) return QuestionAndParagraphsSpec(batch_size=batch_size, max_num_contexts=num_contexts, max_num_question_words=max_q_words, max_num_context_words=max_c_words)
def get_spec(self): batch_size = self.batcher.get_fixed_batch_size() num_contexts = 1 max_q_words = max(len(q.question) for q in self.questions) max_c_words = max(multiple_contexts_len(q) for q in self.questions) return QuestionAndParagraphsSpec(batch_size=batch_size, max_num_contexts=num_contexts, max_num_question_words=max_q_words, max_num_context_words=max_c_words)
def encode_all_squad(encoder_model: str): print("loading data...") corpus = SquadRelevanceCorpus() train = corpus.get_train() dev = corpus.get_dev() spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=1, max_num_question_words=None, max_num_context_words=None) voc = corpus.get_vocab() encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=corpus.get_resource_loader()) for questions, title2doc in [(train, corpus.train_title_to_document), (dev, corpus.dev_title_to_document)]: print( f"Starting encoding of {'train' if questions == train else 'dev'}") # eliminating distractors not from original squad title2max = { key: max(x.paragraph.par_id for x in group) for key, group in itertools.groupby( sorted(questions, key=lambda x: x.paragraph.doc_title), key=lambda x: x.paragraph.doc_title) } for title in title2max: title2doc[title].paragraphs = title2doc[ title].paragraphs[:title2max[title] + 1] for title in tqdm(title2max): np.savez_compressed( get_filename(questions == train, title), **{ str(k): v for k, v in encode_document(encoder, title2doc[title]).items() })
def squad_build_drqa_doc_encodings(out_dir, encoder_model, num_workers, all_squad=False): print("loading data...") corpus = SquadRelevanceCorpus() questions = corpus.get_dev() if all_squad: questions.extend(corpus.get_train()) # docs = corpus.dev_title_to_document.values() if dev else corpus.train_title_to_document.values() relevant_titles = list(set([q.paragraph.doc_title for q in questions])) conn = sqlite3.connect(DRQA_DOC_DB) c = conn.cursor() titles = list(set([q.paragraph.doc_title for q in questions])) for i, t in enumerate(titles): # Had to manually resolve this (due to changes in Wikipedia?) if t == "Sky (United Kingdom)": titles[i] = "Sky UK" title_to_doc_id = {t1: t2 for t1, t2 in zip(titles, relevant_titles)} c.execute("CREATE TEMPORARY TABLE squad_docs(id)") c.executemany("INSERT INTO squad_docs VALUES (?)", [(x, ) for x in titles]) c.execute("SELECT id, text FROM documents WHERE id IN squad_docs") out = c.fetchall() conn.close() out = [(title_to_doc_id[title], text) for title, text in out] spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=1, max_num_question_words=None, max_num_context_words=None) voc = corpus.get_vocab() encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader()) # Setup worker pool workers = ProcessPool(num_workers, initializer=init, initargs=[]) documents = {} tokenized_documents = {} print("Tokenizing...") with tqdm(total=len(out)) as pbar: for doc, tok_doc in tqdm( workers.imap_unordered(get_document_paragraphs, out)): documents.update(doc) tokenized_documents.update(tok_doc) pbar.update() encodings = {} print("Encoding...") for title, paragraphs in tqdm(tokenized_documents.items()): dummy_question = "Hello Hello".split() model_paragraphs = [ BinaryQuestionAndParagraphs(question=dummy_question, paragraphs=[x], label=1, num_distractors=0, question_id='dummy') for x in paragraphs ] encodings.update({ f"{title}_{i}": rep for i, rep in enumerate(encoder.encode_paragraphs( model_paragraphs)) }) with open(join(out_dir, 'docs.json'), 'w') as f: json.dump(documents, f) np.savez_compressed(join(out_dir, 'encodings.npz'), **encodings)
def build_doc_eval_file(out_file, encodings_dir, encoder_model, k, per_doc=True): print("loading data...") corpus = SquadRelevanceCorpus() questions = corpus.get_dev() spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=1, max_num_question_words=None, max_num_context_words=None) voc = corpus.get_vocab() encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=corpus.get_resource_loader()) par_encs = np.load(join(encodings_dir, 'encodings.npz')) with open(join(encodings_dir, 'docs.json'), 'r') as f: documents = json.load(f) questions_eval_format = [] questions = sorted(questions, key=lambda x: x.paragraph.doc_title) if per_doc: title2par_encs = {} for p_name, rep in par_encs.items(): title = '_'.join(p_name.split('_')[:-1]) if title in title2par_encs: title2par_encs[title].update({p_name: rep}) else: title2par_encs[title] = {p_name: rep} for title, doc_qs in tqdm( itertools.groupby(questions, key=lambda x: x.paragraph.doc_title)): doc_qs = list(doc_qs) q_encodings = encode_squad.encode_questions(encoder, doc_qs) par2ids = {} reps = [] total_sentences = 0 for p_name, rep in title2par_encs[title].items(): par2ids[p_name] = list( range(total_sentences, total_sentences + len(rep))) reps.append(rep) total_sentences += len(rep) id2par = {i: p for p, ids in par2ids.items() for i in ids} reps = np.concatenate(reps, axis=0) top_k = simple_numpy_knn(q_encodings, reps, k * 2) for idx, question in enumerate(doc_qs): seen = set() p_names = [ id2par[x] for x in top_k[idx] if not (id2par[x] in seen or seen.add(id2par[x])) ][:k] questions_eval_format.append({ 'qid': question.question_id, 'question': ' '.join(question.question), 'answers': list(question.answers), 'paragraphs': [ documents['_'.join(p_name.split('_')[:-1])][int( p_name.split('_')[-1])] for p_name in p_names ] }) else: print("encoding questions") q_encodings = encode_squad.encode_questions(encoder, questions) par2ids = {} reps = [] total_sentences = 0 for p_name, rep in par_encs.items(): par2ids[p_name] = list( range(total_sentences, total_sentences + len(rep))) reps.append(rep) total_sentences += len(rep) id2par = {i: p for p, ids in par2ids.items() for i in ids} reps = np.concatenate(reps, axis=0) print("scoring") top_k = simple_numpy_knn(q_encodings, reps, k * 2) for idx, question in enumerate(questions): seen = set() p_names = [ id2par[x] for x in top_k[idx] if not (id2par[x] in seen or seen.add(id2par[x])) ][:k] questions_eval_format.append({ 'qid': question.question_id, 'question': ' '.join(question.question), 'answers': list(question.answers), 'paragraphs': [ documents['_'.join(p_name.split('_')[:-1])][int( p_name.split('_')[-1])] for p_name in p_names ] }) with open(out_file, 'w') as f: json.dump(questions_eval_format, f)
def build_openqa_top_titles(out_file, questions_file, docs_file, encodings_dir, encoder_model, k, use_ema: bool, checkpoint: str, safety_mult: int, n_titles: int): print('Loading data...') s = time.time() with open(questions_file, 'r') as f: questions = json.load(f) with open(docs_file, 'r') as f: documents = json.load(f) print(f'Done, took {time.time()-s} seconds.') # Setup worker pool workers = ProcessPool(16, initializer=init, initargs=[]) qid2tokenized = {} tupled_questions = [(q['qid'], q['question']) for q in questions] print("Tokenizing questions...") with tqdm(total=len(tupled_questions)) as pbar: for tok_q in tqdm( workers.imap_unordered(tokenize_question, tupled_questions)): qid2tokenized.update(tok_q) pbar.update() workers.close() workers.join() voc = set() for question in qid2tokenized.values(): voc.update(question) all_titles = list( set([title for q in questions for title in q['top_titles']])) def parname_to_text(par_name): par_title = par_name_to_title(par_name) par_num = int(par_name.split('_')[-1]) return documents[par_title][par_num] print(f"Gathering documents...") # Setup worker pool workers = ProcessPool(32, initializer=init_encoding_handler, initargs=[encodings_dir]) # title2encs = {} # title2idx2par_name = {} # with tqdm(total=len(all_titles)) as pbar: # for t2enc, t2id2p in tqdm(workers.imap_unordered(get_title_mappings_from_saver, all_titles)): # title2encs.update(t2enc) # title2idx2par_name.update(t2id2p) # pbar.update() print("Loading encoder...") spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=1, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader(), use_char_inputs=False, use_ema=use_ema, checkpoint=checkpoint) print("Encoding questions...") q_encodings = encoder.encode_text_questions( [qid2tokenized[q['qid']] for q in questions], return_search_vectors=True, show_progress=True) print("Calculating similarities...") for idx, question in tqdm(enumerate(questions), total=len(questions), ncols=80): q_titles = question['top_titles'] if n_titles is not None: q_titles = q_titles[:n_titles] title2encs = {} title2idx2par_name = {} for t2enc, t2id2p in workers.imap_unordered( get_title_mappings_from_saver, q_titles): title2encs.update(t2enc) title2idx2par_name.update(t2id2p) q_enc = q_encodings[idx] title2ids = {} reps = [] total_sentences = 0 titles_offset_dict = {} for title in q_titles: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list( range(total_sentences, total_sentences + len(rep))) reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} reps = np.concatenate(reps, axis=0) top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), reps, k * safety_mult)[0] def id_to_par_name(rep_id): return title2idx2par_name[id2title[rep_id]][ rep_id - titles_offset_dict[id2title[rep_id]]] seen = set() p_names = [ id_to_par_name(x) for x in top_k if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x))) ][:k] question['paragraphs'] = [ parname_to_text(p_name) for p_name in p_names ] with open(out_file, 'w') as f: json.dump(questions, f)
def build_openqa_iterative_top_titles(out_file, questions_file, docs_file, encodings_dir, encoder_model, k1, k2, n1, n2, evaluate: bool, reformulate_from_text: bool, use_ema: bool, checkpoint: str): print('Loading data...') s = time.time() with open(questions_file, 'r') as f: questions = json.load(f) with open(docs_file, 'r') as f: documents = json.load(f) print(f'Done, took {time.time()-s} seconds.') if n1 is not None and n2 is not None: for q in questions: q['top_titles'] = q['top_titles'][:max(n1, n2)] # Setup worker pool workers = ProcessPool(16, initializer=init, initargs=[]) qid2tokenized = {} tupled_questions = [(q['qid'], q['question']) for q in questions] print("Tokenizing questions...") with tqdm(total=len(tupled_questions)) as pbar: for tok_q in tqdm( workers.imap_unordered(tokenize_question, tupled_questions)): qid2tokenized.update(tok_q) pbar.update() voc = set() for question in qid2tokenized.values(): voc.update(question) all_titles = list( set([title for q in questions for title in q['top_titles']])) def parname_to_text(par_name): par_title = par_name_to_title(par_name) par_num = int(par_name.split('_')[-1]) return documents[par_title][par_num] print(f"Gathering documents...") # Setup worker pool workers = ProcessPool(32, initializer=init_encoding_handler, initargs=[encodings_dir]) title2encs = {} title2idx2par_name = {} with tqdm(total=len(all_titles)) as pbar: for t2enc, t2id2p in tqdm( workers.imap_unordered(get_title_mappings_from_saver, all_titles)): title2encs.update(t2enc) title2idx2par_name.update(t2id2p) pbar.update() title2par_name2idxs = {} for title, id2par in title2idx2par_name.items(): par2idxs = {} for idx, parname in id2par.items(): if parname in par2idxs: par2idxs[parname].append(idx) else: par2idxs[parname] = [idx] title2par_name2idxs[title] = { par: sorted(idxs) for par, idxs in par2idxs.items() } print("Loading encoder...") spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=2, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderIterativeModel(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader(), use_char_inputs=False, use_ema=use_ema, checkpoint=checkpoint) print("Encoding questions...") q_original_encodings = encoder.encode_text_questions( [qid2tokenized[q['qid']] for q in questions], return_search_vectors=False, show_progress=True) q_search_encodings = encoder.question_rep_to_search_vector( question_encodings=q_original_encodings) init() # for initializing the tokenizer print("Calculating similarities...") for idx, question in tqdm(enumerate(questions), total=len(questions)): title2ids = {} all_par_reps = [] total_sentences = 0 titles_offset_dict = {} for title in question['top_titles'][:n1]: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list( range(total_sentences, total_sentences + len(rep))) all_par_reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} all_par_reps = np.concatenate(all_par_reps, axis=0) q_enc = q_search_encodings[idx] top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), all_par_reps, k1 * 2)[0] def id_to_par_name(rep_id): return title2idx2par_name[id2title[rep_id]][ rep_id - titles_offset_dict[id2title[rep_id]]] seen = set() p_names = [ id_to_par_name(x) for x in top_k if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x))) ][:k1] iteration1_paragraphs = \ [title2encs[par_name_to_title(pname)][title2par_name2idxs[par_name_to_title(pname)][pname], :] for pname in p_names] if not reformulate_from_text: reformulations = encoder.reformulate_questions( questions_rep=np.tile(q_original_encodings[idx], reps=(len(p_names), 1)), paragraphs_rep=iteration1_paragraphs, return_search_vectors=True) else: tok_q = tokenize(question['question']).words() par_texts = [ tokenize(parname_to_text(pname)).words() for pname in p_names ] reformulations = encoder.reformulate_questions_from_texts( tokenized_questions=[tok_q for _ in range(len(par_texts))], tokenized_pars=par_texts, return_search_vectors=True) title2ids = {} all_par_reps = [] total_sentences = 0 titles_offset_dict = {} for title in question['top_titles'][:n2]: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list( range(total_sentences, total_sentences + len(rep))) all_par_reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} all_par_reps = np.concatenate(all_par_reps, axis=0) top_k_second = numpy_global_knn(reformulations, all_par_reps, k2 * k1) seen = set() final_p_name_pairs = [ (p_names[x1], id_to_par_name(x2)) for x1, x2 in top_k_second if not ((p_names[x1], id_to_par_name(x2)) in seen or seen.add( (p_names[x1], id_to_par_name(x2)))) ][:k2] # important to note that in the iterative dataset the paragraphs of each question are in pairs question['paragraph_pairs'] = final_p_name_pairs with open(out_file, 'w') as f: json.dump(questions, f) if evaluate: eval_questions(questions)
def encode_from_file(docs_file, questions_file, encodings_dir, encoder_model, num_workers, hotpot: bool, long_batch: int, short_batch: int, use_chars: bool, use_ema: bool, checkpoint: str, document_chunk_size=1000, samples=None, encode_all_db=False): """ :param out_file: .npz file to dump the encodings :param docs_file: path to json file whose structure is [{title: list of paragraphs}, ...] :return: """ doc_encs_handler = DocumentEncodingHandler(encodings_dir) # Setup worker pool workers = ProcessPool(num_workers, initializer=init, initargs=[]) if docs_file is not None: with open(docs_file, 'r') as f: documents = json.load(f) documents = { k: v for k, v in documents.items() if k not in doc_encs_handler.titles2filenames } tokenized_documents = {} tupled_doc_list = [(title, pars) for title, pars in documents.items()] if samples is not None: print(f"sampling {samples} samples") tupled_doc_list = tupled_doc_list[:samples] print("Tokenizing from file...") with tqdm(total=len(tupled_doc_list), ncols=80) as pbar: for tok_doc in tqdm( workers.imap_unordered(tokenize_document, tupled_doc_list)): tokenized_documents.update(tok_doc) pbar.update() else: if questions_file is not None: with open(questions_file, 'r') as f: questions = json.load(f) all_titles = list( set([title for q in questions for title in q['top_titles']])) else: print("encoding all DB!") all_titles = DocDB().get_doc_titles() if samples is not None: print(f"sampling {samples} samples") all_titles = all_titles[:samples] all_titles = [ t for t in all_titles if t not in doc_encs_handler.titles2filenames ] tokenized_documents = {} print("Tokenizing from DB...") with tqdm(total=len(all_titles), ncols=80) as pbar: for tok_doc in tqdm( workers.imap_unordered(tokenize_from_db, all_titles)): tokenized_documents.update(tok_doc) pbar.update() workers.close() workers.join() voc = set() for paragraphs in tokenized_documents.values(): for par in paragraphs: voc.update(par) if not hotpot: spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=1, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader(), use_char_inputs=use_chars, use_ema=use_ema, checkpoint=checkpoint) else: spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=2, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderIterativeModel(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader(), use_char_inputs=use_chars, use_ema=use_ema, checkpoint=checkpoint) tokenized_documents_items = list(tokenized_documents.items()) for tokenized_doc_chunk in tqdm([ tokenized_documents_items[i:i + document_chunk_size] for i in range(0, len(tokenized_documents_items), document_chunk_size) ], ncols=80): flattened_pars_with_names = [(f"{title}_{i}", par) for title, pars in tokenized_doc_chunk for i, par in enumerate(pars)] # filtering out empty paragraphs (probably had some short string the tokenization removed) # important to notice that the filtered paragraphs will have no representation, # but they still exist in the numbering of paragraphs for consistency with the docs. flattened_pars_with_names = [(name, par) for name, par in flattened_pars_with_names if len(par) > 0] # sort such that longer paragraphs are first to identify OOMs early on flattened_pars_with_names = sorted(flattened_pars_with_names, key=lambda x: len(x[1]), reverse=True) long_paragraphs_ids = [ i for i, name_par in enumerate(flattened_pars_with_names) if len(name_par[1]) >= 900 ] short_paragraphs_ids = [ i for i, name_par in enumerate(flattened_pars_with_names) if len(name_par[1]) < 900 ] # print(f"Encoding {len(flattened_pars_with_names)} paragraphs...") name2enc = {} dummy_question = "Hello Hello".split() if not hotpot: model_paragraphs = [ BinaryQuestionAndParagraphs(question=dummy_question, paragraphs=[x], label=1, num_distractors=0, question_id='dummy') for _, x in flattened_pars_with_names ] else: # todo allow precomputed sentence segments model_paragraphs = [ IterativeQuestionAndParagraphs(question=dummy_question, paragraphs=[x, dummy_question], first_label=1, second_label=1, question_id='dummy', sentence_segments=None) for _, x in flattened_pars_with_names ] # print("Encoding long paragraphs...") long_pars = [model_paragraphs[i] for i in long_paragraphs_ids] name2enc.update({ flattened_pars_with_names[long_paragraphs_ids[i]][0]: enc for i, enc in enumerate( encoder.encode_paragraphs( long_pars, batch_size=long_batch, show_progress=True ) if not hotpot else encoder.encode_first_paragraphs( long_pars, batch_size=long_batch, show_progress=True)) }) # print("Encoding short paragraphs...") short_pars = [model_paragraphs[i] for i in short_paragraphs_ids] name2enc.update({ flattened_pars_with_names[short_paragraphs_ids[i]][0]: enc for i, enc in enumerate( encoder.encode_paragraphs( short_pars, batch_size=short_batch, show_progress=True ) if not hotpot else encoder.encode_first_paragraphs( short_pars, batch_size=short_batch, show_progress=True)) }) doc_encs_handler.save_multiple_documents(name2enc)
print("Loading TF-IDF...") tfidf_ranker = TfidfDocRanker() db = DocDB() loader = ResourceLoader() # loader = HotpotQuestions().get_resource_loader() word_counts = load_counts(join(LOCAL_DATA_DIR, 'hotpot', 'wiki_word_counts.txt')) title_counts = load_counts(join(LOCAL_DATA_DIR, 'hotpot', 'wiki_title_word_counts.txt')) word_counts.update(title_counts) voc = set(word_counts.keys()) print("Loading encoder...") spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=2, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderIterativeModel(model_dir_path=args.encoder_model, vocabulary=voc, spec=spec, loader=loader, use_char_inputs=False, use_ema=not args.no_ema, checkpoint=args.checkpoint) print("Loading QA model...") evaluators = [RecordHotpotQAPrediction(15, True, sp_prediction=True, disable_tqdm=True)] batcher = ClusteredBatcher(64, multiple_contexts_len, truncate_batches=True) qa_model_dir = ModelDir(args.qa_model) checkpoint = None if checkpoint == 'best': checkpoint = qa_model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else:
def build_openqa_iterative_top_titles( base_dir, questions_file, docs_file, encodings_dir, encoder_model, k1_list: List[int], k2_list: List[int], n1_list: List[int], n2_list: List[int], evaluate: bool, reformulate_from_text: bool, use_ema: bool, checkpoint: str, safety_mult: int): print('Loading data...') s = time.time() with open(questions_file, 'r') as f: questions = json.load(f) if docs_file is not None: with open(docs_file, 'r') as f: documents = json.load(f) else: docs_db = DocDB() print(f'Done, took {time.time()-s} seconds.') if n1_list is not None and n2_list is not None: for q in questions: q['top_titles'] = q['top_titles'][:max(max(n1_list), max(n2_list))] # Setup worker pool workers = ProcessPool(16, initializer=init, initargs=[]) qid2tokenized = {} tupled_questions = [(q['qid'], q['question']) for q in questions] print("Tokenizing questions...") with tqdm(total=len(tupled_questions)) as pbar: for tok_q in tqdm( workers.imap_unordered(tokenize_question, tupled_questions)): qid2tokenized.update(tok_q) pbar.update() voc = set() for question in qid2tokenized.values(): voc.update(question) workers.close() workers.join() # all_titles = list(set([title for q in questions for title in q['top_titles']])) def parname_to_text(par_name): par_title = par_name_to_title(par_name) par_num = int(par_name.split('_')[-1]) if docs_file is not None: return documents[par_title][par_num] return ' '.join(docs_db.get_doc_sentences(par_title)) # print(f"Gathering documents...") # Setup worker pool workers = ProcessPool(16, initializer=init_encoding_handler, initargs=[encodings_dir]) # title2encs = {} # title2idx2par_name = {} # with tqdm(total=len(all_titles)) as pbar: # for t2enc, t2id2p in tqdm(workers.imap_unordered(get_title_mappings_from_saver, all_titles)): # title2encs.update(t2enc) # title2idx2par_name.update(t2id2p) # pbar.update() # title2par_name2idxs = {} # for title, id2par in title2idx2par_name.items(): # par2idxs = {} # for idx, parname in id2par.items(): # if parname in par2idxs: # par2idxs[parname].append(idx) # else: # par2idxs[parname] = [idx] # title2par_name2idxs[title] = {par: sorted(idxs) for par, idxs in par2idxs.items()} print("Loading encoder...") spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=2, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderIterativeModel(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader(), use_char_inputs=False, use_ema=use_ema, checkpoint=checkpoint) print("Encoding questions...") q_original_encodings = encoder.encode_text_questions( [qid2tokenized[q['qid']] for q in questions], return_search_vectors=False, show_progress=True) q_search_encodings = encoder.question_rep_to_search_vector( question_encodings=q_original_encodings) init() # for initializing the tokenizer total_num = len(n1_list) * len(n2_list) * len(k1_list) * len(k2_list) print("Calculating similarities...") for n1, n2, k1, k2 in tqdm(itertools.product(n1_list, n2_list, k1_list, k2_list), total=total_num, ncols=80): questions = iterative_retrieval(encoder, questions, qid2tokenized, q_search_encodings, workers, parname_to_text, reformulate_from_text, n1, n2, k1, k2, safety_mult) dir_path = os.path.join(base_dir, f"n2-{n2}", f"n1-{n1}") os.makedirs(dir_path, exist_ok=True) out_file = os.path.join(dir_path, f"n1-{n1}_n2-{n2}_k1-{k1}_k2-{k2}.json") questions_copy = deepcopy(questions) for question in questions_copy: question.pop('top_titles') with open(out_file, 'w') as f: json.dump(questions_copy, f) if evaluate: eval_questions(questions_copy)