def build_doc_eval_file(out_file, encodings_dir, encoder_model, k, per_doc=True): print("loading data...") corpus = SquadRelevanceCorpus() questions = corpus.get_dev() spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=1, max_num_question_words=None, max_num_context_words=None) voc = corpus.get_vocab() encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=corpus.get_resource_loader()) par_encs = np.load(join(encodings_dir, 'encodings.npz')) with open(join(encodings_dir, 'docs.json'), 'r') as f: documents = json.load(f) questions_eval_format = [] questions = sorted(questions, key=lambda x: x.paragraph.doc_title) if per_doc: title2par_encs = {} for p_name, rep in par_encs.items(): title = '_'.join(p_name.split('_')[:-1]) if title in title2par_encs: title2par_encs[title].update({p_name: rep}) else: title2par_encs[title] = {p_name: rep} for title, doc_qs in tqdm( itertools.groupby(questions, key=lambda x: x.paragraph.doc_title)): doc_qs = list(doc_qs) q_encodings = encode_squad.encode_questions(encoder, doc_qs) par2ids = {} reps = [] total_sentences = 0 for p_name, rep in title2par_encs[title].items(): par2ids[p_name] = list( range(total_sentences, total_sentences + len(rep))) reps.append(rep) total_sentences += len(rep) id2par = {i: p for p, ids in par2ids.items() for i in ids} reps = np.concatenate(reps, axis=0) top_k = simple_numpy_knn(q_encodings, reps, k * 2) for idx, question in enumerate(doc_qs): seen = set() p_names = [ id2par[x] for x in top_k[idx] if not (id2par[x] in seen or seen.add(id2par[x])) ][:k] questions_eval_format.append({ 'qid': question.question_id, 'question': ' '.join(question.question), 'answers': list(question.answers), 'paragraphs': [ documents['_'.join(p_name.split('_')[:-1])][int( p_name.split('_')[-1])] for p_name in p_names ] }) else: print("encoding questions") q_encodings = encode_squad.encode_questions(encoder, questions) par2ids = {} reps = [] total_sentences = 0 for p_name, rep in par_encs.items(): par2ids[p_name] = list( range(total_sentences, total_sentences + len(rep))) reps.append(rep) total_sentences += len(rep) id2par = {i: p for p, ids in par2ids.items() for i in ids} reps = np.concatenate(reps, axis=0) print("scoring") top_k = simple_numpy_knn(q_encodings, reps, k * 2) for idx, question in enumerate(questions): seen = set() p_names = [ id2par[x] for x in top_k[idx] if not (id2par[x] in seen or seen.add(id2par[x])) ][:k] questions_eval_format.append({ 'qid': question.question_id, 'question': ' '.join(question.question), 'answers': list(question.answers), 'paragraphs': [ documents['_'.join(p_name.split('_')[:-1])][int( p_name.split('_')[-1])] for p_name in p_names ] }) with open(out_file, 'w') as f: json.dump(questions_eval_format, f)
def build_openqa_top_titles(out_file, questions_file, docs_file, encodings_dir, encoder_model, k, use_ema: bool, checkpoint: str, safety_mult: int, n_titles: int): print('Loading data...') s = time.time() with open(questions_file, 'r') as f: questions = json.load(f) with open(docs_file, 'r') as f: documents = json.load(f) print(f'Done, took {time.time()-s} seconds.') # Setup worker pool workers = ProcessPool(16, initializer=init, initargs=[]) qid2tokenized = {} tupled_questions = [(q['qid'], q['question']) for q in questions] print("Tokenizing questions...") with tqdm(total=len(tupled_questions)) as pbar: for tok_q in tqdm( workers.imap_unordered(tokenize_question, tupled_questions)): qid2tokenized.update(tok_q) pbar.update() workers.close() workers.join() voc = set() for question in qid2tokenized.values(): voc.update(question) all_titles = list( set([title for q in questions for title in q['top_titles']])) def parname_to_text(par_name): par_title = par_name_to_title(par_name) par_num = int(par_name.split('_')[-1]) return documents[par_title][par_num] print(f"Gathering documents...") # Setup worker pool workers = ProcessPool(32, initializer=init_encoding_handler, initargs=[encodings_dir]) # title2encs = {} # title2idx2par_name = {} # with tqdm(total=len(all_titles)) as pbar: # for t2enc, t2id2p in tqdm(workers.imap_unordered(get_title_mappings_from_saver, all_titles)): # title2encs.update(t2enc) # title2idx2par_name.update(t2id2p) # pbar.update() print("Loading encoder...") spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=1, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader(), use_char_inputs=False, use_ema=use_ema, checkpoint=checkpoint) print("Encoding questions...") q_encodings = encoder.encode_text_questions( [qid2tokenized[q['qid']] for q in questions], return_search_vectors=True, show_progress=True) print("Calculating similarities...") for idx, question in tqdm(enumerate(questions), total=len(questions), ncols=80): q_titles = question['top_titles'] if n_titles is not None: q_titles = q_titles[:n_titles] title2encs = {} title2idx2par_name = {} for t2enc, t2id2p in workers.imap_unordered( get_title_mappings_from_saver, q_titles): title2encs.update(t2enc) title2idx2par_name.update(t2id2p) q_enc = q_encodings[idx] title2ids = {} reps = [] total_sentences = 0 titles_offset_dict = {} for title in q_titles: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list( range(total_sentences, total_sentences + len(rep))) reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} reps = np.concatenate(reps, axis=0) top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), reps, k * safety_mult)[0] def id_to_par_name(rep_id): return title2idx2par_name[id2title[rep_id]][ rep_id - titles_offset_dict[id2title[rep_id]]] seen = set() p_names = [ id_to_par_name(x) for x in top_k if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x))) ][:k] question['paragraphs'] = [ parname_to_text(p_name) for p_name in p_names ] with open(out_file, 'w') as f: json.dump(questions, f)
def build_openqa_iterative_top_titles(out_file, questions_file, docs_file, encodings_dir, encoder_model, k1, k2, n1, n2, evaluate: bool, reformulate_from_text: bool, use_ema: bool, checkpoint: str): print('Loading data...') s = time.time() with open(questions_file, 'r') as f: questions = json.load(f) with open(docs_file, 'r') as f: documents = json.load(f) print(f'Done, took {time.time()-s} seconds.') if n1 is not None and n2 is not None: for q in questions: q['top_titles'] = q['top_titles'][:max(n1, n2)] # Setup worker pool workers = ProcessPool(16, initializer=init, initargs=[]) qid2tokenized = {} tupled_questions = [(q['qid'], q['question']) for q in questions] print("Tokenizing questions...") with tqdm(total=len(tupled_questions)) as pbar: for tok_q in tqdm( workers.imap_unordered(tokenize_question, tupled_questions)): qid2tokenized.update(tok_q) pbar.update() voc = set() for question in qid2tokenized.values(): voc.update(question) all_titles = list( set([title for q in questions for title in q['top_titles']])) def parname_to_text(par_name): par_title = par_name_to_title(par_name) par_num = int(par_name.split('_')[-1]) return documents[par_title][par_num] print(f"Gathering documents...") # Setup worker pool workers = ProcessPool(32, initializer=init_encoding_handler, initargs=[encodings_dir]) title2encs = {} title2idx2par_name = {} with tqdm(total=len(all_titles)) as pbar: for t2enc, t2id2p in tqdm( workers.imap_unordered(get_title_mappings_from_saver, all_titles)): title2encs.update(t2enc) title2idx2par_name.update(t2id2p) pbar.update() title2par_name2idxs = {} for title, id2par in title2idx2par_name.items(): par2idxs = {} for idx, parname in id2par.items(): if parname in par2idxs: par2idxs[parname].append(idx) else: par2idxs[parname] = [idx] title2par_name2idxs[title] = { par: sorted(idxs) for par, idxs in par2idxs.items() } print("Loading encoder...") spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=2, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderIterativeModel(model_dir_path=encoder_model, vocabulary=voc, spec=spec, loader=ResourceLoader(), use_char_inputs=False, use_ema=use_ema, checkpoint=checkpoint) print("Encoding questions...") q_original_encodings = encoder.encode_text_questions( [qid2tokenized[q['qid']] for q in questions], return_search_vectors=False, show_progress=True) q_search_encodings = encoder.question_rep_to_search_vector( question_encodings=q_original_encodings) init() # for initializing the tokenizer print("Calculating similarities...") for idx, question in tqdm(enumerate(questions), total=len(questions)): title2ids = {} all_par_reps = [] total_sentences = 0 titles_offset_dict = {} for title in question['top_titles'][:n1]: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list( range(total_sentences, total_sentences + len(rep))) all_par_reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} all_par_reps = np.concatenate(all_par_reps, axis=0) q_enc = q_search_encodings[idx] top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), all_par_reps, k1 * 2)[0] def id_to_par_name(rep_id): return title2idx2par_name[id2title[rep_id]][ rep_id - titles_offset_dict[id2title[rep_id]]] seen = set() p_names = [ id_to_par_name(x) for x in top_k if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x))) ][:k1] iteration1_paragraphs = \ [title2encs[par_name_to_title(pname)][title2par_name2idxs[par_name_to_title(pname)][pname], :] for pname in p_names] if not reformulate_from_text: reformulations = encoder.reformulate_questions( questions_rep=np.tile(q_original_encodings[idx], reps=(len(p_names), 1)), paragraphs_rep=iteration1_paragraphs, return_search_vectors=True) else: tok_q = tokenize(question['question']).words() par_texts = [ tokenize(parname_to_text(pname)).words() for pname in p_names ] reformulations = encoder.reformulate_questions_from_texts( tokenized_questions=[tok_q for _ in range(len(par_texts))], tokenized_pars=par_texts, return_search_vectors=True) title2ids = {} all_par_reps = [] total_sentences = 0 titles_offset_dict = {} for title in question['top_titles'][:n2]: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list( range(total_sentences, total_sentences + len(rep))) all_par_reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} all_par_reps = np.concatenate(all_par_reps, axis=0) top_k_second = numpy_global_knn(reformulations, all_par_reps, k2 * k1) seen = set() final_p_name_pairs = [ (p_names[x1], id_to_par_name(x2)) for x1, x2 in top_k_second if not ((p_names[x1], id_to_par_name(x2)) in seen or seen.add( (p_names[x1], id_to_par_name(x2)))) ][:k2] # important to note that in the iterative dataset the paragraphs of each question are in pairs question['paragraph_pairs'] = final_p_name_pairs with open(out_file, 'w') as f: json.dump(questions, f) if evaluate: eval_questions(questions)
def iterative_retrieval(encoder, questions, qid2tokenized, q_search_encodings, workers, parname_to_text, reformulate_from_text, n1, n2, k1, k2, safety_mult, disable_tqdm=False): questions_top_ks = [] for q_idx, question in tqdm(enumerate(questions), total=len(questions), ncols=80, desc=f"n1: {n1}-{n2}-{k1}-{k2}", disable=disable_tqdm): q_titles = question['top_titles'][:n1] title2encs = {} title2idx2par_name = {} for t2enc, t2id2p in workers.imap_unordered( get_title_mappings_from_saver, q_titles): title2encs.update(t2enc) title2idx2par_name.update(t2id2p) title2par_name2idxs = {} for title, id2par in title2idx2par_name.items(): par2idxs = {} for idx, parname in id2par.items(): if parname in par2idxs: par2idxs[parname].append(idx) else: par2idxs[parname] = [idx] title2par_name2idxs[title] = { par: sorted(idxs) for par, idxs in par2idxs.items() } title2ids = {} all_par_reps = [] total_sentences = 0 titles_offset_dict = {} for title in question['top_titles'][:n1]: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list( range(total_sentences, total_sentences + len(rep))) all_par_reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} all_par_reps = np.concatenate(all_par_reps, axis=0) def id_to_par_name(rep_id): return title2idx2par_name[id2title[rep_id]][ rep_id - titles_offset_dict[id2title[rep_id]]] q_enc = q_search_encodings[q_idx] top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), all_par_reps, k1 * safety_mult)[0] seen = set() p_names = [ id_to_par_name(x) for x in top_k if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x))) ][:k1] questions_top_ks.append(p_names) if not reformulate_from_text: # not tested in this batch version raise NotImplementedError() tok_qs = [qid2tokenized[q['qid']] for q in questions] par_texts = [ x.words() for x in workers.imap(tokenize, [ parname_to_text(pname) for p_names in questions_top_ks for pname in p_names ]) ] pnames_end_idxs = list( itertools.accumulate([len(x) for x in questions_top_ks])) q_with_p = list( zip([ tok_qs[idx] for idx, pnames in enumerate(questions_top_ks) for _ in pnames ], par_texts)) q_with_p = [(x, i) for i, x in enumerate(q_with_p)] sorted_q_with_p = sorted(q_with_p, key=lambda x: (len(x[0][1]), len(x[0][0]), x[1]), reverse=True) sorted_qs, sorted_ps = zip(*[x for x, _ in sorted_q_with_p]) last_long_index = max( [i for i, x in enumerate(sorted_ps) if len(x) >= 900] + [-1]) if last_long_index != -1: reformulations_long = encoder.reformulate_questions_from_texts( tokenized_questions=sorted_qs[:last_long_index + 1], tokenized_pars=sorted_ps[:last_long_index + 1], return_search_vectors=True, show_progress=not disable_tqdm, max_batch=8) reformulations_short = encoder.reformulate_questions_from_texts( tokenized_questions=sorted_qs[last_long_index + 1:], tokenized_pars=sorted_ps[last_long_index + 1:], return_search_vectors=True, show_progress=not disable_tqdm, max_batch=128) reformulations = np.concatenate( [reformulations_long, reformulations_short], axis=0) else: reformulations = encoder.reformulate_questions_from_texts( tokenized_questions=sorted_qs, tokenized_pars=sorted_ps, return_search_vectors=True, show_progress=not disable_tqdm, max_batch=128) reformulations = reformulations[np.argsort([i for _, i in sorted_q_with_p])] pnames_end_idxs = [0] + pnames_end_idxs reformulations_per_question = [ reformulations[pnames_end_idxs[i]:pnames_end_idxs[i + 1]] for i in range(len(questions)) ] for q_idx, question in tqdm(enumerate(questions), total=len(questions), ncols=80, desc=f"n2: {n1}-{n2}-{k1}-{k2}", disable=disable_tqdm): q_titles = question['top_titles'][:n2] title2encs = {} title2idx2par_name = {} for t2enc, t2id2p in workers.imap_unordered( get_title_mappings_from_saver, q_titles): title2encs.update(t2enc) title2idx2par_name.update(t2id2p) title2par_name2idxs = {} for title, id2par in title2idx2par_name.items(): par2idxs = {} for idx, parname in id2par.items(): if parname in par2idxs: par2idxs[parname].append(idx) else: par2idxs[parname] = [idx] title2par_name2idxs[title] = { par: sorted(idxs) for par, idxs in par2idxs.items() } title2ids = {} all_par_reps = [] total_sentences = 0 titles_offset_dict = {} for title in question['top_titles'][:n2]: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list( range(total_sentences, total_sentences + len(rep))) all_par_reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} all_par_reps = np.concatenate(all_par_reps, axis=0) def id_to_par_name(rep_id): return title2idx2par_name[id2title[rep_id]][ rep_id - titles_offset_dict[id2title[rep_id]]] top_k_second = numpy_global_knn(reformulations_per_question[q_idx], all_par_reps, k2 * safety_mult) seen = set() p_names = questions_top_ks[q_idx] final_p_name_pairs = [ (p_names[x1], id_to_par_name(x2)) for x1, x2 in top_k_second if not ((p_names[x1], id_to_par_name(x2)) in seen or seen.add( (p_names[x1], id_to_par_name(x2)))) ][:k2] # important to note that in the iterative dataset the paragraphs of each question are in pairs question['paragraph_pairs'] = final_p_name_pairs return questions
def initial_retrieval(encoder, workers, questions: List, k1: int, n1: int, safety_mult: int = 1): tokenized_qs = [ tok_q.words() for tok_q in workers.imap(tokenize, [q['question'] for q in questions]) ] q_search_encodings = encoder.encode_text_questions( tokenized_qs, return_search_vectors=True, show_progress=False) for q_idx, question in enumerate(questions): q_titles = question['top_titles'][:n1] title2encs = {} title2idx2par_name = {} for t2enc, t2id2p in workers.imap_unordered( get_title_mappings_from_saver, q_titles): title2encs.update(t2enc) title2idx2par_name.update(t2id2p) title2par_name2idxs = {} for title, id2par in title2idx2par_name.items(): par2idxs = {} for idx, parname in id2par.items(): if parname in par2idxs: par2idxs[parname].append(idx) else: par2idxs[parname] = [idx] title2par_name2idxs[title] = { par: sorted(idxs) for par, idxs in par2idxs.items() } title2ids = {} all_par_reps = [] total_sentences = 0 titles_offset_dict = {} for title in question['top_titles'][:n1]: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list( range(total_sentences, total_sentences + len(rep))) all_par_reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} all_par_reps = np.concatenate(all_par_reps, axis=0) def id_to_par_name(rep_id): return title2idx2par_name[id2title[rep_id]][ rep_id - titles_offset_dict[id2title[rep_id]]] q_enc = q_search_encodings[q_idx] top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), all_par_reps, k1 * safety_mult)[0] seen = set() p_names = [ id_to_par_name(x) for x in top_k if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x))) ][:k1] question['top_pars_titles'] = [(par_name_to_title(p), ) for p in p_names] return questions
def iterative_retrieval(encoder, questions, q_original_encodings, q_search_encodings, workers, parname_to_text, reformulate_from_text, n1, n2, k1, k2, safety_mult): for q_idx, question in tqdm(enumerate(questions), total=len(questions), ncols=80, desc=f"{n1}-{n2}-{k1}-{k2}"): q_titles = question['top_titles'][:max(n1, n2)] title2encs = {} title2idx2par_name = {} for t2enc, t2id2p in workers.imap_unordered(get_title_mappings_from_saver, q_titles): title2encs.update(t2enc) title2idx2par_name.update(t2id2p) title2par_name2idxs = {} for title, id2par in title2idx2par_name.items(): par2idxs = {} for idx, parname in id2par.items(): if parname in par2idxs: par2idxs[parname].append(idx) else: par2idxs[parname] = [idx] title2par_name2idxs[title] = {par: sorted(idxs) for par, idxs in par2idxs.items()} title2ids = {} all_par_reps = [] total_sentences = 0 titles_offset_dict = {} for title in question['top_titles'][:n1]: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list(range(total_sentences, total_sentences + len(rep))) all_par_reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} all_par_reps = np.concatenate(all_par_reps, axis=0) def id_to_par_name(rep_id): return title2idx2par_name[id2title[rep_id]][rep_id - titles_offset_dict[id2title[rep_id]]] q_enc = q_search_encodings[q_idx] top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), all_par_reps, k1 * safety_mult)[0] seen = set() p_names = [id_to_par_name(x) for x in top_k if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x)))][:k1] iteration1_paragraphs = \ [title2encs[par_name_to_title(pname)][title2par_name2idxs[par_name_to_title(pname)][pname], :] for pname in p_names] if not reformulate_from_text: reformulations = encoder.reformulate_questions(questions_rep=np.tile(q_original_encodings[q_idx], reps=(len(p_names), 1)), paragraphs_rep=iteration1_paragraphs, return_search_vectors=True) else: tok_q = tokenize(question['question']).words() par_texts = [tokenize(parname_to_text(pname)).words() for pname in p_names] reformulations = encoder.reformulate_questions_from_texts( tokenized_questions=[tok_q for _ in range(len(par_texts))], tokenized_pars=par_texts, return_search_vectors=True ) title2ids = {} all_par_reps = [] total_sentences = 0 titles_offset_dict = {} for title in question['top_titles'][:n2]: titles_offset_dict[title] = total_sentences rep = title2encs[title] title2ids[title] = list(range(total_sentences, total_sentences + len(rep))) all_par_reps.append(rep) total_sentences += len(rep) id2title = {i: title for title, ids in title2ids.items() for i in ids} all_par_reps = np.concatenate(all_par_reps, axis=0) top_k_second = numpy_global_knn(reformulations, all_par_reps, k2 * safety_mult) seen = set() final_p_name_pairs = [(p_names[x1], id_to_par_name(x2)) for x1, x2 in top_k_second if not ((p_names[x1], id_to_par_name(x2)) in seen or seen.add((p_names[x1], id_to_par_name(x2))))][:k2] # important to note that in the iterative dataset the paragraphs of each question are in pairs question['paragraph_pairs'] = final_p_name_pairs return questions