예제 #1
0
def build_doc_eval_file(out_file,
                        encodings_dir,
                        encoder_model,
                        k,
                        per_doc=True):
    print("loading data...")
    corpus = SquadRelevanceCorpus()
    questions = corpus.get_dev()
    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=1,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    voc = corpus.get_vocab()
    encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                           vocabulary=voc,
                                           spec=spec,
                                           loader=corpus.get_resource_loader())

    par_encs = np.load(join(encodings_dir, 'encodings.npz'))
    with open(join(encodings_dir, 'docs.json'), 'r') as f:
        documents = json.load(f)

    questions_eval_format = []
    questions = sorted(questions, key=lambda x: x.paragraph.doc_title)
    if per_doc:
        title2par_encs = {}
        for p_name, rep in par_encs.items():
            title = '_'.join(p_name.split('_')[:-1])
            if title in title2par_encs:
                title2par_encs[title].update({p_name: rep})
            else:
                title2par_encs[title] = {p_name: rep}
        for title, doc_qs in tqdm(
                itertools.groupby(questions,
                                  key=lambda x: x.paragraph.doc_title)):
            doc_qs = list(doc_qs)
            q_encodings = encode_squad.encode_questions(encoder, doc_qs)
            par2ids = {}
            reps = []
            total_sentences = 0
            for p_name, rep in title2par_encs[title].items():
                par2ids[p_name] = list(
                    range(total_sentences, total_sentences + len(rep)))
                reps.append(rep)
                total_sentences += len(rep)
            id2par = {i: p for p, ids in par2ids.items() for i in ids}
            reps = np.concatenate(reps, axis=0)
            top_k = simple_numpy_knn(q_encodings, reps, k * 2)
            for idx, question in enumerate(doc_qs):
                seen = set()
                p_names = [
                    id2par[x] for x in top_k[idx]
                    if not (id2par[x] in seen or seen.add(id2par[x]))
                ][:k]
                questions_eval_format.append({
                    'qid':
                    question.question_id,
                    'question':
                    ' '.join(question.question),
                    'answers':
                    list(question.answers),
                    'paragraphs': [
                        documents['_'.join(p_name.split('_')[:-1])][int(
                            p_name.split('_')[-1])] for p_name in p_names
                    ]
                })
    else:
        print("encoding questions")
        q_encodings = encode_squad.encode_questions(encoder, questions)
        par2ids = {}
        reps = []
        total_sentences = 0
        for p_name, rep in par_encs.items():
            par2ids[p_name] = list(
                range(total_sentences, total_sentences + len(rep)))
            reps.append(rep)
            total_sentences += len(rep)
        id2par = {i: p for p, ids in par2ids.items() for i in ids}
        reps = np.concatenate(reps, axis=0)
        print("scoring")
        top_k = simple_numpy_knn(q_encodings, reps, k * 2)
        for idx, question in enumerate(questions):
            seen = set()
            p_names = [
                id2par[x] for x in top_k[idx]
                if not (id2par[x] in seen or seen.add(id2par[x]))
            ][:k]
            questions_eval_format.append({
                'qid':
                question.question_id,
                'question':
                ' '.join(question.question),
                'answers':
                list(question.answers),
                'paragraphs': [
                    documents['_'.join(p_name.split('_')[:-1])][int(
                        p_name.split('_')[-1])] for p_name in p_names
                ]
            })

    with open(out_file, 'w') as f:
        json.dump(questions_eval_format, f)
def build_openqa_top_titles(out_file, questions_file, docs_file, encodings_dir,
                            encoder_model, k, use_ema: bool, checkpoint: str,
                            safety_mult: int, n_titles: int):
    print('Loading data...')
    s = time.time()
    with open(questions_file, 'r') as f:
        questions = json.load(f)
    with open(docs_file, 'r') as f:
        documents = json.load(f)
    print(f'Done, took {time.time()-s} seconds.')

    # Setup worker pool
    workers = ProcessPool(16, initializer=init, initargs=[])

    qid2tokenized = {}
    tupled_questions = [(q['qid'], q['question']) for q in questions]
    print("Tokenizing questions...")
    with tqdm(total=len(tupled_questions)) as pbar:
        for tok_q in tqdm(
                workers.imap_unordered(tokenize_question, tupled_questions)):
            qid2tokenized.update(tok_q)
            pbar.update()

    workers.close()
    workers.join()

    voc = set()
    for question in qid2tokenized.values():
        voc.update(question)

    all_titles = list(
        set([title for q in questions for title in q['top_titles']]))

    def parname_to_text(par_name):
        par_title = par_name_to_title(par_name)
        par_num = int(par_name.split('_')[-1])
        return documents[par_title][par_num]

    print(f"Gathering documents...")
    # Setup worker pool
    workers = ProcessPool(32,
                          initializer=init_encoding_handler,
                          initargs=[encodings_dir])
    # title2encs = {}
    # title2idx2par_name = {}
    # with tqdm(total=len(all_titles)) as pbar:
    #     for t2enc, t2id2p in tqdm(workers.imap_unordered(get_title_mappings_from_saver, all_titles)):
    #         title2encs.update(t2enc)
    #         title2idx2par_name.update(t2id2p)
    #         pbar.update()

    print("Loading encoder...")
    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=1,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                           vocabulary=voc,
                                           spec=spec,
                                           loader=ResourceLoader(),
                                           use_char_inputs=False,
                                           use_ema=use_ema,
                                           checkpoint=checkpoint)

    print("Encoding questions...")
    q_encodings = encoder.encode_text_questions(
        [qid2tokenized[q['qid']] for q in questions],
        return_search_vectors=True,
        show_progress=True)

    print("Calculating similarities...")
    for idx, question in tqdm(enumerate(questions),
                              total=len(questions),
                              ncols=80):
        q_titles = question['top_titles']
        if n_titles is not None:
            q_titles = q_titles[:n_titles]
        title2encs = {}
        title2idx2par_name = {}
        for t2enc, t2id2p in workers.imap_unordered(
                get_title_mappings_from_saver, q_titles):
            title2encs.update(t2enc)
            title2idx2par_name.update(t2id2p)
        q_enc = q_encodings[idx]
        title2ids = {}
        reps = []
        total_sentences = 0
        titles_offset_dict = {}
        for title in q_titles:
            titles_offset_dict[title] = total_sentences
            rep = title2encs[title]
            title2ids[title] = list(
                range(total_sentences, total_sentences + len(rep)))
            reps.append(rep)
            total_sentences += len(rep)
        id2title = {i: title for title, ids in title2ids.items() for i in ids}
        reps = np.concatenate(reps, axis=0)
        top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), reps,
                                 k * safety_mult)[0]

        def id_to_par_name(rep_id):
            return title2idx2par_name[id2title[rep_id]][
                rep_id - titles_offset_dict[id2title[rep_id]]]

        seen = set()
        p_names = [
            id_to_par_name(x) for x in top_k
            if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x)))
        ][:k]
        question['paragraphs'] = [
            parname_to_text(p_name) for p_name in p_names
        ]

    with open(out_file, 'w') as f:
        json.dump(questions, f)
예제 #3
0
def build_openqa_iterative_top_titles(out_file, questions_file, docs_file,
                                      encodings_dir, encoder_model, k1, k2, n1,
                                      n2, evaluate: bool,
                                      reformulate_from_text: bool,
                                      use_ema: bool, checkpoint: str):
    print('Loading data...')
    s = time.time()
    with open(questions_file, 'r') as f:
        questions = json.load(f)
    with open(docs_file, 'r') as f:
        documents = json.load(f)
    print(f'Done, took {time.time()-s} seconds.')

    if n1 is not None and n2 is not None:
        for q in questions:
            q['top_titles'] = q['top_titles'][:max(n1, n2)]

    # Setup worker pool
    workers = ProcessPool(16, initializer=init, initargs=[])

    qid2tokenized = {}
    tupled_questions = [(q['qid'], q['question']) for q in questions]
    print("Tokenizing questions...")
    with tqdm(total=len(tupled_questions)) as pbar:
        for tok_q in tqdm(
                workers.imap_unordered(tokenize_question, tupled_questions)):
            qid2tokenized.update(tok_q)
            pbar.update()

    voc = set()
    for question in qid2tokenized.values():
        voc.update(question)

    all_titles = list(
        set([title for q in questions for title in q['top_titles']]))

    def parname_to_text(par_name):
        par_title = par_name_to_title(par_name)
        par_num = int(par_name.split('_')[-1])
        return documents[par_title][par_num]

    print(f"Gathering documents...")
    # Setup worker pool
    workers = ProcessPool(32,
                          initializer=init_encoding_handler,
                          initargs=[encodings_dir])
    title2encs = {}
    title2idx2par_name = {}
    with tqdm(total=len(all_titles)) as pbar:
        for t2enc, t2id2p in tqdm(
                workers.imap_unordered(get_title_mappings_from_saver,
                                       all_titles)):
            title2encs.update(t2enc)
            title2idx2par_name.update(t2id2p)
            pbar.update()
    title2par_name2idxs = {}
    for title, id2par in title2idx2par_name.items():
        par2idxs = {}
        for idx, parname in id2par.items():
            if parname in par2idxs:
                par2idxs[parname].append(idx)
            else:
                par2idxs[parname] = [idx]
        title2par_name2idxs[title] = {
            par: sorted(idxs)
            for par, idxs in par2idxs.items()
        }

    print("Loading encoder...")
    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=2,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    encoder = SentenceEncoderIterativeModel(model_dir_path=encoder_model,
                                            vocabulary=voc,
                                            spec=spec,
                                            loader=ResourceLoader(),
                                            use_char_inputs=False,
                                            use_ema=use_ema,
                                            checkpoint=checkpoint)

    print("Encoding questions...")
    q_original_encodings = encoder.encode_text_questions(
        [qid2tokenized[q['qid']] for q in questions],
        return_search_vectors=False,
        show_progress=True)
    q_search_encodings = encoder.question_rep_to_search_vector(
        question_encodings=q_original_encodings)

    init()  # for initializing the tokenizer

    print("Calculating similarities...")
    for idx, question in tqdm(enumerate(questions), total=len(questions)):
        title2ids = {}
        all_par_reps = []
        total_sentences = 0
        titles_offset_dict = {}
        for title in question['top_titles'][:n1]:
            titles_offset_dict[title] = total_sentences
            rep = title2encs[title]
            title2ids[title] = list(
                range(total_sentences, total_sentences + len(rep)))
            all_par_reps.append(rep)
            total_sentences += len(rep)
        id2title = {i: title for title, ids in title2ids.items() for i in ids}
        all_par_reps = np.concatenate(all_par_reps, axis=0)

        q_enc = q_search_encodings[idx]
        top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), all_par_reps,
                                 k1 * 2)[0]

        def id_to_par_name(rep_id):
            return title2idx2par_name[id2title[rep_id]][
                rep_id - titles_offset_dict[id2title[rep_id]]]

        seen = set()
        p_names = [
            id_to_par_name(x) for x in top_k
            if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x)))
        ][:k1]
        iteration1_paragraphs = \
            [title2encs[par_name_to_title(pname)][title2par_name2idxs[par_name_to_title(pname)][pname], :]
             for pname in p_names]
        if not reformulate_from_text:
            reformulations = encoder.reformulate_questions(
                questions_rep=np.tile(q_original_encodings[idx],
                                      reps=(len(p_names), 1)),
                paragraphs_rep=iteration1_paragraphs,
                return_search_vectors=True)
        else:
            tok_q = tokenize(question['question']).words()
            par_texts = [
                tokenize(parname_to_text(pname)).words() for pname in p_names
            ]
            reformulations = encoder.reformulate_questions_from_texts(
                tokenized_questions=[tok_q for _ in range(len(par_texts))],
                tokenized_pars=par_texts,
                return_search_vectors=True)

        title2ids = {}
        all_par_reps = []
        total_sentences = 0
        titles_offset_dict = {}
        for title in question['top_titles'][:n2]:
            titles_offset_dict[title] = total_sentences
            rep = title2encs[title]
            title2ids[title] = list(
                range(total_sentences, total_sentences + len(rep)))
            all_par_reps.append(rep)
            total_sentences += len(rep)
        id2title = {i: title for title, ids in title2ids.items() for i in ids}
        all_par_reps = np.concatenate(all_par_reps, axis=0)

        top_k_second = numpy_global_knn(reformulations, all_par_reps, k2 * k1)
        seen = set()
        final_p_name_pairs = [
            (p_names[x1], id_to_par_name(x2)) for x1, x2 in top_k_second
            if not ((p_names[x1], id_to_par_name(x2)) in seen or seen.add(
                (p_names[x1], id_to_par_name(x2))))
        ][:k2]

        # important to note that in the iterative dataset the paragraphs of each question are in pairs
        question['paragraph_pairs'] = final_p_name_pairs

    with open(out_file, 'w') as f:
        json.dump(questions, f)

    if evaluate:
        eval_questions(questions)
예제 #4
0
def iterative_retrieval(encoder,
                        questions,
                        qid2tokenized,
                        q_search_encodings,
                        workers,
                        parname_to_text,
                        reformulate_from_text,
                        n1,
                        n2,
                        k1,
                        k2,
                        safety_mult,
                        disable_tqdm=False):
    questions_top_ks = []
    for q_idx, question in tqdm(enumerate(questions),
                                total=len(questions),
                                ncols=80,
                                desc=f"n1: {n1}-{n2}-{k1}-{k2}",
                                disable=disable_tqdm):
        q_titles = question['top_titles'][:n1]
        title2encs = {}
        title2idx2par_name = {}
        for t2enc, t2id2p in workers.imap_unordered(
                get_title_mappings_from_saver, q_titles):
            title2encs.update(t2enc)
            title2idx2par_name.update(t2id2p)
        title2par_name2idxs = {}
        for title, id2par in title2idx2par_name.items():
            par2idxs = {}
            for idx, parname in id2par.items():
                if parname in par2idxs:
                    par2idxs[parname].append(idx)
                else:
                    par2idxs[parname] = [idx]
            title2par_name2idxs[title] = {
                par: sorted(idxs)
                for par, idxs in par2idxs.items()
            }
        title2ids = {}
        all_par_reps = []
        total_sentences = 0
        titles_offset_dict = {}
        for title in question['top_titles'][:n1]:
            titles_offset_dict[title] = total_sentences
            rep = title2encs[title]
            title2ids[title] = list(
                range(total_sentences, total_sentences + len(rep)))
            all_par_reps.append(rep)
            total_sentences += len(rep)
        id2title = {i: title for title, ids in title2ids.items() for i in ids}
        all_par_reps = np.concatenate(all_par_reps, axis=0)

        def id_to_par_name(rep_id):
            return title2idx2par_name[id2title[rep_id]][
                rep_id - titles_offset_dict[id2title[rep_id]]]

        q_enc = q_search_encodings[q_idx]
        top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), all_par_reps,
                                 k1 * safety_mult)[0]

        seen = set()
        p_names = [
            id_to_par_name(x) for x in top_k
            if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x)))
        ][:k1]
        questions_top_ks.append(p_names)

    if not reformulate_from_text:  # not tested in this batch version
        raise NotImplementedError()
    tok_qs = [qid2tokenized[q['qid']] for q in questions]
    par_texts = [
        x.words() for x in workers.imap(tokenize, [
            parname_to_text(pname) for p_names in questions_top_ks
            for pname in p_names
        ])
    ]
    pnames_end_idxs = list(
        itertools.accumulate([len(x) for x in questions_top_ks]))
    q_with_p = list(
        zip([
            tok_qs[idx] for idx, pnames in enumerate(questions_top_ks)
            for _ in pnames
        ], par_texts))
    q_with_p = [(x, i) for i, x in enumerate(q_with_p)]
    sorted_q_with_p = sorted(q_with_p,
                             key=lambda x: (len(x[0][1]), len(x[0][0]), x[1]),
                             reverse=True)
    sorted_qs, sorted_ps = zip(*[x for x, _ in sorted_q_with_p])
    last_long_index = max(
        [i for i, x in enumerate(sorted_ps) if len(x) >= 900] + [-1])
    if last_long_index != -1:
        reformulations_long = encoder.reformulate_questions_from_texts(
            tokenized_questions=sorted_qs[:last_long_index + 1],
            tokenized_pars=sorted_ps[:last_long_index + 1],
            return_search_vectors=True,
            show_progress=not disable_tqdm,
            max_batch=8)
        reformulations_short = encoder.reformulate_questions_from_texts(
            tokenized_questions=sorted_qs[last_long_index + 1:],
            tokenized_pars=sorted_ps[last_long_index + 1:],
            return_search_vectors=True,
            show_progress=not disable_tqdm,
            max_batch=128)
        reformulations = np.concatenate(
            [reformulations_long, reformulations_short], axis=0)
    else:
        reformulations = encoder.reformulate_questions_from_texts(
            tokenized_questions=sorted_qs,
            tokenized_pars=sorted_ps,
            return_search_vectors=True,
            show_progress=not disable_tqdm,
            max_batch=128)
    reformulations = reformulations[np.argsort([i
                                                for _, i in sorted_q_with_p])]
    pnames_end_idxs = [0] + pnames_end_idxs
    reformulations_per_question = [
        reformulations[pnames_end_idxs[i]:pnames_end_idxs[i + 1]]
        for i in range(len(questions))
    ]

    for q_idx, question in tqdm(enumerate(questions),
                                total=len(questions),
                                ncols=80,
                                desc=f"n2: {n1}-{n2}-{k1}-{k2}",
                                disable=disable_tqdm):
        q_titles = question['top_titles'][:n2]
        title2encs = {}
        title2idx2par_name = {}
        for t2enc, t2id2p in workers.imap_unordered(
                get_title_mappings_from_saver, q_titles):
            title2encs.update(t2enc)
            title2idx2par_name.update(t2id2p)
        title2par_name2idxs = {}
        for title, id2par in title2idx2par_name.items():
            par2idxs = {}
            for idx, parname in id2par.items():
                if parname in par2idxs:
                    par2idxs[parname].append(idx)
                else:
                    par2idxs[parname] = [idx]
            title2par_name2idxs[title] = {
                par: sorted(idxs)
                for par, idxs in par2idxs.items()
            }
        title2ids = {}
        all_par_reps = []
        total_sentences = 0
        titles_offset_dict = {}
        for title in question['top_titles'][:n2]:
            titles_offset_dict[title] = total_sentences
            rep = title2encs[title]
            title2ids[title] = list(
                range(total_sentences, total_sentences + len(rep)))
            all_par_reps.append(rep)
            total_sentences += len(rep)
        id2title = {i: title for title, ids in title2ids.items() for i in ids}
        all_par_reps = np.concatenate(all_par_reps, axis=0)

        def id_to_par_name(rep_id):
            return title2idx2par_name[id2title[rep_id]][
                rep_id - titles_offset_dict[id2title[rep_id]]]

        top_k_second = numpy_global_knn(reformulations_per_question[q_idx],
                                        all_par_reps, k2 * safety_mult)
        seen = set()
        p_names = questions_top_ks[q_idx]
        final_p_name_pairs = [
            (p_names[x1], id_to_par_name(x2)) for x1, x2 in top_k_second
            if not ((p_names[x1], id_to_par_name(x2)) in seen or seen.add(
                (p_names[x1], id_to_par_name(x2))))
        ][:k2]

        # important to note that in the iterative dataset the paragraphs of each question are in pairs
        question['paragraph_pairs'] = final_p_name_pairs
    return questions
예제 #5
0
def initial_retrieval(encoder,
                      workers,
                      questions: List,
                      k1: int,
                      n1: int,
                      safety_mult: int = 1):
    tokenized_qs = [
        tok_q.words()
        for tok_q in workers.imap(tokenize, [q['question'] for q in questions])
    ]
    q_search_encodings = encoder.encode_text_questions(
        tokenized_qs, return_search_vectors=True, show_progress=False)
    for q_idx, question in enumerate(questions):
        q_titles = question['top_titles'][:n1]
        title2encs = {}
        title2idx2par_name = {}
        for t2enc, t2id2p in workers.imap_unordered(
                get_title_mappings_from_saver, q_titles):
            title2encs.update(t2enc)
            title2idx2par_name.update(t2id2p)
        title2par_name2idxs = {}
        for title, id2par in title2idx2par_name.items():
            par2idxs = {}
            for idx, parname in id2par.items():
                if parname in par2idxs:
                    par2idxs[parname].append(idx)
                else:
                    par2idxs[parname] = [idx]
            title2par_name2idxs[title] = {
                par: sorted(idxs)
                for par, idxs in par2idxs.items()
            }
        title2ids = {}
        all_par_reps = []
        total_sentences = 0
        titles_offset_dict = {}
        for title in question['top_titles'][:n1]:
            titles_offset_dict[title] = total_sentences
            rep = title2encs[title]
            title2ids[title] = list(
                range(total_sentences, total_sentences + len(rep)))
            all_par_reps.append(rep)
            total_sentences += len(rep)
        id2title = {i: title for title, ids in title2ids.items() for i in ids}
        all_par_reps = np.concatenate(all_par_reps, axis=0)

        def id_to_par_name(rep_id):
            return title2idx2par_name[id2title[rep_id]][
                rep_id - titles_offset_dict[id2title[rep_id]]]

        q_enc = q_search_encodings[q_idx]
        top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), all_par_reps,
                                 k1 * safety_mult)[0]

        seen = set()
        p_names = [
            id_to_par_name(x) for x in top_k
            if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x)))
        ][:k1]
        question['top_pars_titles'] = [(par_name_to_title(p), )
                                       for p in p_names]
    return questions
def iterative_retrieval(encoder, questions, q_original_encodings, q_search_encodings, workers,
                        parname_to_text, reformulate_from_text, n1, n2, k1, k2, safety_mult):
    for q_idx, question in tqdm(enumerate(questions), total=len(questions), ncols=80, desc=f"{n1}-{n2}-{k1}-{k2}"):
        q_titles = question['top_titles'][:max(n1, n2)]
        title2encs = {}
        title2idx2par_name = {}
        for t2enc, t2id2p in workers.imap_unordered(get_title_mappings_from_saver, q_titles):
            title2encs.update(t2enc)
            title2idx2par_name.update(t2id2p)
        title2par_name2idxs = {}
        for title, id2par in title2idx2par_name.items():
            par2idxs = {}
            for idx, parname in id2par.items():
                if parname in par2idxs:
                    par2idxs[parname].append(idx)
                else:
                    par2idxs[parname] = [idx]
            title2par_name2idxs[title] = {par: sorted(idxs) for par, idxs in par2idxs.items()}
        title2ids = {}
        all_par_reps = []
        total_sentences = 0
        titles_offset_dict = {}
        for title in question['top_titles'][:n1]:
            titles_offset_dict[title] = total_sentences
            rep = title2encs[title]
            title2ids[title] = list(range(total_sentences, total_sentences + len(rep)))
            all_par_reps.append(rep)
            total_sentences += len(rep)
        id2title = {i: title for title, ids in title2ids.items() for i in ids}
        all_par_reps = np.concatenate(all_par_reps, axis=0)

        def id_to_par_name(rep_id):
            return title2idx2par_name[id2title[rep_id]][rep_id - titles_offset_dict[id2title[rep_id]]]

        q_enc = q_search_encodings[q_idx]
        top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), all_par_reps, k1 * safety_mult)[0]

        seen = set()
        p_names = [id_to_par_name(x)
                   for x in top_k if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x)))][:k1]
        iteration1_paragraphs = \
            [title2encs[par_name_to_title(pname)][title2par_name2idxs[par_name_to_title(pname)][pname], :]
             for pname in p_names]
        if not reformulate_from_text:
            reformulations = encoder.reformulate_questions(questions_rep=np.tile(q_original_encodings[q_idx],
                                                                                 reps=(len(p_names), 1)),
                                                           paragraphs_rep=iteration1_paragraphs,
                                                           return_search_vectors=True)
        else:
            tok_q = tokenize(question['question']).words()
            par_texts = [tokenize(parname_to_text(pname)).words() for pname in p_names]
            reformulations = encoder.reformulate_questions_from_texts(
                tokenized_questions=[tok_q for _ in range(len(par_texts))],
                tokenized_pars=par_texts,
                return_search_vectors=True
            )

        title2ids = {}
        all_par_reps = []
        total_sentences = 0
        titles_offset_dict = {}
        for title in question['top_titles'][:n2]:
            titles_offset_dict[title] = total_sentences
            rep = title2encs[title]
            title2ids[title] = list(range(total_sentences, total_sentences + len(rep)))
            all_par_reps.append(rep)
            total_sentences += len(rep)
        id2title = {i: title for title, ids in title2ids.items() for i in ids}
        all_par_reps = np.concatenate(all_par_reps, axis=0)

        top_k_second = numpy_global_knn(reformulations, all_par_reps, k2 * safety_mult)
        seen = set()
        final_p_name_pairs = [(p_names[x1], id_to_par_name(x2))
                              for x1, x2 in top_k_second
                              if not ((p_names[x1], id_to_par_name(x2)) in seen
                                      or seen.add((p_names[x1], id_to_par_name(x2))))][:k2]

        # important to note that in the iterative dataset the paragraphs of each question are in pairs
        question['paragraph_pairs'] = final_p_name_pairs
    return questions