Exemplo n.º 1
0
def encode_questions(encoder: SentenceEncoderSingleContext,
                     questions: List[SquadQuestion]):
    dummy_par = "Hello Hello".split()

    samples = [
        BinaryQuestionAndParagraphs(question=q.question,
                                    paragraphs=[dummy_par],
                                    label=1,
                                    num_distractors=0,
                                    question_id='dummy') for q in questions
    ]

    return encoder.encode_questions(samples, return_search_vectors=True)
Exemplo n.º 2
0
def encode_document(encoder: SentenceEncoderSingleContext,
                    doc: SquadDocument) -> Dict[int, np.ndarray]:
    dummy_question = "Hello Hello".split()

    paragraphs = [
        BinaryQuestionAndParagraphs(question=dummy_question,
                                    paragraphs=[x.par_text],
                                    label=1,
                                    num_distractors=0,
                                    question_id='dummy')
        for x in doc.paragraphs
    ]
    id_to_index = {x.par_id: idx for idx, x in enumerate(doc.paragraphs)}

    reps = encoder.encode_paragraphs(paragraphs)

    return {x.par_id: reps[id_to_index[x.par_id]] for x in doc.paragraphs}
Exemplo n.º 3
0
def encode_all_squad(encoder_model: str):
    print("loading data...")
    corpus = SquadRelevanceCorpus()
    train = corpus.get_train()
    dev = corpus.get_dev()

    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=1,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    voc = corpus.get_vocab()

    encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                           vocabulary=voc,
                                           spec=spec,
                                           loader=corpus.get_resource_loader())

    for questions, title2doc in [(train, corpus.train_title_to_document),
                                 (dev, corpus.dev_title_to_document)]:
        print(
            f"Starting encoding of {'train' if questions == train else 'dev'}")
        # eliminating distractors not from original squad
        title2max = {
            key: max(x.paragraph.par_id for x in group)
            for key, group in itertools.groupby(
                sorted(questions, key=lambda x: x.paragraph.doc_title),
                key=lambda x: x.paragraph.doc_title)
        }
        for title in title2max:
            title2doc[title].paragraphs = title2doc[
                title].paragraphs[:title2max[title] + 1]

        for title in tqdm(title2max):
            np.savez_compressed(
                get_filename(questions == train, title), **{
                    str(k): v
                    for k, v in encode_document(encoder,
                                                title2doc[title]).items()
                })
Exemplo n.º 4
0
def squad_build_drqa_doc_encodings(out_dir,
                                   encoder_model,
                                   num_workers,
                                   all_squad=False):
    print("loading data...")
    corpus = SquadRelevanceCorpus()
    questions = corpus.get_dev()
    if all_squad:
        questions.extend(corpus.get_train())
    # docs = corpus.dev_title_to_document.values() if dev else corpus.train_title_to_document.values()
    relevant_titles = list(set([q.paragraph.doc_title for q in questions]))

    conn = sqlite3.connect(DRQA_DOC_DB)
    c = conn.cursor()
    titles = list(set([q.paragraph.doc_title for q in questions]))
    for i, t in enumerate(titles):
        # Had to manually resolve this (due to changes in Wikipedia?)
        if t == "Sky (United Kingdom)":
            titles[i] = "Sky UK"

    title_to_doc_id = {t1: t2 for t1, t2 in zip(titles, relevant_titles)}

    c.execute("CREATE TEMPORARY TABLE squad_docs(id)")
    c.executemany("INSERT INTO squad_docs VALUES (?)", [(x, ) for x in titles])

    c.execute("SELECT id, text FROM documents WHERE id IN squad_docs")

    out = c.fetchall()
    conn.close()

    out = [(title_to_doc_id[title], text) for title, text in out]

    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=1,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    voc = corpus.get_vocab()
    encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                           vocabulary=voc,
                                           spec=spec,
                                           loader=ResourceLoader())

    # Setup worker pool
    workers = ProcessPool(num_workers, initializer=init, initargs=[])

    documents = {}
    tokenized_documents = {}

    print("Tokenizing...")
    with tqdm(total=len(out)) as pbar:
        for doc, tok_doc in tqdm(
                workers.imap_unordered(get_document_paragraphs, out)):
            documents.update(doc)
            tokenized_documents.update(tok_doc)
            pbar.update()

    encodings = {}
    print("Encoding...")
    for title, paragraphs in tqdm(tokenized_documents.items()):
        dummy_question = "Hello Hello".split()
        model_paragraphs = [
            BinaryQuestionAndParagraphs(question=dummy_question,
                                        paragraphs=[x],
                                        label=1,
                                        num_distractors=0,
                                        question_id='dummy')
            for x in paragraphs
        ]
        encodings.update({
            f"{title}_{i}": rep
            for i, rep in enumerate(encoder.encode_paragraphs(
                model_paragraphs))
        })

    with open(join(out_dir, 'docs.json'), 'w') as f:
        json.dump(documents, f)
    np.savez_compressed(join(out_dir, 'encodings.npz'), **encodings)
Exemplo n.º 5
0
def build_doc_eval_file(out_file,
                        encodings_dir,
                        encoder_model,
                        k,
                        per_doc=True):
    print("loading data...")
    corpus = SquadRelevanceCorpus()
    questions = corpus.get_dev()
    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=1,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    voc = corpus.get_vocab()
    encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                           vocabulary=voc,
                                           spec=spec,
                                           loader=corpus.get_resource_loader())

    par_encs = np.load(join(encodings_dir, 'encodings.npz'))
    with open(join(encodings_dir, 'docs.json'), 'r') as f:
        documents = json.load(f)

    questions_eval_format = []
    questions = sorted(questions, key=lambda x: x.paragraph.doc_title)
    if per_doc:
        title2par_encs = {}
        for p_name, rep in par_encs.items():
            title = '_'.join(p_name.split('_')[:-1])
            if title in title2par_encs:
                title2par_encs[title].update({p_name: rep})
            else:
                title2par_encs[title] = {p_name: rep}
        for title, doc_qs in tqdm(
                itertools.groupby(questions,
                                  key=lambda x: x.paragraph.doc_title)):
            doc_qs = list(doc_qs)
            q_encodings = encode_squad.encode_questions(encoder, doc_qs)
            par2ids = {}
            reps = []
            total_sentences = 0
            for p_name, rep in title2par_encs[title].items():
                par2ids[p_name] = list(
                    range(total_sentences, total_sentences + len(rep)))
                reps.append(rep)
                total_sentences += len(rep)
            id2par = {i: p for p, ids in par2ids.items() for i in ids}
            reps = np.concatenate(reps, axis=0)
            top_k = simple_numpy_knn(q_encodings, reps, k * 2)
            for idx, question in enumerate(doc_qs):
                seen = set()
                p_names = [
                    id2par[x] for x in top_k[idx]
                    if not (id2par[x] in seen or seen.add(id2par[x]))
                ][:k]
                questions_eval_format.append({
                    'qid':
                    question.question_id,
                    'question':
                    ' '.join(question.question),
                    'answers':
                    list(question.answers),
                    'paragraphs': [
                        documents['_'.join(p_name.split('_')[:-1])][int(
                            p_name.split('_')[-1])] for p_name in p_names
                    ]
                })
    else:
        print("encoding questions")
        q_encodings = encode_squad.encode_questions(encoder, questions)
        par2ids = {}
        reps = []
        total_sentences = 0
        for p_name, rep in par_encs.items():
            par2ids[p_name] = list(
                range(total_sentences, total_sentences + len(rep)))
            reps.append(rep)
            total_sentences += len(rep)
        id2par = {i: p for p, ids in par2ids.items() for i in ids}
        reps = np.concatenate(reps, axis=0)
        print("scoring")
        top_k = simple_numpy_knn(q_encodings, reps, k * 2)
        for idx, question in enumerate(questions):
            seen = set()
            p_names = [
                id2par[x] for x in top_k[idx]
                if not (id2par[x] in seen or seen.add(id2par[x]))
            ][:k]
            questions_eval_format.append({
                'qid':
                question.question_id,
                'question':
                ' '.join(question.question),
                'answers':
                list(question.answers),
                'paragraphs': [
                    documents['_'.join(p_name.split('_')[:-1])][int(
                        p_name.split('_')[-1])] for p_name in p_names
                ]
            })

    with open(out_file, 'w') as f:
        json.dump(questions_eval_format, f)
def build_openqa_top_titles(out_file, questions_file, docs_file, encodings_dir,
                            encoder_model, k, use_ema: bool, checkpoint: str,
                            safety_mult: int, n_titles: int):
    print('Loading data...')
    s = time.time()
    with open(questions_file, 'r') as f:
        questions = json.load(f)
    with open(docs_file, 'r') as f:
        documents = json.load(f)
    print(f'Done, took {time.time()-s} seconds.')

    # Setup worker pool
    workers = ProcessPool(16, initializer=init, initargs=[])

    qid2tokenized = {}
    tupled_questions = [(q['qid'], q['question']) for q in questions]
    print("Tokenizing questions...")
    with tqdm(total=len(tupled_questions)) as pbar:
        for tok_q in tqdm(
                workers.imap_unordered(tokenize_question, tupled_questions)):
            qid2tokenized.update(tok_q)
            pbar.update()

    workers.close()
    workers.join()

    voc = set()
    for question in qid2tokenized.values():
        voc.update(question)

    all_titles = list(
        set([title for q in questions for title in q['top_titles']]))

    def parname_to_text(par_name):
        par_title = par_name_to_title(par_name)
        par_num = int(par_name.split('_')[-1])
        return documents[par_title][par_num]

    print(f"Gathering documents...")
    # Setup worker pool
    workers = ProcessPool(32,
                          initializer=init_encoding_handler,
                          initargs=[encodings_dir])
    # title2encs = {}
    # title2idx2par_name = {}
    # with tqdm(total=len(all_titles)) as pbar:
    #     for t2enc, t2id2p in tqdm(workers.imap_unordered(get_title_mappings_from_saver, all_titles)):
    #         title2encs.update(t2enc)
    #         title2idx2par_name.update(t2id2p)
    #         pbar.update()

    print("Loading encoder...")
    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=1,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                           vocabulary=voc,
                                           spec=spec,
                                           loader=ResourceLoader(),
                                           use_char_inputs=False,
                                           use_ema=use_ema,
                                           checkpoint=checkpoint)

    print("Encoding questions...")
    q_encodings = encoder.encode_text_questions(
        [qid2tokenized[q['qid']] for q in questions],
        return_search_vectors=True,
        show_progress=True)

    print("Calculating similarities...")
    for idx, question in tqdm(enumerate(questions),
                              total=len(questions),
                              ncols=80):
        q_titles = question['top_titles']
        if n_titles is not None:
            q_titles = q_titles[:n_titles]
        title2encs = {}
        title2idx2par_name = {}
        for t2enc, t2id2p in workers.imap_unordered(
                get_title_mappings_from_saver, q_titles):
            title2encs.update(t2enc)
            title2idx2par_name.update(t2id2p)
        q_enc = q_encodings[idx]
        title2ids = {}
        reps = []
        total_sentences = 0
        titles_offset_dict = {}
        for title in q_titles:
            titles_offset_dict[title] = total_sentences
            rep = title2encs[title]
            title2ids[title] = list(
                range(total_sentences, total_sentences + len(rep)))
            reps.append(rep)
            total_sentences += len(rep)
        id2title = {i: title for title, ids in title2ids.items() for i in ids}
        reps = np.concatenate(reps, axis=0)
        top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), reps,
                                 k * safety_mult)[0]

        def id_to_par_name(rep_id):
            return title2idx2par_name[id2title[rep_id]][
                rep_id - titles_offset_dict[id2title[rep_id]]]

        seen = set()
        p_names = [
            id_to_par_name(x) for x in top_k
            if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x)))
        ][:k]
        question['paragraphs'] = [
            parname_to_text(p_name) for p_name in p_names
        ]

    with open(out_file, 'w') as f:
        json.dump(questions, f)
Exemplo n.º 7
0
def encode_from_file(docs_file,
                     questions_file,
                     encodings_dir,
                     encoder_model,
                     num_workers,
                     hotpot: bool,
                     long_batch: int,
                     short_batch: int,
                     use_chars: bool,
                     use_ema: bool,
                     checkpoint: str,
                     document_chunk_size=1000,
                     samples=None,
                     encode_all_db=False):
    """

    :param out_file: .npz file to dump the encodings
    :param docs_file: path to json file whose structure is [{title: list of paragraphs}, ...]
    :return:
    """
    doc_encs_handler = DocumentEncodingHandler(encodings_dir)
    # Setup worker pool
    workers = ProcessPool(num_workers, initializer=init, initargs=[])

    if docs_file is not None:
        with open(docs_file, 'r') as f:
            documents = json.load(f)
        documents = {
            k: v
            for k, v in documents.items()
            if k not in doc_encs_handler.titles2filenames
        }

        tokenized_documents = {}
        tupled_doc_list = [(title, pars) for title, pars in documents.items()]

        if samples is not None:
            print(f"sampling {samples} samples")
            tupled_doc_list = tupled_doc_list[:samples]

        print("Tokenizing from file...")
        with tqdm(total=len(tupled_doc_list), ncols=80) as pbar:
            for tok_doc in tqdm(
                    workers.imap_unordered(tokenize_document,
                                           tupled_doc_list)):
                tokenized_documents.update(tok_doc)
                pbar.update()
    else:
        if questions_file is not None:
            with open(questions_file, 'r') as f:
                questions = json.load(f)
            all_titles = list(
                set([title for q in questions for title in q['top_titles']]))
        else:
            print("encoding all DB!")
            all_titles = DocDB().get_doc_titles()

        if samples is not None:
            print(f"sampling {samples} samples")
            all_titles = all_titles[:samples]

        all_titles = [
            t for t in all_titles if t not in doc_encs_handler.titles2filenames
        ]
        tokenized_documents = {}

        print("Tokenizing from DB...")
        with tqdm(total=len(all_titles), ncols=80) as pbar:
            for tok_doc in tqdm(
                    workers.imap_unordered(tokenize_from_db, all_titles)):
                tokenized_documents.update(tok_doc)
                pbar.update()

    workers.close()
    workers.join()

    voc = set()
    for paragraphs in tokenized_documents.values():
        for par in paragraphs:
            voc.update(par)

    if not hotpot:
        spec = QuestionAndParagraphsSpec(batch_size=None,
                                         max_num_contexts=1,
                                         max_num_question_words=None,
                                         max_num_context_words=None)
        encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                               vocabulary=voc,
                                               spec=spec,
                                               loader=ResourceLoader(),
                                               use_char_inputs=use_chars,
                                               use_ema=use_ema,
                                               checkpoint=checkpoint)
    else:
        spec = QuestionAndParagraphsSpec(batch_size=None,
                                         max_num_contexts=2,
                                         max_num_question_words=None,
                                         max_num_context_words=None)
        encoder = SentenceEncoderIterativeModel(model_dir_path=encoder_model,
                                                vocabulary=voc,
                                                spec=spec,
                                                loader=ResourceLoader(),
                                                use_char_inputs=use_chars,
                                                use_ema=use_ema,
                                                checkpoint=checkpoint)

    tokenized_documents_items = list(tokenized_documents.items())
    for tokenized_doc_chunk in tqdm([
            tokenized_documents_items[i:i + document_chunk_size] for i in
            range(0, len(tokenized_documents_items), document_chunk_size)
    ],
                                    ncols=80):
        flattened_pars_with_names = [(f"{title}_{i}", par)
                                     for title, pars in tokenized_doc_chunk
                                     for i, par in enumerate(pars)]

        # filtering out empty paragraphs (probably had some short string the tokenization removed)
        # important to notice that the filtered paragraphs will have no representation,
        # but they still exist in the numbering of paragraphs for consistency with the docs.
        flattened_pars_with_names = [(name, par)
                                     for name, par in flattened_pars_with_names
                                     if len(par) > 0]

        # sort such that longer paragraphs are first to identify OOMs early on
        flattened_pars_with_names = sorted(flattened_pars_with_names,
                                           key=lambda x: len(x[1]),
                                           reverse=True)
        long_paragraphs_ids = [
            i for i, name_par in enumerate(flattened_pars_with_names)
            if len(name_par[1]) >= 900
        ]
        short_paragraphs_ids = [
            i for i, name_par in enumerate(flattened_pars_with_names)
            if len(name_par[1]) < 900
        ]

        # print(f"Encoding {len(flattened_pars_with_names)} paragraphs...")
        name2enc = {}
        dummy_question = "Hello Hello".split()
        if not hotpot:
            model_paragraphs = [
                BinaryQuestionAndParagraphs(question=dummy_question,
                                            paragraphs=[x],
                                            label=1,
                                            num_distractors=0,
                                            question_id='dummy')
                for _, x in flattened_pars_with_names
            ]
        else:
            # todo allow precomputed sentence segments
            model_paragraphs = [
                IterativeQuestionAndParagraphs(question=dummy_question,
                                               paragraphs=[x, dummy_question],
                                               first_label=1,
                                               second_label=1,
                                               question_id='dummy',
                                               sentence_segments=None)
                for _, x in flattened_pars_with_names
            ]

        # print("Encoding long paragraphs...")
        long_pars = [model_paragraphs[i] for i in long_paragraphs_ids]
        name2enc.update({
            flattened_pars_with_names[long_paragraphs_ids[i]][0]: enc
            for i, enc in enumerate(
                encoder.encode_paragraphs(
                    long_pars, batch_size=long_batch, show_progress=True
                ) if not hotpot else encoder.encode_first_paragraphs(
                    long_pars, batch_size=long_batch, show_progress=True))
        })

        # print("Encoding short paragraphs...")
        short_pars = [model_paragraphs[i] for i in short_paragraphs_ids]
        name2enc.update({
            flattened_pars_with_names[short_paragraphs_ids[i]][0]: enc
            for i, enc in enumerate(
                encoder.encode_paragraphs(
                    short_pars, batch_size=short_batch, show_progress=True
                ) if not hotpot else encoder.encode_first_paragraphs(
                    short_pars, batch_size=short_batch, show_progress=True))
        })

        doc_encs_handler.save_multiple_documents(name2enc)