for fn in bitext_fields:
        bi_quest_files[fn] = open(
            os.path.join(out_bitext_dir, BITEXT_QUESTION_PREFIX + fn), 'w')
        bi_answ_files[fn] = open(
            os.path.join(out_bitext_dir, BITEXT_ANSWER_PREFIX + fn), 'w')

seen_qrels = set()

for qid, json_str in tqdm.tqdm(enumerate(dpr_json_reader(inp_file))):
    query_idx = f'{args.part_type}_{qid}'
    fields = json.loads(json_str)
    query_orig = fields["question"]
    answer_list = list(fields["answers"])
    answer_list_lc = [s.lower() for s in answer_list]
    query_lemmas, query_unlemm = nlp.proc_text(query_orig)
    query_bert_tok = None

    query_toks = query_lemmas.split()
    if len(query_toks) >= min_query_tok_qty:
        doc = {
            DOCID_FIELD: query_idx,
            TEXT_FIELD_NAME: query_lemmas,
            TEXT_UNLEMM_FIELD_NAME: query_unlemm,
            TEXT_RAW_FIELD_NAME: query_orig,
            ANSWER_LIST_FIELD_NAME: answer_list
        }
        add_retokenized_field(doc, TEXT_RAW_FIELD_NAME,
                              TEXT_BERT_TOKENIZED_NAME, bert_tokenizer)
        if TEXT_BERT_TOKENIZED_NAME in doc:
            query_bert_tok = doc[TEXT_BERT_TOKENIZED_NAME]
        bi_answ_files[fn] = open(
            os.path.join(out_bitext_dir, BITEXT_ANSWER_PREFIX + fn), 'w')

ln = 0
for rec_str in SimpleXmlRecIterator(inp_file_name, 'document'):
    ln += 1
    try:
        rec = proc_yahoo_answers_record(rec_str)
        if len(rec.answer_list) == 0:  # Ignore questions without answers
            continue

        question_orig = (rec.subject + ' ' + rec.content).strip()
        question_lc = question_orig.lower()
        qid = rec.uri

        question_lemmas, question_unlemm = nlp.proc_text(question_orig)

        question_bert_tok = None
        if bert_tokenizer:
            question_bert_tok = get_retokenized(bert_tokenizer, question_lc)

        doc = {
            DOCID_FIELD: qid,
            TEXT_FIELD_NAME: question_lemmas,
            TEXT_UNLEMM_FIELD_NAME: question_unlemm,
            TEXT_RAW_FIELD_NAME: question_orig
        }

        if question_bert_tok is not None:
            doc[TEXT_BERT_TOKENIZED_NAME] = question_bert_tok
        doc_str = json.dumps(doc) + '\n'
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)

nlp = SpacyTextParser(SPACY_MODEL,
                      stop_words,
                      keep_only_alpha_num=True,
                      lower_case=True)

with FileWrapper(args.output, 'w') as outf:
    qid = 0
    for query in tqdm(inp_data, desc='converting queries'):
        # Cranfield query IDs are all wrong and don't match QRELs
        # In QRELs a query ID is simply
        qid += 1

        e = {
            DOCID_FIELD: str(qid),
            TEXT_RAW_FIELD_NAME: query[TEXT_RAW_FIELD_NAME]
        }

        body_lemmas, body_unlemm = nlp.proc_text(query[BODY_FIED_NAME])

        e[TEXT_FIELD_NAME] = body_lemmas
        e[BODY_FIED_NAME] = body_unlemm

        add_retokenized_field(e, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                              bert_tokenizer)

        outf.write(json.dumps(e) + '\n')
Пример #4
0
# Input file is a TSV file
ln = 0
for line in inp_file:
    ln += 1
    line = line.strip()
    if not line:
        continue
    fields = line.split('\t')
    if len(fields) != 2:
        print('Misformated line %d ignoring:' % ln)
        print(line.replace('\t', '<field delimiter>'))
        continue

    did, query_orig = fields

    query_lemmas, query_unlemm = nlp.proc_text(query_orig)

    query_toks = query_lemmas.split()
    if len(query_toks) >= min_query_tok_qty:
        doc = {
            DOCID_FIELD: did,
            TEXT_FIELD_NAME: query_lemmas,
            TEXT_UNLEMM_FIELD_NAME: query_unlemm,
            TEXT_RAW_FIELD_NAME: query_orig
        }
        add_retokenized_field(doc, TEXT_RAW_FIELD_NAME,
                              TEXT_BERT_TOKENIZED_NAME, bert_tokenizer)

        doc_str = json.dumps(doc) + '\n'
        out_file.write(doc_str)