for fn in bitext_fields: bi_quest_files[fn] = open( os.path.join(out_bitext_dir, BITEXT_QUESTION_PREFIX + fn), 'w') bi_answ_files[fn] = open( os.path.join(out_bitext_dir, BITEXT_ANSWER_PREFIX + fn), 'w') seen_qrels = set() for qid, json_str in tqdm.tqdm(enumerate(dpr_json_reader(inp_file))): query_idx = f'{args.part_type}_{qid}' fields = json.loads(json_str) query_orig = fields["question"] answer_list = list(fields["answers"]) answer_list_lc = [s.lower() for s in answer_list] query_lemmas, query_unlemm = nlp.proc_text(query_orig) query_bert_tok = None query_toks = query_lemmas.split() if len(query_toks) >= min_query_tok_qty: doc = { DOCID_FIELD: query_idx, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query_orig, ANSWER_LIST_FIELD_NAME: answer_list } add_retokenized_field(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bert_tokenizer) if TEXT_BERT_TOKENIZED_NAME in doc: query_bert_tok = doc[TEXT_BERT_TOKENIZED_NAME]
bi_answ_files[fn] = open( os.path.join(out_bitext_dir, BITEXT_ANSWER_PREFIX + fn), 'w') ln = 0 for rec_str in SimpleXmlRecIterator(inp_file_name, 'document'): ln += 1 try: rec = proc_yahoo_answers_record(rec_str) if len(rec.answer_list) == 0: # Ignore questions without answers continue question_orig = (rec.subject + ' ' + rec.content).strip() question_lc = question_orig.lower() qid = rec.uri question_lemmas, question_unlemm = nlp.proc_text(question_orig) question_bert_tok = None if bert_tokenizer: question_bert_tok = get_retokenized(bert_tokenizer, question_lc) doc = { DOCID_FIELD: qid, TEXT_FIELD_NAME: question_lemmas, TEXT_UNLEMM_FIELD_NAME: question_unlemm, TEXT_RAW_FIELD_NAME: question_orig } if question_bert_tok is not None: doc[TEXT_BERT_TOKENIZED_NAME] = question_bert_tok doc_str = json.dumps(doc) + '\n'
print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL) nlp = SpacyTextParser(SPACY_MODEL, stop_words, keep_only_alpha_num=True, lower_case=True) with FileWrapper(args.output, 'w') as outf: qid = 0 for query in tqdm(inp_data, desc='converting queries'): # Cranfield query IDs are all wrong and don't match QRELs # In QRELs a query ID is simply qid += 1 e = { DOCID_FIELD: str(qid), TEXT_RAW_FIELD_NAME: query[TEXT_RAW_FIELD_NAME] } body_lemmas, body_unlemm = nlp.proc_text(query[BODY_FIED_NAME]) e[TEXT_FIELD_NAME] = body_lemmas e[BODY_FIED_NAME] = body_unlemm add_retokenized_field(e, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bert_tokenizer) outf.write(json.dumps(e) + '\n')
# Input file is a TSV file ln = 0 for line in inp_file: ln += 1 line = line.strip() if not line: continue fields = line.split('\t') if len(fields) != 2: print('Misformated line %d ignoring:' % ln) print(line.replace('\t', '<field delimiter>')) continue did, query_orig = fields query_lemmas, query_unlemm = nlp.proc_text(query_orig) query_toks = query_lemmas.split() if len(query_toks) >= min_query_tok_qty: doc = { DOCID_FIELD: did, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query_orig } add_retokenized_field(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bert_tokenizer) doc_str = json.dumps(doc) + '\n' out_file.write(doc_str)