if len(fields) != 2: print('Misformated line %d ignoring:' % ln) print(line.replace('\t', '<field delimiter>')) continue did, query = fields query_lemmas, query_unlemm = nlp.procText(query) query_toks = query_lemmas.split() if len(query_toks) >= minQueryTokQty: doc = { DOCID_FIELD: did, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query.lower() } addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) docStr = json.dumps(doc) + '\n' outFile.write(docStr) if ln % REPORT_QTY == 0: print('Processed %d queries' % ln) print('Processed %d queries' % ln) inpFile.close() outFile.close()
'body': body_unlemm, TEXT_RAW_FIELD_NAME: text_raw } add_retokenized_field(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bert_tokenizer) doc_str = json.dumps(doc) + '\n' return doc_str proc_qty = args.proc_qty print(f'Spanning {proc_qty} processes') pool = multiprocessing.Pool(processes=proc_qty) ln = 0 for doc_str in pool.imap(DocParseWorker(), inp_file, IMAP_PROC_CHUNK_QTY): ln = ln + 1 if doc_str is not None: out_file.write(doc_str) else: # print('Misformatted line %d ignoring:' % ln) # print(line.replace('\t', '<field delimiter>')) print('Ignoring misformatted line %d' % ln) if ln % REPORT_QTY == 0: print('Processed %d docs' % ln) print('Processed %d docs' % ln) inp_file.close() out_file.close()
dataAnswFile.write(docStr) relGrade = MAX_RELEV_GRADE - int(i != rec.bestAnswerId) qrelFile.write(genQrelStr(qid, aid, relGrade) + '\n') if biQuestFiles and biAnswFiles: biQuestFiles[TEXT_FIELD_NAME].write(question_lemmas + '\n') biQuestFiles[TEXT_UNLEMM_FIELD_NAME].write(question_lemmas + '\n') biAnswFiles[TEXT_FIELD_NAME].write(answ_lemmas + '\n') biAnswFiles[TEXT_UNLEMM_FIELD_NAME].write(answ_lemmas + '\n') if bertTokenizer is not None: biQuestFiles[TEXT_BERT_TOKENIZED_NAME].write(question_bert_tok + '\n') biAnswFiles[TEXT_BERT_TOKENIZED_NAME].write(answ_bert_tok + '\n') if ln % REPORT_QTY == 0: print('Processed %d questions' % ln) except Exception as e: print(f'Error parsing record #{ln}, error msg: ' + str(e)) dataQuestFile.close() dataAnswFile.close() qrelFile.close() for _, f in biQuestFiles.items(): f.close() for _, f in biAnswFiles.items(): f.close()
print(f"Ignoring query, which is found in specified query files. Raw query: '{query}' lemmatized query '{query_lemmas}'") query_toks = query_lemmas.split() if len(query_toks) >= minQueryTokQty: qrelList.append(QrelEntry(queryId=qid, docId=did, relGrade=1)) # Entries are sorted by the query ID if prevQid != qid: doc = {DOCID_FIELD: qid, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query.lower()} addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) docStr = json.dumps(doc) + '\n' outFileQueries.write(docStr) prevQid = qid if ln % REPORT_QTY == 0: print('Processed %d input line' % ln) print('Processed %d input lines' % ln) writeQrels(qrelList, outFileQrelsName) inpFile.close() outFileQueries.close()
'\n') bi_answ_files[TEXT_UNLEMM_FIELD_NAME].write( sent_unlemm + '\n') if bert_tokenizer is not None: answ_bert_tok = get_retokenized( bert_tokenizer, ctx_sent_lc) bi_quest_files[TEXT_BERT_TOKENIZED_NAME].write( query_bert_tok + '\n') bi_answ_files[TEXT_BERT_TOKENIZED_NAME].write( answ_bert_tok + '\n') if use_precomputed_negatives: for entry in fields["negative_ctxs"]: psg_id = get_passage_id(entry) add_qrel_entry(qrel_dict=glob_qrel_dict, qid=query_idx, did=psg_id, grade=0) inp_file.close() out_queries.close() write_qrels([qrel_entry for qrel_key, qrel_entry in glob_qrel_dict.items()], args.output_qrels) for _, f in bi_quest_files.items(): f.close() for _, f in bi_answ_files.items(): f.close()
break if has_answ: sent_lemmas, sent_unlemm = nlp.procText(ctx_sent) biQuestFiles[TEXT_FIELD_NAME].write(query_lemmas + '\n') biQuestFiles[TEXT_UNLEMM_FIELD_NAME].write(query_unlemm + '\n') biAnswFiles[TEXT_FIELD_NAME].write(sent_lemmas + '\n') biAnswFiles[TEXT_UNLEMM_FIELD_NAME].write(sent_unlemm + '\n') if bertTokenizer is not None: answ_bert_tok = getRetokenized(bertTokenizer, ctx_sent_lc) biQuestFiles[TEXT_BERT_TOKENIZED_NAME].write(query_bert_tok + '\n') biAnswFiles[TEXT_BERT_TOKENIZED_NAME].write(answ_bert_tok + '\n') if usePrecomputedNegatives: for entry in fields["negative_ctxs"]: psgId = get_passage_id(entry) outQrels.write(f'{query_idx} 0 {psgId} 0\n') inpFile.close() outQueries.close() outQrels.close() for _, f in biQuestFiles.items(): f.close() for _, f in biAnswFiles.items(): f.close()
print('A list of queries to ignore has %d entries' % (len(ignore_queries))) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) out_file_queries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON), 'w') read_qty = 0 wrote_qty = 0 for e in read_queries(os.path.join(args.input_dir, QUESTION_FILE_JSON)): read_qty += 1 if not TEXT_FIELD_NAME in e: continue text = e[TEXT_FIELD_NAME] if text in ignore_queries: print( f"Ignoring query, which is found in specified query files: {text}'" ) continue wrote_qty += 1 out_file_queries.write(json.dumps(e) + '\n') ignored_qty = read_qty - wrote_qty print(f'Wrote {wrote_qty} queries, ignored {ignored_qty} queries') out_file_queries.close()
qrel_file.write(gen_qrel_str(qid, aid, rel_grade) + '\n') if bi_quest_files and bi_answ_files: bi_quest_files[TEXT_FIELD_NAME].write(question_lemmas + '\n') bi_quest_files[TEXT_UNLEMM_FIELD_NAME].write(question_lemmas + '\n') bi_answ_files[TEXT_FIELD_NAME].write(answ_lemmas + '\n') bi_answ_files[TEXT_UNLEMM_FIELD_NAME].write(answ_lemmas + '\n') if bert_tokenizer is not None: bi_quest_files[TEXT_BERT_TOKENIZED_NAME].write( question_bert_tok + '\n') bi_answ_files[TEXT_BERT_TOKENIZED_NAME].write( answ_bert_tok + '\n') if ln % REPORT_QTY == 0: print('Processed %d questions' % ln) except Exception as e: print(f'Error parsing record #{ln}, error msg: ' + str(e)) data_quest_file.close() data_answ_file.close() qrel_file.close() for _, f in bi_quest_files.items(): f.close() for _, f in bi_answ_files.items(): f.close()
print('A list of queries to ignore has %d entries' % (len(ignoreQueries))) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) outFileQueries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON), 'w') readQty = 0 wroteQty = 0 for e in readQueries(os.path.join(args.input_dir, QUESTION_FILE_JSON)): readQty += 1 if not TEXT_FIELD_NAME in e: continue text = e[TEXT_FIELD_NAME] if text in ignoreQueries: print(f"Ignoring query, which is found in specified query files: {text}'") continue wroteQty += 1 outFileQueries.write(json.dumps(e) + '\n') ignoredQty = readQty - wroteQty print(f'Wrote {wroteQty} queries, ignored {ignoredQty} queries') outFileQueries.close()