'body': body_unlemm, TEXT_RAW_FIELD_NAME: text_raw } add_retokenized_field(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bert_tokenizer) doc_str = json.dumps(doc) + '\n' return doc_str proc_qty = args.proc_qty print(f'Spanning {proc_qty} processes') pool = multiprocessing.Pool(processes=proc_qty) ln = 0 for doc_str in pool.imap(DocParseWorker(), inp_file, IMAP_PROC_CHUNK_QTY): ln = ln + 1 if doc_str is not None: out_file.write(doc_str) else: # print('Misformatted line %d ignoring:' % ln) # print(line.replace('\t', '<field delimiter>')) print('Ignoring misformatted line %d' % ln) if ln % REPORT_QTY == 0: print('Processed %d docs' % ln) print('Processed %d docs' % ln) inp_file.close() out_file.close()
print(f"Ignoring query, which is found in specified query files. Raw query: '{query}' lemmatized query '{query_lemmas}'") query_toks = query_lemmas.split() if len(query_toks) >= minQueryTokQty: qrelList.append(QrelEntry(queryId=qid, docId=did, relGrade=1)) # Entries are sorted by the query ID if prevQid != qid: doc = {DOCID_FIELD: qid, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query.lower()} addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) docStr = json.dumps(doc) + '\n' outFileQueries.write(docStr) prevQid = qid if ln % REPORT_QTY == 0: print('Processed %d input line' % ln) print('Processed %d input lines' % ln) writeQrels(qrelList, outFileQrelsName) inpFile.close() outFileQueries.close()
if len(fields) != 2: print('Misformated line %d ignoring:' % ln) print(line.replace('\t', '<field delimiter>')) continue did, query = fields query_lemmas, query_unlemm = nlp.procText(query) query_toks = query_lemmas.split() if len(query_toks) >= minQueryTokQty: doc = { DOCID_FIELD: did, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query.lower() } addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) docStr = json.dumps(doc) + '\n' outFile.write(docStr) if ln % REPORT_QTY == 0: print('Processed %d queries' % ln) print('Processed %d queries' % ln) inpFile.close() outFile.close()
# Doing it after lower-casing answ_bert_tok = None if bertTokenizer: answ_bert_tok = getRetokenized(bertTokenizer, answ) doc = {DOCID_FIELD: aid, TEXT_FIELD_NAME: answ_lemmas, TEXT_UNLEMM_FIELD_NAME: answ_unlemm, TEXT_RAW_FIELD_NAME: answ} if answ_bert_tok is not None: doc[TEXT_BERT_TOKENIZED_NAME] = answ_bert_tok docStr = json.dumps(doc) + '\n' dataAnswFile.write(docStr) relGrade = MAX_RELEV_GRADE - int(i != rec.bestAnswerId) qrelFile.write(genQrelStr(qid, aid, relGrade) + '\n') if biQuestFiles and biAnswFiles: biQuestFiles[TEXT_FIELD_NAME].write(question_lemmas + '\n') biQuestFiles[TEXT_UNLEMM_FIELD_NAME].write(question_lemmas + '\n') biAnswFiles[TEXT_FIELD_NAME].write(answ_lemmas + '\n') biAnswFiles[TEXT_UNLEMM_FIELD_NAME].write(answ_lemmas + '\n') if bertTokenizer is not None: biQuestFiles[TEXT_BERT_TOKENIZED_NAME].write(question_bert_tok + '\n') biAnswFiles[TEXT_BERT_TOKENIZED_NAME].write(answ_bert_tok + '\n')
return json.dumps(doc) inp_file = FileWrapper(args.input_file) out_file = FileWrapper(args.out_file, 'w') proc_qty = args.proc_qty print(f'Spanning {proc_qty} processes') pool = multiprocessing.Pool(processes=proc_qty) ln = 0 ln_ign = 0 for doc_str in pool.imap(PassParseWorker(), inp_file, IMAP_PROC_CHUNK_QTY): ln = ln + 1 if doc_str is not None: if doc_str: out_file.write(doc_str + '\n') else: ln_ign += 1 else: print('Ignoring misformatted line %d' % ln) if ln % REPORT_QTY == 0: print('Read %d passages, processed %d passages' % (ln, ln - ln_ign)) print('Processed %d passages' % ln) inp_file.close() out_file.close()
query_bert_tok = None query_toks = query_lemmas.split() if len(query_toks) >= minQueryTokQty: doc = { DOCID_FIELD: query_idx, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query.lower() } addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) if TEXT_BERT_TOKENIZED_NAME in doc: query_bert_tok = doc[TEXT_BERT_TOKENIZED_NAME] docStr = json.dumps(doc) + '\n' outQueries.write(docStr) for entry in fields["positive_ctxs"]: psgId = get_passage_id(entry) outQrels.write(f'{query_idx} 0 {psgId} 1\n') if biQuestFiles and biAnswFiles: title_text = entry["title"] if title_text: _, title_unlemm = nlp.procText(title_text) biQuestFiles[TITLE_UNLEMM_FIELD_NAME].write(query_unlemm + '\n') biAnswFiles[TITLE_UNLEMM_FIELD_NAME].write(title_unlemm + '\n') for ctx_sent in sentSplit(entry["text"]): ctx_sent = str(ctx_sent) ctx_sent_lc = ctx_sent.lower() # This is sometimes can be a false positive, b/c it doesn't
query_toks = query_lemmas.split() if len(query_toks) >= min_query_tok_qty: doc = { DOCID_FIELD: query_idx, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query_orig, ANSWER_LIST_FIELD_NAME: answer_list } add_retokenized_field(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bert_tokenizer) if TEXT_BERT_TOKENIZED_NAME in doc: query_bert_tok = doc[TEXT_BERT_TOKENIZED_NAME] doc_str = json.dumps(doc) + '\n' out_queries.write(doc_str) for entry in fields["positive_ctxs"]: psg_id = get_passage_id(entry) add_qrel_entry(qrel_dict=glob_qrel_dict, qid=query_idx, did=psg_id, grade=1) if bi_quest_files and bi_answ_files: title_text = entry["title"] if title_text: _, title_unlemm = nlp.proc_text(title_text) bi_quest_files[TITLE_UNLEMM_FIELD_NAME].write( query_unlemm + '\n') bi_answ_files[TITLE_UNLEMM_FIELD_NAME].write(title_unlemm + '\n')
bertTokenizer) return json.dumps(doc) inpFile = FileWrapper(args.input_file) outFile = FileWrapper(args.out_file, 'w') proc_qty = args.proc_qty print(f'Spanning {proc_qty} processes') pool = multiprocessing.Pool(processes=proc_qty) ln = 0 ln_ign = 0 for docStr in pool.imap(PassParseWorker(), inpFile, IMAP_PROC_CHUNK_QTY): ln = ln + 1 if docStr is not None: if docStr: outFile.write(docStr + '\n') else: ln_ign += 1 else: print('Ignoring misformatted line %d' % ln) if ln % REPORT_QTY == 0: print('Read %d passages, processed %d passages' % (ln, ln - ln_ign)) print('Processed %d passages' % ln) inpFile.close() outFile.close()
doc = json.loads(line) textRaw = doc[TEXT_RAW_FIELD_NAME] docSents = [] for oneSent in nlp(textRaw).sents: oneSent = replaceCharsNL(str(oneSent)).strip() if args.lower_case: oneSent = oneSent.lower() if oneSent: docSents.append(oneSent) # Work hard to not write empty documents, b/c it'll upset the pregenerator if docSents: for oneSent in docSents: outFile.write(oneSent + '\n') outFile.write('\n') docQty += 1 setQty += 1 if docQty % REPORT_QTY == 0: print('Processed %d docs' % docQty) if setQty >= args.max_set_size: setQty = 0 setId += 1 print('Starting set %d' % setId) outFile.close() outFile = FileWrapper(outFileName(args.output_pref, setId), 'w') print('Processed %d docs' % docQty)
print('A list of queries to ignore has %d entries' % (len(ignore_queries))) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) out_file_queries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON), 'w') read_qty = 0 wrote_qty = 0 for e in read_queries(os.path.join(args.input_dir, QUESTION_FILE_JSON)): read_qty += 1 if not TEXT_FIELD_NAME in e: continue text = e[TEXT_FIELD_NAME] if text in ignore_queries: print( f"Ignoring query, which is found in specified query files: {text}'" ) continue wrote_qty += 1 out_file_queries.write(json.dumps(e) + '\n') ignored_qty = read_qty - wrote_qty print(f'Wrote {wrote_qty} queries, ignored {ignored_qty} queries') out_file_queries.close()
answ_bert_tok = None if bert_tokenizer: answ_bert_tok = get_retokenized(bert_tokenizer, answ_lc) doc = { DOCID_FIELD: aid, TEXT_FIELD_NAME: answ_lemmas, TEXT_UNLEMM_FIELD_NAME: answ_unlemm, TEXT_RAW_FIELD_NAME: answ_orig } if answ_bert_tok is not None: doc[TEXT_BERT_TOKENIZED_NAME] = answ_bert_tok doc_str = json.dumps(doc) + '\n' data_answ_file.write(doc_str) rel_grade = MAX_RELEV_GRADE - int(i != rec.best_answer_id) qrel_file.write(gen_qrel_str(qid, aid, rel_grade) + '\n') if bi_quest_files and bi_answ_files: bi_quest_files[TEXT_FIELD_NAME].write(question_lemmas + '\n') bi_quest_files[TEXT_UNLEMM_FIELD_NAME].write(question_lemmas + '\n') bi_answ_files[TEXT_FIELD_NAME].write(answ_lemmas + '\n') bi_answ_files[TEXT_UNLEMM_FIELD_NAME].write(answ_lemmas + '\n') if bert_tokenizer is not None: bi_quest_files[TEXT_BERT_TOKENIZED_NAME].write( question_bert_tok + '\n')
qrel_list.append(QrelEntry(query_id=qid, doc_id=did, rel_grade=1)) # Entries are sorted by the query ID if prev_qid != qid: doc = { DOCID_FIELD: qid, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query_orig } add_retokenized_field(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bert_tokenizer) doc_str = json.dumps(doc) + '\n' out_file_queries.write(doc_str) gen_query_qty += 1 if gen_query_qty >= max_query_qty: break prev_qid = qid if ln % REPORT_QTY == 0: print('Processed %d input line' % ln) print('Processed %d input lines' % ln) write_qrels(qrel_list, out_file_qrels_name) inp_file.close() out_file_queries.close()
doc = json.loads(line) text_raw = doc[TEXT_RAW_FIELD_NAME] doc_sents = [] for one_sent in nlp(text_raw).sents: one_sent = replace_chars_nl(str(one_sent)).strip() if args.lower_case: one_sent = one_sent.lower() if one_sent: doc_sents.append(one_sent) # Work hard to not write empty documents, b/c it'll upset the pregenerator if doc_sents: for one_sent in doc_sents: out_file.write(one_sent + '\n') out_file.write('\n') doc_qty += 1 set_qty += 1 if doc_qty % REPORT_QTY == 0: print('Processed %d docs' % doc_qty) if set_qty >= args.max_set_size: set_qty = 0 set_id += 1 print('Starting set %d' % set_id) out_file.close() out_file = FileWrapper(out_file_name(args.output_pref, set_id), 'w') print('Processed %d docs' % doc_qty)
print('A list of queries to ignore has %d entries' % (len(ignoreQueries))) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) outFileQueries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON), 'w') readQty = 0 wroteQty = 0 for e in readQueries(os.path.join(args.input_dir, QUESTION_FILE_JSON)): readQty += 1 if not TEXT_FIELD_NAME in e: continue text = e[TEXT_FIELD_NAME] if text in ignoreQueries: print(f"Ignoring query, which is found in specified query files: {text}'") continue wroteQty += 1 outFileQueries.write(json.dumps(e) + '\n') ignoredQty = readQty - wroteQty print(f'Wrote {wroteQty} queries, ignored {ignoredQty} queries') outFileQueries.close()