class DocParseWorker: def __init__(self, stopWords, spacyModel): self.nlp = SpacyTextParser(spacyModel, stopWords, keepOnlyAlphaNum=True, lowerCase=True) def __call__(self, line): if not line: return None line = line[:maxDocSize] # cut documents that are too long! fields = line.split('\t') if len(fields) != 4: return None did, url, title, body = fields title_lemmas, title_unlemm = self.nlp.procText(title) body_lemmas, body_unlemm = self.nlp.procText(body) text = title_lemmas + ' ' + body_lemmas text = text.strip() text_raw = (title.strip() + ' ' + body.strip()).lower() doc = {DOCID_FIELD: did, TEXT_FIELD_NAME: text, TITLE_UNLEMM_FIELD_NAME: title_unlemm, 'body': body_unlemm, TEXT_RAW_FIELD_NAME: text_raw} addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) docStr = json.dumps(doc) + '\n' return docStr
class PassParseWorker: def __init__(self, stopWords, spacyModel): self.nlp = SpacyTextParser(spacyModel, stopWords, keepOnlyAlphaNum=True, lowerCase=True) def __call__(self, line): if not line: return None line = line[:maxDocSize] # cut documents that are too long! fields = line.split('\t') if len(fields) != 2: return None pid, body = fields text, text_unlemm = self.nlp.procText(body) doc = { DOCID_FIELD: pid, TEXT_FIELD_NAME: text, TEXT_UNLEMM_FIELD_NAME: text_unlemm, TEXT_RAW_FIELD_NAME: body.lower() } addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) return json.dumps(doc) + '\n'
class PassParseWorker: def __init__(self, stopWords, spacyModel): # Lower cased self.textProcessor = SpacyTextParser(spacyModel, stopWords, keepOnlyAlphaNum=True, lowerCase=True, enablePOS=True) def __call__(self, line): if not line: return None line = line.strip() if not line: return None fields = line.split('\t') if ' '.join(fields) == 'id text title': return '' assert len(fields) == 3, f"Wrong format fline: {line}" passId, rawText, title = fields textLemmas, textUnlemm = self.textProcessor.procText(rawText) titleLemmas, titleUnlemm = self.textProcessor.procText(title) doc = { DOCID_FIELD: passId, TEXT_FIELD_NAME: titleLemmas + ' ' + textLemmas, TITLE_UNLEMM_FIELD_NAME: titleUnlemm, TEXT_UNLEMM_FIELD_NAME: textUnlemm, TEXT_RAW_FIELD_NAME: titleUnlemm + ' ' + rawText.lower() } addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) return json.dumps(doc)
# Input file is a TSV file ln = 0 for line in inpFile: ln += 1 line = line.strip() if not line: continue fields = line.split('\t') if len(fields) != 2: print('Misformated line %d ignoring:' % ln) print(line.replace('\t', '<field delimiter>')) continue did, query = fields query_lemmas, query_unlemm = nlp.procText(query) query_toks = query_lemmas.split() if len(query_toks) >= minQueryTokQty: doc = { DOCID_FIELD: did, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query.lower() } addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) docStr = json.dumps(doc) + '\n' outFile.write(docStr)
bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL) nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True) with FileWrapper(args.output, 'w') as outf: for doc in tqdm(inp_data, desc='converting documents'): e = { DOCID_FIELD: doc[DOCID_FIELD], TEXT_RAW_FIELD_NAME: doc[TEXT_RAW_FIELD_NAME] } title_lemmas, _ = nlp.procText(doc[TITLE_FIELD_NAME]) author_lemmas, _ = nlp.procText(doc[AUTHOR_FIELD_NAME]) venue_lemmas, _ = nlp.procText(doc[VENUE_FIELD_NAME]) body_lemmas, _ = nlp.procText(doc[BODY_FIED_NAME]) e[TEXT_FIELD_NAME] = ' '.join( [title_lemmas, author_lemmas, venue_lemmas, body_lemmas]) e[TITLE_FIELD_NAME] = title_lemmas e[AUTHOR_FIELD_NAME] = author_lemmas e[VENUE_FIELD_NAME] = venue_lemmas e[BODY_FIED_NAME] = body_lemmas addRetokenizedField(e, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bert_tokenizer) outf.write(json.dumps(e) + '\n')
print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL) nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True) with FileWrapper(args.output, 'w') as outf: qid = 0 for query in tqdm(inp_data, desc='converting queries'): # Cranfield query IDs are all wrong and don't match QRELs # In QRELs a query ID is simply qid += 1 e = { DOCID_FIELD: str(qid), TEXT_RAW_FIELD_NAME: query[TEXT_RAW_FIELD_NAME] } body_lemmas, body_unlemm = nlp.procText(query[BODY_FIED_NAME]) e[TEXT_FIELD_NAME] = body_lemmas e[BODY_FIED_NAME] = body_unlemm addRetokenizedField(e, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bert_tokenizer) outf.write(json.dumps(e) + '\n')
for fn in bitext_fields: biQuestFiles[fn] = open(os.path.join(outBitextDir, BITEXT_QUESTION_PREFIX + fn), 'w') biAnswFiles[fn] = open(os.path.join(outBitextDir, BITEXT_ANSWER_PREFIX + fn), 'w') ln = 0 for recStr in SimpleXmlRecIterator(inpFileName, 'document'): ln += 1 try: rec = procYahooAnswersRecord(recStr) if len(rec.answerList) == 0: # Ignore questions without answers continue question = (rec.subject + ' ' + rec.content).strip() qid = rec.uri question_lemmas, question_unlemm = nlp.procText(question) question = question.lower() # after NLP question_bert_tok = None if bertTokenizer: question_bert_tok = getRetokenized(bertTokenizer, question) doc = {DOCID_FIELD: qid, TEXT_FIELD_NAME: question_lemmas, TEXT_UNLEMM_FIELD_NAME: question_unlemm, TEXT_RAW_FIELD_NAME: question} if question_bert_tok is not None: doc[TEXT_BERT_TOKENIZED_NAME] = question_bert_tok docStr = json.dumps(doc) + '\n' dataQuestFile.write(docStr)
biQuestFiles = {} biAnswFiles = {} if outBitextDir: if not os.path.exists(outBitextDir): os.makedirs(outBitextDir) for fn in bitext_fields: biQuestFiles[fn] = open(os.path.join(outBitextDir, BITEXT_QUESTION_PREFIX + fn), 'w') biAnswFiles[fn] = open(os.path.join(outBitextDir, BITEXT_ANSWER_PREFIX + fn), 'w') for query_idx, fields in tqdm.tqdm(enumerate(dpr_json_reader(inpFile))): query = fields["question"] answer_list_lc = [s.lower() for s in fields["answers"]] query_lemmas, query_unlemm = nlp.procText(query) query_bert_tok = None query_toks = query_lemmas.split() if len(query_toks) >= minQueryTokQty: doc = { DOCID_FIELD: query_idx, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query.lower() } addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) if TEXT_BERT_TOKENIZED_NAME in doc: query_bert_tok = doc[TEXT_BERT_TOKENIZED_NAME] docStr = json.dumps(doc) + '\n'
predicted_queries = [] for doc_id, predicted_queries_partial in tqdm(zip( FileWrapper(args.doc_ids_path), FileWrapper(args.predictions_path)), desc='reading predictions'): doc_id = doc_id.strip() if doc_id_prev is not None and doc_id_prev != doc_id: if predicted_queries and doc_id_prev is not None: docid_to_preds[doc_id_prev] = ' '.join(predicted_queries).strip() predicted_queries = [] doc_id_prev = doc_id predicted_queries.append(predicted_queries_partial) # Not forgetting about the last batch if predicted_queries and doc_id_prev is not None: docid_to_preds[doc_id_prev] = ' '.join(predicted_queries) with FileWrapper(args.output, 'w') as outf: for doce in tqdm(jsonlGen(args.input), desc='adding doc2query fields'): doc_id = doce[DOCID_FIELD] if doc_id in docid_to_preds: text, text_unlemm = nlp.procText(docid_to_preds[doc_id]) doce[TEXT_FIELD_NAME] = doce[TEXT_FIELD_NAME] + ' ' + text doce[DOC2QUERY_FIELD_TEXT] = text doce[DOC2QUERY_FIELD_TEXT_UNLEMM] = text_unlemm else: print(f'WARNING: no predictions for {doc_id}') outf.write(json.dumps(doce) + '\n')