def __call__(self, line): if not line: return None line = line.strip() if not line: return None fields = line.split('\t') if ' '.join(fields) == 'id text title': return '' assert len(fields) == 3, f"Wrong format fline: {line}" passId, rawText, title = fields if fltPassIds is not None: if passId not in fltPassIds: return '' textLemmas, textUnlemm = textProcessor.procText(rawText) titleLemmas, titleUnlemm = textProcessor.procText(title) doc = { DOCID_FIELD: passId, TEXT_FIELD_NAME: titleLemmas + ' ' + textLemmas, TITLE_UNLEMM_FIELD_NAME: titleUnlemm, TEXT_UNLEMM_FIELD_NAME: textUnlemm, TEXT_RAW_FIELD_NAME: titleUnlemm + ' ' + rawText.lower() } addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) return json.dumps(doc)
def __call__(self, line): if not line: return None line = line[:maxDocSize] # cut documents that are too long! fields = line.split('\t') if len(fields) != 4: return None did, url, title, body = fields url_pretok = pretokenizeUrl(url) url_lemmas, url_unlemm = nlp.procText(url_pretok) title_lemmas, title_unlemm = nlp.procText(title) body_lemmas, body_unlemm = nlp.procText(body) text = title_lemmas + ' ' + body_lemmas text = text.strip() text_raw = (title.strip() + ' ' + body.strip()).lower() doc = { DOCID_FIELD: did, 'url': url_lemmas, 'url_unlemm': url_unlemm, TEXT_FIELD_NAME: text, TITLE_FIELD_NAME: title_lemmas, TITLE_UNLEMM_FIELD_NAME: title_unlemm, 'body': body_unlemm, TEXT_RAW_FIELD_NAME: text_raw } addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) docStr = json.dumps(doc) + '\n' return docStr
def __call__(self, line): if not line: return None line = line[:maxDocSize] # cut documents that are too long! fields = line.split('\t') if len(fields) != 2: return None pid, body = fields text, text_unlemm = self.nlp.procText(body) doc = { DOCID_FIELD: pid, TEXT_FIELD_NAME: text, TEXT_UNLEMM_FIELD_NAME: text_unlemm, TEXT_RAW_FIELD_NAME: body.lower() } addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) return json.dumps(doc) + '\n'
if len(fields) != 2: print('Misformated line %d ignoring:' % ln) print(line.replace('\t', '<field delimiter>')) continue did, query = fields query_lemmas, query_unlemm = nlp.procText(query) query_toks = query_lemmas.split() if len(query_toks) >= minQueryTokQty: doc = { DOCID_FIELD: did, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query.lower() } addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) docStr = json.dumps(doc) + '\n' outFile.write(docStr) if ln % REPORT_QTY == 0: print('Processed %d queries' % ln) print('Processed %d queries' % ln) inpFile.close() outFile.close()