def process(line): if not line: return None json_line = json.loads(line) pid = json_line['id'] body = json_line['contents'] #url = json_line['url'] #title = json_line['title'] text, text_unlemm = nlp.proc_text(body) #_,title_unlemm = nlp.proc_text(title) analyzed = analyzer.analyze(body) for token in analyzed: assert ' ' not in token contents = ' '.join(analyzed) doc = {"id": pid, "text": text, "text_unlemm": text_unlemm, 'contents': contents, #"title_unlemm": title_unlemm, #"url": url, "raw": body} if (len(body)>512): doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower()[:512]) else: doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower()) return doc
def process(line): if not line: return None line = line[:maxDocSize] # cut documents that are too long! fields = line.split('\t') if len(fields) != 2: return None pid, body = fields text, text_unlemm = nlp.proc_text(body) #doc = nlp_ent(body) #entity = {} #for i in range(len(doc.ents)): #entity[doc.ents[i].text] = doc.ents[i].label_ #entity = json.dumps(entity) analyzed = analyzer.analyze(body) for token in analyzed: assert ' ' not in token contents = ' '.join(analyzed) doc = { "id": pid, "text": text, "text_unlemm": text_unlemm, 'contents': contents, "raw": body } doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower()) return doc
query_toks = query_lemmas.split() doc = nlp_ent(query) entity = {} for i in range(len(doc.ents)): entity[doc.ents[i].text] = doc.ents[i].label_ entity = json.dumps(entity) if len(query_toks) >= minQueryTokQty: doc = { "id": did, "text": query_lemmas, "text_unlemm": query_unlemm, "analyzed": ' '.join(analyzed), "entity": entity, "raw": query } doc["text_bert_tok"] = get_retokenized(bert_tokenizer, query.lower()) docStr = json.dumps(doc) + '\n' outFile.write(docStr) if ln % 10000 == 0: print('Processed %d queries' % ln) print('Processed %d queries' % ln) inpFile.close() outFile.close()