class DocParseWorker: def __init__(self, stopWords, spacyModel): self.nlp = SpacyTextParser(spacyModel, stopWords, keepOnlyAlphaNum=True, lowerCase=True) def __call__(self, line): if not line: return None line = line[:maxDocSize] # cut documents that are too long! fields = line.split('\t') if len(fields) != 4: return None did, url, title, body = fields title_lemmas, title_unlemm = self.nlp.procText(title) body_lemmas, body_unlemm = self.nlp.procText(body) text = title_lemmas + ' ' + body_lemmas text = text.strip() text_raw = (title.strip() + ' ' + body.strip()).lower() doc = {DOCID_FIELD: did, TEXT_FIELD_NAME: text, TITLE_UNLEMM_FIELD_NAME: title_unlemm, 'body': body_unlemm, TEXT_RAW_FIELD_NAME: text_raw} addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) docStr = json.dumps(doc) + '\n' return docStr
def __init__(self, stopWords, spacyModel): # Lower cased self.textProcessor = SpacyTextParser(spacyModel, stopWords, keepOnlyAlphaNum=True, lowerCase=True, enablePOS=True)
class PassParseWorker: def __init__(self, stopWords, spacyModel): self.nlp = SpacyTextParser(spacyModel, stopWords, keepOnlyAlphaNum=True, lowerCase=True) def __call__(self, line): if not line: return None line = line[:maxDocSize] # cut documents that are too long! fields = line.split('\t') if len(fields) != 2: return None pid, body = fields text, text_unlemm = self.nlp.procText(body) doc = { DOCID_FIELD: pid, TEXT_FIELD_NAME: text, TEXT_UNLEMM_FIELD_NAME: text_unlemm, TEXT_RAW_FIELD_NAME: body.lower() } addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) return json.dumps(doc) + '\n'
class PassParseWorker: def __init__(self, stopWords, spacyModel): # Lower cased self.textProcessor = SpacyTextParser(spacyModel, stopWords, keepOnlyAlphaNum=True, lowerCase=True, enablePOS=True) def __call__(self, line): if not line: return None line = line.strip() if not line: return None fields = line.split('\t') if ' '.join(fields) == 'id text title': return '' assert len(fields) == 3, f"Wrong format fline: {line}" passId, rawText, title = fields textLemmas, textUnlemm = self.textProcessor.procText(rawText) titleLemmas, titleUnlemm = self.textProcessor.procText(title) doc = { DOCID_FIELD: passId, TEXT_FIELD_NAME: titleLemmas + ' ' + textLemmas, TITLE_UNLEMM_FIELD_NAME: titleUnlemm, TEXT_UNLEMM_FIELD_NAME: textUnlemm, TEXT_RAW_FIELD_NAME: titleUnlemm + ' ' + rawText.lower() } addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) return json.dumps(doc)
def __init__(self, stopWords, spacyModel): self.nlp = SpacyTextParser(spacyModel, stopWords, keepOnlyAlphaNum=True, lowerCase=True)
print(args) arg_vars = vars(args) inp_data = read_cranfield_data(args.input) stopWords = readStopWords(STOPWORD_FILE, lowerCase=True) #print(stopWords) bert_tokenizer = None if arg_vars[BERT_TOK_OPT]: print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL) nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True) with FileWrapper(args.output, 'w') as outf: qid = 0 for query in tqdm(inp_data, desc='converting queries'): # Cranfield query IDs are all wrong and don't match QRELs # In QRELs a query ID is simply qid += 1 e = { DOCID_FIELD: str(qid), TEXT_RAW_FIELD_NAME: query[TEXT_RAW_FIELD_NAME] } body_lemmas, body_unlemm = nlp.procText(query[BODY_FIED_NAME])
bert_tokenizer = BertTokenizer.from_pretrained(BERT_BASE_MODEL) # Lower cased stop_words = read_stop_words(STOPWORD_FILE, lower_case=True) print(stop_words) flt_pass_ids = None if args.passage_ids is not None: flt_pass_ids = set(np.load(args.passage_ids)) print(f'Restricting parsing to {len(flt_pass_ids)} passage IDs') fields = [TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME, TITLE_UNLEMM_FIELD_NAME, TEXT_RAW_FIELD_NAME] # Lower cased text_processor = SpacyTextParser(SPACY_MODEL, stop_words, keep_only_alpha_num=True, lower_case=True, enable_pos=True) class PassParseWorker: def __call__(self, line): if not line: return None line = line.strip() if not line: return None fields = line.split('\t') if ' '.join(fields) == 'id text title':
bitext_fields = [TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME] if BERT_TOK_OPT in arg_vars: print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(BERT_BASE_MODEL) bitext_fields.append(TEXT_BERT_TOKENIZED_NAME) if not os.path.exists(outMainDir): os.makedirs(outMainDir) biQuestFiles = {} biAnswFiles = {} stopWords = readStopWords(STOPWORD_FILE, lowerCase=True) print(stopWords) nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True, enablePOS=False) dataQuestFile = open(os.path.join(outMainDir, QUESTION_FILE_JSON), 'w') # File wrapper can handle output gz files dataAnswFile = FileWrapper(os.path.join(outMainDir, ANSWER_FILE_JSON), flags='w') qrelFile = open(os.path.join(outMainDir, QREL_FILE), 'w') if outBitextDir: if not os.path.exists(outBitextDir): os.makedirs(outBitextDir) for fn in bitext_fields: biQuestFiles[fn] = open(os.path.join(outBitextDir, BITEXT_QUESTION_PREFIX + fn), 'w') biAnswFiles[fn] = open(os.path.join(outBitextDir, BITEXT_ANSWER_PREFIX + fn), 'w') ln = 0
parser.add_argument('--' + BERT_TOK_OPT, action='store_true', help=BERT_TOK_OPT_HELP) args = parser.parse_args() print(args) arg_vars = vars(args) inpFile = FileWrapper(args.input) outFile = FileWrapper(args.output, 'w') minQueryTokQty = args.min_query_token_qty stopWords = readStopWords(STOPWORD_FILE, lowerCase=True) print(stopWords) nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True) if arg_vars[BERT_TOK_OPT]: print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL) # Input file is a TSV file ln = 0 for line in inpFile: ln += 1 line = line.strip() if not line: continue fields = line.split('\t')
args = parser.parse_args() return args args = parse_args() arg_vars=vars(args) inpFile = FileWrapper(args.input) outQueries = FileWrapper(args.output_queries, 'w') outQrels = FileWrapper(args.output_qrels, 'w') minQueryTokQty = args.min_query_token_qty usePrecomputedNegatives = args.use_precomputed_negatives stopWords = readStopWords(STOPWORD_FILE, lowerCase=True) outBitextDir = arg_vars[OUT_BITEXT_PATH_OPT] nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True) sentSplit = Sentencizer(SPACY_MODEL) bitext_fields = [TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME, TITLE_UNLEMM_FIELD_NAME] bertTokenizer=None if BERT_TOK_OPT in arg_vars: print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(BERT_BASE_MODEL) bitext_fields.append(TEXT_BERT_TOKENIZED_NAME) biQuestFiles = {} biAnswFiles = {} if outBitextDir: if not os.path.exists(outBitextDir):
print(args) arg_vars = vars(args) inp_data = read_cranfield_data(args.input) stop_words = read_stop_words(STOPWORD_FILE, lower_case=True) #print(stop_words) bert_tokenizer = None if arg_vars[BERT_TOK_OPT]: print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL) nlp = SpacyTextParser(SPACY_MODEL, stop_words, keep_only_alpha_num=True, lower_case=True) with FileWrapper(args.output, 'w') as outf: qid = 0 for query in tqdm(inp_data, desc='converting queries'): # Cranfield query IDs are all wrong and don't match QRELs # In QRELs a query ID is simply qid += 1 e = { DOCID_FIELD: str(qid), TEXT_RAW_FIELD_NAME: query[TEXT_RAW_FIELD_NAME] } body_lemmas, body_unlemm = nlp.proc_text(query[BODY_FIED_NAME])
default=1000_000, help='the maximum number of set (in documents)', type=int) parser.add_argument('--lower_case', help='lowercase text', action='store_true', default=False) args = parser.parse_args() print(args) docQty = 0 setQty = 0 setId = 0 inpFile = FileWrapper(args.input) nlp = SpacyTextParser(SPACY_MODEL, [], sentSplit=True) def outFileName(pref, num): return pref + str(num) + '.txt' print('Starting set 0') outFile = FileWrapper(outFileName(args.output_pref, setId), 'w') for line in inpFile: doc = json.loads(line) textRaw = doc[TEXT_RAW_FIELD_NAME] docSents = []
print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL) bitext_fields.append(TEXT_BERT_TOKENIZED_NAME) if not os.path.exists(out_main_dir): os.makedirs(out_main_dir) bi_quest_files = {} bi_answ_files = {} stop_words = read_stop_words(STOPWORD_FILE, lower_case=True) print(stop_words) nlp = SpacyTextParser(SPACY_MODEL, stop_words, keep_only_alpha_num=True, lower_case=True, enable_pos=False) data_quest_file = open(os.path.join(out_main_dir, QUESTION_FILE_JSON), 'w') # File wrapper can handle output gz files data_answ_file = FileWrapper(os.path.join(out_main_dir, ANSWER_FILE_JSON), flags='w') qrel_file = open(os.path.join(out_main_dir, QREL_FILE), 'w') if out_bitext_dir: if not os.path.exists(out_bitext_dir): os.makedirs(out_bitext_dir) for fn in bitext_fields: bi_quest_files[fn] = open(
inpFile = FileWrapper(args.input) outFile = FileWrapper(args.output, 'w') maxDocSize = args.max_doc_size stopWords = readStopWords(STOPWORD_FILE, lowerCase=True) print(stopWords) bertTokenizer = None if arg_vars[BERT_TOK_OPT]: print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL) nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True) class DocParseWorker: def __call__(self, line): if not line: return None line = line[:maxDocSize] # cut documents that are too long! fields = line.split('\t') if len(fields) != 4: return None did, url, title, body = fields
args = parser.parse_args() return args args = parse_args() arg_vars = vars(args) inp_file = FileWrapper(args.input) out_queries = FileWrapper(args.output_queries, 'w') min_query_tok_qty = args.min_query_token_qty use_precomputed_negatives = args.use_precomputed_negatives stop_words = read_stop_words(STOPWORD_FILE, lower_case=True) out_bitext_dir = arg_vars[OUT_BITEXT_PATH_OPT] nlp = SpacyTextParser(SPACY_MODEL, stop_words, keep_only_alpha_num=True, lower_case=True) sent_split = Sentencizer(SPACY_MODEL) bitext_fields = [ TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME, TITLE_UNLEMM_FIELD_NAME ] bert_tokenizer = None if arg_vars[BERT_TOK_OPT]: print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL) bitext_fields.append(TEXT_BERT_TOKENIZED_NAME) bi_quest_files = {}
print(args) arg_vars = vars(args) inp_data = read_cranfield_data(args.input) stopWords = readStopWords(STOPWORD_FILE, lowerCase=True) #print(stopWords) bert_tokenizer = None if arg_vars[BERT_TOK_OPT]: print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL) nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True) with FileWrapper(args.output, 'w') as outf: for doc in tqdm(inp_data, desc='converting documents'): e = { DOCID_FIELD: doc[DOCID_FIELD], TEXT_RAW_FIELD_NAME: doc[TEXT_RAW_FIELD_NAME] } title_lemmas, _ = nlp.procText(doc[TITLE_FIELD_NAME]) author_lemmas, _ = nlp.procText(doc[AUTHOR_FIELD_NAME]) venue_lemmas, _ = nlp.procText(doc[VENUE_FIELD_NAME]) body_lemmas, _ = nlp.procText(doc[BODY_FIED_NAME]) e[TEXT_FIELD_NAME] = ' '.join(
required=True) parser.add_argument('--bert_tok_qty', metavar='max # BERT toks.', help='max # of BERT tokens in a piece.', type=int, default=288) parser.add_argument('--proc_qty', metavar='# of processes', help='# of parallel processes', type=int, required=True) args = parser.parse_args() print(args) # Lower cased stopWords = readStopWords(STOPWORD_FILE, lowerCase=True) print(stopWords) # Lower cased textProcessor = SpacyTextParser(SPACY_MODEL, stopWords, sentSplit=True, keepOnlyAlphaNum=True, lowerCase=True, enablePOS=False) maxBertTokQty = args.bert_tok_qty tokenizer = BertTokenizer.from_pretrained(BERT_BASE_MODEL, do_lower_case=True) tempFilePref = args.temp_file_pref procQty = args.proc_qty fields = [TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME, TITLE_UNLEMM_FIELD_NAME, TEXT_RAW_FIELD_NAME] class FakeSentence: def __repr__(self): return '[%s,%d]' % (self.start_char, self.end_char)
inp_file = FileWrapper(args.input) out_file = FileWrapper(args.output, 'w') max_doc_size = args.max_doc_size stop_words = read_stop_words(STOPWORD_FILE, lower_case=True) print(stop_words) bert_tokenizer = None if arg_vars[BERT_TOK_OPT]: print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL) nlp = SpacyTextParser(SPACY_MODEL, stop_words, keep_only_alpha_num=True, lower_case=True) class DocParseWorker: def __call__(self, line): if not line: return None line = line[:max_doc_size] # cut documents that are too long! fields = line.split('\t') if len(fields) != 4: return None did, url, title, body = fields
help='File mapping segments to doc ids.') parser.add_argument('--predictions_path', required=True, metavar='doc2query predictions', help='File containing predicted queries.') docid_to_preds = {} args = parser.parse_args() print(args) stopWords = readStopWords(STOPWORD_FILE, lowerCase=True) print(stopWords) nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True) doc_id_prev = None predicted_queries = [] for doc_id, predicted_queries_partial in tqdm(zip( FileWrapper(args.doc_ids_path), FileWrapper(args.predictions_path)), desc='reading predictions'): doc_id = doc_id.strip() if doc_id_prev is not None and doc_id_prev != doc_id: if predicted_queries and doc_id_prev is not None: docid_to_preds[doc_id_prev] = ' '.join(predicted_queries).strip() predicted_queries = [] doc_id_prev = doc_id