def generate(self, qids) -> Iterator[SRPerQuery]: for qid in qids: query_tokens = self.resource.get_q_tokens(qid) content_len = self.max_seq_length - 3 - len(query_tokens) try: docs: List[MSMarcoDoc] = load_per_query_docs(qid, None) docs_d = {d.doc_id: d for d in docs} sr_per_query_doc_list = [] for doc_id in self.resource.get_doc_for_query_d()[qid]: label = self.resource.get_label(qid, doc_id) try: doc = docs_d[doc_id] segs: List[SegmentRepresentation] = self.get_segs( query_tokens, doc, content_len) sr_per_query_doc = SRPerQueryDoc(doc_id, segs, label) sr_per_query_doc_list.append(sr_per_query_doc) except KeyError: pass sr_per_query = SRPerQuery(qid, sr_per_query_doc_list) yield sr_per_query except KeyError as e: print(e) print(doc_id) except FileNotFoundError as e: print(e) print(qid)
def work(self, job_id): qid_list = self.query_group[job_id] missing_rel_cnt = 0 missing_nrel_cnt = 0 def empty_doc_fn(query_id, doc_id): rel_docs = self.ms_reader.qrel[query_id] nonlocal missing_rel_cnt nonlocal missing_nrel_cnt if doc_id in rel_docs: missing_rel_cnt += 1 else: missing_nrel_cnt += 1 for qid in qid_list: docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn) if qid not in self.candidate_docs_d: continue target_docs = self.candidate_docs_d[qid] tokens_d = {} for d in docs: if d.doc_id in target_docs: tokens_d[d.doc_id] = [] if len(tokens_d) < len(target_docs): log_variables(job_id, qid, tokens_d, target_docs) not_found_docs = list([ doc_id for doc_id in target_docs if doc_id not in tokens_d ]) print("{} of {} not found: {}".format(len(not_found_docs), len(target_docs), not_found_docs))
def work(self, job_id): qid_list = self.query_group[job_id] ticker = TimeEstimator(len(qid_list)) missing_rel_cnt = 0 missing_nrel_cnt = 0 def empty_doc_fn(query_id, doc_id): rel_docs = self.ms_reader.qrel[query_id] nonlocal missing_rel_cnt nonlocal missing_nrel_cnt if doc_id in rel_docs: missing_rel_cnt += 1 else: missing_nrel_cnt += 1 for qid in qid_list: if qid not in self.candidate_docs_d: continue docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn) ticker.tick() target_docs = self.candidate_docs_d[qid] text_d = {} bert_tokens_d = {} stemmed_tokens_d = {} for d in docs: if d.doc_id in target_docs: title = d.title title = crop_to_space(title, self.max_title_length) body_sents = sent_tokenize(d.body) new_body_sents = self.resplit_body_sents(body_sents) text_d[d.doc_id] = title, new_body_sents for tokenize_fn, save_dict in [ (self.bert_tokenizer.tokenize, bert_tokens_d), (self.stem_tokenizer.tokenize_stem, stemmed_tokens_d) ]: title_tokens = tokenize_fn(title) body_tokens_list = lmap(tokenize_fn, new_body_sents) save_dict[d.doc_id] = (title_tokens, body_tokens_list) todo = [ (text_d, self.text_dir_name), (bert_tokens_d, self.bert_tokens_dir_name), (stemmed_tokens_d, self.stemmed_tokens_dir_name), ] for tokens_d, dir_name in todo: save_path = os.path.join(self.out_dir, dir_name, str(qid)) pickle.dump(tokens_d, open(save_path, "wb"))
def get_todo() -> List[Tuple[QueryID, MSMarcoDoc]]: print("get_todo()") doc_queries = load_train_queries() doc_qrels: Dict[QueryID, List[str]] = load_msmarco_raw_qrels("train") todo: List[Tuple[QueryID, MSMarcoDoc]] = [] doc_id_to_find = [] n_item = 1000 for qid, q_text in doc_queries[:n_item]: docs = load_per_query_docs(qid, None) for doc in docs: if doc.doc_id in doc_qrels[qid]: todo.append((qid, doc)) doc_id_to_find.append(doc.doc_id) return todo
def generate(self, data_id_manager, qids): missing_cnt = 0 success_docs = 0 missing_doc_qid = [] ticker = TimeEstimator(len(qids)) for qid in qids: if qid not in self.resource.get_doc_for_query_d(): continue ticker.tick() docs: List[MSMarcoDoc] = load_per_query_docs(qid, None) docs_d = {d.doc_id: d for d in docs} q_tokens = self.resource.get_q_tokens(qid) pos_doc_id_list, neg_doc_id_list \ = get_pos_neg_doc_ids_for_qid(self.resource, qid) def iter_passages(doc_id): doc = docs_d[doc_id] insts: List[Tuple[List, List]] = self.encoder.encode(q_tokens, doc.title, doc.body) for passage_idx, passage in enumerate(insts): yield passage for pos_doc_id in pos_doc_id_list: sampled_neg_doc_id = pick1(neg_doc_id_list) try: for passage_idx1, passage1 in enumerate(iter_passages(pos_doc_id)): for passage_idx2, passage2 in enumerate(iter_passages(sampled_neg_doc_id)): tokens_seg1, seg_ids1 = passage1 tokens_seg2, seg_ids2 = passage2 data_id = data_id_manager.assign({ 'doc_id1': pos_doc_id, 'passage_idx1': passage_idx1, 'doc_id2': sampled_neg_doc_id, 'passage_idx2': passage_idx2, }) inst = PairedInstance(tokens_seg1, seg_ids1, tokens_seg2, seg_ids2, data_id) yield inst success_docs += 1 except KeyError: missing_cnt += 1 missing_doc_qid.append(qid) if missing_cnt > 10: print(missing_doc_qid) print("success: ", success_docs) raise KeyError
def main(): split = "dev" query_d = dict(load_queries(split)) bm25_module = get_bm25_module() ranked_list_path = at_working_dir("msmarco-doc{}-top100".format(split)) run_name = "BM25_df100" rlg = load_ranked_list_grouped(ranked_list_path) save_path = at_output_dir("ranked_list", "mmd_dev_{}.txt".format(run_name)) te = TimeEstimator(100) out_entries = [] for query_id, entries in rlg.items(): doc_ids = list([e.doc_id for e in entries]) docs = load_per_query_docs(query_id, None) found_doc_ids = list([d.doc_id for d in docs]) not_found_doc_ids = list( [doc_id for doc_id in doc_ids if doc_id not in found_doc_ids]) doc_id_len = len(not_found_doc_ids) if doc_id_len: print("{} docs not found".format(doc_id_len)) query_text = query_d[QueryID(query_id)] def score(doc: MSMarcoDoc): content = doc.title + " " + doc.body return bm25_module.score(query_text, content) scored_docs = list([(d, score(d)) for d in docs]) scored_docs.sort(key=get_second, reverse=True) reranked_entries = [] for rank, (doc, score) in enumerate(scored_docs): e = TrecRankedListEntry(query_id, doc.doc_id, rank, score, run_name) reranked_entries.append(e) out_entries.extend(reranked_entries) te.tick() if len(out_entries) > 100 * 100: break write_trec_ranked_list_entry(out_entries, save_path)
def work(self, job_id): qid_list = self.query_group[job_id] ticker = TimeEstimator(len(qid_list)) missing_rel_cnt = 0 missing_nrel_cnt = 0 def empty_doc_fn(query_id, doc_id): rel_docs = self.ms_reader.qrel[query_id] nonlocal missing_rel_cnt nonlocal missing_nrel_cnt if doc_id in rel_docs: missing_rel_cnt += 1 else: missing_nrel_cnt += 1 def get_tf(text): tokens = self.tokenizer.tokenize_stem(text) return Counter(tokens) for qid in qid_list: if qid not in self.candidate_docs_d: continue docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn) ticker.tick() target_docs = self.candidate_docs_d[qid] tokens_d = {} for d in docs: if d.doc_id in target_docs: title_tokens = self.tokenizer.tokenize_stem(d.title) body_sents = sent_tokenize(d.body) body_tf_list = lmap(get_tf, body_sents) tokens_d[d.doc_id] = (title_tokens, body_tf_list) if len(tokens_d) < len(target_docs): log_variables(job_id, qid) print("{} of {} not found".format(len(tokens_d), len(target_docs))) save_path = os.path.join(self.out_dir, str(qid)) pickle.dump(tokens_d, open(save_path, "wb"))
def generate(self, data_id_manager, qids): missing_cnt = 0 success_docs = 0 missing_doc_qid = [] ticker = TimeEstimator(len(qids)) for qid in qids: if qid not in self.resource.get_doc_for_query_d(): continue ticker.tick() docs: List[MSMarcoDoc] = load_per_query_docs(qid, None) docs_d = {d.doc_id: d for d in docs} q_tokens = self.resource.get_q_tokens(qid) for doc_id in self.resource.get_doc_for_query_d()[qid]: label = self.resource.get_label(qid, doc_id) try: doc = docs_d[doc_id] insts: Iterable[Tuple[List, List]] = self.encoder.encode(q_tokens, doc.title, doc.body) for passage_idx, passage in enumerate(insts): tokens_seg, seg_ids = passage assert type(tokens_seg[0]) == str assert type(seg_ids[0]) == int data_id = data_id_manager.assign({ 'doc_id': doc_id, 'passage_idx': passage_idx, 'label': label, }) inst = ClassificationInstanceWDataID(tokens_seg, seg_ids, label, data_id) yield inst success_docs += 1 except KeyError: missing_cnt += 1 missing_doc_qid.append(qid) if missing_cnt > 10: print(missing_doc_qid) print("success: ", success_docs) raise KeyError