def __init__(self, max_seq_length, out_dir): self.query_group: List[List[QueryID]] = load_query_group("train") self.seg_resource_loader = SegmentResourceLoader(job_man_dir, "train") self.max_seq_length = max_seq_length self.out_dir = out_dir self.info_dir = self.out_dir + "_info" exist_or_mkdir(self.info_dir)
def __init__(self, split): query_group: List[List[QueryID]] = load_query_group(split) qrel: SimpleQrel = load_msmarco_simple_qrels(split) self.split = split self.queires = dict(load_queries(split)) self.query_group = query_group self.tokenizer = get_tokenizer() self.qrel = qrel
def run_tokenize_jobs_for_prediction_split(split): query_group = load_query_group(split) candidate_docs = top100_doc_ids(split) def factory(out_dir): return TokenizeDocTitleBodyWorker(split, query_group, candidate_docs, out_dir) runner = JobRunnerS(job_man_dir, len(query_group), "MSMARCO_{}_title_body_tokens".format(split), factory) runner.start()
def run_tokenize_jobs_for_train_split(split): query_group = load_query_group(split) candidate_docs = load_candidate_doc_list_10(split) def factory(out_dir): return SentLevelTokenizeWorker(split, query_group, candidate_docs, out_dir) runner = JobRunnerS(job_man_dir, len(query_group), "MSMARCO_{}_sent_tokens".format(split), factory) runner.start()
def __init__(self, split, load_candidate_doc_list_fn): query_group: List[List[QueryID]] = load_query_group(split) candidate_docs_d: Dict[QueryID, List[str]] = load_candidate_doc_list_fn(split) qrel: SimpleQrel = load_msmarco_simple_qrels(split) self.split = split self.queires = dict(load_queries(split)) self.query_group = query_group self.tokenizer = get_tokenizer() self.candidate_doc_d: Dict[QueryID, List[str]] = candidate_docs_d self.qrel = qrel self.tokenizer = get_tokenizer()
def run_tokenize_jobs_for_pred_split(split): query_group = load_query_group(split) candidate_docs = top100_doc_ids(split) max_sent_length = 64 * 5 max_title_length = 64 * 5 def factory(out_dir): return MultipleTokenizeWorker(split, query_group, candidate_docs, max_sent_length, max_title_length, out_dir) runner = JobRunnerS(job_man_dir, len(query_group), "MSMARCO_{}_multiple_tokenize".format(split), factory) runner.start()
def __init__(self, split): super(ProcessedResource10doc, self).__init__(split) query_group: List[List[QueryID]] = load_query_group(split) candidate_docs_d: Dict[QueryID, List[str]] = load_candidate_doc_list_10(split) qrel: SimpleQrel = load_msmarco_simple_qrels(split) self.split = split self.queires = dict(load_queries(split)) self.query_group = query_group self.tokenizer = get_tokenizer() self.candidate_doc_d: Dict[QueryID, List[str]] = candidate_docs_d self.qrel = qrel self.tokenizer = get_tokenizer()
def main(): query_groups = load_query_group("train") path_format = os.path.join(job_man_dir, "seg_resource_train", "{}") not_found_list = [] for job_id in range(train_query_group_len): qids = query_groups[job_id] for qid in qids: resource_path = path_format.format(qid) if not os.path.exists(resource_path): print(job_id, qid) not_found_list.append((job_id, qid)) print("{} files not found".format(len(not_found_list)))
def __init__(self, max_seq_length, split, skip_single_seg, pick_for_pairwise, out_dir): self.query_group: List[List[QueryID]] = load_query_group(split) self.seg_resource_loader = SegmentResourceLoader(job_man_dir, split) self.max_seq_length = max_seq_length self.out_dir = out_dir self.skip_single_seg = skip_single_seg self.pick_for_pairwise = pick_for_pairwise self.info_dir = self.out_dir + "_info" exist_or_mkdir(self.info_dir)
def main(): split = "train" resource = ProcessedResource10docMulti(split) query_group: List[List[QueryID]] = load_query_group(split) msmarco_passage_qrel_path = at_data_dir("msmarco", "qrels.train.tsv") passage_qrels: QRelsDict = load_qrels_structured(msmarco_passage_qrel_path) qids = query_group[0] qids = qids[:100] pickle_name = "msmarco_passage_doc_analyze_passage_dict_evidence_loc" try: passage_dict = load_from_pickle(pickle_name) except FileNotFoundError: print("Reading passages...") passage_dict = get_passages(qids, passage_qrels) save_to_pickle(passage_dict, pickle_name) def get_rel_doc_id(qid): if qid not in resource.get_doc_for_query_d(): raise KeyError for doc_id in resource.get_doc_for_query_d()[qid]: label = resource.get_label(qid, doc_id) if label: return doc_id raise KeyError def translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body): acc = 0 for idx, tokens in enumerate(stemmed_body_tokens_list): acc += len(tokens) if loc_in_body < acc: return idx return -1 pc_tokenize = PCTokenizer() bert_tokenizer = get_tokenizer() for qid in qids: try: doc_id = get_rel_doc_id(qid) stemmed_tokens_d = resource.get_stemmed_tokens_d(qid) stemmed_title_tokens, stemmed_body_tokens_list = stemmed_tokens_d[doc_id] rel_passages = list([passage_id for passage_id, score in passage_qrels[qid].items() if score]) success = False found_idx = -1 for rel_passage_id in rel_passages: passage_text = passage_dict[rel_passage_id].strip() passage_tokens = pc_tokenize.tokenize_stem(passage_text) stemmed_body_tokens_flat = lflatten(stemmed_body_tokens_list) n, log = lcs(passage_tokens, stemmed_body_tokens_flat, True) if len(passage_tokens) > 4 and n > len(passage_tokens) * 0.7 and n > 0: success = True _, loc_in_body = log[0] sent_idx = translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body) prev = stemmed_body_tokens_flat[:loc_in_body] loc_by_bert_tokenize = len(bert_tokenizer.tokenize(" ".join(prev))) print(sent_idx, loc_in_body, loc_by_bert_tokenize, len(stemmed_body_tokens_list)) found_idx = sent_idx if not success: print("Not found. doc_lines={} passage_len={}".format(len(stemmed_body_tokens_list), len(passage_tokens))) except KeyError: pass
from data_generator.job_runner import JobRunner from dataset_specific.msmarco.common import load_query_group, top100_doc_ids from dataset_specific.msmarco.tokenize_jobs.run_tokenize import DummyWorker from epath import job_man_dir if __name__ == "__main__": split = "dev" query_group = load_query_group(split) candidate_docs = top100_doc_ids(split) def factory(out_dir): return DummyWorker(split, query_group, candidate_docs, out_dir) runner = JobRunner(job_man_dir, len(query_group)-1, "MSMARCO_{}_tokens_debug".format(split), factory) runner.start()