def get_candidate_all_passage_w_samping_predict( max_seq_length=256) -> Dict[str, List[QCKCandidateWToken]]: qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt") galago_rank = load_bm25_best() tokens_d = load_robust_tokens_for_predict(4) queries = load_robust04_title_query() tokenizer = get_tokenizer() out_d: Dict[str, List[QCKCandidateWToken]] = {} for query_id in queries: query = queries[query_id] query_tokens = tokenizer.tokenize(query) ranked_list = galago_rank[query_id] ranked_list = ranked_list[:100] doc_ids = list([e.doc_id for e in ranked_list]) candidate = [] for doc_id in doc_ids: tokens = tokens_d[doc_id] for idx, passage in enumerate(enum_passage(tokens, max_seq_length)): if idx == 0: include = True else: include = random.random() < 0.1 if include: c = QCKCandidateWToken(doc_id, "", passage) candidate.append(c) out_d[query_id] = candidate return out_d
def __init__(self, query_type="desc", neg_k=1000): qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.queries = load_robust_04_query(query_type) self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best() self.neg_k = neg_k
def __init__(self, encoder, max_seq_length, top_k=100, query_type="title"): self.data = self.load_tokens_from_pickles() self.max_seq_length = max_seq_length self.queries = load_robust_04_query(query_type) self.galago_rank = load_bm25_best() self.top_k = top_k self.encoder = encoder self.tokenizer = get_tokenizer()
def __init__(self, encoder, max_seq_length, query_type="title"): self.data = self.load_tokens() qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.max_seq_length = max_seq_length self.queries = load_robust_04_query(query_type) self.encoder = encoder self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best()
def __init__(self, doc_max_length, query_type="title", neg_k=1000, pos_only=True): self.data = self.load_tokens() qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.doc_max_length = doc_max_length self.queries = load_robust_04_query(query_type) self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best() self.neg_k = neg_k self.pos_only = pos_only
def __init__(self, encoder, max_seq_length, query_type, target_selection_fn: Callable[[str, str, List], List[int]]): self.data = self.load_tokens() qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.max_seq_length = max_seq_length self.queries = load_robust_04_query(query_type) self.encoder = encoder self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best() self.target_selection_fn: Callable[[str, str, List], List[int]] = target_selection_fn
def load_candidate_all_passage( max_seq_length, max_passage_per_doc=10) -> Dict[str, List[QCKCandidateWToken]]: candidate_docs: Dict[str, List[SimpleRankedListEntry]] = load_bm25_best() def get_doc_id(l: List[SimpleRankedListEntry]): return list([e.doc_id for e in l]) candidate_doc_ids: Dict[str, List[str]] = dict_value_map( get_doc_id, candidate_docs) token_data: Dict[str, List[str]] = load_robust_tokens_for_predict() return load_candidate_all_passage_inner(candidate_doc_ids, token_data, max_seq_length, max_passage_per_doc)
def __init__(self, encoder, max_seq_length, score_d, query_type="title", neg_k=1000): self.data = self.load_tokens() qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.score_d: Dict[str, List[float]] = score_d self.judgement = load_qrels_structured(qrel_path) self.max_seq_length = max_seq_length self.queries = load_robust_04_query(query_type) self.encoder = encoder self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best() self.neg_k = neg_k self.n_seg_per_doc = 4
def all_doc_ids_of_interest() -> List[str]: qrel = load_robust_qrel() all_doc_id_set = set() for query in qrel.keys(): judgement = qrel[query] for doc_id, score in judgement.items(): all_doc_id_set.add(doc_id) top_k = 1000 galago_rank = load_bm25_best() for query_id, ranked_list in galago_rank.items(): ranked_list.sort(key=lambda x:x[1]) all_doc_id_set.update([x[0] for x in ranked_list[:top_k]]) all_doc_id_list = list(all_doc_id_set) all_doc_id_list.sort() return all_doc_id_list
def __init__(self, encoder, max_seq_length_per_inst, num_doc_per_inst, num_seg_per_inst, query_type="title", neg_k=1000): self.data = self.load_tokens() qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.max_seq_length = max_seq_length_per_inst self.queries = load_robust_04_query(query_type) self.num_doc_per_inst = num_doc_per_inst self.num_seg_per_inst = num_seg_per_inst self.all_segment_encoder = encoder self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best() self.neg_k = neg_k
def load_candidate_head_as_doc( doc_len=400) -> Dict[str, List[QCKCandidateWToken]]: top_k = 100 candidate_docs: Dict[str, List[SimpleRankedListEntry]] = load_bm25_best() print("Num queries : ", len(candidate_docs)) print("Loading robust collection tokens...", end="") data: Dict[str, List[str]] = load_robust_tokens_for_predict() print("Done") print("Total of {} docs".format(len(data))) def make_candidate(doc_id: str): tokens = data[doc_id] return QCKCandidateWToken(doc_id, "", tokens[:doc_len]) def fetch_docs( ranked_list: List[SimpleRankedListEntry] ) -> List[QCKCandidateWToken]: return list([make_candidate(e.doc_id) for e in ranked_list[:top_k]]) return dict_value_map(fetch_docs, candidate_docs)
def main(): top_k = 1000 galago_rank = load_bm25_best() doc_id_set = set() for query_id, ranked_list in galago_rank.items(): ranked_list.sort(key=lambda x :x[1]) doc_id_set.update([x[0] for x in ranked_list[:top_k]]) doc_id_list = list(doc_id_set) robust_path = "/mnt/nfs/work3/youngwookim/data/robust04" data = load_robust(robust_path) save_d = {} for doc_id in doc_id_list: try: save_d[doc_id] = data[doc_id] except KeyError: print(doc_id, 'not found') save_to_pickle(save_d, "robust04_docs_predict")
def load_candidate_d(): candidate_docs: Dict[str, List[SimpleRankedListEntry]] = load_bm25_best() def get_doc_id(l: List[SimpleRankedListEntry]): return list([e.doc_id for e in l]) candidate_doc_ids: Dict[str, List[str]] = dict_value_map( get_doc_id, candidate_docs) # token_data: Dict[str, List[str]] = load_robust_tokens_for_predict() docs = load_from_pickle("robust04_docs_predict") out_d = {} top_k = 100 for query_id, doc_id_list in candidate_doc_ids.items(): new_entries = [] for doc_id in doc_id_list[:top_k]: # tokens = token_data[doc_id] content = docs[doc_id] new_entries.append((doc_id, content)) out_d[query_id] = new_entries return out_d
def __init__(self, encoder, max_seq_length, scores, query_type="title", target_selection="best"): self.data = self.load_tokens() qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.max_seq_length = max_seq_length self.queries = load_robust_04_query(query_type) self.encoder = encoder self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best() self.scores: Dict[Tuple[str, str, int], float] = scores self.get_target_indices: Callable[[], List[int]] = { 'best': get_target_indices_get_best, 'all': get_target_indices_all, 'first_and_best': get_target_indices_first_and_best, 'best_or_over_09': get_target_indices_best_or_over_09, 'random_over_09': get_target_indices_random_over_09 }[target_selection]
def get_candidate_all_passage_w_samping( max_seq_length=256, neg_k=1000) -> Dict[str, List[QCKCandidateWToken]]: qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt") galago_rank = load_bm25_best() tokens_d = load_robust_tokens_for_train() tokens_d.update(load_robust_tokens_for_predict(4)) queries = load_robust04_title_query() tokenizer = get_tokenizer() judgement: Dict[str, Dict] = load_qrels_structured(qrel_path) out_d: Dict[str, List[QCKCandidateWToken]] = {} for query_id in judgement.keys(): if query_id not in judgement: continue query = queries[query_id] query_tokens = tokenizer.tokenize(query) judge_entries = judgement[query_id] doc_ids = set(judge_entries.keys()) ranked_list = galago_rank[query_id] ranked_list = ranked_list[:neg_k] doc_ids.update([e.doc_id for e in ranked_list]) candidate = [] for doc_id in doc_ids: tokens = tokens_d[doc_id] for idx, passage in enumerate(enum_passage(tokens, max_seq_length)): if idx == 0: include = True else: include = random.random() < 0.1 if include: c = QCKCandidateWToken(doc_id, "", passage) candidate.append(c) out_d[query_id] = candidate return out_d
def __init__(self, top_k=150): super(RobustPreprocessPredict, self).__init__() self.galago_rank = load_bm25_best() self.top_k = top_k