def get_candidate_all_passage_w_samping_predict( max_seq_length=256) -> Dict[str, List[QCKCandidateWToken]]: qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt") galago_rank = load_bm25_best() tokens_d = load_robust_tokens_for_predict(4) queries = load_robust04_title_query() tokenizer = get_tokenizer() out_d: Dict[str, List[QCKCandidateWToken]] = {} for query_id in queries: query = queries[query_id] query_tokens = tokenizer.tokenize(query) ranked_list = galago_rank[query_id] ranked_list = ranked_list[:100] doc_ids = list([e.doc_id for e in ranked_list]) candidate = [] for doc_id in doc_ids: tokens = tokens_d[doc_id] for idx, passage in enumerate(enum_passage(tokens, max_seq_length)): if idx == 0: include = True else: include = random.random() < 0.1 if include: c = QCKCandidateWToken(doc_id, "", passage) candidate.append(c) out_d[query_id] = candidate return out_d
def load_candidate_all_passage( max_seq_length, max_passage_per_doc=10) -> Dict[str, List[QCKCandidateWToken]]: candidate_docs: Dict[str, List[SimpleRankedListEntry]] = load_bm25_best() def get_doc_id(l: List[SimpleRankedListEntry]): return list([e.doc_id for e in l]) candidate_doc_ids: Dict[str, List[str]] = dict_value_map( get_doc_id, candidate_docs) token_data: Dict[str, List[str]] = load_robust_tokens_for_predict() return load_candidate_all_passage_inner(candidate_doc_ids, token_data, max_seq_length, max_passage_per_doc)
def load_candidate_head_as_doc( doc_len=400) -> Dict[str, List[QCKCandidateWToken]]: top_k = 100 candidate_docs: Dict[str, List[SimpleRankedListEntry]] = load_bm25_best() print("Num queries : ", len(candidate_docs)) print("Loading robust collection tokens...", end="") data: Dict[str, List[str]] = load_robust_tokens_for_predict() print("Done") print("Total of {} docs".format(len(data))) def make_candidate(doc_id: str): tokens = data[doc_id] return QCKCandidateWToken(doc_id, "", tokens[:doc_len]) def fetch_docs( ranked_list: List[SimpleRankedListEntry] ) -> List[QCKCandidateWToken]: return list([make_candidate(e.doc_id) for e in ranked_list[:top_k]]) return dict_value_map(fetch_docs, candidate_docs)
def get_candidate_all_passage_w_samping( max_seq_length=256, neg_k=1000) -> Dict[str, List[QCKCandidateWToken]]: qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt") galago_rank = load_bm25_best() tokens_d = load_robust_tokens_for_train() tokens_d.update(load_robust_tokens_for_predict(4)) queries = load_robust04_title_query() tokenizer = get_tokenizer() judgement: Dict[str, Dict] = load_qrels_structured(qrel_path) out_d: Dict[str, List[QCKCandidateWToken]] = {} for query_id in judgement.keys(): if query_id not in judgement: continue query = queries[query_id] query_tokens = tokenizer.tokenize(query) judge_entries = judgement[query_id] doc_ids = set(judge_entries.keys()) ranked_list = galago_rank[query_id] ranked_list = ranked_list[:neg_k] doc_ids.update([e.doc_id for e in ranked_list]) candidate = [] for doc_id in doc_ids: tokens = tokens_d[doc_id] for idx, passage in enumerate(enum_passage(tokens, max_seq_length)): if idx == 0: include = True else: include = random.random() < 0.1 if include: c = QCKCandidateWToken(doc_id, "", passage) candidate.append(c) out_d[query_id] = candidate return out_d