def __init__(self, encoder, max_seq_length): self.data = load_robust_tokens_for_train() assert len(self.data) == 174787 qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.max_seq_length = max_seq_length self.queries = load_robust04_title_query() self.encoder = encoder self.tokenizer = get_tokenizer()
def main(): tokens_d = load_robust_tokens_for_train() print("Tokens loaded") while True: try: doc_id = input() print("doc_id: ", doc_id) print(tokens_d[doc_id]) print("----") ## except KeyError as e: print("Doc not found") print(e) except Exception: pass
def load_candidate_all_passage_from_qrel( max_seq_length, max_passage_per_doc=10) -> Dict[str, List[QCKCandidateWToken]]: qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt") judgement: Dict[str, Dict] = load_qrels_structured(qrel_path) candidate_doc_ids = {} for query_id in judgement.keys(): judge_entries = judgement[query_id] doc_ids = list(judge_entries.keys()) candidate_doc_ids[query_id] = doc_ids token_data = load_robust_tokens_for_train() return load_candidate_all_passage_inner(candidate_doc_ids, token_data, max_seq_length, max_passage_per_doc, 9999)
def get_candidate_all_passage_w_samping( max_seq_length=256, neg_k=1000) -> Dict[str, List[QCKCandidateWToken]]: qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt") galago_rank = load_bm25_best() tokens_d = load_robust_tokens_for_train() tokens_d.update(load_robust_tokens_for_predict(4)) queries = load_robust04_title_query() tokenizer = get_tokenizer() judgement: Dict[str, Dict] = load_qrels_structured(qrel_path) out_d: Dict[str, List[QCKCandidateWToken]] = {} for query_id in judgement.keys(): if query_id not in judgement: continue query = queries[query_id] query_tokens = tokenizer.tokenize(query) judge_entries = judgement[query_id] doc_ids = set(judge_entries.keys()) ranked_list = galago_rank[query_id] ranked_list = ranked_list[:neg_k] doc_ids.update([e.doc_id for e in ranked_list]) candidate = [] for doc_id in doc_ids: tokens = tokens_d[doc_id] for idx, passage in enumerate(enum_passage(tokens, max_seq_length)): if idx == 0: include = True else: include = random.random() < 0.1 if include: c = QCKCandidateWToken(doc_id, "", passage) candidate.append(c) out_d[query_id] = candidate return out_d
def load_tokens(self): tokens_d = load_robust_tokens_for_train() tokens_d.update(load_robust_tokens_for_predict(4)) return tokens_d