Пример #1
0
 def __init__(self, encoder, max_seq_length):
     self.data = load_robust_tokens_for_train()
     assert len(self.data) == 174787
     qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
     self.judgement = load_qrels_structured(qrel_path)
     self.max_seq_length = max_seq_length
     self.queries = load_robust04_title_query()
     self.encoder = encoder
     self.tokenizer = get_tokenizer()
Пример #2
0
def main():
    tokens_d = load_robust_tokens_for_train()
    print("Tokens loaded")
    while True:
        try:
            doc_id = input()
            print("doc_id: ", doc_id)
            print(tokens_d[doc_id])
            print("----")
            ##
        except KeyError as e:
            print("Doc not found")
            print(e)
        except Exception:
            pass
Пример #3
0
def load_candidate_all_passage_from_qrel(
        max_seq_length,
        max_passage_per_doc=10) -> Dict[str, List[QCKCandidateWToken]]:
    qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt")
    judgement: Dict[str, Dict] = load_qrels_structured(qrel_path)

    candidate_doc_ids = {}
    for query_id in judgement.keys():
        judge_entries = judgement[query_id]
        doc_ids = list(judge_entries.keys())
        candidate_doc_ids[query_id] = doc_ids

    token_data = load_robust_tokens_for_train()

    return load_candidate_all_passage_inner(candidate_doc_ids, token_data,
                                            max_seq_length,
                                            max_passage_per_doc, 9999)
Пример #4
0
def get_candidate_all_passage_w_samping(
        max_seq_length=256, neg_k=1000) -> Dict[str, List[QCKCandidateWToken]]:
    qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt")
    galago_rank = load_bm25_best()
    tokens_d = load_robust_tokens_for_train()
    tokens_d.update(load_robust_tokens_for_predict(4))
    queries = load_robust04_title_query()
    tokenizer = get_tokenizer()
    judgement: Dict[str, Dict] = load_qrels_structured(qrel_path)
    out_d: Dict[str, List[QCKCandidateWToken]] = {}
    for query_id in judgement.keys():
        if query_id not in judgement:
            continue
        query = queries[query_id]
        query_tokens = tokenizer.tokenize(query)

        judge_entries = judgement[query_id]
        doc_ids = set(judge_entries.keys())

        ranked_list = galago_rank[query_id]
        ranked_list = ranked_list[:neg_k]
        doc_ids.update([e.doc_id for e in ranked_list])

        candidate = []
        for doc_id in doc_ids:
            tokens = tokens_d[doc_id]
            for idx, passage in enumerate(enum_passage(tokens,
                                                       max_seq_length)):
                if idx == 0:
                    include = True
                else:
                    include = random.random() < 0.1

                if include:
                    c = QCKCandidateWToken(doc_id, "", passage)
                    candidate.append(c)

        out_d[query_id] = candidate
    return out_d
Пример #5
0
 def load_tokens(self):
     tokens_d = load_robust_tokens_for_train()
     tokens_d.update(load_robust_tokens_for_predict(4))
     return tokens_d