Exemplo n.º 1
0
def get_candidate_all_passage_w_samping_predict(
        max_seq_length=256) -> Dict[str, List[QCKCandidateWToken]]:
    qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt")
    galago_rank = load_bm25_best()
    tokens_d = load_robust_tokens_for_predict(4)
    queries = load_robust04_title_query()
    tokenizer = get_tokenizer()
    out_d: Dict[str, List[QCKCandidateWToken]] = {}
    for query_id in queries:
        query = queries[query_id]
        query_tokens = tokenizer.tokenize(query)

        ranked_list = galago_rank[query_id]
        ranked_list = ranked_list[:100]
        doc_ids = list([e.doc_id for e in ranked_list])

        candidate = []
        for doc_id in doc_ids:
            tokens = tokens_d[doc_id]
            for idx, passage in enumerate(enum_passage(tokens,
                                                       max_seq_length)):
                if idx == 0:
                    include = True
                else:
                    include = random.random() < 0.1

                if include:
                    c = QCKCandidateWToken(doc_id, "", passage)
                    candidate.append(c)

        out_d[query_id] = candidate
    return out_d
Exemplo n.º 2
0
def load_candidate_all_passage(
        max_seq_length,
        max_passage_per_doc=10) -> Dict[str, List[QCKCandidateWToken]]:
    candidate_docs: Dict[str, List[SimpleRankedListEntry]] = load_bm25_best()

    def get_doc_id(l: List[SimpleRankedListEntry]):
        return list([e.doc_id for e in l])

    candidate_doc_ids: Dict[str, List[str]] = dict_value_map(
        get_doc_id, candidate_docs)
    token_data: Dict[str, List[str]] = load_robust_tokens_for_predict()
    return load_candidate_all_passage_inner(candidate_doc_ids, token_data,
                                            max_seq_length,
                                            max_passage_per_doc)
Exemplo n.º 3
0
def load_candidate_head_as_doc(
        doc_len=400) -> Dict[str, List[QCKCandidateWToken]]:
    top_k = 100
    candidate_docs: Dict[str, List[SimpleRankedListEntry]] = load_bm25_best()
    print("Num queries : ", len(candidate_docs))
    print("Loading robust collection tokens...", end="")
    data: Dict[str, List[str]] = load_robust_tokens_for_predict()
    print("Done")
    print("Total of {} docs".format(len(data)))

    def make_candidate(doc_id: str):
        tokens = data[doc_id]
        return QCKCandidateWToken(doc_id, "", tokens[:doc_len])

    def fetch_docs(
            ranked_list: List[SimpleRankedListEntry]
    ) -> List[QCKCandidateWToken]:
        return list([make_candidate(e.doc_id) for e in ranked_list[:top_k]])

    return dict_value_map(fetch_docs, candidate_docs)
Exemplo n.º 4
0
def get_candidate_all_passage_w_samping(
        max_seq_length=256, neg_k=1000) -> Dict[str, List[QCKCandidateWToken]]:
    qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt")
    galago_rank = load_bm25_best()
    tokens_d = load_robust_tokens_for_train()
    tokens_d.update(load_robust_tokens_for_predict(4))
    queries = load_robust04_title_query()
    tokenizer = get_tokenizer()
    judgement: Dict[str, Dict] = load_qrels_structured(qrel_path)
    out_d: Dict[str, List[QCKCandidateWToken]] = {}
    for query_id in judgement.keys():
        if query_id not in judgement:
            continue
        query = queries[query_id]
        query_tokens = tokenizer.tokenize(query)

        judge_entries = judgement[query_id]
        doc_ids = set(judge_entries.keys())

        ranked_list = galago_rank[query_id]
        ranked_list = ranked_list[:neg_k]
        doc_ids.update([e.doc_id for e in ranked_list])

        candidate = []
        for doc_id in doc_ids:
            tokens = tokens_d[doc_id]
            for idx, passage in enumerate(enum_passage(tokens,
                                                       max_seq_length)):
                if idx == 0:
                    include = True
                else:
                    include = random.random() < 0.1

                if include:
                    c = QCKCandidateWToken(doc_id, "", passage)
                    candidate.append(c)

        out_d[query_id] = candidate
    return out_d