Exemplo n.º 1
0
def get_candidate_all_passage_w_samping_predict(
        max_seq_length=256) -> Dict[str, List[QCKCandidateWToken]]:
    qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt")
    galago_rank = load_bm25_best()
    tokens_d = load_robust_tokens_for_predict(4)
    queries = load_robust04_title_query()
    tokenizer = get_tokenizer()
    out_d: Dict[str, List[QCKCandidateWToken]] = {}
    for query_id in queries:
        query = queries[query_id]
        query_tokens = tokenizer.tokenize(query)

        ranked_list = galago_rank[query_id]
        ranked_list = ranked_list[:100]
        doc_ids = list([e.doc_id for e in ranked_list])

        candidate = []
        for doc_id in doc_ids:
            tokens = tokens_d[doc_id]
            for idx, passage in enumerate(enum_passage(tokens,
                                                       max_seq_length)):
                if idx == 0:
                    include = True
                else:
                    include = random.random() < 0.1

                if include:
                    c = QCKCandidateWToken(doc_id, "", passage)
                    candidate.append(c)

        out_d[query_id] = candidate
    return out_d
Exemplo n.º 2
0
 def make_candidate(doc_id: str) -> Iterable[QCKCandidateWToken]:
     tokens = token_data[doc_id]
     for idx, passage_tokens in enumerate(
             enum_passage(tokens, content_len)):
         if idx >= max_passage_per_doc:
             break
         doc_part_id = "{}_{}".format(doc_id, idx)
         yield QCKCandidateWToken(doc_part_id, "", passage_tokens)
Exemplo n.º 3
0
    def get_candidate_for_query(query: QCKQuery):
        res = get_evidence_from_pool(query.text, 60)
        query_len = len(tokenizer.tokenize(query.text))
        candidate_max_len = max_seq_length - 3 - query_len

        output = []
        for text, e_id, score in res:
            tokens = tokenizer.tokenize(text)
            for passage in enum_passage(tokens, candidate_max_len):
                c = QCKCandidateWToken(str(e_id), "", passage)
                output.append(c)
        return output
Exemplo n.º 4
0
        def convert(
            target_pair: Tuple[QCKQueryWToken, List[KDPWToken]],
            other_pairs: List[Tuple[QCKQueryWToken, List[KDPWToken]]]
        ) -> Iterable[Payload]:
            target_query, target_kdp_list = target_pair
            candidates = self.candidates_dict[target_query.query_id]
            candidates_w_tokens = [
                QCKCandidateWToken.from_qck_candidate(self.tokenizer, c)
                for c in candidates
            ]
            num_inst_expectation = len(target_kdp_list) * len(candidates)
            if num_inst_expectation > 1000 * 1000:
                print(target_query)
                print(len(target_kdp_list))
                print(len(candidates))

            def get_insts_per_candidate(candidate: QCKCandidateWToken,
                                        query: QCKQueryWToken,
                                        kdp_list: List[KDPWToken]) -> Payload:
                kdp_list = kdp_list[:self.k_group_size]

                kdp_token_list = []
                for p_idx, kdp in enumerate(kdp_list):
                    kdp_token_list.append(kdp.sub_tokens)

                info = {
                    'query': get_light_qckquery(query),
                    'candidate': get_light_qckcandidate(candidate),
                    'kdpl': lmap(get_light_kdp, kdp_list)
                }
                inst = Payload(kdp_list=kdp_token_list,
                               text1=query.tokens,
                               text2=candidate.tokens,
                               data_id=data_id_manager.assign(info),
                               is_correct=self._is_correct(query, candidate))
                return inst

            for c_w_token in candidates_w_tokens:
                yield get_insts_per_candidate(c_w_token, target_query,
                                              target_kdp_list)
                other_query, other_kdp_list = pick1(other_pairs)
                yield get_insts_per_candidate(c_w_token, other_query,
                                              other_kdp_list)
Exemplo n.º 5
0
def get_candidate_all_passage_w_samping(
        max_seq_length=256, neg_k=1000) -> Dict[str, List[QCKCandidateWToken]]:
    qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt")
    galago_rank = load_bm25_best()
    tokens_d = load_robust_tokens_for_train()
    tokens_d.update(load_robust_tokens_for_predict(4))
    queries = load_robust04_title_query()
    tokenizer = get_tokenizer()
    judgement: Dict[str, Dict] = load_qrels_structured(qrel_path)
    out_d: Dict[str, List[QCKCandidateWToken]] = {}
    for query_id in judgement.keys():
        if query_id not in judgement:
            continue
        query = queries[query_id]
        query_tokens = tokenizer.tokenize(query)

        judge_entries = judgement[query_id]
        doc_ids = set(judge_entries.keys())

        ranked_list = galago_rank[query_id]
        ranked_list = ranked_list[:neg_k]
        doc_ids.update([e.doc_id for e in ranked_list])

        candidate = []
        for doc_id in doc_ids:
            tokens = tokens_d[doc_id]
            for idx, passage in enumerate(enum_passage(tokens,
                                                       max_seq_length)):
                if idx == 0:
                    include = True
                else:
                    include = random.random() < 0.1

                if include:
                    c = QCKCandidateWToken(doc_id, "", passage)
                    candidate.append(c)

        out_d[query_id] = candidate
    return out_d
Exemplo n.º 6
0
 def make_candidate(e_id: int) -> Iterable[QCKCandidate]:
     text = evi_dict[e_id]
     tokens = tokenizer.tokenize(text)
     for passage in enum_passage(tokens, candidate_max_len):
         yield QCKCandidateWToken(str(e_id), "", passage)
Exemplo n.º 7
0
 def make_candidate(doc_id: str):
     tokens = data[doc_id]
     return QCKCandidateWToken(doc_id, "", tokens[:doc_len])
Exemplo n.º 8
0
 def get_qck_candidate_w_token(self, c: QCKCandidate) -> QCKCandidateWToken:
     tokens = self.tokenizer.tokenize(c.text)
     return QCKCandidateWToken(c.id, c.text, tokens)