示例#1
0
def convert_sub_token(tokenizer, r: Payload) -> PayloadAsTokens:
    passage_subtokens = tokenize_from_tokens(tokenizer, r.passage)
    tokens1: List[str] = tokenizer.tokenize(r.text1)
    tokens2: List[str] = tokenizer.tokenize(r.text2)

    return PayloadAsTokens(passage=passage_subtokens,
                           text1=tokens1,
                           text2=tokens2,
                           data_id=r.data_id,
                           is_correct=r.is_correct)
示例#2
0
def add_tokens_to_qk_unit(qk_unit: QKUnit, tokenizer) -> QKUnitWToken:
    query, kdp_list = qk_unit
    q = QCKQueryWToken(query.query_id, query.text,
                       tokenizer.tokenize(query.text))
    new_kdp_list = []
    for kdp in kdp_list:
        sub_tokens = tokenize_from_tokens(tokenizer, kdp.tokens)
        kdp_w_tokens = KDPWToken(kdp.doc_id, kdp.passage_idx,
                                 kdp.start_location, kdp.tokens, sub_tokens)
        new_kdp_list.append(kdp_w_tokens)
    return q, new_kdp_list
示例#3
0
 def convert(pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[QKInstance]:
     query, passages = pair
     for passage in passages:
         info = {
                     'query': query,
                     'kdp': passage
                 }
         yield QKInstance(query.text,
                          tokenize_from_tokens(self.tokenizer, passage.tokens),
                          data_id_manager.assign(info),
                          self.get_label(query, passage)
                          )
示例#4
0
    def _convert_sub_token(self, r: QCKInstance) -> PayloadAsTokens:
        tokenizer = self.tokenizer
        passage_subtokens = tokenize_from_tokens(tokenizer, r.doc_tokens)
        tokens1: List[str] = tokenizer.tokenize(r.query_text)
        tokens2: List[str] = tokenizer.tokenize(r.candidate_text)

        return PayloadAsTokens(
            passage=passage_subtokens,
            text1=tokens1,
            text2=tokens2,
            data_id=r.data_id,
            is_correct=r.is_correct,
        )
示例#5
0
        def convert(k: KDP) -> Iterable[PayloadAsTokens]:
            k_tokens = tokenize_from_tokens(self.tokenizer, k.tokens)
            for query in self.queries:
                for c in self.candidates_dict[query.query_id]:
                    info = {
                        'query': light_query(query),
                        'candidate': light_candidate(c),
                        'kdp': light_kdp(k)
                    }

                    yield PayloadAsTokens(query.tokens, c.tokens, k_tokens,
                                          data_id_manager.assign(info),
                                          self._is_correct(query, c))
示例#6
0
    def _convert_sub_token(self, r: Instance) -> InstanceTokenized:
        tokenizer = self.tokenizer
        passage_subtokens_list = list(
            [tokenize_from_tokens(tokenizer, p) for p in r.doc_tokens_list])
        tokens1: List[str] = tokenizer.tokenize(r.query_text)
        tokens2: List[str] = tokenizer.tokenize(r.candidate_text)

        return InstanceTokenized(
            passage_subtokens_list=passage_subtokens_list,
            q_tokens=tokens1,
            c_tokens=tokens2,
            data_id=r.data_id,
            is_correct=r.is_correct,
        )
示例#7
0
    def generate_instances(self, claim: Dict,
                           data_id_manager) -> List[PayloadAsTokens]:
        cid = claim['cId']
        claim_tokens = self.tokenizer.tokenize(claim['text'])
        perspectives = self.candidate_perspective[cid]
        passages = self.cid_to_passages[cid]
        output = []
        for pid in perspectives:
            is_correct = any([pid in cluster for cluster in self.gold[cid]])
            perspective = perspective_getter(pid)
            perspective_tokens = self.tokenizer.tokenize(perspective)
            for passage_idx, passage in enumerate(left(passages)):
                passage_subtokens = tokenize_from_tokens(
                    self.tokenizer, passage)
                info = {'cid': cid, 'pid': pid, 'passage_idx': passage_idx}
                p = PayloadAsTokens(passage_subtokens,
                                    perspective_tokens, claim_tokens,
                                    data_id_manager.assign(info), is_correct)
                output.append(p)

        return output
示例#8
0
        def convert(
                pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[PayloadAsTokens]:
            query, passages = pair
            tokenizer = self.tokenizer
            q_tokens: List[int] = tokenizer.convert_tokens_to_ids(
                tokenizer.tokenize(query.text))
            candidates = self.candidates_dict[query.query_id]
            num_inst_expectation = len(passages) * len(candidates)
            if num_inst_expectation > 1000 * 1000:
                print(query)
                print(len(passages))
                print(len(candidates))

            passage_input_ids_list = []
            for p_idx, passage in enumerate(passages):
                if self.kdp_as_sub_token:
                    passage_subtokens = passage.tokens
                else:
                    passage_subtokens = tokenize_from_tokens(
                        tokenizer, passage.tokens)
                passage_input_ids_list.append(
                    tokenizer.convert_tokens_to_ids(passage_subtokens))

            for c in candidates:
                c_tokens: List[int] = tokenizer.convert_tokens_to_ids(
                    c.get_tokens(tokenizer))
                for p_idx, passage in enumerate(passages):
                    info = {
                        'query': get_light_qckquery(query),
                        'candidate': get_light_qckcandidate(c),
                        'kdp': get_light_kdp(passage)
                    }
                    passage_subtokens = passage_input_ids_list[p_idx]
                    inst = PayloadAsIds(passage=passage_subtokens,
                                        text1=q_tokens,
                                        text2=c_tokens,
                                        data_id=data_id_manager.assign(info),
                                        is_correct=self._is_correct(query, c))
                    yield inst
示例#9
0
        def convert(pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[Payload]:
            query, kdp_list = pair
            tokenizer = self.tokenizer
            q_tokens: List[str] = tokenizer.tokenize(query.text)
            candidates = self.candidates_dict[query.query_id]
            num_inst_expectation = len(kdp_list) * len(candidates)
            if num_inst_expectation > 1000 * 1000:
                print(query)
                print(len(kdp_list))
                print(len(candidates))
            p_sub_tokens = []
            for p_idx, kdp in enumerate(kdp_list):
                if self.kdp_as_sub_token:
                    passage_subtokens = kdp.tokens
                else:
                    passage_subtokens = tokenize_from_tokens(
                        tokenizer, kdp.tokens)
                p_sub_tokens.append(passage_subtokens)

            for c in candidates:
                c_tokens: List[str] = c.get_tokens(tokenizer)
                for p_idx, kdp in enumerate(kdp_list):
                    info = {
                        'query': get_light_qckquery(query),
                        'candidate': get_light_qckcandidate(c),
                        'kdp': get_light_kdp(kdp)
                    }
                    passage_subtokens = p_sub_tokens[p_idx]
                    inst = Payload(
                        passage=passage_subtokens,
                        text1=q_tokens,
                        text2=c_tokens,
                        data_id=data_id_manager.assign(info),
                        is_correct=self._is_correct(query, c),
                        kdp_score=self.get_rel_score(query, kdp),
                    )
                    yield inst
示例#10
0
 def tokenize_from_tokens_fn(tokens):
     return tokenize_from_tokens(tokenizer, tokens)