示例#1
0
def predict_see_candidate(bm25_module: BM25, claims, top_k):
    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    c_qtf_d = {}
    for cid, c_text in cid_to_text.items():
        c_tokens = bm25_module.tokenizer.tokenize_stem(c_text)
        c_qtf_d[cid] = Counter(c_tokens)

    output = []
    for claim in claims:
        cid = claim['cId']
        claim_text = claim['text']
        lucene_results = es_helper.get_perspective_from_pool(claim_text, 50)
        candidate_pids = []
        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            candidate_pids.append(_pid)

        p_text = lmap(perspective_getter, candidate_pids)

        p_tokens = lmap(bm25_module.tokenizer.tokenize_stem, p_text)

        acc_counter = Counter()
        for tokens in p_tokens[:30]:
            for t in tokens:
                acc_counter[t] += 1 / len(tokens)
        c = normalize_counter(acc_counter)
        c_tokens = bm25_module.tokenizer.tokenize_stem(claim_text)
        qtf = Counter(c_tokens)
        qtf = c + qtf

        ranked_list = []
        for pid in candidate_pids:
            p_tokens = bm25_module.tokenizer.tokenize_stem(
                perspective_getter(pid))
            score = bm25_module.score_inner(qtf, Counter(p_tokens))
            ranked_list.append((pid, score))

        ranked_list.sort(key=lambda x: x[1], reverse=True)
        prediction_list = []

        for pid, score in ranked_list[:top_k]:
            p_entry = {
                'cid': cid,
                'pid': pid,
                'claim_text': claim_text,
                'perspective_text': perspective_getter(pid),
                'rationale': score.name,
                'score': score,
            }
            prediction_list.append(p_entry)
        output.append((cid, prediction_list))

    return output
示例#2
0
def predict_by_bm25_from_candidate(bm25_module, claims,
                                   candidate_dict: List[Tuple[int, List[int]]],
                                   top_k) -> List[Tuple[int, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)

    def scorer(c_text, p_text) -> NamedNumber:
        score = bm25_module.score(c_text, p_text)
        return score

    all_prediction_list: List[Tuple[int, List[Dict]]] = []
    for cid, candidates in candidate_dict:
        prediction_list: List[Dict] = []
        claim_text = cid_to_text[cid]
        for pid in candidates:
            p_text = perspective_getter(pid)
            p_entry = {
                'cid': cid,
                'pid': pid,
                'claim_text': claim_text,
                'perspective_text': p_text,
                'rationale': "",
                'score': scorer(claim_text, p_text),
            }
            prediction_list.append(p_entry)
        prediction_list.sort(key=lambda x: x['score'], reverse=True)
        prediction_list = prediction_list[:top_k]
        all_prediction_list.append((cid, prediction_list))
    return all_prediction_list
示例#3
0
def main():
    pc_data: List[Dict] = load_claim_perspective_pair()

    pc_data.sort(key=lambda e: len(e['perspectives']), reverse=True)
    gold_d: Dict[int, List[PerspectiveCluster]] = load_perspectrum_golds()
    ca_cid = 1

    out_j = []
    for e in pc_data[:100]:
        cid = e['cId']
        if not gold_d[cid]:
            continue
        c_text = e['text']
        for pc in gold_d[cid]:
            if random.random() < 0.3:
                first_pid = pc.perspective_ids[0]
                p_text = perspective_getter(first_pid)
                j_entry = {
                    'cid': cid,
                    'claim_text': c_text,
                    'ca_cid': ca_cid,
                    'perspective': {
                        'stance': pc.stance_label_3,
                        'pid': first_pid,
                        'p_text': p_text
                    }
                }
                ca_cid += 1
                out_j.append(j_entry)
    print("total of {}".format(len(out_j)))
    out_f = open(at_output_dir("ca_building", "claims.step1.txt"),
                 "w",
                 encoding="utf-8")
    json.dump(out_j, out_f, indent=True)
示例#4
0
    def generate_instances(self, claim: Dict,
                           data_id_manager) -> List[Payload]:
        cid = claim['cId']
        claim = claim['text']
        perspectives = self.candidate_perspective[cid]
        passages = self.cid_to_passages[cid]

        output = []
        for pid in perspectives:
            info = {
                'cid': cid,
                'pid': pid,
            }
            is_correct = any([pid in cluster for cluster in self.gold[cid]])
            perspective = perspective_getter(pid)
            passage_list = left(passages)
            payload = Payload(
                passage_list,
                claim,
                perspective,
                data_id_manager.assign(info),
                is_correct,
            )
            output.append(payload)

        return output
示例#5
0
    def generate_instances(self, claim: Dict,
                           data_id_manager) -> List[Payload]:
        cid = claim['cId']
        claim = claim['text']
        perspectives = self.candidate_perspective[cid]
        passages = self.cid_to_passages[cid]

        if self.filter_good:
            filter_condition = score_over_zero
        else:

            def filter_condition(dummy):
                return True

        good_passages: List[List[str]] = left(
            lfilter(filter_condition, passages))
        output = []
        for pid in perspectives:
            is_correct = any([pid in cluster for cluster in self.gold[cid]])
            for passage_idx, passage in enumerate(good_passages):
                perspective = perspective_getter(pid)
                info = {'cid': cid, 'pid': pid, 'passage_idx': passage_idx}
                p = Payload(passage, claim, perspective,
                            data_id_manager.assign(info), is_correct)
                output.append(p)

        return output
示例#6
0
def main():
    pc_data: List[Dict] = load_claim_perspective_pair()

    pc_data.sort(key=lambda e: len(e['perspectives']), reverse=True)
    gold_d: Dict[int, List[PerspectiveCluster]] = load_perspectrum_golds()

    out_f = open(at_data_dir("perspective", "claims_and_perspective.txt"), "w", encoding="utf-8")

    for e in pc_data:
        cid = e['cId']

        if not gold_d[cid]:
            continue
        text = e['text']
        rows = []
        row = [str(cid), text]
        rows.append(row)

        for pc in gold_d[cid]:
            rows.append([pc.stance_label_3, pc.stance_label_5])
            for pid in pc.perspective_ids:
                row = [perspective_getter(pid)]
                rows.append(row)
            rows.append([])

        for row in rows:
            out_f.write("\t".join(row) + "\n")
        out_f.write("\n\n\n")
示例#7
0
 def scorer(lucene_score, query_id) -> NamedNumber:
     claim_id, p_id = query_id.split("_")
     p_text = perspective_getter(int(p_id))
     tokens = tokenizer.tokenize_stem(p_text)
     c_lm = claim_log_odds_dict[claim_id]
     reason = " ".join(["{0} ({1:.2f})".format(t, c_lm[t]) for t in tokens])
     score = sum([c_lm[t] for t in tokens])
     return NamedNumber(score, reason)
示例#8
0
def main():
    claim_text_d: Dict[int, str] = get_all_claim_d()
    claim_text_d: Dict[str, str] = dict_key_map(str, claim_text_d)
    evi_dict: Dict[str, str] = dict_key_map(str, load_evidence_dict())
    evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid()
    print("V2")

    def print_entry(entry):
        evidence_text = evi_dict[entry.doc_id]
        print("[{}] {}: {}".format(entry.rank, entry.doc_id, evidence_text))

    ranked_list_dict = load_ranked_list_grouped(sys.argv[1])
    for query, ranked_list in ranked_list_dict.items():
        print()

        claim_id, perspective_id = query.split("_")
        gold_ids: List[str] = lmap(str, evi_gold_dict[query])
        if not gold_ids:
            print("query {} has no gold".format(query))
            continue
        assert gold_ids
        claim_text = claim_text_d[claim_id]
        perspective_text = perspective_getter(int(perspective_id))

        pos_entries = []
        neg_entries = []
        for entry in ranked_list:
            label = entry.doc_id in gold_ids
            if label:
                pos_entries.append(entry)
            elif entry.rank < 3:
                neg_entries.append(entry)

        if not pos_entries:
            print("gold not in ranked list")
            continue

        num_rel = len(pos_entries)

        correctness = []
        for entry in ranked_list[:num_rel]:
            label = entry.doc_id in gold_ids
            correctness.append(int(label))

        precision = average(correctness)
        if precision > 0.99:
            print("Good")
            continue
        print("precision at {}: {}".format(num_rel, precision))

        print("Claim: ", claim_text)
        print("perspective_text: ", perspective_text)
        print(" < GOLD >")
        foreach(print_entry, pos_entries)
        print(" < False Positive >")
        foreach(print_entry, neg_entries)
示例#9
0
    def encode(e: Tuple[int, int, List[Dict]]):
        cid, pid, passages = e
        text1 = tokenize(cid_to_text[cid])
        text2 = tokenize(perspective_getter(pid))

        for passage_idx, passage in enumerate(passages):
            info = {
                'cid': cid,
                'pid': pid,
                'passage_idx': passage_idx,
                'passage': passage['passage'],
                'c_text': cid_to_text[cid],
                'p_text': perspective_getter(pid)
            }
            yield PayloadAsTokens(passage=passage['passage'],
                                   text1=text1,
                                   text2=text2,
                                   data_id=data_id_man.assign(info),
                                   is_correct=0
                                   )
示例#10
0
 def scorer(lucene_score, query_id) -> NamedNumber:
     claim_id, p_id = query_id.split("_")
     i_claim_id = int(claim_id)
     payload = []
     p_text = perspective_getter(int(p_id))
     c_text = cid_to_text[i_claim_id]
     payload.append(encoder.encode_pair(c_text, p_text))
     r = proxy.predict(payload)
     ns_score = -float(r[0])
     #ns_score = 0
     score = bm25_module.score(c_text, p_text)
     new_score = score + ns_score * 10
     score = NamedNumber(new_score, score.name + " {}".format(ns_score))
     return score
示例#11
0
文件: common.py 项目: clover3/Chair
def get_qck_queries_all() -> List[QCKQuery]:
    pc_itr = enum_perspective_clusters()
    claim_text_d: Dict[int, str] = get_all_claim_d()

    query_list = []
    for pc in pc_itr:
        c_text = claim_text_d[pc.claim_id]
        pid = min(pc.perspective_ids)
        p_text = perspective_getter(pid)
        text = c_text + " " + p_text
        query = QCKQuery(get_pc_cluster_query_id(pc), text)
        query_list.append(query)

    return query_list
示例#12
0
 def scorer(lucene_score, query_id) -> NamedNumber:
     nonlocal found_claim
     claim_id, p_id = query_id.split("_")
     i_claim_id = int(claim_id)
     if i_claim_id in q_tf_replace_norm:
         ex_qtf = q_tf_replace_norm[i_claim_id]
         ex_qtf = Counter(dict(ex_qtf.most_common(50)))
         qtf = ex_qtf + c_qtf_d[i_claim_id]
         found_claim.add(i_claim_id)
     else:
         qtf = c_qtf_d[i_claim_id]
     p_text = perspective_getter(int(p_id))
     p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
     score = bm25_module.score_inner(qtf, Counter(p_tokens))
     return score
def main():
    claim_text_d: Dict[int, str] = get_all_claim_d()
    evidence_d = load_evidence_dict()
    evidence_gold = evidence_gold_dict()
    while True:
        s = input()
        cid, pid = s.split("_")
        cid = int(cid)
        pid = int(pid)
        print("Claim: ", claim_text_d[cid])
        print("Perspective: ", perspective_getter(pid))
        key = cid, pid
        e_ids = evidence_gold[key]
        for eid in e_ids:
            print("Evidence: ", evidence_d[eid])
示例#14
0
def main(input_path):
    claims = get_all_claims()
    claim_d = claims_to_dict(claims)
    gold: Dict[int, List[List[int]]] = get_claim_perspective_id_dict()
    grouped_ranked_list = load_ranked_list_grouped(input_path)

    def is_correct(qid: str, doc_id: str):
        return any([int(doc_id) in cluster for cluster in gold[int(qid)]])

    top_k = 5
    for qid, entries in grouped_ranked_list.items():
        n_gold = sum(map(len, gold[int(qid)]))
        cut_n = min(n_gold, top_k)
        correctness = list([is_correct(qid, e.doc_id) for e in entries[:cut_n]])
        num_correct = sum(lmap(int, correctness))
        p_at_k = num_correct / cut_n

        pid_to_rank: Dict[str, int] = {e.doc_id: e.rank for e in entries}

        def get_rank(pid: int):
            if str(pid) in pid_to_rank:
                return pid_to_rank[str(pid)]
            else:
                return "X"

        if p_at_k < 0.3:
            print(n_gold)
            print(p_at_k)
            print("Claim {} {}".format(qid, claim_d[int(qid)]))##
            for cluster in gold[int(qid)]:
                print("-")
                for pid in cluster:
                    print("[{}]".format(get_rank(pid)), perspective_getter(int(pid)))
            for e in entries[:50]:
                correct_str = "Y" if is_correct(qid, e.doc_id) else "N"
                print("{} {} {}".format(correct_str, e.score, perspective_getter(int(e.doc_id))))
示例#15
0
文件: common.py 项目: clover3/Chair
def get_qck_queries(split) -> List[QCKQuery]:
    claim_ids = set(load_claim_ids_for_split(split))
    pc_itr = enum_perspective_clusters_for_split(split)
    claim_text_d: Dict[int, str] = get_all_claim_d()

    query_list = []
    for pc in pc_itr:
        if pc.claim_id in claim_ids:
            c_text = claim_text_d[pc.claim_id]
            pid = min(pc.perspective_ids)
            p_text = perspective_getter(pid)
            text = c_text + " " + p_text
            query = QCKQuery(get_pc_cluster_query_id(pc), text)
            query_list.append(query)

    return query_list
示例#16
0
    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        c_text = cid_to_text[int(claim_id)]
        p_text = perspective_getter(int(p_id))
        score: NamedNumber = bm25_module.score(c_text, p_text)

        nclaim_id = int(claim_id)
        if nclaim_id in rm_info:
            ex_qtf = rm_info_c[nclaim_id]
            p_tokens = tokenizer.tokenize_stem(p_text)
            ex_score = bm25_module.score_inner(ex_qtf, Counter(p_tokens))
            new_info = score.name + "({})".format(ex_score.name)
            score = NamedNumber(score + ex_score, new_info)
        else:
            not_found.add(claim_id)
        return score
示例#17
0
    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        c_text = cid_to_text[int(claim_id)]
        p_text = perspective_getter(int(p_id))
        qtf = Counter(stem_tokenize(c_text))
        weight = claim_term_weight[int(claim_id)]

        new_qtf = Counter()
        for k, v in qtf.items():
            try:
                w = weight[k]
                new_qtf[k] = w * v
            except Exception as e:
                print("Exception")
                print(e)
                print(k)

        tf = Counter(stem_tokenize(p_text))
        score = bm25_module.score_inner(new_qtf, tf)
        return score
示例#18
0
    def generate_instances(self, claim: Dict,
                           data_id_manager) -> List[PayloadAsTokens]:
        cid = claim['cId']
        claim_tokens = self.tokenizer.tokenize(claim['text'])
        perspectives = self.candidate_perspective[cid]
        passages = self.cid_to_passages[cid]
        output = []
        for pid in perspectives:
            is_correct = any([pid in cluster for cluster in self.gold[cid]])
            perspective = perspective_getter(pid)
            perspective_tokens = self.tokenizer.tokenize(perspective)
            for passage_idx, passage in enumerate(left(passages)):
                passage_subtokens = tokenize_from_tokens(
                    self.tokenizer, passage)
                info = {'cid': cid, 'pid': pid, 'passage_idx': passage_idx}
                p = PayloadAsTokens(passage_subtokens,
                                    perspective_tokens, claim_tokens,
                                    data_id_manager.assign(info), is_correct)
                output.append(p)

        return output
示例#19
0
    def get_candidates(c: Dict) -> Tuple[int, List[int]]:
        cid = c["cId"]
        assert type(cid) == int
        claim_text = c["text"]
        claim_tokens = tokenizer.tokenize_stem(claim_text)
        top_k = 50
        lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k)
        candidate_list: List[int] = []

        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            candidate_list.append(_pid)

        gold_pids = cid_to_pids[int(cid)]
        hard_candidate = []
        mismatch_voca = Counter()
        for pid in gold_pids:
            if pid not in candidate_list:
                hard_candidate.append(pid)
                p_text = perspective_getter(pid)
                p_tokens = tokenizer.tokenize_stem(p_text)

                for t in p_tokens:
                    if t not in claim_tokens:
                        mismatch_voca[t] += 1

        candidate_list.extend(hard_candidate)
        mismatch_tf_idf = get_tf_idf(mismatch_voca)
        new_qterms = left(mismatch_tf_idf.most_common(30))
        lucene_results = es_helper.get_perspective_from_pool(
            " ".join(new_qterms), top_k)

        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            if _pid not in candidate_list:
                candidate_list.append(_pid)

        return cid, candidate_list
示例#20
0
def doc_id_to_candidate(doc_id: str) -> QCKCandidate:
    return QCKCandidate(doc_id, perspective_getter(int(doc_id)))
示例#21
0
def get_qck_candidate_from_candidate_id(candidate_id: str):
    text = perspective_getter(int(candidate_id))
    return QCKCandidate(candidate_id, text)
示例#22
0
def cid_pid_format_to_qck(candidate_pers):
    candidate_dict: Dict[str, List[QCKCandidate]] = dict()
    for cid, candidate_pids in candidate_pers:
        candidate_dict[str(cid)] = \
            lmap(lambda pid: QCKCandidate(str(pid), perspective_getter(pid)), candidate_pids)
    return candidate_dict
示例#23
0
文件: kdp_para.py 项目: clover3/Chair
 def get_p_tokens(self, pid: int):
     if pid not in self.p_tokens_d:
         text = perspective_getter(pid)
         self.p_tokens_d[pid] = self.tokenizer.tokenize(text)
     return self.p_tokens_d[pid]
示例#24
0
def main():
    while True:
        s = input()
        pid = int(s)
        print(perspective_getter(pid))
示例#25
0
def pc_predict_to_inspect(bm25_module: BM25, q_tf_replace: Dict[int, Counter],
                          q_tf_replace_0: Dict[int, Counter], claims, top_k):
    gold = get_claim_perspective_id_dict()
    q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace)
    q_tf_replace_0_norm = dict_value_map(normalize_counter, q_tf_replace_0)

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    c_qtf_d = {}
    for cid, c_text in cid_to_text.items():
        c_tokens = bm25_module.tokenizer.tokenize_stem(c_text)
        c_qtf_d[cid] = Counter(c_tokens)

    def counter_to_str(c: Dict) -> str:
        s = ""
        for k, v in c.items():
            s += "{0} {1:.2f}".format(k, v) + "\t"
        return s

    for claim in claims:
        cid = claim['cId']
        i_claim_id = int(cid)
        claim_text = claim['text']
        lucene_results = es_helper.get_perspective_from_pool(claim_text, 50)
        candidate_pids = []
        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            candidate_pids.append(_pid)

        if i_claim_id in q_tf_replace_norm:
            claim_qtf = Counter(
                dict_value_map(lambda x: x * 1, c_qtf_d[i_claim_id]))
            ex_qtf = q_tf_replace_norm[i_claim_id]
            ex_qtf = Counter(dict(ex_qtf.most_common(50)))
            qtf = ex_qtf + claim_qtf
        else:
            qtf = c_qtf_d[i_claim_id]

        ranked_list = []
        for pid in candidate_pids:
            p_text = perspective_getter(int(pid))
            p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
            score = bm25_module.score_inner(qtf, Counter(p_tokens))
            debug_str = ""

            e = score, pid, p_text, debug_str
            ranked_list.append(e)

        gold_pids = gold[cid]

        def is_correct(pid):
            for pids in gold_pids:
                if pid in pids:
                    return True
            return False

        ranked_list.sort(key=lambda x: x[0], reverse=True)

        qtf_idf_applied = {
            k: v * bm25_module.term_idf_factor(k)
            for k, v in qtf.items()
        }
        print()
        print("Claim: ", cid, claim_text)
        for cluster in gold_pids:
            print("-")
            for pid in cluster:
                print(pid, perspective_getter(pid))
        print()
        print("qtf:", counter_to_str(qtf))
        if i_claim_id in q_tf_replace_norm and i_claim_id in q_tf_replace_0_norm:
            print("ex_qtf:", counter_to_str(ex_qtf))
            ex_qtf_0 = q_tf_replace_0_norm[i_claim_id]
            ex_qtf_0 = Counter(dict(ex_qtf_0.most_common(50)))
            print("ex_qtf_0:", counter_to_str(ex_qtf_0))
        print("qtf idf apllied:", counter_to_str(qtf_idf_applied))

        for score, pid, p_text, debug_str in ranked_list[:top_k]:

            if i_claim_id in q_tf_replace_0_norm:
                p_text = perspective_getter(int(pid))
                p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
                ex_qtf_0 = q_tf_replace_0_norm[i_claim_id]
                qtf = ex_qtf_0 + c_qtf_d[i_claim_id]
                score2 = bm25_module.score_inner(qtf, Counter(p_tokens))
                correct_str = "Y" if is_correct(pid) else "N"
                print("{0} {1:.2f} ({2:.2f}) {3} / {4} / {5}".format(
                    correct_str, score, score2, p_text, score.name,
                    score2.name))
示例#26
0
    def get_tokens(pid):
        if pid not in tokens_d:
            text = perspective_getter(pid)
            tokens_d[pid] = tokenizer.tokenize(text)

        return tokens_d[pid]
示例#27
0
 def scorer(lucene_score, query_id) -> NamedNumber:
     claim_id, p_id = query_id.split("_")
     c_text = cid_to_text[int(claim_id)]
     p_text = perspective_getter(int(p_id))
     score = bm25_module.score(c_text, p_text)
     return score