Пример #1
0
def pc_predict_by_bert_next_sent(bm25_module: BM25, claims,
                                 top_k) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    port = 8123
    # Example usage :
    proxy = xmlrpc.client.ServerProxy(
        'http://ingham.cs.umass.edu:{}'.format(port))

    voca_path = pjoin(data_path, "bert_voca.txt")
    encoder = EncoderUnitPlain(512, voca_path)

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        i_claim_id = int(claim_id)
        payload = []
        p_text = perspective_getter(int(p_id))
        c_text = cid_to_text[i_claim_id]
        payload.append(encoder.encode_pair(c_text, p_text))
        r = proxy.predict(payload)
        ns_score = -float(r[0])
        #ns_score = 0
        score = bm25_module.score(c_text, p_text)
        new_score = score + ns_score * 10
        score = NamedNumber(new_score, score.name + " {}".format(ns_score))
        return score

    r = predict_interface(claims, top_k, scorer)
    return r
Пример #2
0
def pc_predict_from_vector_query(bm25_module: BM25,
                                 q_tf_replace: Dict[int, Counter], claims,
                                 top_k) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    found_claim = set()
    q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace)

    c_qtf_d = {}
    for cid, c_text in cid_to_text.items():
        c_tokens = bm25_module.tokenizer.tokenize_stem(c_text)
        c_qtf_d[cid] = Counter(c_tokens)

    def scorer(lucene_score, query_id) -> NamedNumber:
        nonlocal found_claim
        claim_id, p_id = query_id.split("_")
        i_claim_id = int(claim_id)
        if i_claim_id in q_tf_replace_norm:
            claim_qtf = Counter(
                dict_value_map(lambda x: x * 1, c_qtf_d[i_claim_id]))
            ex_qtf = q_tf_replace_norm[i_claim_id]
            ex_qtf = Counter(dict(ex_qtf.most_common(50)))
            qtf = ex_qtf + claim_qtf
            found_claim.add(i_claim_id)
        else:
            qtf = c_qtf_d[i_claim_id]
        p_text = perspective_getter(int(p_id))
        p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
        score = bm25_module.score_inner(qtf, Counter(p_tokens))
        return score

    r = predict_interface(claims, top_k, scorer)
    print("{} of {} found".format(len(found_claim), len(claims)))
    return r
Пример #3
0
def predict_from_dict(score_d: Dict[CPID, float], claims,
                      top_k) -> List[Tuple[str, List[Dict]]]:
    suc_count = SuccessCounter()
    suc_count.reset()

    per_claim_suc = {}
    per_claim_counter = {}

    rationale_d = {}

    def scorer(lucene_score, query_id):
        claim_id, p_id = query_id.split("_")
        if claim_id not in per_claim_suc:
            per_claim_counter[claim_id] = Counter()
            per_claim_suc[claim_id] = SuccessCounter()

        cls_score = get_score_by_d(claim_id, query_id)

        score = (cls_score < 4) * -1 + lucene_score / 20
        #score = cls_score + lucene_score / 20
        score = cls_score
        r = "score={0:.2f} <- cls_score({1:.2f}) lucene_score({2:.2f}) /20".format(
            score, cls_score, lucene_score)
        rationale_d[query_id] = r
        return score

    def get_score_by_d(claim_id, query_id):
        if query_id in score_d:
            cls_score = score_d[query_id]
            per_claim_suc[claim_id].suc()
            if cls_score > 0.8:
                per_claim_counter[claim_id][1] += 1
            elif cls_score < 0.3:
                per_claim_counter[claim_id][0] += 1
            suc_count.suc()
        else:
            cls_score = 0
            per_claim_suc[claim_id].fail()
            suc_count.fail()
        return cls_score

    def get_rationale(query_id):
        if query_id in rationale_d:
            return rationale_d[query_id]
        else:
            return "(N/A)"

    r = predict_interface(claims, top_k, scorer, get_rationale)
    for claim in per_claim_suc:
        suc_counter = per_claim_suc[claim]
        print("{} suc/total={}/{}  True/False={}/{}".format(
            claim, suc_counter.get_suc(), suc_counter.get_total(),
            per_claim_counter[claim][1], per_claim_counter[claim][0]))

    print("{} found of {}".format(suc_count.get_suc(), suc_count.get_total()))
    return r
Пример #4
0
def predict_by_bm25(bm25_module, claims,
                    top_k) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        c_text = cid_to_text[int(claim_id)]
        p_text = perspective_getter(int(p_id))
        score = bm25_module.score(c_text, p_text)
        return score

    r = predict_interface(claims, top_k, scorer)
    return r
Пример #5
0
def predict_by_bm25_rm(bm25_module: BM25, rm_info: Dict[str, List[Tuple[str,
                                                                        str]]],
                       claims, top_k) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    tokenizer = PCTokenizer()

    def stem_merge(score_list: List[Tuple[str, float]]) -> Counter:
        c = Counter()
        for k, v in score_list:
            try:
                new_k = tokenizer.stemmer.stem(k)
                c[new_k] += v
            except UnicodeDecodeError:
                pass
        return c

    rm_info: Dict[str,
                  List[Tuple[str,
                             float]]] = dict_value_map(parse_float, rm_info)
    rm_info: Dict[str,
                  List[Tuple[str,
                             float]]] = dict_value_map(normalize_scores,
                                                       rm_info)
    rm_info_c: Dict[str, Counter] = dict_value_map(stem_merge, rm_info)
    print(len(rm_info_c.keys()))
    print(len(claims))
    not_found = set()

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        c_text = cid_to_text[int(claim_id)]
        p_text = perspective_getter(int(p_id))
        score: NamedNumber = bm25_module.score(c_text, p_text)

        nclaim_id = int(claim_id)
        if nclaim_id in rm_info:
            ex_qtf = rm_info_c[nclaim_id]
            p_tokens = tokenizer.tokenize_stem(p_text)
            ex_score = bm25_module.score_inner(ex_qtf, Counter(p_tokens))
            new_info = score.name + "({})".format(ex_score.name)
            score = NamedNumber(score + ex_score, new_info)
        else:
            not_found.add(claim_id)
        return score

    r = predict_interface(claims, top_k, scorer)
    print(not_found)
    return r
Пример #6
0
def predict_by_para_scorer(score_pred_file_name: FileName,
                           cpid_resolute_file: FileName,
                           claims,
                           top_k) -> List[Tuple[str, List[Dict]]]:
    suc_count = SuccessCounter()
    suc_count.reset()

    pred_path: FilePath = pjoin(output_path, score_pred_file_name)
    print("Loading cpid_resolute")
    cpid_resolute: Dict[str, CPID] = load_cpid_resolute(cpid_resolute_file)
    print("Loading paragraph triple scores")
    score_d: Dict[CPID, float] = get_cpid_score_from_cache_or_raw(pred_path, cpid_resolute, "avg")

    per_claim_suc = {}
    per_claim_counter = {}

    def scorer(lucene_score, query_id):
        claim_id, p_id = query_id.split("_")
        if claim_id not in per_claim_suc:
            per_claim_counter[claim_id] = Counter()
            per_claim_suc[claim_id] = SuccessCounter()

        if query_id in score_d:
            cls_score = score_d[query_id]
            per_claim_suc[claim_id].suc()
            if cls_score > 0.8:
                per_claim_counter[claim_id][1] += 1
            elif cls_score < 0.3:
                per_claim_counter[claim_id][0] += 1
            suc_count.suc()
        else:
            cls_score = 0.5
            per_claim_suc[claim_id].fail()
            suc_count.fail()

        score = 0.9 * cls_score + 0.1 * lucene_score / 20
        return score

    r = predict_interface(claims, top_k, scorer)
    for claim in per_claim_suc:
        suc_counter = per_claim_suc[claim]
        print("{} suc/total={}/{}  True/False={}/{}".format(
            claim, suc_counter.get_suc(), suc_counter.get_total(),
            per_claim_counter[claim][1], per_claim_counter[claim][0]
        ))

    print("{} found of {}".format(suc_count.get_suc(), suc_count.get_total()))
    return r
Пример #7
0
def predict_by_reweighter(bm25_module: BM25, claims, top_k,
                          param) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    claim_term_weight: Dict[int, Dict[str, float]] = get_claim_term_weighting(
        claims, param)
    nlp = spacy.load("en_core_web_sm")

    def do_stem(t: str) -> str:
        r = bm25_module.tokenizer.stemmer.stem(t)
        return r

    def stem_tokenize(text: str) -> Iterator[str]:
        for t in nlp(text):
            try:
                yield do_stem(t.text)
            except UnicodeDecodeError:
                pass

    def apply_stem(term_weight: Dict[str, float]) -> Dict[str, float]:
        return {do_stem(k): v for k, v in term_weight.items()}

    claim_term_weight: Dict[int, Dict[str, float]] = dict_value_map(
        apply_stem, claim_term_weight)

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        c_text = cid_to_text[int(claim_id)]
        p_text = perspective_getter(int(p_id))
        qtf = Counter(stem_tokenize(c_text))
        weight = claim_term_weight[int(claim_id)]

        new_qtf = Counter()
        for k, v in qtf.items():
            try:
                w = weight[k]
                new_qtf[k] = w * v
            except Exception as e:
                print("Exception")
                print(e)
                print(k)

        tf = Counter(stem_tokenize(p_text))
        score = bm25_module.score_inner(new_qtf, tf)
        return score

    r = predict_interface(claims, top_k, scorer)
    return r
Пример #8
0
def predict_by_oracle_on_candidate(claims,
                                   top_k) -> List[Tuple[str, List[Dict]]]:
    gold: Dict[int, List[List[int]]] = get_claim_perspective_id_dict()

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        gold_pids = gold[int(claim_id)]
        score = 0
        for p_ids in gold_pids:
            if int(p_id) in p_ids:
                score = 1

        return NamedNumber(score, "")

    r = predict_interface(claims, top_k, scorer)
    return r
Пример #9
0
def predict_by_lm(claim_lms: List[ClaimLM],
                  claims,
                  top_k) -> List[Tuple[str, List[Dict]]]:

    alpha = 0.1
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    tokenizer = PCTokenizer()
    print("Eval log odds")
    claim_log_odds_dict = {str(c_lm.cid): get_log_odd(c_lm, bg_lm, alpha) for c_lm in claim_lms}

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        p_text = perspective_getter(int(p_id))
        tokens = tokenizer.tokenize_stem(p_text)
        c_lm = claim_log_odds_dict[claim_id]
        reason = " ".join(["{0} ({1:.2f})".format(t, c_lm[t]) for t in tokens])
        score = sum([c_lm[t] for t in tokens])
        return NamedNumber(score, reason)

    r = predict_interface(claims, top_k, scorer)
    return r
Пример #10
0
def pc_predict_vector_query_and_reweight(
        bm25_module: BM25, q_tf_replace: Dict[int, Counter], claims, top_k,
        param) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    found_claim = set()
    q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace)

    def do_stem(t: str) -> str:
        r = bm25_module.tokenizer.stemmer.stem(t)
        return r

    def apply_stem(term_weight: Dict[str, float]) -> Dict[str, float]:
        return {do_stem(k): v for k, v in term_weight.items()}

    claim_term_weight: Dict[int, Dict[str, float]] = get_claim_term_weighting(
        claims, param)
    claim_term_weight: Dict[int, Dict[str, float]] = dict_value_map(
        apply_stem, claim_term_weight)

    nlp = spacy.load("en_core_web_sm")

    def stem_tokenize(text: str) -> Iterator[str]:
        for t in nlp(text):
            try:
                yield do_stem(t.text)
            except UnicodeDecodeError:
                pass

    def get_qtf(claim_id):
        weight = claim_term_weight[claim_id]
        new_qtf = Counter()
        c_text = cid_to_text[int(claim_id)]
        qtf = Counter(stem_tokenize(c_text))
        print(weight)
        for k, v in qtf.items():
            try:
                if k in weight:
                    w = weight[k]
                    new_qtf[k] = w * v
                else:
                    new_qtf[k] = v
            except Exception as e:
                print("Exception")
                print(e)
                print(k)
        return new_qtf

    c_qtf_d = {k: get_qtf(k) for k in cid_to_text.keys()}

    # for cid, c_text in cid_to_text.items():
    #     c_tokens = bm25_module.tokenizer.tokenize_stem(c_text)
    #     c_qtf_d[cid] = Counter(c_tokens)

    def scorer(lucene_score, query_id) -> NamedNumber:
        nonlocal found_claim
        claim_id, p_id = query_id.split("_")
        i_claim_id = int(claim_id)
        if i_claim_id in q_tf_replace_norm:
            ex_qtf = q_tf_replace_norm[i_claim_id]
            ex_qtf = Counter(dict(ex_qtf.most_common(50)))
            qtf = ex_qtf + c_qtf_d[i_claim_id]
            found_claim.add(i_claim_id)
        else:
            qtf = c_qtf_d[i_claim_id]
        p_text = perspective_getter(int(p_id))
        p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
        score = bm25_module.score_inner(qtf, Counter(p_tokens))
        return score

    r = predict_interface(claims, top_k, scorer)
    print("{} of {} found".format(len(found_claim), len(claims)))
    return r