Exemplo n.º 1
0
def get_bm25_module():
    df = load_from_pickle("mmd_df_10")
    avdl_raw = 1350
    avdl_passage = 40
    # k_dtf_saturation = 1.2
    k_dtf_saturation = 0.75
    return BM25(df, avdl=avdl_passage, num_doc=321384, k1=k_dtf_saturation, k2=100, b=0.75)
Exemplo n.º 2
0
def get_bm25_module(split):
    pickle_name = "argu_{}_df".format(split)
    df = load_from_pickle(pickle_name)
    N = {
        'training': 8148,
        'validation': 4074,
        'test': 4074,
    }[split]
    return BM25(df, avdl=160, num_doc=N * 2, k1=0.1, k2=100, b=0.9)
Exemplo n.º 3
0
def predict_see_candidate(bm25_module: BM25, claims, top_k):
    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    c_qtf_d = {}
    for cid, c_text in cid_to_text.items():
        c_tokens = bm25_module.tokenizer.tokenize_stem(c_text)
        c_qtf_d[cid] = Counter(c_tokens)

    output = []
    for claim in claims:
        cid = claim['cId']
        claim_text = claim['text']
        lucene_results = es_helper.get_perspective_from_pool(claim_text, 50)
        candidate_pids = []
        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            candidate_pids.append(_pid)

        p_text = lmap(perspective_getter, candidate_pids)

        p_tokens = lmap(bm25_module.tokenizer.tokenize_stem, p_text)

        acc_counter = Counter()
        for tokens in p_tokens[:30]:
            for t in tokens:
                acc_counter[t] += 1 / len(tokens)
        c = normalize_counter(acc_counter)
        c_tokens = bm25_module.tokenizer.tokenize_stem(claim_text)
        qtf = Counter(c_tokens)
        qtf = c + qtf

        ranked_list = []
        for pid in candidate_pids:
            p_tokens = bm25_module.tokenizer.tokenize_stem(
                perspective_getter(pid))
            score = bm25_module.score_inner(qtf, Counter(p_tokens))
            ranked_list.append((pid, score))

        ranked_list.sort(key=lambda x: x[1], reverse=True)
        prediction_list = []

        for pid, score in ranked_list[:top_k]:
            p_entry = {
                'cid': cid,
                'pid': pid,
                'claim_text': claim_text,
                'perspective_text': perspective_getter(pid),
                'rationale': score.name,
                'score': score,
            }
            prediction_list.append(p_entry)
        output.append((cid, prediction_list))

    return output
Exemplo n.º 4
0
def get_max_values(queries: Dict[str, str]) -> Dict[str, float]:
    tf, df = load_clueweb12_B13_termstat_stemmed()
    stemmer = Stemmer()
    avdl = 500
    bm25_module = BM25(df, cdf, avdl)
    score_d = {}
    for qid, query_text in queries.items():
        q_terms = extract_terms_from_structured_query(query_text)
        q_terms_stemmed: List[str] = lmap(stemmer.stem, q_terms)
        q_tf = Counter(q_terms_stemmed)
        d_tf = q_tf
        score = bm25_module.score_inner(q_tf, d_tf)
        score_d[qid] = score
    return score_d
Exemplo n.º 5
0
from arg.bm25 import BM25
from cache import load_from_pickle
from data_generator.job_runner import JobRunner
from epath import job_man_dir
from tab_print import print_table
from tlm.data_gen.msmarco_doc_gen.gen_worker import MMDWorker, PointwiseGen
from tlm.data_gen.msmarco_doc_gen.max_sent_encode import SegScorer, PassageScoreTuner, get_mrr_from_ranks
from tlm.data_gen.msmarco_doc_gen.processed_resource import ProcessedResource, ProcessedResource10docMulti

if __name__ == "__main__":
    split = "train"
    resource = ProcessedResource10docMulti(split)
    max_seq_length = 512
    job_id = 0
    df = load_from_pickle("mmd_df_10")
    avdl_raw = 1350
    avdl_passage = 40

    rows = []
    k1 = 0.1
    for avdl in [10, 40, 100, 200]:
        bm25 = BM25(df, avdl=avdl, num_doc=321384, k1=k1, k2=100, b=0.75)
        scorer = SegScorer(bm25, max_seq_length)
        qids = resource.query_group[job_id]
        tuner = PassageScoreTuner(resource, scorer)
        row = [avdl, tuner.get_mrr(qids)]
        rows.append(row)

    print_table(rows)
Exemplo n.º 6
0
def pc_predict_to_inspect(bm25_module: BM25, q_tf_replace: Dict[int, Counter],
                          q_tf_replace_0: Dict[int, Counter], claims, top_k):
    gold = get_claim_perspective_id_dict()
    q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace)
    q_tf_replace_0_norm = dict_value_map(normalize_counter, q_tf_replace_0)

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    c_qtf_d = {}
    for cid, c_text in cid_to_text.items():
        c_tokens = bm25_module.tokenizer.tokenize_stem(c_text)
        c_qtf_d[cid] = Counter(c_tokens)

    def counter_to_str(c: Dict) -> str:
        s = ""
        for k, v in c.items():
            s += "{0} {1:.2f}".format(k, v) + "\t"
        return s

    for claim in claims:
        cid = claim['cId']
        i_claim_id = int(cid)
        claim_text = claim['text']
        lucene_results = es_helper.get_perspective_from_pool(claim_text, 50)
        candidate_pids = []
        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            candidate_pids.append(_pid)

        if i_claim_id in q_tf_replace_norm:
            claim_qtf = Counter(
                dict_value_map(lambda x: x * 1, c_qtf_d[i_claim_id]))
            ex_qtf = q_tf_replace_norm[i_claim_id]
            ex_qtf = Counter(dict(ex_qtf.most_common(50)))
            qtf = ex_qtf + claim_qtf
        else:
            qtf = c_qtf_d[i_claim_id]

        ranked_list = []
        for pid in candidate_pids:
            p_text = perspective_getter(int(pid))
            p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
            score = bm25_module.score_inner(qtf, Counter(p_tokens))
            debug_str = ""

            e = score, pid, p_text, debug_str
            ranked_list.append(e)

        gold_pids = gold[cid]

        def is_correct(pid):
            for pids in gold_pids:
                if pid in pids:
                    return True
            return False

        ranked_list.sort(key=lambda x: x[0], reverse=True)

        qtf_idf_applied = {
            k: v * bm25_module.term_idf_factor(k)
            for k, v in qtf.items()
        }
        print()
        print("Claim: ", cid, claim_text)
        for cluster in gold_pids:
            print("-")
            for pid in cluster:
                print(pid, perspective_getter(pid))
        print()
        print("qtf:", counter_to_str(qtf))
        if i_claim_id in q_tf_replace_norm and i_claim_id in q_tf_replace_0_norm:
            print("ex_qtf:", counter_to_str(ex_qtf))
            ex_qtf_0 = q_tf_replace_0_norm[i_claim_id]
            ex_qtf_0 = Counter(dict(ex_qtf_0.most_common(50)))
            print("ex_qtf_0:", counter_to_str(ex_qtf_0))
        print("qtf idf apllied:", counter_to_str(qtf_idf_applied))

        for score, pid, p_text, debug_str in ranked_list[:top_k]:

            if i_claim_id in q_tf_replace_0_norm:
                p_text = perspective_getter(int(pid))
                p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
                ex_qtf_0 = q_tf_replace_0_norm[i_claim_id]
                qtf = ex_qtf_0 + c_qtf_d[i_claim_id]
                score2 = bm25_module.score_inner(qtf, Counter(p_tokens))
                correct_str = "Y" if is_correct(pid) else "N"
                print("{0} {1:.2f} ({2:.2f}) {3} / {4} / {5}".format(
                    correct_str, score, score2, p_text, score.name,
                    score2.name))
Exemplo n.º 7
0
def get_bm25_module():
    df = load_from_pickle("mmd_df_100")
    return BM25(df, avdl=1350, num_doc=321384, k1=1.2, k2=100, b=0.75)
Exemplo n.º 8
0
def modify(bm25_module: BM25, k1, k2, b):
    bm25_module.k1 = k1
    bm25_module.k2 = k2
    bm25_module.b = b
Exemplo n.º 9
0
def get_bm25_module_no_idf():
    df = Counter()
    return BM25(df, avdl=11.7, num_doc=541 + 400, k1=0.00001, k2=100, b=0.5)
Exemplo n.º 10
0
def get_bm25_module():
    df = load_from_pickle("pc_df")
    return BM25(df, avdl=11.7, num_doc=541 + 400, k1=0.00001, k2=100, b=0.5)