Exemplo n.º 1
0
def save_to_csv():
    gold = get_claim_perspective_id_dict()

    def routine(claims, out_path):
        payloads = predict_by_elastic_search(claims, 50)
        head = ['sentence1', 'sentence2', 'gold_label', 'cid', 'pid']
        rows = []
        for cid, data_list in payloads:
            gold_pids = gold[cid]
            all_pid_set = set(flatten(gold_pids))
            for p_entry in data_list:
                c_text = p_entry['claim_text']
                p_text = p_entry['perspective_text']
                y = 1 if p_entry['pid'] in all_pid_set else 0
                row = [c_text, p_text, y, cid, p_entry['pid']]
                rows.append(row)
        f_out = csv.writer(open(out_path, "w", encoding="utf-8"),
                           dialect='excel-tab')
        f_out.writerows([head] + rows)

    claims, val = train_split()
    routine(claims, get_file_path('train'))
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    routine(claims, get_file_path('dev'))
    d_ids: List[int] = list(load_test_claim_ids())
    claims = get_claims_from_ids(d_ids)
    routine(claims, get_file_path('test'))
Exemplo n.º 2
0
def run_reweight():
    top_k = 7
    claims, val = train_split()
    param = {'k1': 1}
    target = claims[:50]
    pred = predict_by_reweighter(get_bm25_module(), target, top_k, param)
    print(param)
    print(evaluate(pred))
Exemplo n.º 3
0
def run_bm25_2():
    claims, val = train_split()
    top_k = 1000
    candidate_dict: List[Tuple[int, List[int]]] = get_eval_candidates_w_q_text(
        claim_as_query(claims), top_k)
    pred = predict_by_bm25_from_candidate(get_bm25_module(), claims,
                                          candidate_dict, top_k)
    print(evaluate_recall(pred, True))
Exemplo n.º 4
0
def run_a_relevant_lm():
    claims, val = train_split()
    top_k = 50
    print("Building lms")
    claim_lms = get_train_passage_a_lms()
    print("Predicting")
    pred = predict_by_lm(claim_lms, claims, top_k)
    print(evaluate_map(pred))
Exemplo n.º 5
0
def run_gold_lm_ap():
    claims, val = train_split()
    top_k = 50
    print("Building lms")
    claim_lms = build_gold_claim_lm_train()
    print("Predicting")
    pred = predict_by_lm(claim_lms, claims, top_k)
    print(evaluate_map(pred))
Exemplo n.º 6
0
def run_bert_baseline():
    claims, val = train_split()
    top_k = 50
    target = filter_avail(val)
    print("targets", len(target))
    pc_score_d = load_from_pickle("pc_bert_baseline_score_d_train")
    pred = predict_from_dict(pc_score_d, target, top_k)
    print(evaluate(pred))
Exemplo n.º 7
0
def run_gold_lm():
    claims, val = train_split()
    top_k = 5
    print("Building lms")
    claim_lms: List[ClaimLM] = build_gold_claim_lm_train()
    print("Predicting")
    pred = predict_by_lm(claim_lms, claims, top_k)
    print(evaluate(pred))
Exemplo n.º 8
0
def run_rel_scorer():
    claims, val = train_split()
    top_k = 6
    target = filter_avail(val)
    print("targets", len(target))
    pc_score_d = load_from_pickle("pc_rel_based_score_train")
    pred = predict_from_dict(pc_score_d, target, top_k)
    print(evaluate(pred))
Exemplo n.º 9
0
def run_bm25_ex():
    claims, val = train_split()
    top_k = 100
    candidate_dict = get_eval_candidates_l(
        get_expanded_query_text(claims, "train"))
    pred = predict_by_bm25_from_candidate(get_bm25_module(), claims,
                                          candidate_dict, top_k)
    print(evaluate_recall(pred, True))
Exemplo n.º 10
0
def run_baseline_lm():
    claims, val = train_split()
    claims = val
    top_k = 50
    print("Building lms")
    claim_lms = build_baseline_lms(claims)
    print("Predicting")
    pred = predict_by_lm(claim_lms, claims, top_k)
    print(evaluate_map(pred))
Exemplo n.º 11
0
def generate_classification_payload():
    claims, val = train_split()
    top_k = 50
    pred = predict_by_elastic_search(claims, top_k)
    save_to_pickle(pred, "perspective_cls_train_X")
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 50
    pred = predict_by_elastic_search(claims, top_k)
    save_to_pickle(pred, "perspective_cls_dev_X")
Exemplo n.º 12
0
def run_para_scorer():
    claims, val = train_split()
    top_k = 6

    target = filter_avail(val)
    print("targets", len(target))
    score_pred_file: FileName = FileName("pc_para_D_pred")
    cpid_resolute_file: FileName = FileName("resolute_dict_580_606")
    pred = predict_by_para_scorer(score_pred_file, cpid_resolute_file, target,
                                  top_k)
    print(evaluate(pred))
Exemplo n.º 13
0
def build_df():
    claims, val = train_split()
    gold = get_claim_perspective_id_dict()

    tokenizer = PCTokenizer()
    df = Counter()

    dl_list = []
    for claim in claims:
        cid = claim["cId"]
        gold_pids = flatten(gold[cid])
        p_text_list: List[str] = lmap(perspective_getter, gold_pids)
        tokens_list = lmap(tokenizer.tokenize_stem, p_text_list)
        dl_list.extend(lmap(len, tokens_list))

        for t in set(flatten(tokens_list)):
            df[t] += 1

    print(dl_list)
    print("Avdl", average(dl_list))
    print(len(claims))
    print(df.most_common(30))
    save_to_pickle(df, "pc_df")
Exemplo n.º 14
0
def build_gold_claim_lm_train() -> List[ClaimLM]:
    # load claims and perspectives
    # Calculate term frequency for each terms.
    claims, val = train_split()
    return build_gold_lms(claims)
Exemplo n.º 15
0
def run_bm25():
    claims, val = train_split()
    top_k = 20
    pred = predict_by_bm25(get_bm25_module(), claims, top_k)

    inspect(pred)
Exemplo n.º 16
0
def run_bm25():
    claims, val = train_split()
    top_k = 20
    pred = predict_by_bm25(get_bm25_module(), claims, top_k)
    print(evaluate(pred))
Exemplo n.º 17
0
def run_oracle_on_candiate_map():
    claims, val = train_split()
    top_k = 50
    pred = predict_by_oracle_on_candidate(claims, top_k)
    print(evaluate_map(pred))
Exemplo n.º 18
0
def run_baseline():
    claims, val = train_split()
    top_k = 50
    pred = predict_by_elastic_search(claims, top_k)
    print(evaluate(pred))