def save_to_csv(): gold = get_claim_perspective_id_dict() def routine(claims, out_path): payloads = predict_by_elastic_search(claims, 50) head = ['sentence1', 'sentence2', 'gold_label', 'cid', 'pid'] rows = [] for cid, data_list in payloads: gold_pids = gold[cid] all_pid_set = set(flatten(gold_pids)) for p_entry in data_list: c_text = p_entry['claim_text'] p_text = p_entry['perspective_text'] y = 1 if p_entry['pid'] in all_pid_set else 0 row = [c_text, p_text, y, cid, p_entry['pid']] rows.append(row) f_out = csv.writer(open(out_path, "w", encoding="utf-8"), dialect='excel-tab') f_out.writerows([head] + rows) claims, val = train_split() routine(claims, get_file_path('train')) d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) routine(claims, get_file_path('dev')) d_ids: List[int] = list(load_test_claim_ids()) claims = get_claims_from_ids(d_ids) routine(claims, get_file_path('test'))
def run_reweight(): top_k = 7 claims, val = train_split() param = {'k1': 1} target = claims[:50] pred = predict_by_reweighter(get_bm25_module(), target, top_k, param) print(param) print(evaluate(pred))
def run_bm25_2(): claims, val = train_split() top_k = 1000 candidate_dict: List[Tuple[int, List[int]]] = get_eval_candidates_w_q_text( claim_as_query(claims), top_k) pred = predict_by_bm25_from_candidate(get_bm25_module(), claims, candidate_dict, top_k) print(evaluate_recall(pred, True))
def run_a_relevant_lm(): claims, val = train_split() top_k = 50 print("Building lms") claim_lms = get_train_passage_a_lms() print("Predicting") pred = predict_by_lm(claim_lms, claims, top_k) print(evaluate_map(pred))
def run_gold_lm_ap(): claims, val = train_split() top_k = 50 print("Building lms") claim_lms = build_gold_claim_lm_train() print("Predicting") pred = predict_by_lm(claim_lms, claims, top_k) print(evaluate_map(pred))
def run_bert_baseline(): claims, val = train_split() top_k = 50 target = filter_avail(val) print("targets", len(target)) pc_score_d = load_from_pickle("pc_bert_baseline_score_d_train") pred = predict_from_dict(pc_score_d, target, top_k) print(evaluate(pred))
def run_gold_lm(): claims, val = train_split() top_k = 5 print("Building lms") claim_lms: List[ClaimLM] = build_gold_claim_lm_train() print("Predicting") pred = predict_by_lm(claim_lms, claims, top_k) print(evaluate(pred))
def run_rel_scorer(): claims, val = train_split() top_k = 6 target = filter_avail(val) print("targets", len(target)) pc_score_d = load_from_pickle("pc_rel_based_score_train") pred = predict_from_dict(pc_score_d, target, top_k) print(evaluate(pred))
def run_bm25_ex(): claims, val = train_split() top_k = 100 candidate_dict = get_eval_candidates_l( get_expanded_query_text(claims, "train")) pred = predict_by_bm25_from_candidate(get_bm25_module(), claims, candidate_dict, top_k) print(evaluate_recall(pred, True))
def run_baseline_lm(): claims, val = train_split() claims = val top_k = 50 print("Building lms") claim_lms = build_baseline_lms(claims) print("Predicting") pred = predict_by_lm(claim_lms, claims, top_k) print(evaluate_map(pred))
def generate_classification_payload(): claims, val = train_split() top_k = 50 pred = predict_by_elastic_search(claims, top_k) save_to_pickle(pred, "perspective_cls_train_X") d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 50 pred = predict_by_elastic_search(claims, top_k) save_to_pickle(pred, "perspective_cls_dev_X")
def run_para_scorer(): claims, val = train_split() top_k = 6 target = filter_avail(val) print("targets", len(target)) score_pred_file: FileName = FileName("pc_para_D_pred") cpid_resolute_file: FileName = FileName("resolute_dict_580_606") pred = predict_by_para_scorer(score_pred_file, cpid_resolute_file, target, top_k) print(evaluate(pred))
def build_df(): claims, val = train_split() gold = get_claim_perspective_id_dict() tokenizer = PCTokenizer() df = Counter() dl_list = [] for claim in claims: cid = claim["cId"] gold_pids = flatten(gold[cid]) p_text_list: List[str] = lmap(perspective_getter, gold_pids) tokens_list = lmap(tokenizer.tokenize_stem, p_text_list) dl_list.extend(lmap(len, tokens_list)) for t in set(flatten(tokens_list)): df[t] += 1 print(dl_list) print("Avdl", average(dl_list)) print(len(claims)) print(df.most_common(30)) save_to_pickle(df, "pc_df")
def build_gold_claim_lm_train() -> List[ClaimLM]: # load claims and perspectives # Calculate term frequency for each terms. claims, val = train_split() return build_gold_lms(claims)
def run_bm25(): claims, val = train_split() top_k = 20 pred = predict_by_bm25(get_bm25_module(), claims, top_k) inspect(pred)
def run_bm25(): claims, val = train_split() top_k = 20 pred = predict_by_bm25(get_bm25_module(), claims, top_k) print(evaluate(pred))
def run_oracle_on_candiate_map(): claims, val = train_split() top_k = 50 pred = predict_by_oracle_on_candidate(claims, top_k) print(evaluate_map(pred))
def run_baseline(): claims, val = train_split() top_k = 50 pred = predict_by_elastic_search(claims, top_k) print(evaluate(pred))