def sanity_check(): dvp: List[DocValueParts2] = load() candidate_d_raw: List[Tuple[int, List[int]]] = get_eval_candidate_as_pids( "train") candidate_d = {str(k): lmap(str, v) for k, v in candidate_d_raw} # Group by doc id dvp_qid_grouped: Dict[str, List[DocValueParts2]] = group_by(dvp, get_qid) ap_baseline = [] ap_new_score = [] for qid, entries in dvp_qid_grouped.items(): ranked_list_new = [] ranked_list_baseline = [] candidate_id_grouped = group_by(entries, get_candidate) for candidate_id, entries2 in candidate_id_grouped.items(): is_initial_candidate = candidate_id in candidate_d[qid] gold = entries2[0].label skip = gold and not is_initial_candidate def get_new_score(dvp: DocValueParts2): return dvp.score def get_baseline_score(dvp: DocValueParts2): return dvp.init_score if skip: continue new_score = top_k_avg(lmap(get_new_score, entries2)) baseline_score = average(lmap(get_baseline_score, entries2)) ranked_list_new.append((candidate_id, new_score, gold)) ranked_list_baseline.append((candidate_id, baseline_score, gold)) def get_ap(ranked_list): ranked_list.sort(key=lambda x: x[1], reverse=True) p_list = [] p = 0 for rank, (cid, score, gold) in enumerate(ranked_list): if gold: p += 1 p_list.append(p / (rank + 1)) return average(p_list) ap_baseline.append(get_ap(ranked_list_baseline)) ap_new_score.append(get_ap(ranked_list_new)) print("MAP baseline", average(ap_baseline)) print("MAP new score", average(ap_new_score))
def group_by_docs(): dvp: List[DocValueParts2] = load() candidate_d_raw: List[Tuple[int, List[int]]] = get_eval_candidate_as_pids( "train") candidate_d = {str(k): lmap(str, v) for k, v in candidate_d_raw} # Group by doc id dvp_qid_grouped: Dict[str, List[DocValueParts2]] = group_by(dvp, get_qid) def simple(doc_id): return doc_id.split("-")[-1] c_all = Counter() rows = [] for qid, entries in dvp_qid_grouped.items(): # Q : How many kdp are useful? # Q : Does relevant matter? candidate_id_grouped = group_by(entries, get_doc_id) rows.append(["qid", qid]) for doc_id_idx, entries2 in candidate_id_grouped.items(): #c = Counter([good_or_bad(e.score-e.init_score, e.label) for e in entries2]) c = Counter([ get_decision_change(e.label, e.init_score, e.score) for e in entries2 ]) rows.append([doc_id_idx]) #row = [doc_id_idx, c["good"], c["bad"], c["no change"]] row = [ doc_id_idx, c["decision_change_good"], c["decision_change_bad"], c["no_change"] ] rows.append(row) for k, v in c.items(): c_all[k] += v row = [ "summary", c_all["decision_change_good"], c_all["decision_change_bad"], c_all["no_change"] ] rows = [row] + rows print_table(rows)
def main(): trec_path = sys.argv[1] ranked_list = load_ranked_list(trec_path) candidate_d_raw: Dict[Tuple[int, List[int]]] = dict( get_eval_candidate_as_pids("dev")) label_d: Dict[int, List[int]] = get_claim_perspective_id_dict2() ex_candiate_entry = defaultdict(list) for entry in ranked_list: cid = int(entry.query_id) pid = int(entry.doc_id) label = pid in label_d[cid] # show entry which are true and not in original candidate if label and pid not in candidate_d_raw[cid]: ex_candiate_entry[cid].append(entry.rank) for cid, ranks in ex_candiate_entry.items(): print(cid, ranks)
def group_by_cids(): dvp: List[DocValueParts2] = load() candidate_d_raw: List[Tuple[int, List[int]]] = get_eval_candidate_as_pids( "train") candidate_d = {str(k): lmap(str, v) for k, v in candidate_d_raw} # Group by doc id dvp_qid_grouped: Dict[str, List[DocValueParts2]] = group_by(dvp, get_qid) def simple(doc_id): return doc_id.split("-")[-1] rows = [] for qid, entries in dvp_qid_grouped.items(): # Q : How many kdp are useful? # Q : Does relevant matter? candidate_id_grouped = group_by(entries, get_candidate) rows.append([qid]) for candidate_id, entries2 in candidate_id_grouped.items(): is_initial_candidate = candidate_id in candidate_d[qid] avg_score = average(lmap(lambda x: x.score, entries2)) rows.append(['candidate id:', candidate_id]) rows.append(['is_initial_candidate', is_initial_candidate]) rows.append([ "doc_id", "score", "gold", "init_pred", "direction", "decision" ]) for e in entries2: s = "{}_{}".format(simple(e.kdp.doc_id), e.kdp.passage_idx) row = [ s, "{0:.2f}".format(e.score), e.label, to_pred(e.init_score), direction(e.score, e.init_score), to_pred(e.score) ] rows.append(row) print_table(rows)
def avg_scores(): dvp: List[DocValueParts2] = load() candidate_d_raw: List[Tuple[int, List[int]]] = get_eval_candidate_as_pids( "train") candidate_d = {str(k): lmap(str, v) for k, v in candidate_d_raw} # Group by doc id dvp_qid_grouped: Dict[str, List[DocValueParts2]] = group_by(dvp, get_qid) rows = [] for qid, entries in dvp_qid_grouped.items(): # Q : How many kdp are useful? # Q : Does relevant matter? candidate_id_grouped = group_by(entries, get_candidate) c = Counter() new_rows = [] new_rows.append(["candidate id", "init_score", "avg_score"]) for candidate_id, entries2 in candidate_id_grouped.items(): label = entries2[0].label avg_score = average(lmap(lambda x: x.score, entries2)) initial_score = entries2[0].init_score change = avg_score - initial_score value_type = good_or_bad(change, label, 0.01) c[value_type] += 1 row = [ candidate_id, label, value_type, four_digit_float(initial_score), four_digit_float(avg_score) ] new_rows.append(row) row = [qid, c['good'], c['bad'], c['no change']] rows.append(row) rows.extend(new_rows) print_table(rows)
def functor(cid_to_passage) -> CPPNCGeneratorInterface: candidate_pers = dict(get_eval_candidate_as_pids("train")) return ppnc_datagen_50_perspective.Generator(cid_to_passage, candidate_pers)
def functor(cid_to_passage) -> CPPNCGeneratorInterface: candidate_pers = dict(get_eval_candidate_as_pids("dev")) return multi_evidence.Generator(cid_to_passage, candidate_pers, False)
def functor(cid_to_passage) -> CPPNCGeneratorInterface: candidate_pers = dict(get_eval_candidate_as_pids("train")) return cppnc_datagen.Generator(cid_to_passage, candidate_pers, False)
def get_eval_candidates_as_qck(split) -> Dict[str, List[QCKCandidate]]: candidate_pers: List[Tuple[int, List[int]]] = get_eval_candidate_as_pids(split) return cid_pid_format_to_qck(candidate_pers)