def main(): claim_text_d: Dict[int, str] = get_all_claim_d() claim_text_d: Dict[str, str] = dict_key_map(str, claim_text_d) evi_dict: Dict[str, str] = dict_key_map(str, load_evidence_dict()) evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid() print("V2") def print_entry(entry): evidence_text = evi_dict[entry.doc_id] print("[{}] {}: {}".format(entry.rank, entry.doc_id, evidence_text)) ranked_list_dict = load_ranked_list_grouped(sys.argv[1]) for query, ranked_list in ranked_list_dict.items(): print() claim_id, perspective_id = query.split("_") gold_ids: List[str] = lmap(str, evi_gold_dict[query]) if not gold_ids: print("query {} has no gold".format(query)) continue assert gold_ids claim_text = claim_text_d[claim_id] perspective_text = perspective_getter(int(perspective_id)) pos_entries = [] neg_entries = [] for entry in ranked_list: label = entry.doc_id in gold_ids if label: pos_entries.append(entry) elif entry.rank < 3: neg_entries.append(entry) if not pos_entries: print("gold not in ranked list") continue num_rel = len(pos_entries) correctness = [] for entry in ranked_list[:num_rel]: label = entry.doc_id in gold_ids correctness.append(int(label)) precision = average(correctness) if precision > 0.99: print("Good") continue print("precision at {}: {}".format(num_rel, precision)) print("Claim: ", claim_text) print("perspective_text: ", perspective_text) print(" < GOLD >") foreach(print_entry, pos_entries) print(" < False Positive >") foreach(print_entry, neg_entries)
def main(): claim_text_d: Dict[int, str] = get_all_claim_d() evidence_d = load_evidence_dict() evidence_gold = evidence_gold_dict() while True: s = input() cid, pid = s.split("_") cid = int(cid) pid = int(pid) print("Claim: ", claim_text_d[cid]) print("Perspective: ", perspective_getter(pid)) key = cid, pid e_ids = evidence_gold[key] for eid in e_ids: print("Evidence: ", evidence_d[eid])
def get_query_lms(split) -> Dict[str, Counter]: evi_dict: Dict[int, str] = load_evidence_dict() tokenzier = PCTokenizer() queries = get_qck_queries(split) evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid() def get_evidence_texts(query: QCKQuery) -> List[str]: query_id = query.query_id e_ids: List[int] = evi_gold_dict[query_id] return list([evi_dict[eid] for eid in e_ids]) def get_query_lm(query: QCKQuery) -> Counter: return text_list_to_lm(tokenzier, get_evidence_texts(query)) lms = lmap(get_query_lm, queries) qids = lmap(QCKQuery.get_id, queries) query_lms: Dict[str, Counter] = dict(zip(qids, lms)) return query_lms
def get_ex_candidate_for_training(split, balanced=True, cached=False ) -> Dict[str, List[QCKCandidateI]]: if cached: bow_ranked = load_top_rank_candidate(split) else: bow_ranked = get_candidate(split) tokenizer = get_tokenizer() evi_dict: Dict[int, str] = load_evidence_dict() evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid() queries = get_qck_queries(split) max_seq_length = 512 out_d = {} for query in queries: qid = query.query_id c_list = bow_ranked[qid] gold_e_ids: List[int] = evi_gold_dict[qid] top_ranked: List[int] = lmap(int, map(QCKCandidate.get_id, c_list)) query_len = len(tokenizer.tokenize(query.text)) candidate_max_len = max_seq_length - 3 - query_len neg_e_ids = [] for e_id in set(top_ranked): if e_id not in gold_e_ids: neg_e_ids.append(e_id) if balanced and len(neg_e_ids) == len(gold_e_ids): break def make_candidate(e_id: int) -> Iterable[QCKCandidate]: text = evi_dict[e_id] tokens = tokenizer.tokenize(text) for passage in enum_passage(tokens, candidate_max_len): yield QCKCandidateWToken(str(e_id), "", passage) new_list = lflatten(map(make_candidate, gold_e_ids + neg_e_ids)) out_d[qid] = new_list return out_d
def insert(): e_dict = load_evidence_dict() for e_id, text in e_dict.items(): p = {'id': e_id, 'text': text} r = es.index(index=index_name, body=p)