def pc_predict_by_bert_next_sent(bm25_module: BM25, claims, top_k) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) port = 8123 # Example usage : proxy = xmlrpc.client.ServerProxy( 'http://ingham.cs.umass.edu:{}'.format(port)) voca_path = pjoin(data_path, "bert_voca.txt") encoder = EncoderUnitPlain(512, voca_path) def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") i_claim_id = int(claim_id) payload = [] p_text = perspective_getter(int(p_id)) c_text = cid_to_text[i_claim_id] payload.append(encoder.encode_pair(c_text, p_text)) r = proxy.predict(payload) ns_score = -float(r[0]) #ns_score = 0 score = bm25_module.score(c_text, p_text) new_score = score + ns_score * 10 score = NamedNumber(new_score, score.name + " {}".format(ns_score)) return score r = predict_interface(claims, top_k, scorer) return r
def pc_predict_from_vector_query(bm25_module: BM25, q_tf_replace: Dict[int, Counter], claims, top_k) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) found_claim = set() q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace) c_qtf_d = {} for cid, c_text in cid_to_text.items(): c_tokens = bm25_module.tokenizer.tokenize_stem(c_text) c_qtf_d[cid] = Counter(c_tokens) def scorer(lucene_score, query_id) -> NamedNumber: nonlocal found_claim claim_id, p_id = query_id.split("_") i_claim_id = int(claim_id) if i_claim_id in q_tf_replace_norm: claim_qtf = Counter( dict_value_map(lambda x: x * 1, c_qtf_d[i_claim_id])) ex_qtf = q_tf_replace_norm[i_claim_id] ex_qtf = Counter(dict(ex_qtf.most_common(50))) qtf = ex_qtf + claim_qtf found_claim.add(i_claim_id) else: qtf = c_qtf_d[i_claim_id] p_text = perspective_getter(int(p_id)) p_tokens = bm25_module.tokenizer.tokenize_stem(p_text) score = bm25_module.score_inner(qtf, Counter(p_tokens)) return score r = predict_interface(claims, top_k, scorer) print("{} of {} found".format(len(found_claim), len(claims))) return r
def predict_from_dict(score_d: Dict[CPID, float], claims, top_k) -> List[Tuple[str, List[Dict]]]: suc_count = SuccessCounter() suc_count.reset() per_claim_suc = {} per_claim_counter = {} rationale_d = {} def scorer(lucene_score, query_id): claim_id, p_id = query_id.split("_") if claim_id not in per_claim_suc: per_claim_counter[claim_id] = Counter() per_claim_suc[claim_id] = SuccessCounter() cls_score = get_score_by_d(claim_id, query_id) score = (cls_score < 4) * -1 + lucene_score / 20 #score = cls_score + lucene_score / 20 score = cls_score r = "score={0:.2f} <- cls_score({1:.2f}) lucene_score({2:.2f}) /20".format( score, cls_score, lucene_score) rationale_d[query_id] = r return score def get_score_by_d(claim_id, query_id): if query_id in score_d: cls_score = score_d[query_id] per_claim_suc[claim_id].suc() if cls_score > 0.8: per_claim_counter[claim_id][1] += 1 elif cls_score < 0.3: per_claim_counter[claim_id][0] += 1 suc_count.suc() else: cls_score = 0 per_claim_suc[claim_id].fail() suc_count.fail() return cls_score def get_rationale(query_id): if query_id in rationale_d: return rationale_d[query_id] else: return "(N/A)" r = predict_interface(claims, top_k, scorer, get_rationale) for claim in per_claim_suc: suc_counter = per_claim_suc[claim] print("{} suc/total={}/{} True/False={}/{}".format( claim, suc_counter.get_suc(), suc_counter.get_total(), per_claim_counter[claim][1], per_claim_counter[claim][0])) print("{} found of {}".format(suc_count.get_suc(), suc_count.get_total())) return r
def predict_by_bm25(bm25_module, claims, top_k) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") c_text = cid_to_text[int(claim_id)] p_text = perspective_getter(int(p_id)) score = bm25_module.score(c_text, p_text) return score r = predict_interface(claims, top_k, scorer) return r
def predict_by_bm25_rm(bm25_module: BM25, rm_info: Dict[str, List[Tuple[str, str]]], claims, top_k) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) tokenizer = PCTokenizer() def stem_merge(score_list: List[Tuple[str, float]]) -> Counter: c = Counter() for k, v in score_list: try: new_k = tokenizer.stemmer.stem(k) c[new_k] += v except UnicodeDecodeError: pass return c rm_info: Dict[str, List[Tuple[str, float]]] = dict_value_map(parse_float, rm_info) rm_info: Dict[str, List[Tuple[str, float]]] = dict_value_map(normalize_scores, rm_info) rm_info_c: Dict[str, Counter] = dict_value_map(stem_merge, rm_info) print(len(rm_info_c.keys())) print(len(claims)) not_found = set() def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") c_text = cid_to_text[int(claim_id)] p_text = perspective_getter(int(p_id)) score: NamedNumber = bm25_module.score(c_text, p_text) nclaim_id = int(claim_id) if nclaim_id in rm_info: ex_qtf = rm_info_c[nclaim_id] p_tokens = tokenizer.tokenize_stem(p_text) ex_score = bm25_module.score_inner(ex_qtf, Counter(p_tokens)) new_info = score.name + "({})".format(ex_score.name) score = NamedNumber(score + ex_score, new_info) else: not_found.add(claim_id) return score r = predict_interface(claims, top_k, scorer) print(not_found) return r
def predict_by_para_scorer(score_pred_file_name: FileName, cpid_resolute_file: FileName, claims, top_k) -> List[Tuple[str, List[Dict]]]: suc_count = SuccessCounter() suc_count.reset() pred_path: FilePath = pjoin(output_path, score_pred_file_name) print("Loading cpid_resolute") cpid_resolute: Dict[str, CPID] = load_cpid_resolute(cpid_resolute_file) print("Loading paragraph triple scores") score_d: Dict[CPID, float] = get_cpid_score_from_cache_or_raw(pred_path, cpid_resolute, "avg") per_claim_suc = {} per_claim_counter = {} def scorer(lucene_score, query_id): claim_id, p_id = query_id.split("_") if claim_id not in per_claim_suc: per_claim_counter[claim_id] = Counter() per_claim_suc[claim_id] = SuccessCounter() if query_id in score_d: cls_score = score_d[query_id] per_claim_suc[claim_id].suc() if cls_score > 0.8: per_claim_counter[claim_id][1] += 1 elif cls_score < 0.3: per_claim_counter[claim_id][0] += 1 suc_count.suc() else: cls_score = 0.5 per_claim_suc[claim_id].fail() suc_count.fail() score = 0.9 * cls_score + 0.1 * lucene_score / 20 return score r = predict_interface(claims, top_k, scorer) for claim in per_claim_suc: suc_counter = per_claim_suc[claim] print("{} suc/total={}/{} True/False={}/{}".format( claim, suc_counter.get_suc(), suc_counter.get_total(), per_claim_counter[claim][1], per_claim_counter[claim][0] )) print("{} found of {}".format(suc_count.get_suc(), suc_count.get_total())) return r
def predict_by_reweighter(bm25_module: BM25, claims, top_k, param) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) claim_term_weight: Dict[int, Dict[str, float]] = get_claim_term_weighting( claims, param) nlp = spacy.load("en_core_web_sm") def do_stem(t: str) -> str: r = bm25_module.tokenizer.stemmer.stem(t) return r def stem_tokenize(text: str) -> Iterator[str]: for t in nlp(text): try: yield do_stem(t.text) except UnicodeDecodeError: pass def apply_stem(term_weight: Dict[str, float]) -> Dict[str, float]: return {do_stem(k): v for k, v in term_weight.items()} claim_term_weight: Dict[int, Dict[str, float]] = dict_value_map( apply_stem, claim_term_weight) def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") c_text = cid_to_text[int(claim_id)] p_text = perspective_getter(int(p_id)) qtf = Counter(stem_tokenize(c_text)) weight = claim_term_weight[int(claim_id)] new_qtf = Counter() for k, v in qtf.items(): try: w = weight[k] new_qtf[k] = w * v except Exception as e: print("Exception") print(e) print(k) tf = Counter(stem_tokenize(p_text)) score = bm25_module.score_inner(new_qtf, tf) return score r = predict_interface(claims, top_k, scorer) return r
def predict_by_oracle_on_candidate(claims, top_k) -> List[Tuple[str, List[Dict]]]: gold: Dict[int, List[List[int]]] = get_claim_perspective_id_dict() def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") gold_pids = gold[int(claim_id)] score = 0 for p_ids in gold_pids: if int(p_id) in p_ids: score = 1 return NamedNumber(score, "") r = predict_interface(claims, top_k, scorer) return r
def predict_by_lm(claim_lms: List[ClaimLM], claims, top_k) -> List[Tuple[str, List[Dict]]]: alpha = 0.1 bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) tokenizer = PCTokenizer() print("Eval log odds") claim_log_odds_dict = {str(c_lm.cid): get_log_odd(c_lm, bg_lm, alpha) for c_lm in claim_lms} def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") p_text = perspective_getter(int(p_id)) tokens = tokenizer.tokenize_stem(p_text) c_lm = claim_log_odds_dict[claim_id] reason = " ".join(["{0} ({1:.2f})".format(t, c_lm[t]) for t in tokens]) score = sum([c_lm[t] for t in tokens]) return NamedNumber(score, reason) r = predict_interface(claims, top_k, scorer) return r
def pc_predict_vector_query_and_reweight( bm25_module: BM25, q_tf_replace: Dict[int, Counter], claims, top_k, param) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) found_claim = set() q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace) def do_stem(t: str) -> str: r = bm25_module.tokenizer.stemmer.stem(t) return r def apply_stem(term_weight: Dict[str, float]) -> Dict[str, float]: return {do_stem(k): v for k, v in term_weight.items()} claim_term_weight: Dict[int, Dict[str, float]] = get_claim_term_weighting( claims, param) claim_term_weight: Dict[int, Dict[str, float]] = dict_value_map( apply_stem, claim_term_weight) nlp = spacy.load("en_core_web_sm") def stem_tokenize(text: str) -> Iterator[str]: for t in nlp(text): try: yield do_stem(t.text) except UnicodeDecodeError: pass def get_qtf(claim_id): weight = claim_term_weight[claim_id] new_qtf = Counter() c_text = cid_to_text[int(claim_id)] qtf = Counter(stem_tokenize(c_text)) print(weight) for k, v in qtf.items(): try: if k in weight: w = weight[k] new_qtf[k] = w * v else: new_qtf[k] = v except Exception as e: print("Exception") print(e) print(k) return new_qtf c_qtf_d = {k: get_qtf(k) for k in cid_to_text.keys()} # for cid, c_text in cid_to_text.items(): # c_tokens = bm25_module.tokenizer.tokenize_stem(c_text) # c_qtf_d[cid] = Counter(c_tokens) def scorer(lucene_score, query_id) -> NamedNumber: nonlocal found_claim claim_id, p_id = query_id.split("_") i_claim_id = int(claim_id) if i_claim_id in q_tf_replace_norm: ex_qtf = q_tf_replace_norm[i_claim_id] ex_qtf = Counter(dict(ex_qtf.most_common(50))) qtf = ex_qtf + c_qtf_d[i_claim_id] found_claim.add(i_claim_id) else: qtf = c_qtf_d[i_claim_id] p_text = perspective_getter(int(p_id)) p_tokens = bm25_module.tokenizer.tokenize_stem(p_text) score = bm25_module.score_inner(qtf, Counter(p_tokens)) return score r = predict_interface(claims, top_k, scorer) print("{} of {} found".format(len(found_claim), len(claims))) return r