def main(): docs: Dict[str, List[List[str]]] = load_from_pickle("dev_claim_docs") _, clue12_13_df = load_clueweb12_B13_termstat() d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) r = select_paragraph(docs, clue12_13_df, claims, "topk") save_to_pickle(r, "dev_claim_paras")
def __init__(self, split, q_config_id, out_dir): self.out_dir = out_dir self.ci = StaticRankedListInterface(q_config_id) print("load__data_point") self.all_data_points = load_data_point(split) print("Load term stat") _, clue12_13_df = load_clueweb12_B13_termstat() self.clue12_13_df = clue12_13_df self.tokenizer = get_tokenizer()
def __init__(self, option, out_dir): self.out_dir = out_dir self.ci = RankedListInterface() print("load__data_point") self.all_data_points: List[TPDataPoint] = lmap(ukp_datapoint_to_tp_datapoint, load_all_data_flat()) self.data_step_size = 50 total_jobs = ceil_divide(len(self.all_data_points), self.data_step_size) print("total_jobs :", total_jobs ) print("Load term stat") _, clue12_13_df = load_clueweb12_B13_termstat() self.clue12_13_df = clue12_13_df self.dp_id_to_q_res_id_fn = build_dp_id_to_q_res_id_fn() self.tokenizer = get_tokenizer() self.option = option
def count_term_stat(doc_list, unigrams): # count term frequency # t \in unigrams # df(controversy, t), df(t) # df_clueweb(controversy), df_clueweb(t) tf_cont = Counter() tf_ncont = Counter() ctf_cont = 0 ctf_ncont = 0 df_cont = Counter() df_ncont = Counter() cdf_cont = 0 cdf_ncont = 0 clueweb_tf, clueweb_df = load_clueweb12_B13_termstat() clueweb_ctf = sum(clueweb_tf.values()) clueweb_cdf = max(clueweb_df.values()) + 100 def get_tf(doc, t): return doc['tf_d'][t] def contain_controversy(doc): return 'controversy' in doc['tokens_set'] or 'controversial' in doc[ 'tokens_set'] def contain(doc, t): return t in doc['tokens_set'] for doc in doc_list: current_doc_contain_controversy = contain_controversy(doc) for t in unigrams: if contain(doc, t): if current_doc_contain_controversy: tf_cont[t] += get_tf(doc, t) df_cont[t] += 1 else: tf_ncont[t] += get_tf(doc, t) df_ncont[t] += 1 if current_doc_contain_controversy: ctf_cont += doc['dl'] cdf_cont += 1 else: ctf_ncont += doc['dl'] cdf_ncont += 1 return cdf_cont, cdf_ncont, clueweb_cdf, clueweb_ctf, clueweb_df, clueweb_tf, ctf_cont, ctf_ncont, df_cont, df_ncont, tf_cont, tf_ncont
def build_binary_feature(ci: DynRankedListInterface, datapoint_list: List[PerspectiveCandidate] ) -> List[Dict]: not_found_set = set() print("Load term stat") _, clue12_13_df = load_clueweb12_B13_termstat() cdf = 50 * 1000 * 1000 def idf_scorer(doc: Counter, claim_text: str, perspective_text: str) -> bool: cp_tokens = nltk.word_tokenize(claim_text) + nltk.word_tokenize(perspective_text) cp_tokens = lmap(lambda x: x.lower(), cp_tokens) cp_tokens = set(cp_tokens) mentioned_terms = lfilter(lambda x: x in doc, cp_tokens) mentioned_terms = re_tokenize(mentioned_terms) def idf(term: str): if term not in clue12_13_df: if term in string.printable: return 0 not_found_set.add(term) return math.log((cdf+0.5)/(clue12_13_df[term]+0.5)) score = sum(lmap(idf, mentioned_terms)) max_score = sum(lmap(idf, cp_tokens)) return score > max_score * 0.8 def data_point_to_feature(x: PerspectiveCandidate) -> Dict: e = get_feature_binary_model(x.cid, x.pid, x.claim_text, x.p_text, ci, idf_scorer) feature: Counter = e[0] num_metion: int = e[1] return { 'feature': feature, 'cid': x.cid, 'pid': x.pid, 'num_mention': num_metion, 'label': x.label } r = lmap(data_point_to_feature, datapoint_list) return r
def binary_feature_demo(datapoint_list): ci = PassageRankedListInterface(make_passage_query, Q_CONFIG_ID_BM25) not_found_set = set() _, clue12_13_df = load_clueweb12_B13_termstat() cdf = 50 * 1000 * 1000 html = HtmlVisualizer("pc_binary_feature.html") def idf_scorer(doc, claim_text, perspective_text): cp_tokens = nltk.word_tokenize(claim_text) + nltk.word_tokenize( perspective_text) cp_tokens = lmap(lambda x: x.lower(), cp_tokens) cp_tokens = set(cp_tokens) mentioned_terms = lfilter(lambda x: x in doc, cp_tokens) mentioned_terms = re_tokenize(mentioned_terms) def idf(term): if term not in clue12_13_df: if term in string.printable: return 0 not_found_set.add(term) return math.log((cdf + 0.5) / (clue12_13_df[term] + 0.5)) score = sum(lmap(idf, mentioned_terms)) max_score = sum(lmap(idf, cp_tokens)) # print(claim_text, perspective_text) # print(mentioned_terms) # print(score, max_score) return score, max_score, mentioned_terms def bm25_estimator(doc: Counter, claim_text: str, perspective_text: str): cp_tokens = nltk.word_tokenize(claim_text) + nltk.word_tokenize( perspective_text) cp_tokens = lmap(lambda x: x.lower(), cp_tokens) k1 = 0 def BM25_3(f, qf, df, N, dl, avdl) -> float: K = compute_K(dl, avdl) first = math.log((N - df + 0.5) / (df + 0.5)) second = ((k1 + 1) * f) / (K + f) return first * second dl = sum(doc.values()) info = [] for q_term in set(cp_tokens): if q_term in doc: score = BM25_3(doc[q_term], 0, clue12_13_df[q_term], cdf, dl, 1200) info.append((q_term, doc[q_term], clue12_13_df[q_term], score)) return info print_cnt = 0 for dp_idx, x in enumerate(datapoint_list): ranked_list: List[GalagoRankEntry] = ci.query_passage( x.cid, x.pid, x.claim_text, x.p_text) html.write_paragraph(x.claim_text) html.write_paragraph(x.p_text) html.write_paragraph("{}".format(x.label)) local_print_cnt = 0 lines = [] for ranked_entry in ranked_list: try: doc_id = ranked_entry.doc_id galago_score = ranked_entry.score tokens = load_doc(doc_id) doc_tf = Counter(tokens) if doc_tf is not None: score, max_score, mentioned_terms = idf_scorer( doc_tf, x.claim_text, x.p_text) matched = score > max_score * 0.75 else: matched = "Unk" score = "Unk" max_score = "Unk" def get_cell(token): if token in mentioned_terms: return Cell(token, highlight_score=50) else: return Cell(token) line = [doc_id, galago_score, matched, score, max_score] lines.append(line) html.write_paragraph("{0} / {1:.2f}".format( doc_id, galago_score)) html.write_paragraph("{}/{}".format(score, max_score)) bm25_info = bm25_estimator(doc_tf, x.claim_text, x.p_text) bm25_score = sum(lmap(lambda x: x[3], bm25_info)) html.write_paragraph( "bm25 re-estimate : {}".format(bm25_score)) html.write_paragraph("{}".format(bm25_info)) html.multirow_print(lmap(get_cell, tokens)) local_print_cnt += 1 if local_print_cnt > 10: break except KeyError: pass matched_idx = idx_where(lambda x: x[2], lines) if not matched_idx: html.write_paragraph("No match") else: last_matched = matched_idx[-1] lines = lines[:last_matched + 1] rows = lmap(lambda line: lmap(Cell, line), lines) html.write_table(rows) if dp_idx > 10: break