def build_and_show(): claim_lms = build_gold_claim_lm_train() alpha = 0.1 bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) def show(claim_lm: ClaimLM): print('----') print(claim_lm.claim) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_bg_lm = get_lm_log(bg_lm) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) for k, v in claim_lm.LM.most_common(50): print(k, v) s = "\t".join(left(claim_lm.LM.most_common(10))) print("LM freq: ", s) print(s) s = "\t".join(left(log_odd.most_common(30))) print("Log odd top", s) s = "\t".join(left(least_common(log_odd, 10))) print("Log odd bottom", s) foreach(show, claim_lms[:10])
def get_cluster_lm(cluster: List[int]) -> Counter: p_text_list: List[str] = lmap(perspective_getter, cluster) tokens_list: List[List[str]] = lmap(tokenizer.tokenize_stem, p_text_list) counter_list = lmap(tokens_to_freq, tokens_list) counter = average_counters(counter_list) return counter
def main(): tprint("loading counter dict") counter_dict: Dict[str, Counter] = load_counter_dict() def get_doc_lm(doc_id) -> Counter: counter = counter_dict[doc_id] n_tf = sum(counter.values()) out_counter = Counter() for word, cnt in counter.items(): out_counter[word] = cnt / n_tf return out_counter qrel = load_robust_qrel() def get_pos_docs(query_id): if query_id not in qrel: return judgement = qrel[query_id] for doc_id, score in judgement.items(): if score: yield doc_id tprint("build query lm dict") query_lm_dict = {} queries = list(qrel.keys()) for query_id in queries: pos_docs_ids: Iterable[str] = get_pos_docs(query_id) pos_doc_lms: List[Counter] = lmap(get_doc_lm, pos_docs_ids) query_lm: Counter = average_counters(pos_doc_lms) query_lm_dict[query_id] = query_lm
def a_relevant(save_name, q_res_path, claims): top_n = 10 ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.5 tokenizer = PCTokenizer() all_passages = [] entries = [] num_pos_sum = 0 num_pos_exists = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) num_pos = len(lfilter(lambda x: x[1] > 0, passages)) num_pos_sum += num_pos if num_pos > 0: num_pos_exists += 1 all_passages.extend(passages) entries.append((c, passages)) print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum, num_pos_exists)) data = entries, all_passages save_to_pickle(data, save_name)
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) claims = claims[:10] top_n = 100 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) stopwords = load_stopwords_for_query() alpha = 0.7 tokenizer = PCTokenizer() for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 docs = [] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) docs.append(doc) except KeyError: docs.append(None) pass print(c['text']) rows = [] for rank, doc in enumerate(docs): if doc is None: rows.append((rank, "-", "-")) continue scores = get_doc_score(doc, get_passage_score) avg_score = average(scores) max_score = max(scores) rows.append((rank, avg_score, max_score)) print_table(rows)
def get_lm_scorer(claim_lms: List[ClaimLM], alpha): bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) claim_log_odds_dict: Dict[int, Counter] = {c_lm.cid: get_log_odd(c_lm, bg_lm, alpha) for c_lm in claim_lms} def scorer(claim_id: int, p_tokens: List[str]) -> NamedNumber: c_lm = claim_log_odds_dict[claim_id] reason = " ".join(["{0} ({1:.2f})".format(t, c_lm[t]) for t in p_tokens]) score = sum([c_lm[t] for t in p_tokens]) return NamedNumber(score, reason) return scorer
def cluster_to_query(cluster: PerspectiveCluster) -> DocQuery: claim_text = claim_text_d[cluster.claim_id] perspective_text_list = list( [perspective_text_d[pid] for pid in cluster.perspective_ids]) query_id = get_pc_cluster_query_id(cluster) claim_tf: Counter = get_terms(claim_text) pers_tf: Counter = average_counters( lmap(get_terms, perspective_text_list)) tf = sum_counters([claim_tf, pers_tf]) query: DocQuery = counter_to_galago_query(query_id, tf) return query
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.3 tokenizer = PCTokenizer() all_passages = [] entries = [] for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) base = average(scores) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) all_passages.extend(passages) a_rel_passages = lfilter(lambda x: x[1] > 0, passages) entries.append((c, a_rel_passages)) data = entries, all_passages save_to_pickle(data, "pc_train_a_passages")
def __init__(self, query_lms: Dict[str, Counter], alpha=0.5): self.query_lms = query_lms bg_lm = average_counters(list(query_lms.values())) self.bg_lm = bg_lm self.log_bg_lm: Counter = get_lm_log(bg_lm) self.alpha = alpha self.log_odd_d: Dict[str, Counter] = { k: Counter() for k in query_lms.keys() } self.stopwords = load_stopwords_for_query() self.tokenizer = PCTokenizer()
def main(): split = "train" subjectivity_path = sys.argv[1] q_res_path = sys.argv[2] ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) # load LM claim_lms: List[ClaimLM] = build_gold_lms_for_sub_split(split) bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) alpha = 0.1 stopwords = load_stopwords_for_query() # load subjectivity predictions. subj_d: Dict[str, Tuple[int, int]] = load_subjectivity(subjectivity_path) doc_ids = subj_d.keys() preload_man.preload(TokenizedCluewebDoc, doc_ids) tokenizer = PCTokenizer() lm_scores = [] rates = [] num_subj_list = [] num_sent_list = [] for claim_lm in claim_lms: qid = str(claim_lm.cid) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 for entry in ranked_list[qid]: if entry.doc_id in subj_d: tokens = load_doc(entry.doc_id) assert type(tokens[0]) == str lm_score = get_passage_score(tokens) num_subj, num_sent = subj_d[entry.doc_id] rate = num_subj / num_sent lm_scores.append(lm_score) rates.append(rate) num_subj_list.append(num_subj) num_sent_list.append(num_sent) print("lm scores correlation with ") print("rates: ", pearsonr(lm_scores, rates)) print("num subj: ", pearsonr(lm_scores, num_subj_list)) print("num sent: ", pearsonr(lm_scores, num_sent_list))
def do_datagen(d_ids, q_res_path, save_name): claims: List[Dict] = get_claims_from_ids(d_ids) ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) claim_lms = build_gold_lms(claims) bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) alpha = 0.1 max_seq_length = 512 generator = get_generator(max_seq_length, bg_lm, alpha) out_dir = os.path.join(env_data_dir, save_name) exist_or_mkdir(out_dir) for claim_lm in claim_lms: print(claim_lm.cid) records: List[Record] = generator(claim_lm, ranked_list[str(claim_lm.cid)]) output_path = os.path.join(out_dir, str(claim_lm.cid)) write_records(records, max_seq_length, output_path)
def predict_by_lm(claim_lms: List[ClaimLM], claims, top_k) -> List[Tuple[str, List[Dict]]]: alpha = 0.1 bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) tokenizer = PCTokenizer() print("Eval log odds") claim_log_odds_dict = {str(c_lm.cid): get_log_odd(c_lm, bg_lm, alpha) for c_lm in claim_lms} def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") p_text = perspective_getter(int(p_id)) tokens = tokenizer.tokenize_stem(p_text) c_lm = claim_log_odds_dict[claim_id] reason = " ".join(["{0} ({1:.2f})".format(t, c_lm[t]) for t in tokens]) score = sum([c_lm[t] for t in tokens]) return NamedNumber(score, reason) r = predict_interface(claims, top_k, scorer) return r
def join_docs_and_lm(): gold = get_claim_perspective_id_dict() d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims[:10] top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords.update([".", ",", "!", "?"]) alpha = 0.1 html_visualizer = HtmlVisualizer("doc_lm_joined.html") def get_cell_from_token2(token, probs): if token.lower() in stopwords: probs = 0 probs = probs * 1e5 s = min(100, probs) c = Cell(token, s) return c tokenizer = PCTokenizer() for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] html_visualizer.write_headline("{} : {}".format(c['cId'], c['text'])) clusters: List[List[int]] = gold[c['cId']] for cluster in clusters: html_visualizer.write_paragraph("---") p_text_list: List[str] = lmap(perspective_getter, cluster) for text in p_text_list: html_visualizer.write_paragraph(text) html_visualizer.write_paragraph("---") claim_lm = claim_lms_d[c['cId']] topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) s = "\t".join(left(log_odd.most_common(30))) html_visualizer.write_paragraph("Log odd top: " + s) not_found = set() def get_log_odd(x): x = tokenizer.stemmer.stem(x) if x not in log_odd: not_found.add(x) return log_odd[x] def get_probs(x): x = tokenizer.stemmer.stem(x) if x not in topic_lm_prob: not_found.add(x) return topic_lm_prob[x] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)), doc) html_visualizer.write_headline("Doc rank {}".format(i)) html_visualizer.multirow_print(cells, width=20) except KeyError: pass html_visualizer.write_paragraph("Not found: {}".format(not_found))
tokenizer = PCTokenizer() problems, candidate_pool_d = prepare_eval_data(split) payload: List[Passage] = get_eval_payload_from_dp(problems) for query, problem in zip(payload, problems): p = problem source_text = p.text1.text tokens = tokenizer.tokenize_stem(source_text) counter = tokens_to_freq(tokens) yield RelevanceModel(query.id.id, query.text, counter) if __name__ == "__main__": split = "training" lms: List[Tuple[str, Counter]] = list(build_lm(split)) alpha = 0.1 bg_lm = average_counters(lmap(lambda x: x.lm, lms)) def show(r: RelevanceModel): print('----') print(r.text) log_topic_lm = get_lm_log(smooth(r.lm, bg_lm, alpha)) log_bg_lm = get_lm_log(bg_lm) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) for k, v in r.lm.most_common(50): print(k, v) s = "\t".join(left(r.lm.most_common(10))) print("LM freq: ", s) print(s)
def get_claim_lm(claim) -> ClaimLM: cid = claim["cId"] counter_list: List[Counter] = lmap(get_cluster_lm, gold[cid]) counter: Counter = average_counters(counter_list) return ClaimLM(cid, claim['text'], counter)
def doc_lm_scoring(): gold = get_claim_perspective_id_dict() d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.5 html_visualizer = HtmlVisualizer("doc_lm_doc_level.html") tokenizer = PCTokenizer() random_passages = [] num_pos_sum = 0 num_pos_exists = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] html_visualizer.write_headline("{} : {}".format(c['cId'], c['text'])) # for cluster in clusters: # html_visualizer.write_paragraph("---") # p_text_list: List[str] = lmap(perspective_getter, cluster) # for text in p_text_list: # html_visualizer.write_paragraph(text) # html_visualizer.write_paragraph("---") claim_lm = claim_lms_d[c['cId']] topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) threshold = average(scores) s = "\t".join(left(log_odd.most_common(30))) html_visualizer.write_paragraph("Log odd top: " + s) not_found = set() def get_log_odd(x): x = tokenizer.stemmer.stem(x) if x not in log_odd: not_found.add(x) return log_odd[x] def get_probs(x): x = tokenizer.stemmer.stem(x) if x not in topic_lm_prob: not_found.add(x) return topic_lm_prob[x] def get_passage_score(p): return sum([log_odd[tokenizer.stemmer.stem(t)] for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) passages.sort(key=lambda x: x[1], reverse=True) html_visualizer.write_paragraph("Threshold {}".format(threshold)) top5_scores = right(passages[:5]) bot5_scores = right(passages[-5:]) if len(random_passages) > 5: random_sel_pssages = random.choices(random_passages, k=5) else: random_sel_pssages = [] random5_scores = lmap(get_passage_score, random_sel_pssages) def score_line(scores): return " ".join(lmap(two_digit_float, scores)) html_visualizer.write_paragraph("top 5: " + score_line(top5_scores)) html_visualizer.write_paragraph("bot 5: " + score_line(bot5_scores)) html_visualizer.write_paragraph("random 5: " + score_line(random5_scores)) num_pos = len(lfilter(lambda x: x[1] > 0, passages)) num_pos_sum += num_pos if num_pos > 0: num_pos_exists += 1 def print_doc(doc, html_visualizer, score): cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)), doc) html_visualizer.write_headline("score={}".format(score)) html_visualizer.multirow_print(cells, width=20) random_passages.extend(left(passages)) if threshold < 0: continue for doc, score in passages: if score < 0: break print_doc(doc, html_visualizer, score) html_visualizer.write_headline("Bottom 5") for doc, score in passages[-5:]: print_doc(doc, html_visualizer, score) print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum, num_pos_exists))
def text_list_to_lm(tokenizer: PCTokenizer, text_list: List[str]) -> Counter: tokens_list: List[List[str]] = lmap(tokenizer.tokenize_stem, text_list) counter_list = lmap(tokens_to_freq, tokens_list) counter = average_counters(counter_list) return counter