def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) claims = claims[:10] top_n = 100 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) stopwords = load_stopwords_for_query() alpha = 0.7 tokenizer = PCTokenizer() for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 docs = [] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) docs.append(doc) except KeyError: docs.append(None) pass print(c['text']) rows = [] for rank, doc in enumerate(docs): if doc is None: rows.append((rank, "-", "-")) continue scores = get_doc_score(doc, get_passage_score) avg_score = average(scores) max_score = max(scores) rows.append((rank, avg_score, max_score)) print_table(rows)
def main(): split = "train" subjectivity_path = sys.argv[1] q_res_path = sys.argv[2] ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) # load LM claim_lms: List[ClaimLM] = build_gold_lms_for_sub_split(split) bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) alpha = 0.1 stopwords = load_stopwords_for_query() # load subjectivity predictions. subj_d: Dict[str, Tuple[int, int]] = load_subjectivity(subjectivity_path) doc_ids = subj_d.keys() preload_man.preload(TokenizedCluewebDoc, doc_ids) tokenizer = PCTokenizer() lm_scores = [] rates = [] num_subj_list = [] num_sent_list = [] for claim_lm in claim_lms: qid = str(claim_lm.cid) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 for entry in ranked_list[qid]: if entry.doc_id in subj_d: tokens = load_doc(entry.doc_id) assert type(tokens[0]) == str lm_score = get_passage_score(tokens) num_subj, num_sent = subj_d[entry.doc_id] rate = num_subj / num_sent lm_scores.append(lm_score) rates.append(rate) num_subj_list.append(num_subj) num_sent_list.append(num_sent) print("lm scores correlation with ") print("rates: ", pearsonr(lm_scores, rates)) print("num subj: ", pearsonr(lm_scores, num_subj_list)) print("num sent: ", pearsonr(lm_scores, num_sent_list))
def show_docs(claims, ranked_list, top_n): # for each claim for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] print(c['cId'], c['text']) for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) print() print("Doc rank {}".format(i)) print(" ".join(doc)) except KeyError: pass print("--------")
def iterate_docs(doc_ids: List[str]) -> Iterable[KnowledgeDocument]: docs = [] for doc_id in doc_ids: try: tokens = load_doc(doc_id) kd = KnowledgeDocument(doc_id, tokens) docs.append(kd) except KeyError: pass if len(docs) < len(doc_ids): print("Retrieved {} of {} docs".format(len(docs), len(doc_ids))) duplicate_doc_ids = get_duplicate(docs) unique_docs = [d for d in docs if d.doc_id not in duplicate_doc_ids] return unique_docs
def iterate_docs(q_res: List[SimpleRankedListEntry], top_n: int) -> Iterable[KnowledgeDocument]: docs = [] for i in range(top_n): try: tokens = load_doc(q_res[i].doc_id) kd = KnowledgeDocument(q_res[i].doc_id, tokens) docs.append(kd) except KeyError: pass if len(docs) < top_n: print("Retrieved {} of {} docs".format(len(docs), top_n)) duplicate_doc_ids = get_duplicate(docs) unique_docs = [d for d in docs if d.doc_id not in duplicate_doc_ids] return unique_docs
def report_missing(claims, ranked_list, top_n): # for each claim n_missing = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] print(c['cId'], c['text']) missing = [] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) except KeyError: missing.append(i) pass print(missing) n_missing += len(missing) print("")
def iterate_passages(q_res, top_n, get_passage_score): passages = [] docs = [] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) docs.append(doc) except KeyError: pass for doc in remove_duplicate(docs): idx = 0 window_size = 300 while idx < len(doc): p = doc[idx:idx + window_size] score = get_passage_score(p) passages.append((p, score)) idx += window_size return passages
def binary_feature_demo(datapoint_list): ci = PassageRankedListInterface(make_passage_query, Q_CONFIG_ID_BM25) not_found_set = set() _, clue12_13_df = load_clueweb12_B13_termstat() cdf = 50 * 1000 * 1000 html = HtmlVisualizer("pc_binary_feature.html") def idf_scorer(doc, claim_text, perspective_text): cp_tokens = nltk.word_tokenize(claim_text) + nltk.word_tokenize( perspective_text) cp_tokens = lmap(lambda x: x.lower(), cp_tokens) cp_tokens = set(cp_tokens) mentioned_terms = lfilter(lambda x: x in doc, cp_tokens) mentioned_terms = re_tokenize(mentioned_terms) def idf(term): if term not in clue12_13_df: if term in string.printable: return 0 not_found_set.add(term) return math.log((cdf + 0.5) / (clue12_13_df[term] + 0.5)) score = sum(lmap(idf, mentioned_terms)) max_score = sum(lmap(idf, cp_tokens)) # print(claim_text, perspective_text) # print(mentioned_terms) # print(score, max_score) return score, max_score, mentioned_terms def bm25_estimator(doc: Counter, claim_text: str, perspective_text: str): cp_tokens = nltk.word_tokenize(claim_text) + nltk.word_tokenize( perspective_text) cp_tokens = lmap(lambda x: x.lower(), cp_tokens) k1 = 0 def BM25_3(f, qf, df, N, dl, avdl) -> float: K = compute_K(dl, avdl) first = math.log((N - df + 0.5) / (df + 0.5)) second = ((k1 + 1) * f) / (K + f) return first * second dl = sum(doc.values()) info = [] for q_term in set(cp_tokens): if q_term in doc: score = BM25_3(doc[q_term], 0, clue12_13_df[q_term], cdf, dl, 1200) info.append((q_term, doc[q_term], clue12_13_df[q_term], score)) return info print_cnt = 0 for dp_idx, x in enumerate(datapoint_list): ranked_list: List[GalagoRankEntry] = ci.query_passage( x.cid, x.pid, x.claim_text, x.p_text) html.write_paragraph(x.claim_text) html.write_paragraph(x.p_text) html.write_paragraph("{}".format(x.label)) local_print_cnt = 0 lines = [] for ranked_entry in ranked_list: try: doc_id = ranked_entry.doc_id galago_score = ranked_entry.score tokens = load_doc(doc_id) doc_tf = Counter(tokens) if doc_tf is not None: score, max_score, mentioned_terms = idf_scorer( doc_tf, x.claim_text, x.p_text) matched = score > max_score * 0.75 else: matched = "Unk" score = "Unk" max_score = "Unk" def get_cell(token): if token in mentioned_terms: return Cell(token, highlight_score=50) else: return Cell(token) line = [doc_id, galago_score, matched, score, max_score] lines.append(line) html.write_paragraph("{0} / {1:.2f}".format( doc_id, galago_score)) html.write_paragraph("{}/{}".format(score, max_score)) bm25_info = bm25_estimator(doc_tf, x.claim_text, x.p_text) bm25_score = sum(lmap(lambda x: x[3], bm25_info)) html.write_paragraph( "bm25 re-estimate : {}".format(bm25_score)) html.write_paragraph("{}".format(bm25_info)) html.multirow_print(lmap(get_cell, tokens)) local_print_cnt += 1 if local_print_cnt > 10: break except KeyError: pass matched_idx = idx_where(lambda x: x[2], lines) if not matched_idx: html.write_paragraph("No match") else: last_matched = matched_idx[-1] lines = lines[:last_matched + 1] rows = lmap(lambda line: lmap(Cell, line), lines) html.write_table(rows) if dp_idx > 10: break
def join_docs_and_lm(): gold = get_claim_perspective_id_dict() d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims[:10] top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords.update([".", ",", "!", "?"]) alpha = 0.1 html_visualizer = HtmlVisualizer("doc_lm_joined.html") def get_cell_from_token2(token, probs): if token.lower() in stopwords: probs = 0 probs = probs * 1e5 s = min(100, probs) c = Cell(token, s) return c tokenizer = PCTokenizer() for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] html_visualizer.write_headline("{} : {}".format(c['cId'], c['text'])) clusters: List[List[int]] = gold[c['cId']] for cluster in clusters: html_visualizer.write_paragraph("---") p_text_list: List[str] = lmap(perspective_getter, cluster) for text in p_text_list: html_visualizer.write_paragraph(text) html_visualizer.write_paragraph("---") claim_lm = claim_lms_d[c['cId']] topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) s = "\t".join(left(log_odd.most_common(30))) html_visualizer.write_paragraph("Log odd top: " + s) not_found = set() def get_log_odd(x): x = tokenizer.stemmer.stem(x) if x not in log_odd: not_found.add(x) return log_odd[x] def get_probs(x): x = tokenizer.stemmer.stem(x) if x not in topic_lm_prob: not_found.add(x) return topic_lm_prob[x] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)), doc) html_visualizer.write_headline("Doc rank {}".format(i)) html_visualizer.multirow_print(cells, width=20) except KeyError: pass html_visualizer.write_paragraph("Not found: {}".format(not_found))