def main(config): word_list_path = config['word_list_path'] claims = get_all_claims() claim_d = claims_to_dict(claims) stopwords = load_stopwords_for_query() word_list_d: Dict = json.load(open(word_list_path, "r")) tokenizer = PCTokenizer() for query_id in word_list_d: claim = claim_d[int(query_id)] word_list = word_list_d[query_id] base_query_terms = tokenizer.tokenize_stem(claim) base_query_terms = list( [t for t in base_query_terms if t not in stopwords]) #print new_term_set = set() for new_term in word_list: t = tokenizer.stemmer.stem(new_term) if t not in base_query_terms: new_term_set.add(t) print() print("Claim {}: {}".format(query_id, claim)) print("base query terms: ", base_query_terms) print("new terms: ", new_term_set)
def a_relevant(save_name, q_res_path, claims): top_n = 10 ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.5 tokenizer = PCTokenizer() all_passages = [] entries = [] num_pos_sum = 0 num_pos_exists = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) num_pos = len(lfilter(lambda x: x[1] > 0, passages)) num_pos_sum += num_pos if num_pos > 0: num_pos_exists += 1 all_passages.extend(passages) entries.append((c, passages)) print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum, num_pos_exists)) data = entries, all_passages save_to_pickle(data, save_name)
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) claims = claims[:10] top_n = 100 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) stopwords = load_stopwords_for_query() alpha = 0.7 tokenizer = PCTokenizer() for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 docs = [] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) docs.append(doc) except KeyError: docs.append(None) pass print(c['text']) rows = [] for rank, doc in enumerate(docs): if doc is None: rows.append((rank, "-", "-")) continue scores = get_doc_score(doc, get_passage_score) avg_score = average(scores) max_score = max(scores) rows.append((rank, avg_score, max_score)) print_table(rows)
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.3 tokenizer = PCTokenizer() all_passages = [] entries = [] for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) base = average(scores) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) all_passages.extend(passages) a_rel_passages = lfilter(lambda x: x[1] > 0, passages) entries.append((c, a_rel_passages)) data = entries, all_passages save_to_pickle(data, "pc_train_a_passages")
def __init__(self, query_lms: Dict[str, Counter], alpha=0.5): self.query_lms = query_lms bg_lm = average_counters(list(query_lms.values())) self.bg_lm = bg_lm self.log_bg_lm: Counter = get_lm_log(bg_lm) self.alpha = alpha self.log_odd_d: Dict[str, Counter] = { k: Counter() for k in query_lms.keys() } self.stopwords = load_stopwords_for_query() self.tokenizer = PCTokenizer()
def main(): split = "train" subjectivity_path = sys.argv[1] q_res_path = sys.argv[2] ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) # load LM claim_lms: List[ClaimLM] = build_gold_lms_for_sub_split(split) bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) alpha = 0.1 stopwords = load_stopwords_for_query() # load subjectivity predictions. subj_d: Dict[str, Tuple[int, int]] = load_subjectivity(subjectivity_path) doc_ids = subj_d.keys() preload_man.preload(TokenizedCluewebDoc, doc_ids) tokenizer = PCTokenizer() lm_scores = [] rates = [] num_subj_list = [] num_sent_list = [] for claim_lm in claim_lms: qid = str(claim_lm.cid) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 for entry in ranked_list[qid]: if entry.doc_id in subj_d: tokens = load_doc(entry.doc_id) assert type(tokens[0]) == str lm_score = get_passage_score(tokens) num_subj, num_sent = subj_d[entry.doc_id] rate = num_subj / num_sent lm_scores.append(lm_score) rates.append(rate) num_subj_list.append(num_subj) num_sent_list.append(num_sent) print("lm scores correlation with ") print("rates: ", pearsonr(lm_scores, rates)) print("num subj: ", pearsonr(lm_scores, num_subj_list)) print("num sent: ", pearsonr(lm_scores, num_sent_list))
def get_answer_maker(config): stopwords = load_stopwords_for_query() def make_answer1(problem: PackedInstance, score: np.array) -> List[str]: # among tokens from documents # select unique words that has highest score token_score = Counter() n_appear = Counter() max_len = len(problem.input_ids) for idx in range(max_len): if problem.input_mask[idx] == 0: break # skip query tokens if problem.segment_ids[idx] == 0: continue token_idx = problem.idx_mapping[idx] if token_idx == -1: assert problem.word_tokens[token_idx] == "[CLS]" print("skip cls token") token = problem.word_tokens[token_idx] token_score[token] += score[idx] n_appear[token] += 1 out_tokens = [] max_score = None for token, token_score in token_score.most_common(): if len(out_tokens) > config.max_terms: break if config.drop_stopwords and token in stopwords: continue if max_score is None: max_score = token_score score_cut = max_score * config.cut_factor if len(out_tokens) == 0: include = True else: if token_score > score_cut: include = True else: include = False if include: out_tokens.append(token) else: break return out_tokens return make_answer1
def get_answer_maker_token_level(config): stopwords = load_stopwords_for_query() def make_answer(problem: str, score: np.array) -> List[str]: tokens = problem.split() sep_idx = tokens.index("[SEP]") # among tokens from documents # select unique words that has highest score token_score = Counter() n_appear = Counter() max_len = len(tokens) print(tokens) print(max_len) print(len(score)) for idx in range(sep_idx + 1, max_len): # skip query tokens if tokens[idx] == "[PAD]": break token = tokens[idx] token_score[token] += score[idx] n_appear[token] += 1 out_tokens = [] max_score = None for token, token_score in token_score.most_common(): if len(out_tokens) > config.max_terms: break if config.drop_stopwords and token in stopwords: continue if max_score is None: max_score = token_score score_cut = max_score * config.cut_factor if len(out_tokens) == 0: include = True else: if token_score > score_cut: include = True else: include = False if include: out_tokens.append(token) else: break return out_tokens return make_answer
def main(): split = "dev" stopword = load_stopwords_for_query() # split = "train" ex_info_dir = "/mnt/nfs/work3/youngwookim/job_man/pc_rm_terms_{}".format( split) query_path = os.path.join( output_path, "perspective_{}_claim_query_k0_fixed.json".format(split)) queries = load_queries(query_path) ex_w_scale = 100 out_path = os.path.join(output_path, "perspective_query", "pc_{}_claim_query_rm_ex.json".format(split)) ## new_queries = get_extended(ex_info_dir, ex_w_scale, queries, stopword) save_queries_to_file(new_queries, out_path)
def get_generator(max_seq_length, bg_lm, alpha): log_bg_lm = get_lm_log(bg_lm) top_n = 100 stopwords = load_stopwords_for_query() fail_logger = Counter() bert_tokenizer = get_tokenizer() def generate(claim_lm: ClaimLM, ranked_list: List[SimpleRankedListEntry]): claim_text = claim_lm.claim claim_tokens = bert_tokenizer.tokenize(claim_text) claim_token_len = len(claim_tokens) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) doc_ids = lmap(lambda x: x.doc_id, ranked_list[:top_n]) print("loading docs") preload_man.preload(BertTokenizedCluewebDoc, doc_ids) window_size = max_seq_length - claim_token_len - 3 step_size = max_seq_length - 112 enum_paragraph = enum_paragraph_functor(step_size, window_size) def get_record(tokens): scores, masks = get_target_labels(tokens, log_odd, stopwords, fail_logger) sum(scores) return Record(claim_tokens, tokens, scores, masks) tokens_list: List[List[str]] = [] not_found = 0 for doc_id in doc_ids: try: tokens: List[str] = list( flatten(load(BertTokenizedCluewebDoc, doc_id))) tokens_list.append(tokens) except KeyError: not_found += 1 pass print("{} of {} not found".format(not_found, len(tokens_list))) paragraph_list: Iterable[List[str]] = enum_paragraph(tokens_list) records: List[Record] = lmap(get_record, paragraph_list) return records return generate
def main(): file_path = sys.argv[1] name = os.path.basename(file_path) viewer = EstimatorPredictionViewer(file_path) html = HtmlVisualizer("toke_score_gold.html") stopwords = load_stopwords_for_query() skip = 10 for entry_idx, entry in enumerate(viewer): if entry_idx % skip != 0: continue tokens = entry.get_tokens("input_ids") input_ids = entry.get_vector("input_ids") label_ids = entry.get_vector("label_ids") label_ids = np.reshape(label_ids, [-1, 2]) log_label_ids = np.log(label_ids + 1e-10) seg1, seg2 = split_p_h_with_input_ids(tokens, input_ids) pad_idx = tokens.index("[PAD]") assert pad_idx > 0 logits = entry.get_vector("logits") cells = [] cells2 = [] for idx in range(pad_idx): probs = label_ids[idx] token = tokens[idx] score = probs[0] color = "B" if score > 0 else "R" highlight_score = min(abs(score) * 10000, 100) if token in stopwords: highlight_score = 0 if token in seg1: highlight_score = 50 color = "G" c = Cell(token, highlight_score=highlight_score, target_color=color) cells.append(c) html.multirow_print_from_cells_list([cells, cells2]) if entry_idx > 10000: break
def main(config): split = config['split'] word_prob_path = config['word_prob_path'] save_path = config['save_path'] threshold = config['threshold'] per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path) claims = load_claims_for_sub_split(split) claim_d = claims_to_dict(claims) stopwords = load_stopwords_for_query() def is_stopword(tokens): if len(tokens) == 1 and tokens[0] in stopwords: return True else: return False tokenizer = get_tokenizer() all_d = {} for query_id, d in per_query_infos.items(): entry = [] for key in d.keys(): tokens: List[str] = decode_word_as_id(tokenizer, key) if is_stopword(tokens): continue plain_word: str = pretty_tokens(tokens, True) pos, neg = d[key] pos_log = math.log(pos + 1e-10) neg_log = math.log(neg + 1e-10) diff = pos_log - neg_log entry.append((plain_word, diff, pos_log, neg_log)) entry.sort(key=get_second, reverse=True) word_list = [] for word, diff, pos, neg in entry[:100]: if diff > threshold: word = word.strip() word_list.append(word) all_d[query_id] = word_list json.dump(all_d, open(save_path, "w"))
def get_answer_maker_term_level(config): stopwords = load_stopwords_for_query() stopwords.add("[SEP]") stopwords.add("[CLS]") tokenizer = get_tokenizer() def make_answer1(problem: PackedInstance, score: Dict) -> List[str]: # among tokens from documents # select unique words that has highest score out_tokens = [] max_score = None for term_id_str, term_score in Counter(score).most_common(): if len(out_tokens) > config.max_terms: break input_ids = recover_int_list_str(term_id_str) tokens = tokenizer.convert_ids_to_tokens(input_ids) term = get_term(tokens) if config.drop_stopwords and term in stopwords: continue if max_score is None: max_score = term_score score_cut = max_score * config.cut_factor if len(out_tokens) == 0: include = True else: if term_score > score_cut: include = True else: include = False if include: out_tokens.append(term) else: break return out_tokens return make_answer1
def main(config): split = config['split'] word_prob_path = config['word_prob_path'] per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path) claims = load_claims_for_sub_split(split) claim_d = claims_to_dict(claims) stopwords = load_stopwords_for_query() def is_stopword(tokens): if len(tokens) == 1 and tokens[0] in stopwords: return True else: return False tokenizer = get_tokenizer() for query_id, d in per_query_infos.items(): entry = [] for key in d.keys(): tokens: List[str] = decode_word_as_id(tokenizer, key) if is_stopword(tokens): continue plain_word: str = pretty_tokens(tokens, True) pos, neg = d[key] pos_log = math.log(pos + 1e-10) neg_log = math.log(neg + 1e-10) diff = pos_log - neg_log entry.append((plain_word, diff, pos_log, neg_log)) print(query_id, claim_d[int(query_id)]) entry.sort(key=get_second, reverse=True) for word, diff, pos, neg in entry[:100]: word = word.strip() print("{0}\t{1:.2f}\t{2:.2f}\t{3:.2f}".format( word, diff, pos, neg))
def __init__(self, d: Dict[WordAsID, np.array], skip_stopwords=True, stem=True): self.tokenizer = get_tokenizer() self.stopwords_as_ids: Set[WordAsID] = set() new_d = {} if skip_stopwords: stopwords = load_stopwords_for_query() for key in d.keys(): tokens = decode_word_as_id(self.tokenizer, key) if len(tokens) == 1 and tokens[0] in stopwords: pass self.stopwords_as_ids.add(key) else: new_d[key] = d[key] d = new_d if stem: d_raw = defaultdict(list) stemmer = Stemmer() for key in d.keys(): tokens = decode_word_as_id(self.tokenizer, key) plain_word = pretty_tokens(tokens, True) stemmed = stemmer.stem(plain_word) d_raw[stemmed].append(d[key]) new_d: Dict[str, TokenScore] = {} for key, items in d_raw.items(): score: TokenScore = [average([t[0] for t in items]), average([t[1] for t in items])] new_d[key] = score d = new_d self.stem = True self.stemmer = stemmer self.log_odd = self.log_odd_w_stem self.d = d self.smoothing = 0.1
def __init__(self, drop_stopwords=True): self.drop_stopwords = drop_stopwords self.stopword = load_stopwords_for_query()
q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] print(c['cId'], c['text']) missing = [] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) except KeyError: missing.append(i) pass print(missing) n_missing += len(missing) print("") stopwords = load_stopwords_for_query() def get_cell_from_token(token, log_odd): if token in stopwords: log_odd = 0 if log_odd > 0: s = min(150, log_odd * 50) c = Cell(token, s, target_color="B") elif log_odd < 0: s = min(150, -log_odd * 50) c = Cell(token, s, target_color="R") else: c = Cell(token) return c
def doc_lm_scoring(): gold = get_claim_perspective_id_dict() d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.5 html_visualizer = HtmlVisualizer("doc_lm_doc_level.html") tokenizer = PCTokenizer() random_passages = [] num_pos_sum = 0 num_pos_exists = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] html_visualizer.write_headline("{} : {}".format(c['cId'], c['text'])) # for cluster in clusters: # html_visualizer.write_paragraph("---") # p_text_list: List[str] = lmap(perspective_getter, cluster) # for text in p_text_list: # html_visualizer.write_paragraph(text) # html_visualizer.write_paragraph("---") claim_lm = claim_lms_d[c['cId']] topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) threshold = average(scores) s = "\t".join(left(log_odd.most_common(30))) html_visualizer.write_paragraph("Log odd top: " + s) not_found = set() def get_log_odd(x): x = tokenizer.stemmer.stem(x) if x not in log_odd: not_found.add(x) return log_odd[x] def get_probs(x): x = tokenizer.stemmer.stem(x) if x not in topic_lm_prob: not_found.add(x) return topic_lm_prob[x] def get_passage_score(p): return sum([log_odd[tokenizer.stemmer.stem(t)] for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) passages.sort(key=lambda x: x[1], reverse=True) html_visualizer.write_paragraph("Threshold {}".format(threshold)) top5_scores = right(passages[:5]) bot5_scores = right(passages[-5:]) if len(random_passages) > 5: random_sel_pssages = random.choices(random_passages, k=5) else: random_sel_pssages = [] random5_scores = lmap(get_passage_score, random_sel_pssages) def score_line(scores): return " ".join(lmap(two_digit_float, scores)) html_visualizer.write_paragraph("top 5: " + score_line(top5_scores)) html_visualizer.write_paragraph("bot 5: " + score_line(bot5_scores)) html_visualizer.write_paragraph("random 5: " + score_line(random5_scores)) num_pos = len(lfilter(lambda x: x[1] > 0, passages)) num_pos_sum += num_pos if num_pos > 0: num_pos_exists += 1 def print_doc(doc, html_visualizer, score): cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)), doc) html_visualizer.write_headline("score={}".format(score)) html_visualizer.multirow_print(cells, width=20) random_passages.extend(left(passages)) if threshold < 0: continue for doc, score in passages: if score < 0: break print_doc(doc, html_visualizer, score) html_visualizer.write_headline("Bottom 5") for doc, score in passages[-5:]: print_doc(doc, html_visualizer, score) print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum, num_pos_exists))
def filter_stopwords(tokens: Iterable[str]) -> List[str]: global stopwords if stopwords is None: stopwords = load_stopwords_for_query() return list([t for t in tokens if t not in stopwords])