def start_generate_jobs_for_train_val(generator: InstanceGenerator, name_prefix): # claim ids split to train/val print("Loading data ....") d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) train, val = split_7_3(claims) train_cids = {str(t['cId']) for t in train} val_cids = {str(t['cId']) for t in val} qk_candidate: List[QKUnit] = load_qk_candidate_train() print("Generate instances : train") qk_candidate_train: List[QKUnit] = list( [qk for qk in qk_candidate if qk[0].query_id in train_cids]) qk_candidate_val = list( [qk for qk in qk_candidate if qk[0].query_id in val_cids]) def worker_factory(out_dir): return QCKWorker(qk_candidate_train, generator, out_dir) runner = JobRunner(job_man_dir, 378, name_prefix + "_train", worker_factory) runner.start() print("Generate instances : val") def worker_factory(out_dir): return QCKWorker(qk_candidate_val, generator, out_dir) runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory) runner.start()
def start_generate_jobs_for_train_val( generator_functor: Callable[[Dict[int, List[Tuple[List[str], float]]]], CPPNCGeneratorInterface], writer, name_prefix): # claim ids split to train/val d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) train, val = split_7_3(claims) data = load_from_pickle("pc_train_a_passages") entries, all_passages = data cid_to_passages: Dict[int, List[Tuple[List[str], float]]] = { claim['cId']: p for claim, p in entries } generator = generator_functor(cid_to_passages) print("Generate instances : train") def worker_factory(out_dir): return CPPNCWorker(train, generator, writer, out_dir) runner = JobRunner(job_man_dir, 378, name_prefix + "_train", worker_factory) runner.start() print("Generate instances : val") def worker_factory(out_dir): return CPPNCWorker(val, generator, writer, out_dir) runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory) runner.start()
def write_claim_perspective_pair_as_query(): split = "dev" assert split in ["train", "dev", "test"] d_ids = list({ "train": load_train_claim_ids(), "dev": load_dev_claim_ids(), "test": load_test_claim_ids() }[split]) claims = get_claims_from_ids(d_ids) print(len(claims), " claims") is_train = split == "train" all_data_points = get_candidates(claims, is_train) k = 0 def get_query_entry_from_data_point(x: PerspectiveCandidate) -> DocQuery: tokens = clean_tokenize_str_to_tokens(x.claim_text + " " + x.p_text) qid = "{}_{}".format(x.cid, x.pid) return format_query_bm25(qid, tokens, k) queries = lmap(get_query_entry_from_data_point, all_data_points) out_dir = query_dir_format.format(split) exist_or_mkdir(out_dir) n_query_per_file = 50 write_queries_to_files(n_query_per_file, out_dir, queries)
def work(): d_ids = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) is_train = True all_data_points = get_candidates(claims, is_train) all_data_points = all_data_points[:10] binary_feature_demo(all_data_points)
def sum_random_walk_score(name_class): d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) prob_score_d = load_from_pickle("pc_{}_word_prob_train".format(name_class)) stopwords = load_stopwords() acc_counter_prob_init = Counter() for claim_id, prob_scores in prob_score_d.items(): for k, v in prob_scores: if k not in stopwords: acc_counter_prob_init[k] += v rw_score = dict(load_from_pickle("bias_random_walk_train_{}".format(name_class))) acc_counter = Counter() for claim_id, qtf in rw_score.items(): for k, v in qtf.items(): acc_counter[k] += v acc_counter_prob_init = normalize_counter_to_sum1(acc_counter_prob_init) acc_counter = normalize_counter_to_sum1(acc_counter) new_counter = Counter() for k, v in acc_counter.items(): if len(k) > 2: new_v = v - acc_counter_prob_init[k] new_counter[k] = new_v return new_counter
def save_train(): save_name = "pc_train_a_passages" q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) a_relevant(save_name, q_res_path, claims)
def main(): args = parser.parse_args(sys.argv[1:]) save_name = args.save_name d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) candidate_perspectives: Dict[int, List[Dict]] = dict( get_eval_candidates_from_pickle("train")) make_cppnc_dummy_problem(claims, candidate_perspectives, save_name, encode_two_inputs)
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) claims = claims[:10] top_n = 100 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) stopwords = load_stopwords_for_query() alpha = 0.7 tokenizer = PCTokenizer() for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 docs = [] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) docs.append(doc) except KeyError: docs.append(None) pass print(c['text']) rows = [] for rank, doc in enumerate(docs): if doc is None: rows.append((rank, "-", "-")) continue scores = get_doc_score(doc, get_passage_score) avg_score = average(scores) max_score = max(scores) rows.append((rank, avg_score, max_score)) print_table(rows)
def run_write_claims_as_plain_query(): for claim_ids, out_name in [ (load_train_claim_ids(), "train_claim_query_raw.txt"), (load_dev_claim_ids(), "dev_claim_query_raw.txt") ]: claims = get_claims_from_ids(claim_ids) q_str_list = get_claims_as_plain_query(claims) f = open(pjoin(output_path, out_name), "w") for s in q_str_list: f.write(s + "\n")
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.3 tokenizer = PCTokenizer() all_passages = [] entries = [] for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) base = average(scores) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) all_passages.extend(passages) a_rel_passages = lfilter(lambda x: x[1] > 0, passages) entries.append((c, a_rel_passages)) data = entries, all_passages save_to_pickle(data, "pc_train_a_passages")
def save_bm25_as_trec_format(): d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 200 candidate_dict: List[Tuple[int, List[int]]] = get_eval_candidates_w_q_text( claim_as_query(claims), top_k) pred = predict_by_bm25_from_candidate(get_bm25_module(), claims, candidate_dict, top_k) entries = prediction_to_trec_format(pred, "bm25") write_trec_ranked_list_entry( entries, os.path.join(output_path, "ranked_list", "bm25.txt"))
def show_missing(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims[:10] top_n = 100 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) report_missing(claims, ranked_list, top_n)
def write_claim_queries_k0(): def write(claim_ids, split_name): claims = get_claims_from_ids(claim_ids) queries = get_claims_query(claims, True) out_path = os.path.join( output_path, "perspective_{}_claim_query_k0.json".format(split_name)) save_queries_to_file(queries, out_path) claim_ids, split_name = (load_train_claim_ids(), "train") write(claim_ids, split_name) claim_ids, split_name = (load_dev_claim_ids(), "dev") write(claim_ids, split_name)
def write_claim_as_query(): d_ids = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) queries = [] for c in claims: cid = c["cId"] claim_text = c["text"] tokens = claim_text.split() query_text = clean_query(tokens) print(query_text) q_entry = get_query_entry_bm25_anseri(cid, query_text) queries.append(q_entry) out_path = os.path.join(output_path, "perspective_dev_claim_query.json") save_queries_to_file(queries, out_path)
def work(): claim_ids, split_name = (load_train_claim_ids(), "train") print("Num claims in train : ", len(list(claim_ids))) exit() def submit_jobs_inner(claim_ids, split_name): claims = get_claims_from_ids(claim_ids) queries = get_claims_query(claims) out_root = "/mnt/nfs/work3/youngwookim/data/perspective/{}_claim_rm3".format( split_name) exist_or_mkdir(out_root) submit_rm_jobs(queries, out_root) claim_ids, split_name = (load_dev_claim_ids(), "dev") submit_jobs_inner(claim_ids, split_name)
def run_get_claim_term_weighting(): # Load claim # Do dependency parsing # show top level roles ( dobj, sub ) param = {'k1': 0.5} d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) out_d = get_claim_term_weighting(claims, param) nlp = spacy.load("en_core_web_sm") for c in claims: weight = out_d[c['cId']] for token in nlp(c['text']): s = token.text if weight[s] > 1: s = "[{}]".format(s) print(s, end=" ") print()
def main(): print("Loading data ....") d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) train, val = split_7_3(claims) val_cids = {str(t['cId']) for t in val} qk_candidate: List[QKUnit] = load_qk_candidate_train() qk_candidate_val = list( [qk for qk in qk_candidate if qk[0].query_id in val_cids]) print(qk_candidate_val[0][0]) for q, kdp_list in qk_candidate_val[1:9]: job_id = request_kdp_eval(kdp_list) print('qid:', q.query_id) print('job_id', job_id)
def load_train_claim_d(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claim_d = {c['cId']: c['text'] for c in claims} return claim_d
def work(): split = "train" assert split in ["train", "dev", "test"] tokenizer = PCTokenizer() d_ids = list({ "train": load_train_claim_ids(), "dev": load_dev_claim_ids(), "test": load_test_claim_ids() }[split]) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) print(len(claims), " claims") do_balance = False all_data_points: List[PerspectiveCandidate] = get_candidates( claims, do_balance) grouped: Dict[str, List] = group_by(all_data_points, lambda x: x.cid) def get_frequency_per_class(datapoints: List[PerspectiveCandidate]): pos_text = [] neg_text = [] for dp in datapoints: tokens = tokenizer.tokenize_stem(dp.p_text) tf = Counter(tokens) dl = sum(tf.values()) tf_rel = {k: v / dl for k, v in tf.items()} if dp.label == "1": pos_text.append(tf_rel) elif dp.label == "0": neg_text.append(tf_rel) else: assert False def accumulate(tf_list: List[Dict]): out_c = Counter() n = len(tf_list) for tf in tf_list: for k, v in tf.items(): out_c[k] += v / n return out_c pos_avg_tf = accumulate(pos_text) neg_avg_tf = accumulate(neg_text) return pos_avg_tf, neg_avg_tf class_freq: Dict[str, Tuple[Counter, Counter]] = dict_value_map(get_frequency_per_class, grouped) save_to_pickle(class_freq, "per_claim_class_word_tf_{}".format(split)) def normalize(s_list: List[float]) -> List[float]: m = sum(s_list) return list([s / m for s in s_list]) pos_prob_dict = {} neg_prob_dict = {} for cid, info in class_freq.items(): pos, neg = info all_words = set(pos.keys()) all_words.update(neg.keys()) info = [] for word in all_words: score = pos[word] - neg[word] info.append((word, score)) pos_scores = list([(w, s) for w, s in info if s > 0]) neg_scores = list([(w, s) for w, s in info if s < 0]) def normalize_right(pair_list): right_scores = normalize(right(pair_list)) return list(zip(left(pair_list), right_scores)) pos_prob_dict[cid] = normalize_right(pos_scores) neg_prob_dict[cid] = normalize_right(neg_scores) save_to_pickle(pos_prob_dict, "pc_pos_word_prob_{}".format(split)) save_to_pickle(neg_prob_dict, "pc_neg_word_prob_{}".format(split))
def main(): d_ids = list(load_train_claim_ids()) q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") save_name = "pc_token_train" return do_datagen(d_ids, q_res_path, save_name)
def join_docs_and_lm(): gold = get_claim_perspective_id_dict() d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims[:10] top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords.update([".", ",", "!", "?"]) alpha = 0.1 html_visualizer = HtmlVisualizer("doc_lm_joined.html") def get_cell_from_token2(token, probs): if token.lower() in stopwords: probs = 0 probs = probs * 1e5 s = min(100, probs) c = Cell(token, s) return c tokenizer = PCTokenizer() for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] html_visualizer.write_headline("{} : {}".format(c['cId'], c['text'])) clusters: List[List[int]] = gold[c['cId']] for cluster in clusters: html_visualizer.write_paragraph("---") p_text_list: List[str] = lmap(perspective_getter, cluster) for text in p_text_list: html_visualizer.write_paragraph(text) html_visualizer.write_paragraph("---") claim_lm = claim_lms_d[c['cId']] topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) s = "\t".join(left(log_odd.most_common(30))) html_visualizer.write_paragraph("Log odd top: " + s) not_found = set() def get_log_odd(x): x = tokenizer.stemmer.stem(x) if x not in log_odd: not_found.add(x) return log_odd[x] def get_probs(x): x = tokenizer.stemmer.stem(x) if x not in topic_lm_prob: not_found.add(x) return topic_lm_prob[x] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)), doc) html_visualizer.write_headline("Doc rank {}".format(i)) html_visualizer.multirow_print(cells, width=20) except KeyError: pass html_visualizer.write_paragraph("Not found: {}".format(not_found))
def train_split(): d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) train, val = split_7_3(claims) return claims, val
def doc_lm_scoring(): gold = get_claim_perspective_id_dict() d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.5 html_visualizer = HtmlVisualizer("doc_lm_doc_level.html") tokenizer = PCTokenizer() random_passages = [] num_pos_sum = 0 num_pos_exists = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] html_visualizer.write_headline("{} : {}".format(c['cId'], c['text'])) # for cluster in clusters: # html_visualizer.write_paragraph("---") # p_text_list: List[str] = lmap(perspective_getter, cluster) # for text in p_text_list: # html_visualizer.write_paragraph(text) # html_visualizer.write_paragraph("---") claim_lm = claim_lms_d[c['cId']] topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) threshold = average(scores) s = "\t".join(left(log_odd.most_common(30))) html_visualizer.write_paragraph("Log odd top: " + s) not_found = set() def get_log_odd(x): x = tokenizer.stemmer.stem(x) if x not in log_odd: not_found.add(x) return log_odd[x] def get_probs(x): x = tokenizer.stemmer.stem(x) if x not in topic_lm_prob: not_found.add(x) return topic_lm_prob[x] def get_passage_score(p): return sum([log_odd[tokenizer.stemmer.stem(t)] for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) passages.sort(key=lambda x: x[1], reverse=True) html_visualizer.write_paragraph("Threshold {}".format(threshold)) top5_scores = right(passages[:5]) bot5_scores = right(passages[-5:]) if len(random_passages) > 5: random_sel_pssages = random.choices(random_passages, k=5) else: random_sel_pssages = [] random5_scores = lmap(get_passage_score, random_sel_pssages) def score_line(scores): return " ".join(lmap(two_digit_float, scores)) html_visualizer.write_paragraph("top 5: " + score_line(top5_scores)) html_visualizer.write_paragraph("bot 5: " + score_line(bot5_scores)) html_visualizer.write_paragraph("random 5: " + score_line(random5_scores)) num_pos = len(lfilter(lambda x: x[1] > 0, passages)) num_pos_sum += num_pos if num_pos > 0: num_pos_exists += 1 def print_doc(doc, html_visualizer, score): cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)), doc) html_visualizer.write_headline("score={}".format(score)) html_visualizer.multirow_print(cells, width=20) random_passages.extend(left(passages)) if threshold < 0: continue for doc, score in passages: if score < 0: break print_doc(doc, html_visualizer, score) html_visualizer.write_headline("Bottom 5") for doc, score in passages[-5:]: print_doc(doc, html_visualizer, score) print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum, num_pos_exists))