def load_data(batch_size): train_data = load_pickle_from( os.path.join(data_path, "msmarco", "train.pickle")) dev_data = load_pickle_from( os.path.join(data_path, "msmarco", "dev.pickle")) train_batches = get_batches_ex(train_data, batch_size, 4) dev_batches = get_batches_ex(dev_data, batch_size, 4) return train_batches, dev_batches
def main(): info_dir = os.path.join(job_man_dir, "best_seg_prediction_gen_train_info") job_id = 0 info_file_path = os.path.join(info_dir, str(job_id) + ".info") print(info_file_path) info = json.load(open(info_file_path, "r")) prediction_dir = "output/mmd_ss/mmd_Z_50000" prediction_file = os.path.join(prediction_dir, str(job_id) + ".score") pred_data: List[Dict] = join_prediction_with_info(prediction_file, info) target_qdid = ("1000633", "D144400") saved_entries = [] for key, entry in info.items(): if entry['qid'] == "1000633" and entry['doc_id'] == 'D144400': saved_entries.append(entry) print(entry) print('--') for entry in pred_data: if entry['qid'] == "1000633" and entry['doc_id'] == 'D144400': print(entry) qid = "1000633" sr_path = os.path.join(job_man_dir, "seg_resource_train", qid) sr_per_query: SRPerQuery = load_pickle_from(sr_path) for sr_per_query_doc in sr_per_query.sr_per_query_doc: if sr_per_query_doc.doc_id == "D144400": print("doc {} has {} segs".format(sr_per_query_doc.doc_id, len(sr_per_query_doc.segs)))
def main(): load_path = os.path.join(output_path, "word2vec_clueweb12_13B") model: gensim.models.Word2Vec = load_pickle_from(os.path.join(load_path)) print(model.trainables.syn1neg.shape) terms = ['proposition', 'issue', 'reason'] v_sum = np.sum([model[t] for t in terms], axis=0) print(v_sum) j = np.argmax(v_sum) print(list([model[t][j] for t in terms])) candi = model.wv.similar_by_vector(v_sum, topn=300) j_rank = np.argsort([model[word][j] for word, _ in candi])[::-1] for j_idx in j_rank[:20]: print(candi[j_idx]) word = terms[0] term_id = model.wv.vocab[word].index #print(word, model.wv.vectors[term_id], model[word]) scores = np.dot(v_sum, model.trainables.syn1neg.T) print(scores.shape) rank_by_context = np.argsort(scores)[::-1] for j_idx in rank_by_context[:20]: print(model.wv.index2word[j_idx])
def main(): target_data_idx = int(sys.argv[1]) info_path = os.path.join(job_man_dir, "robust_w_data_id_desc_info_pickle", "{}".format(target_data_idx)) max_seq_length = 512 info = load_pickle_from(info_path) demo_score(info, max_seq_length)
def main(): num_layers = 12 dva = DictValueAverage() all_val = defaultdict(list) for i in range(1): save_path = at_output_dir("lms_scores", str(i) + ".pickle") output_d = load_pickle_from(save_path) input_mask = output_d['input_mask'] # [num_inst, seq_length] for layer_no in range(num_layers): probs = sigmoid( output_d['logits'][layer_no]) # [num_inst, seq_length, 2] num_inst, seq_length, maybe_2 = np.shape(probs) for data_idx in range(num_inst): for seq_idx in range(seq_length): if input_mask[data_idx, seq_idx]: key = layer_no v = probs[data_idx, seq_idx, 1] dva.add(key, v) all_val[key].append(v) for k, v in dva.all_average().items(): print(k, v) for k, l in all_val.items(): min_val = max(l) print(k, min_val)
def main(config): split = config['split'] top_k = config['top_k'] word_prob_path = config['word_prob_path'] run_name = config['run_name'] save_path = config['save_path'] if top_k == 50: candidate_d: Dict[str, List[QCKCandidate]] = get_eval_candidates_as_qck(split) elif top_k == 1000: candidate_d: Dict[str, List[QCKCandidate]] = get_eval_candidates_1k_as_qck(split) else: assert False per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path) all_ranked_list_entries = [] for query_id, d in per_query_infos.items(): scorer = Scorer(d, True) candidates: List[QCKCandidate] = candidate_d[query_id] entries = [] for c in candidates: e = c.id, scorer.score(c.text) entries.append(e) entries.sort(key=get_second, reverse=True) ranked_list_entries = scores_to_ranked_list_entries(entries, run_name, query_id) all_ranked_list_entries.extend(ranked_list_entries) write_trec_ranked_list_entry(all_ranked_list_entries, save_path)
def get_candiset(i): p = os.path.join(cpath.data_path, "stream_pickled", "CandiSet_{}_0".format(i)) if not os.path.exists(p): return None return load_pickle_from(p)
def load_info_from_compressed(pickle_path): tprint("loading info pickle") output_d = {} data = load_pickle_from(pickle_path) tprint("decompressing...") for data_id, value_d in data.items(): new_entry = decompress_seg_ids_entry(value_d) output_d[data_id] = new_entry return output_d
def get_ap_from_file_path(input_path): tf_prediction_data = load_pickle_from(input_path) tf_prediction_data = flatten_batches(tf_prediction_data) logits = tf_prediction_data["logits"] label_ids = tf_prediction_data["label_ids"] scores = lmap(logit_to_score_softmax, logits) assert len(scores) == len(label_ids) return get_ap(label_ids, scores)
def __init__( self, out_path, input_file, train_fn: GraphEmbeddingTrainer, ): self.out_dir = out_path self.corpus_d: Dict[int, List[List[str]]] = load_pickle_from(input_file) self.key_list = list(self.corpus_d.keys()) self.key_list.sort() self.train_fn: GraphEmbeddingTrainer = train_fn
def work(self, job_id): tfrecord_path = os.path.join(self.input_dir, str(job_id)) features = load_record(tfrecord_path) save_path = os.path.join(self.out_dir, str(job_id)) rel_score_path = os.path.join(self.rel_ex_score_dir, str(job_id)) rel_score = load_pickle_from(rel_score_path) writer = RecordWriterWrap(save_path) for f in collect_passages(features, self.relevance_scores, self.cpid_to_label, self.num_max_para, self.window_size, rel_score): writer.write_feature(f) writer.close()
def load_corpus(): dir_path = FilePath("/mnt/nfs/work3/youngwookim/data/bert_tf/clueweb12_13B_word_tokens/") corpus = [] cnt = 0 for file_path in get_dir_files(dir_path): tokens_list = load_pickle_from(file_path) corpus.extend(tokens_list) if cnt > 50: break cnt += 1 return corpus
def generate_selected_training_data_for_many_runs( target_data_idx, info_dir, max_seq_length, score_and_save_dir: List, generate_selected_training_data_fn): interval_start_list = left(robust_query_intervals) key = interval_start_list[target_data_idx] info_path = os.path.join(info_dir, str(key)) tprint("loading info: " + info_path) info = load_pickle_from(info_path) for score_dir, save_dir in score_and_save_dir: exist_or_mkdir(save_dir) tprint(save_dir) generate_selected_training_data_fn(info, key, max_seq_length, save_dir, score_dir)
def main(): target_data_idx = int(sys.argv[1]) max_seq_length = int(sys.argv[2]) max_seg = int(sys.argv[3]) info_path = os.path.join(job_man_dir, "robust_w_data_id_desc_info_pickle", "{}".format(target_data_idx)) info = load_pickle_from(info_path) save_dir_path = at_output_dir("robust_seg_sel", "exact_match{}_{}".format(max_seq_length, max_seg)) exist_or_mkdir(save_dir_path) get_score_fn = get_score_fn_functor() generate_selected_training_data(info, max_seq_length, save_dir_path, get_score_fn, max_seg)
def generate_selected_training_data_loop(split_no, score_dir, info_dir, max_seq_length, save_dir, generate_selected_training_data_fn): train_items, held_out = get_robust_splits(split_no) print(train_items) exist_or_mkdir(save_dir) for key in train_items: info_path = os.path.join(info_dir, str(key)) # info = load_combine_info_jsons(info_path, False, False) tprint("loading info: " + info_path) info = load_pickle_from(info_path) # info = load_info_from_compressed(info_path) generate_selected_training_data_fn(info, key, max_seq_length, save_dir, score_dir)
def parse_prediction_and_eval(prediction_path, payload_type, data_id, k=100): payload_info = get_payload_info(payload_type, data_id) tf_prediction_data = load_pickle_from(prediction_path) all_ranked_list = generate_ranked_list(tf_prediction_data, payload_info, k) text_output_path = prediction_path + ".txt" st = int(data_id) write_ranked_list(range(st, st + 50), all_ranked_list, text_output_path) pred_list = [] for ranked_list in all_ranked_list: pred = [x[0] for x in ranked_list] pred_list.append(pred) return eval(pred_list, data_id)
def collect_ngram_count(dir_path, ed, ngram_range, st): all_counter = {} df_counter = {} for n in ngram_range: all_counter[n] = Counter() df_counter[n] = Counter() for i in range(st, ed): file_path = os.path.join(dir_path, str(i)) features: List[PCNGramFeature] = load_pickle_from(file_path) for f in features: for n in ngram_range: counter: Counter = f.n_grams[n] all_counter[n].update(counter) for key in counter: df_counter[n][key] += 1 return df_counter
def save_to_trec_format(prediction_path, payload_type, data_id, num_candidate, run_name, save_path): payload_info = get_payload_info(payload_type, data_id) tf_prediction_data = load_pickle_from(prediction_path) all_ranked_list = generate_ranked_list(tf_prediction_data, payload_info, num_candidate) st = int(data_id) query_ids = [str(i) for i in range(st, st + 50)] all_entries: List[Tuple[str, List[TrecRankedListEntry]]] = [] for query_id, ranked_list in zip(query_ids, all_ranked_list): rl = [ TrecRankedListEntry(query_id, doc_id, rank, score, run_name) for doc_id, rank, score in ranked_list ] all_entries.append((query_id, rl)) write_ranked_list_from_s(dict(all_entries), save_path)
def main(info_path, input_type, label_dict_path, save_path): f_handler = get_format_handler(input_type) info: Dict[str, Dict] = load_combine_info_jsons(info_path, f_handler.get_mapping(), f_handler.drop_kdp()) label_dict: Dict[Tuple[str, str], bool] = load_pickle_from(label_dict_path) l = [] for entry in info.values(): key = f_handler.get_pair_id(entry) query_id, candidate_id = key if key in label_dict: correctness = label_dict[key] else: correctness = False e = TrecRelevanceJudgementEntry(query_id, candidate_id, int(correctness)) l.append(e) write_trec_relevance_judgement(l, save_path)
def work(self, job_id): file_no = int(job_id / 10) idx = job_id % 10 pc_co_occurrence = load_pickle_from( os.path.join(self.input_dir, str(file_no))) cid, pair_counter = pc_co_occurrence[idx] edges, valid_vertices = select_vertices_edges(pair_counter) try: init_p_dict = Counter(dict(self.prob_score_d[cid])) result = run_biased_random_walk(edges, valid_vertices, self.max_repeat, self.p_reset, init_p_dict) result = Counter(result) output = cid, result save_path = os.path.join(self.out_dir, str(job_id)) pickle.dump(output, open(save_path, "wb")) except KeyError as e: print(e)
def main(): data_name = "wiki" for method in ["deletion", "LIME"]: for config in [DropStop, Config2, ConfigShort]: data_method_str = "{}_{}".format(data_name, method) save_dir = os.path.join(output_path, "genex", data_method_str) for i in range(100): try: idx_str = "{0:02d}".format(i) score_name = "{}_{}_{}".format(data_name, idx_str, method) save_name = "{}_{}.txt".format(score_name, config.name) save_path = os.path.join(save_dir, save_name) score_path = os.path.join(data_path, "cache", data_method_str, score_name + ".pickle") scores: List[np.array] = load_pickle_from(score_path) data: List[PackedInstance] = load_packed(data_name) save_score_to_file(data, config, save_path, scores) except: print(data_name) raise
def main(config): split = config['split'] word_prob_path = config['word_prob_path'] save_path = config['save_path'] threshold = config['threshold'] per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path) claims = load_claims_for_sub_split(split) claim_d = claims_to_dict(claims) stopwords = load_stopwords_for_query() def is_stopword(tokens): if len(tokens) == 1 and tokens[0] in stopwords: return True else: return False tokenizer = get_tokenizer() all_d = {} for query_id, d in per_query_infos.items(): entry = [] for key in d.keys(): tokens: List[str] = decode_word_as_id(tokenizer, key) if is_stopword(tokens): continue plain_word: str = pretty_tokens(tokens, True) pos, neg = d[key] pos_log = math.log(pos + 1e-10) neg_log = math.log(neg + 1e-10) diff = pos_log - neg_log entry.append((plain_word, diff, pos_log, neg_log)) entry.sort(key=get_second, reverse=True) word_list = [] for word, diff, pos, neg in entry[:100]: if diff > threshold: word = word.strip() word_list.append(word) all_d[query_id] = word_list json.dump(all_d, open(save_path, "w"))
def main(): input_path = sys.argv[1] tf_prediction_data = load_pickle_from(input_path) tf_prediction_data = flatten_batches(tf_prediction_data) logits = tf_prediction_data["logits"] label_ids = tf_prediction_data["label_ids"] scores = lmap(logit_to_score_softmax, logits) assert len(scores) == len(label_ids) print("{} data points".format(len(scores))) todo = [(get_auc, "auc"), (get_ap, "ap")] rows = [] for metric_fn, metric_name in todo: score = metric_fn(label_ids, scores) row = [metric_name, score] rows.append(row) print_table(rows)
def build_ngram_feature(dir_path, st, ed): selected_ngram_set = load_from_pickle("selected_ngram_feature") ngram_range = [1, 2, 3] all_data_point = [] for i in range(st, ed): file_path = os.path.join(dir_path, str(i)) features: List[PCNGramFeature] = load_pickle_from(file_path) for f in features: vector_builder = [] for n in ngram_range: counter: Counter = f.n_grams[n] vector = [counter[k] for k in selected_ngram_set[n]] vector_builder.extend(vector) r = PCVectorFeature(f.claim_pers, vector_builder) all_data_point.append(r) save_name = os.path.basename(dir_path) + "_ngram_features" save_to_pickle(all_data_point, save_name)
def main(config): split = config['split'] word_prob_path = config['word_prob_path'] per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path) claims = load_claims_for_sub_split(split) claim_d = claims_to_dict(claims) stopwords = load_stopwords_for_query() def is_stopword(tokens): if len(tokens) == 1 and tokens[0] in stopwords: return True else: return False tokenizer = get_tokenizer() for query_id, d in per_query_infos.items(): entry = [] for key in d.keys(): tokens: List[str] = decode_word_as_id(tokenizer, key) if is_stopword(tokens): continue plain_word: str = pretty_tokens(tokens, True) pos, neg = d[key] pos_log = math.log(pos + 1e-10) neg_log = math.log(neg + 1e-10) diff = pos_log - neg_log entry.append((plain_word, diff, pos_log, neg_log)) print(query_id, claim_d[int(query_id)]) entry.sort(key=get_second, reverse=True) for word, diff, pos, neg in entry[:100]: word = word.strip() print("{0}\t{1:.2f}\t{2:.2f}\t{3:.2f}".format( word, diff, pos, neg))
def save_to_trec_format(): prediction_path_format = "output/robust/A_train_{}.score" payload_type = "first_clean" num_candidate = 100 run_name = "train_eval" save_path = "output/ranked_list/robust_train_pred.txt" all_entries = [] for data_id in [301, 351, 401, 601]: payload_info = get_payload_info(payload_type, str(data_id)) prediction_path = prediction_path_format.format(data_id) tf_prediction_data = load_pickle_from(prediction_path) all_ranked_list = generate_ranked_list(tf_prediction_data, payload_info, num_candidate) st = int(data_id) query_ids = [str(i) for i in range(st, st + 50)] for query_id, ranked_list in zip(query_ids, all_ranked_list): rl = [ TrecRankedListEntry(query_id, doc_id, rank, score, run_name) for doc_id, rank, score in ranked_list ] all_entries.append((query_id, rl)) write_ranked_list_from_s(dict(all_entries), save_path)
def load_entries(cid): save_root = os.path.join(output_path, "cppnc", "cid_grouped") save_path = os.path.join(save_root, cid) return load_pickle_from(save_path)
def predict_nli(model_path): hp = HPGenEx() run_name = "msmarco" dev_batches = load_pickle_from(os.path.join(data_path, "msmarco", "dev.pickle"))[:10] predict(hp, run_name, dev_batches, model_path, load_model)
def main(): prediction_file_path = at_output_dir("robust", "rob_dense2_pred.score") info_file_path = at_job_man_dir1("robust_predict_desc_128_step16_2_info") queries: Dict[str, str] = load_robust_04_query("desc") tokenizer = get_tokenizer() query_token_len_d = {} for qid, q_text in queries.items(): query_token_len_d[qid] = len(tokenizer.tokenize(q_text)) step_size = 16 window_size = 128 out_entries: List[AnalyzedDoc] = token_score_by_ablation( info_file_path, prediction_file_path, query_token_len_d, step_size, window_size) qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" judgement_d = load_qrels_structured(qrel_path) html = HtmlVisualizer("robust_desc_128_step16_2.html", use_tooltip=True) tprint("loading tokens pickles") tokens_d: Dict[str, List[str]] = load_pickle_from( os.path.join(sydney_working_dir, "RobustPredictTokens3", "1")) tprint("Now printing") n_printed = 0 def transform(x): return 3 * (math.pow(x - 0.5, 3) + math.pow(0.5, 3)) n_pos = 0 n_neg = 0 for e in out_entries: max_score: float = max( lmap(SegmentScorePair.get_max_score, flatten(e.token_info.values()))) if max_score < 0.6: if n_neg > n_pos: continue else: n_neg += 1 pass else: n_pos += 1 n_printed += 1 if n_printed > 500: break doc_tokens: List[str] = tokens_d[e.doc_id] score_len = max(e.token_info.keys()) + 1 judgement: Dict[str, int] = judgement_d[e.query_id] label = judgement[e.doc_id] if not len(doc_tokens) <= score_len < len(doc_tokens) + window_size: print("doc length : ", len(doc_tokens)) print("score len:", score_len) print("doc length +step_size: ", len(doc_tokens) + step_size) continue row = [] q_text = queries[e.query_id] html.write_paragraph("qid: " + e.query_id) html.write_paragraph("q_text: " + q_text) html.write_paragraph("Pred: {0:.2f}".format(max_score)) html.write_paragraph("Label: {0:.2f}".format(label)) for idx in range(score_len): token = doc_tokens[idx] if idx < len(doc_tokens) else '[-]' token_info: List[SegmentScorePair] = e.token_info[idx] full_scores: List[float] = lmap(SegmentScorePair.get_score_diff, token_info) full_score_str = " ".join(lmap(two_digit_float, full_scores)) # 1 ~ -1 score = average(full_scores) if score > 0: color = "B" else: color = "R" normalized_score = transform(abs(score)) * 200 c = get_tooltip_cell(token, full_score_str) c.highlight_score = normalized_score c.target_color = color row.append(c) html.multirow_print(row, 16)
def main(): prediction_file_path = at_output_dir("robust", "rob_dense_pred.score") info_file_path = at_job_man_dir1("robust_predict_desc_128_step16_info") queries: Dict[str, str] = load_robust_04_query("desc") tokenizer = get_tokenizer() query_token_len_d = {} for qid, q_text in queries.items(): query_token_len_d[qid] = len(tokenizer.tokenize(q_text)) step_size = 16 window_size = 128 out_entries: List[DocTokenScore] = collect_token_scores( info_file_path, prediction_file_path, query_token_len_d, step_size, window_size) qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" judgement_d = load_qrels_structured(qrel_path) html = HtmlVisualizer("robust_desc_128_step16.html", use_tooltip=True) tprint("loading tokens pickles") tokens_d: Dict[str, List[str]] = load_pickle_from( os.path.join(sydney_working_dir, "RobustPredictTokens3", "1")) tprint("Now printing") n_printed = 0 def transform(x): return 3 * (math.pow(x - 0.5, 3) + math.pow(0.5, 3)) for e in out_entries: max_score = e.max_segment_score() if max_score < 0.6: continue n_printed += 1 if n_printed > 10: break doc_tokens: List[str] = tokens_d[e.doc_id] score_len = len(e.scores) judgement: Dict[str, int] = judgement_d[e.query_id] label = judgement[e.doc_id] if not len(doc_tokens) <= score_len < len(doc_tokens) + window_size: print("doc length : ", len(doc_tokens)) print("score len:", score_len) print("doc length +step_size: ", len(doc_tokens) + step_size) raise IndexError row = [] q_text = queries[e.query_id] html.write_paragraph("qid: " + e.query_id) html.write_paragraph("q_text: " + q_text) html.write_paragraph("Pred: {0:.2f}".format(max_score)) html.write_paragraph("Label: {0:.2f}".format(label)) for idx in range(score_len): token = doc_tokens[idx] if idx < len(doc_tokens) else '[-]' full_scores = e.full_scores[idx] full_score_str = " ".join(lmap(two_digit_float, full_scores)) score = e.scores[idx] normalized_score = transform(score) * 200 c = get_tooltip_cell(token, full_score_str) c.highlight_score = normalized_score row.append(c) html.multirow_print(row, 16)