def __init__(self, query_type="desc", neg_k=1000): qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.queries = load_robust_04_query(query_type) self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best() self.neg_k = neg_k
def __init__(self, encoder, max_seq_length, top_k=100, query_type="title"): self.data = self.load_tokens_from_pickles() self.max_seq_length = max_seq_length self.queries = load_robust_04_query(query_type) self.galago_rank = load_bm25_best() self.top_k = top_k self.encoder = encoder self.tokenizer = get_tokenizer()
def __init__(self, encoder, max_seq_length, query_type="title"): self.data = self.load_tokens() qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.max_seq_length = max_seq_length self.queries = load_robust_04_query(query_type) self.encoder = encoder self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best()
def __init__(self, max_seq_length, use_many_seg_ids=False): self.probe_config = Config1() self.queries: Dict[str, str] = load_robust_04_query("desc") self.tokenizer = get_tokenizer() self.max_seq_length = max_seq_length qid_list = lmap(str, get_robust_qid_list()) self.piece_score_parser = PieceScoreParser(self.queries, qid_list, self.probe_config) self.use_many_seg_ids = use_many_seg_ids
def __init__(self, doc_max_length, query_type="title", neg_k=1000, pos_only=True): self.data = self.load_tokens() qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.doc_max_length = doc_max_length self.queries = load_robust_04_query(query_type) self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best() self.neg_k = neg_k self.pos_only = pos_only
def __init__(self, encoder, max_seq_length, query_type, target_selection_fn: Callable[[str, str, List], List[int]]): self.data = self.load_tokens() qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.max_seq_length = max_seq_length self.queries = load_robust_04_query(query_type) self.encoder = encoder self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best() self.target_selection_fn: Callable[[str, str, List], List[int]] = target_selection_fn
def main(): query_type = "desc" queries = load_robust_04_query(query_type) qid_list = get_robust_qid_list() tokenizer = get_tokenizer() f = open(at_output_dir("robust", "desc_query_len.txt"), "w") for qid in qid_list: query = queries[str(qid)] query_tokens = tokenizer.tokenize(query) n_terms = len(query_tokens) f.write("{}\n".format(n_terms)) f.close()
def __init__(self, encoder, max_seq_length, score_d, query_type="title", neg_k=1000): self.data = self.load_tokens() qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.score_d: Dict[str, List[float]] = score_d self.judgement = load_qrels_structured(qrel_path) self.max_seq_length = max_seq_length self.queries = load_robust_04_query(query_type) self.encoder = encoder self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best() self.neg_k = neg_k self.n_seg_per_doc = 4
def __init__(self, encoder, max_seq_length_per_inst, num_doc_per_inst, num_seg_per_inst, query_type="title", neg_k=1000): self.data = self.load_tokens() qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.max_seq_length = max_seq_length_per_inst self.queries = load_robust_04_query(query_type) self.num_doc_per_inst = num_doc_per_inst self.num_seg_per_inst = num_seg_per_inst self.all_segment_encoder = encoder self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best() self.neg_k = neg_k
def __init__(self, encoder, max_seq_length, scores, query_type="title", target_selection="best"): self.data = self.load_tokens() qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.max_seq_length = max_seq_length self.queries = load_robust_04_query(query_type) self.encoder = encoder self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best() self.scores: Dict[Tuple[str, str, int], float] = scores self.get_target_indices: Callable[[], List[int]] = { 'best': get_target_indices_get_best, 'all': get_target_indices_all, 'first_and_best': get_target_indices_first_and_best, 'best_or_over_09': get_target_indices_best_or_over_09, 'random_over_09': get_target_indices_random_over_09 }[target_selection]
def main(): n_factor = 16 step_size = 16 max_seq_length = 128 max_seq_length2 = 128 - 16 batch_size = 8 info_file_path = at_output_dir("robust", "seg_info") queries = load_robust_04_query("desc") qid_list = get_robust_qid_list() f_handler = get_format_handler("qc") info: Dict = load_combine_info_jsons(info_file_path, f_handler.get_mapping(), f_handler.drop_kdp()) print(len(info)) tokenizer = get_tokenizer() for job_idx in [1]: qid = qid_list[job_idx] query = queries[str(qid)] q_term_length = len(tokenizer.tokenize(query)) data_path1 = os.path.join(output_path, "robust", "windowed_{}.score".format(job_idx)) data_path2 = os.path.join(output_path, "robust", "windowed_small_{}.score".format(job_idx)) data1 = OutputViewer(data_path1, n_factor, batch_size) data2 = OutputViewer(data_path2, n_factor, batch_size) segment_len = max_seq_length - 3 - q_term_length segment_len2 = max_seq_length2 - 3 - q_term_length outputs = [] for d1, d2 in zip(data1, data2): # for each query, doc pairs cur_info1 = info[d1['data_id']] cur_info2 = info[d2['data_id']] query_doc_id1 = f_handler.get_pair_id(cur_info1) query_doc_id2 = f_handler.get_pair_id(cur_info2) assert query_doc_id1 == query_doc_id2 doc = d1['doc'] probs = get_probs(d1['logits']) probs2 = get_probs(d2['logits']) n_pred_true = np.count_nonzero(np.less(0.5, probs)) print(n_pred_true, len(probs)) seg_scores: List[Tuple[int, int, float]] = get_piece_scores( n_factor, probs, segment_len, step_size) seg_scores2: List[Tuple[int, int, float]] = get_piece_scores( n_factor, probs2, segment_len2, step_size) ss_list = [] for st, ed, score in seg_scores: try: st2, ed2, score2 = find_where(lambda x: x[1] == ed, seg_scores2) assert ed == ed2 assert st < st2 tokens = tokenizer.convert_ids_to_tokens(doc[st:st2]) diff = score - score2 ss = ScoredPiece(st, st2, diff, tokens) ss_list.append(ss) except StopIteration: pass outputs.append((probs, probs2, query_doc_id1, ss_list)) html = HtmlVisualizer("windowed.html") for probs, probs2, query_doc_id, ss_list in outputs: html.write_paragraph(str(query_doc_id)) html.write_paragraph("Query: " + query) ss_list.sort(key=lambda ss: ss.st) prev_end = None cells = [] prob_str1 = lmap(two_digit_float, probs) prob_str1 = ["8.88"] + prob_str1 prob_str2 = lmap(two_digit_float, probs2) html.write_paragraph(" ".join(prob_str1)) html.write_paragraph(" ".join(prob_str2)) for ss in ss_list: if prev_end is not None: assert prev_end == ss.st else: print(ss.st) score = abs(int(100 * ss.score)) color = "B" if score > 0 else "R" cells.extend( [Cell(t, score, target_color=color) for t in ss.tokens]) prev_end = ss.ed html.multirow_print(cells)
def main(): prediction_file_path = at_output_dir("robust", "rob_dense_pred.score") info_file_path = at_job_man_dir1("robust_predict_desc_128_step16_info") queries: Dict[str, str] = load_robust_04_query("desc") tokenizer = get_tokenizer() query_token_len_d = {} for qid, q_text in queries.items(): query_token_len_d[qid] = len(tokenizer.tokenize(q_text)) step_size = 16 window_size = 128 out_entries: List[DocTokenScore] = collect_token_scores( info_file_path, prediction_file_path, query_token_len_d, step_size, window_size) qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" judgement_d = load_qrels_structured(qrel_path) html = HtmlVisualizer("robust_desc_128_step16.html", use_tooltip=True) tprint("loading tokens pickles") tokens_d: Dict[str, List[str]] = load_pickle_from( os.path.join(sydney_working_dir, "RobustPredictTokens3", "1")) tprint("Now printing") n_printed = 0 def transform(x): return 3 * (math.pow(x - 0.5, 3) + math.pow(0.5, 3)) for e in out_entries: max_score = e.max_segment_score() if max_score < 0.6: continue n_printed += 1 if n_printed > 10: break doc_tokens: List[str] = tokens_d[e.doc_id] score_len = len(e.scores) judgement: Dict[str, int] = judgement_d[e.query_id] label = judgement[e.doc_id] if not len(doc_tokens) <= score_len < len(doc_tokens) + window_size: print("doc length : ", len(doc_tokens)) print("score len:", score_len) print("doc length +step_size: ", len(doc_tokens) + step_size) raise IndexError row = [] q_text = queries[e.query_id] html.write_paragraph("qid: " + e.query_id) html.write_paragraph("q_text: " + q_text) html.write_paragraph("Pred: {0:.2f}".format(max_score)) html.write_paragraph("Label: {0:.2f}".format(label)) for idx in range(score_len): token = doc_tokens[idx] if idx < len(doc_tokens) else '[-]' full_scores = e.full_scores[idx] full_score_str = " ".join(lmap(two_digit_float, full_scores)) score = e.scores[idx] normalized_score = transform(score) * 200 c = get_tooltip_cell(token, full_score_str) c.highlight_score = normalized_score row.append(c) html.multirow_print(row, 16)
def main(): prediction_file_path = at_output_dir("robust", "rob_dense2_pred.score") info_file_path = at_job_man_dir1("robust_predict_desc_128_step16_2_info") queries: Dict[str, str] = load_robust_04_query("desc") tokenizer = get_tokenizer() query_token_len_d = {} for qid, q_text in queries.items(): query_token_len_d[qid] = len(tokenizer.tokenize(q_text)) step_size = 16 window_size = 128 out_entries: List[AnalyzedDoc] = token_score_by_ablation( info_file_path, prediction_file_path, query_token_len_d, step_size, window_size) qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" judgement_d = load_qrels_structured(qrel_path) html = HtmlVisualizer("robust_desc_128_step16_2.html", use_tooltip=True) tprint("loading tokens pickles") tokens_d: Dict[str, List[str]] = load_pickle_from( os.path.join(sydney_working_dir, "RobustPredictTokens3", "1")) tprint("Now printing") n_printed = 0 def transform(x): return 3 * (math.pow(x - 0.5, 3) + math.pow(0.5, 3)) n_pos = 0 n_neg = 0 for e in out_entries: max_score: float = max( lmap(SegmentScorePair.get_max_score, flatten(e.token_info.values()))) if max_score < 0.6: if n_neg > n_pos: continue else: n_neg += 1 pass else: n_pos += 1 n_printed += 1 if n_printed > 500: break doc_tokens: List[str] = tokens_d[e.doc_id] score_len = max(e.token_info.keys()) + 1 judgement: Dict[str, int] = judgement_d[e.query_id] label = judgement[e.doc_id] if not len(doc_tokens) <= score_len < len(doc_tokens) + window_size: print("doc length : ", len(doc_tokens)) print("score len:", score_len) print("doc length +step_size: ", len(doc_tokens) + step_size) continue row = [] q_text = queries[e.query_id] html.write_paragraph("qid: " + e.query_id) html.write_paragraph("q_text: " + q_text) html.write_paragraph("Pred: {0:.2f}".format(max_score)) html.write_paragraph("Label: {0:.2f}".format(label)) for idx in range(score_len): token = doc_tokens[idx] if idx < len(doc_tokens) else '[-]' token_info: List[SegmentScorePair] = e.token_info[idx] full_scores: List[float] = lmap(SegmentScorePair.get_score_diff, token_info) full_score_str = " ".join(lmap(two_digit_float, full_scores)) # 1 ~ -1 score = average(full_scores) if score > 0: color = "B" else: color = "R" normalized_score = transform(abs(score)) * 200 c = get_tooltip_cell(token, full_score_str) c.highlight_score = normalized_score c.target_color = color row.append(c) html.multirow_print(row, 16)