def dev(): train_data_feeder = load_cache("train_data_feeder") tokenizer = tokenizer_wo_tf.FullTokenizer( os.path.join(data_path, "bert_voca.txt")) html_writer = HtmlVisualizer("nli_w_dict.html", dark_mode=False) for _ in range(100): batch = train_data_feeder.get_random_batch(1) input_ids, input_mask, segment_ids, d_input_ids, d_input_mask, d_location_ids, y = batch tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) for i in range(len(tokens)): if i is not 0 and i in d_location_ids: tokens[i] = "<b>{}</b>".format(tokens[i]) if tokens[i] == "[unused3]": tokens[i] = "[SEP]\n" s = tokenizer_wo_tf.pretty_tokens(tokens) html_writer.write_headline("Input") html_writer.write_paragraph(s) d_tokens = tokenizer.convert_ids_to_tokens(d_input_ids[0]) for i in range(len(d_tokens)): if tokens[i] == "[unused5]": tokens[i] = "<br>\n" s = tokenizer_wo_tf.pretty_tokens(d_tokens) html_writer.write_headline("Dict def") html_writer.write_paragraph(s) html_writer.close()
def print_paragraph_feature(pf_list: List[ParagraphFeature], out_path: FilePath): html = HtmlVisualizer(out_path) for pf in pf_list: html.write_paragraph("Text 1: " + pf.datapoint.text1) html.write_paragraph("Text 2: " + pf.datapoint.text2) for f in pf.feature: s = " ".join(f.paragraph.tokens) html.write_paragraph(s) html.close()
def main(): # load queires and candidate (from qrel? from BM25 ?) # write html # 1. Query # 2. Doc ID # 3. Snippet with most keyword match (BM25 score) # 4. scrollable component ranked_list_path = os.path.join(output_path, "ranked_list", "robust_V_10K_10000.txt") bert_ranked_list = load_ranked_list_grouped(ranked_list_path) queries: Dict[str, str] = load_robust04_desc2() qck_queries = to_qck_queries(queries) qrels = load_robust04_qrels() candidates_d = load_candidate_d() # save_to_pickle(candidates_d, "candidate_viewer_candidate_d") # candidates_d = load_from_pickle("candidate_viewer_candidate_d") style = [get_collapsible_css(), get_scroll_css()] # html = HtmlVisualizer( "robust_V_predictions.html", additional_styles=style, ) def is_perfect(judgement, ranked_list): label_list = get_labels(judgement, ranked_list) all_relevant = True for l in label_list: if not l: all_relevant = False if l: if not all_relevant: return False return True def get_labels(judgement, ranked_list): label_list = [] for e in ranked_list: doc_id = e.doc_id if doc_id in judgement: label = judgement[doc_id] else: label = 0 label_list.append(label) return label_list def p_at_k(judgement, ranked_list, k=10): label_list = get_labels(judgement, ranked_list) num_correct = sum([1 if label else 0 for label in label_list[:k]]) return num_correct / k for qid in bert_ranked_list: if qid in candidates_d: if qid not in qrels: continue judgement = qrels[qid] q_text = queries[qid] ranked_list = bert_ranked_list[qid] if is_perfect(judgement, ranked_list): continue html.write_div_open() text = "{0}: {1} ({2:.2f})".format(qid, q_text, p_at_k(judgement, ranked_list)) html.write_elem( "button", text, "collapsible", ) html.write_div_open("content") doc_text_d = dict(candidates_d[qid]) for e in ranked_list: #tokens = doc_tokens[e.doc_id] doc_id = e.doc_id if doc_id in judgement: label = judgement[doc_id] else: label = 0 style = "font-size: 13px; padding: 8px;" if label: style += " background-color: DarkGreen" else: style += " background-color: DarkRed" text = "{0}] {1} ({2:.2f})".format(e.rank, doc_id, e.score) html.write_elem("p", text, "collapsible", style) #text = pretty_tokens(tokens, True) doc_text = doc_text_d[doc_id] html.write_div(doc_text, "c_content") html.write_div_close() html.write_div_close() html.write_script(get_collapsible_script()) html.close()
def analyze_gradient(data, tokenizer): gradients = data['gradients'] d_input_ids = data['d_input_ids'] mask_input_ids = data['masked_input_ids'] masked_lm_positions = data["masked_lm_positions"] n_inst, seq_len = mask_input_ids.shape n_inst2, def_len = d_input_ids.shape assert n_inst == n_inst2 def_len = 256 hidden_dim = 768 reshaped_grad = reshape_gradienet(gradients, n_inst, def_len, hidden_dim) print(reshaped_grad.shape) n_pred = reshaped_grad.shape[1] grad_per_token = np.sum(np.abs(reshaped_grad), axis=3) html_writer = HtmlVisualizer("dict_grad.html", dark_mode=False) for inst_idx in range(n_inst): tokens = tokenizer.convert_ids_to_tokens(mask_input_ids[inst_idx]) #ans_tokens = tokenizer.convert_ids_to_tokens(input_ids[inst_idx]) for i in range(len(tokens)): if tokens[i] == "[MASK]": tokens[i] = "[MASK_{}]".format(i) if tokens[i] == "[SEP]": tokens[i] = "[SEP]<br>" def_tokens = tokenizer.convert_ids_to_tokens(d_input_ids[inst_idx]) s = tokenizer_wo_tf.pretty_tokens(tokens) lines = [] grad_total_max = 0 for pred_idx in range(n_pred): row = [] max_val = max(grad_per_token[inst_idx, pred_idx]) total = sum(grad_per_token[inst_idx, pred_idx]) mask_pos = masked_lm_positions[inst_idx, pred_idx] if total > grad_total_max: grad_total_max = total row.append(Cell(mask_pos)) row.append(Cell(int(total))) for def_idx in range(def_len): term = def_tokens[def_idx] cont_right = def_idx + 1 < def_len and def_tokens[ def_idx][:2] == "##" cont_left = term[:2] == "##" space_left = " " if not cont_left else "" space_right = " " if not cont_right else "" if term == "[PAD]": break if term == "[unused5]": term = "[\\n]" score = grad_per_token[inst_idx, pred_idx, def_idx] / (hidden_dim * 2) bg_color = get_color(score) row.append(Cell(term, score, not cont_left, not cont_right)) print("{}({})".format( term, grad_per_token[inst_idx, pred_idx, def_idx]), end=" ") lines.append((mask_pos, row)) print("") lines.sort(key=lambda x: x[0]) s = s.replace("[unused4]", "<b>DictTerm</b>") html_writer.write_paragraph(s) if grad_total_max > 5000000: html_writer.write_headline("HIGH Gradient") rows = right(lines) html_writer.write_table(rows) print("----------") html_writer.close()
def load_and_visualize(): tokenizer = tokenizer_wo_tf.FullTokenizer( os.path.join(data_path, "bert_voca.txt")) data_id = "1" n_list = open(os.path.join(output_path, "lookup_n", data_id), "r").readlines() p = os.path.join(output_path, "example_loss.pickle") data = pickle.load(open(p, "rb")) data = data[0]["masked_lm_example_loss"] feature_itr = load_record_v1( os.path.join(output_path, "lookup_example", data_id)) n = len(n_list) feature_idx = 0 html_writer = HtmlVisualizer("lookup_loss2.html", dark_mode=False) for i in range(n): n_sample = int(n_list[i]) rows = [] assert n_sample > 0 for j in range(n_sample): feature = feature_itr.__next__() input_ids = take(feature["input_ids"]) masked_lm_ids = take(feature["masked_lm_ids"]) masked_lm_positions = take(feature["masked_lm_positions"]) input_mask = take(feature["input_mask"]) selected_word = take(feature["selected_word"]) d_input_ids = take(feature["d_input_ids"]) d_location_ids = take(feature["d_location_ids"]) word_tokens = tokenizer.convert_ids_to_tokens(selected_word) word = tokenizer_wo_tf.pretty_tokens((word_tokens)) emph_word = "<b>" + word + "</b>" if j == 0: mask_ans = {} masked_terms = tokenizer.convert_ids_to_tokens(masked_lm_ids) for pos, id in zip(list(masked_lm_positions), masked_terms): mask_ans[pos] = id tokens = tokenizer.convert_ids_to_tokens(input_ids) for i in range(len(tokens)): if tokens[i] == "[MASK]": tokens[i] = "[MASK_{}: {}]".format(i, mask_ans[i]) if i in d_location_ids and i is not 0: if tokens[i - 1] != emph_word: tokens[i] = emph_word else: tokens[i] = "-" def_str = tokenizer_wo_tf.pretty_tokens( tokenizer.convert_ids_to_tokens(d_input_ids), True) row = list() row.append(Cell(word)) row.append(Cell(data[feature_idx])) row.append(Cell(def_str)) rows.append(row) feature_idx += 1 s = tokenizer_wo_tf.pretty_tokens(tokens, True) html_writer.write_paragraph(s) html_writer.write_table(rows) html_writer.close()
def diff_view(): tokenizer = get_tokenizer() filename = "bert_815.pickle" p = os.path.join(output_path, filename) data = pickle.load(open(p, "rb")) filename = "bfn_3_200_815.pickle" p = os.path.join(output_path, filename) data2 = pickle.load(open(p, "rb")) run_name = "diff" batch_size, seq_length = data[0]['masked_input_ids'].shape masked_input_ids = [] input_ids = [] masked_lm_example_loss = [] masked_lm_positions = [] masked_lm_ids = [] for e in data[:-1]: masked_input_ids.append(e["masked_input_ids"]) input_ids.append(e["input_ids"]) masked_lm_example_loss.append( np.reshape(e["masked_lm_example_loss"], [batch_size, -1])) masked_lm_positions.append(e["masked_lm_positions"]) masked_lm_ids.append(e["masked_lm_ids"]) masked_lm_example_loss2 = [] for e in data2[:-1]: masked_lm_example_loss2.append( np.reshape(e["masked_lm_example_loss"], [batch_size, -1])) masked_lm_example_loss2 = np.concatenate(masked_lm_example_loss2) input_ids = np.concatenate(input_ids) masked_input_ids = np.concatenate(masked_input_ids) masked_lm_example_loss = np.concatenate(masked_lm_example_loss) masked_lm_positions = np.concatenate(masked_lm_positions) masked_lm_ids = np.concatenate(masked_lm_ids) html_writer = HtmlVisualizer(run_name + ".html", dark_mode=False) n_instance = len(input_ids) for inst_idx in range(n_instance): tokens = tokenizer.convert_ids_to_tokens(masked_input_ids[inst_idx]) ans_tokens = tokenizer.convert_ids_to_tokens(input_ids[inst_idx]) ans_keys = dict( zip(masked_lm_positions[inst_idx], tokenizer.convert_ids_to_tokens(masked_lm_ids[inst_idx]))) loss_at_loc = { p: l for l, p in zip(masked_lm_example_loss[inst_idx], masked_lm_positions[inst_idx]) } loss_at_loc2 = { p: l for l, p in zip(masked_lm_example_loss2[inst_idx], masked_lm_positions[inst_idx]) } score_at_loc = {k: math.exp(-v) for k, v in loss_at_loc.items()} score_at_loc2 = {k: math.exp(-v) for k, v in loss_at_loc2.items()} def is_dependent(token): return len(token) == 1 and not token[0].isalnum() cells = [] for i in range(len(tokens)): f_inverse = False score = 0 if tokens[i] == "[MASK]" or i in loss_at_loc: tokens[i] = "[{}-{}]".format(i, ans_keys[i]) score = (score_at_loc2[i] - score_at_loc[i]) * 180 score = -score if score < 0: f_inverse = True score = abs(score) if tokens[i] == "[SEP]": tokens[i] = "[SEP]<br>" if tokens[i] != "[PAD]": term = tokens[i] cont_left = term[:2] == "##" cont_right = i + 1 < len(tokens) and tokens[i + 1][:2] == "##" if i + 1 < len(tokens): dependent_right = is_dependent(tokens[i + 1]) else: dependent_right = False dependent_left = is_dependent(tokens[i]) if cont_left: term = term[2:] space_left = " " if not (cont_left or dependent_left) else "" space_right = " " if not (cont_right or dependent_right) else "" if not f_inverse: cells.append(Cell(term, score, space_left, space_right)) else: cells.append( Cell(term, score, space_left, space_right, target_color="R")) #s = tokenization.pretty_tokens(tokens) rows = [] row = [] for cell in cells: row.append(cell) if len(row) == 20: html_writer.write_table([row]) row = [] loss_infos = [] for loss, pos in zip(masked_lm_example_loss[inst_idx], masked_lm_positions[inst_idx]): loss_infos.append((loss, pos)) loss_infos.sort(key=lambda x: x[1]) rows = [] for loss, pos in loss_infos: loss1 = score_at_loc[pos] loss2 = score_at_loc2[pos] loss_diff = loss1 - loss2 rows.append((Cell(pos), Cell(loss1), Cell(loss2), Cell(loss_diff))) html_writer.write_table(rows) html_writer.close()
def work(): tokenizer = get_tokenizer() filename = "bert_815.pickle" filename = "bfn_3_200_815.pickle" run_name = filename[:-(len(".pickle"))] p = os.path.join(output_path, filename) data = pickle.load(open(p, "rb")) batch_size, seq_length = data[0]['masked_input_ids'].shape masked_input_ids = [] input_ids = [] masked_lm_example_loss = [] masked_lm_positions = [] for e in data[:-1]: masked_input_ids.append(e["masked_input_ids"]) input_ids.append(e["input_ids"]) masked_lm_example_loss.append( np.reshape(e["masked_lm_example_loss"], [batch_size, -1])) masked_lm_positions.append(e["masked_lm_positions"]) input_ids = np.concatenate(input_ids) masked_input_ids = np.concatenate(masked_input_ids) masked_lm_example_loss = np.concatenate(masked_lm_example_loss) masked_lm_positions = np.concatenate(masked_lm_positions) html_writer = HtmlVisualizer(run_name + ".html", dark_mode=False) n_instance = len(input_ids) for inst_idx in range(200): tokens = tokenizer.convert_ids_to_tokens(masked_input_ids[inst_idx]) ans_tokens = tokenizer.convert_ids_to_tokens(input_ids[inst_idx]) loss_at_loc = { p: l for l, p in zip(masked_lm_example_loss[inst_idx], masked_lm_positions[inst_idx]) } cells = [] for i in range(len(tokens)): score = 0 if tokens[i] == "[MASK]": tokens[i] = "[{}]".format(ans_tokens[i]) score = loss_at_loc[i] * 255 / 25 if tokens[i] == "[SEP]": tokens[i] = "[SEP]<br>" if tokens[i] != "[PAD]": cells.append(Cell(tokens[i], score)) #s = tokenization.pretty_tokens(tokens) rows = [] row = [] for cell in cells: row.append(cell) if len(row) == 20: html_writer.write_table([row]) row = [] loss_infos = [] for loss, pos in zip(masked_lm_example_loss[inst_idx], masked_lm_positions[inst_idx]): loss_infos.append((loss, pos)) loss_infos.sort(key=lambda x: x[1]) rows = [] for loss, pos in loss_infos: rows.append((Cell(pos), Cell(loss))) html_writer.write_table(rows) html_writer.close()
def main(config): # load queires and candidate (from qrel? from BM25 ?) # write html # 1. Query # 2. Doc ID # 3. Snippet with most keyword match (BM25 score) # 4. scrollable component score_d = load_qk_score_as_dict(config) # qk_candidate: List[QKUnit] = load_from_pickle("robust_on_clueweb_qk_candidate") qk_candidate: List[QKUnit] = load_from_pickle( "robust_on_clueweb_qk_candidate_filtered") # qk_candidate: List[QKUnit] = load_from_pickle("robust_on_wiki_qk_candidate") # candidates_d = load_candidate_d() # save_to_pickle(candidates_d, "candidate_viewer_candidate_d") style = [get_collapsible_css(), get_scroll_css()] # html = HtmlVisualizer( "robust_k_docs_filtered.html", additional_styles=style, ) for query, k_list in qk_candidate: qid = query.query_id q_text = query.text if not k_list: continue c = Counter() for k in k_list: kdp_id = "{}-{}".format(k.doc_id, k.passage_idx) score = score_d[qid, kdp_id] label = 1 if score > 0.5 else 0 c[label] += 1 pos_rate = (c[1] / (c[1] + c[0])) html.write_div_open() html.write_elem( "button", "{0}: {1} ({2:.2f})".format(qid, q_text, pos_rate), "collapsible", ) html.write_div_open("content") for k in k_list: kdp_id = "{}-{}".format(k.doc_id, k.passage_idx) score = score_d[qid, kdp_id] label = score > 0.5 text = " ".join(k.tokens) style = "font-size: 13px; padding: 8px;" if label: style += " background-color: DarkGreen" else: style += " background-color: DarkRed" html.write_elem("p", "{0} : {1:.2f}".format(kdp_id, score), "collapsible", style) html.write_div(text, "c_content") html.write_div_close() html.write_div_close() html.write_script(get_collapsible_script()) html.close()