def print_param(): p_base = load_param(get_bert_full_path()) nli_path = "C:\work\Code\Chair\output\model\\runs\\nli_model.ckpt-75000_NLI\\model-0" p_ft = load_param(nli_path) keys = list(p_base.keys()) key = "bert/encoder/layer_0/output/dense/kernel" param1 = p_base[key] param2 = p_ft[key] html = HtmlVisualizer("bert_dense_param.html") l , c = param1.shape s_score = 100 for i in range(l): rows = [] row1 = [] row2 = [] s_score = 100 - s_score score = s_score for j in range(c): score = 100 - score row1.append(Cell("{0:.4f}".format(param1[i, j]), score)) row2.append(Cell("{0:.4f}".format(param2[i, j]), score)) rows.append(row1) rows.append(row2) html.write_table(rows)
def main(config): # select claims # load relevant documents # remove duplicate q_res_path = config['q_res_path'] ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) claims = get_all_claims() claim_d = claims_to_dict(claims) keys = list(ranked_list.keys()) keys.sort() num_doc_per_query = 10 url_prefix = "http://localhost:36559/document?identifier=" rows = [] for query_id in keys[:10]: entries: List[SimpleRankedListEntry] = ranked_list[query_id] entries = entries[:num_doc_per_query * 3] doc_ids: List[str] = remove_duplicate(list([e.doc_id for e in entries])) claim = claim_d[int(query_id)] s = "{} : {}".format(query_id, claim) rows.append([Cell(s)]) for doc_id in doc_ids[:num_doc_per_query]: url = url_prefix + doc_id s = "<a href=\"{}\">{}</a>".format(url, doc_id) rows.append([Cell(s)]) html = HtmlVisualizer("claim_docs_urls.html") html.write_table(rows)
def view_grad_overlap_hidden(): filename = "ukp_feature_overlap.pickle" obj = pickle.load(open(os.path.join(output_path, filename), "rb")) out_name = filename.split(".")[0] + ".html" html_writer = HtmlVisualizer(out_name, dark_mode=False) data = EstimatorPredictionViewerGosford(filename) for inst_i, entry in enumerate(data): tokens = entry.get_mask_resolved_input_mask_with_input() h_overlap = entry.get_vector('h_overlap') std = np.std(h_overlap, axis=2) mean = np.mean(h_overlap, axis=2) h_overlap = np.sum(h_overlap, axis=2) highlight = lmap(is_mask, tokens) cells = data.cells_from_tokens(tokens, highlight) rows = [cells] for layer_i in range(12): e = h_overlap[layer_i, :] e = [v * 1e6 for v in e] cells = data.cells_from_scores(e) rows.append(cells) e = [v * 1e8 for v in std[layer_i, :]] cells2 = data.cells_from_scores(e) rows.append(cells2) print(entry.get_vector("masked_lm_example_loss")) html_writer.multirow_print_from_cells_list(rows, 40)
def main(config): # select claims # load relevant documents # remove duplicate q_res_path = config['q_res_path'] ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) query_text_d = json.load(open(config['query_text_d'])) save_name = config['save_path'] keys = list(ranked_list.keys()) keys.sort() num_doc_per_query = 10 url_prefix = "http://localhost:36559/document?identifier=" rows = [] for query_id in keys[:100]: entries: List[SimpleRankedListEntry] = ranked_list[query_id] entries = entries[:num_doc_per_query * 3] doc_ids: List[str] = list([e.doc_id for e in entries]) query_text = query_text_d[query_id] s = "{} : {}".format(query_id, query_text) rows.append([Cell(s)]) for doc_id in doc_ids[:num_doc_per_query]: url = url_prefix + doc_id s = "<a href=\"{}\">{}</a>".format(url, doc_id) rows.append([Cell(s)]) html = HtmlVisualizer(save_name) html.write_table(rows)
def show(html_visualizer: HtmlVisualizer, features: List[ParagraphClaimPersFeature]): print("Cid: ", features[0].claim_pers.cid) for f in features: html_visualizer.write_paragraph("Claim: " + f.claim_pers.claim_text) html_visualizer.write_paragraph("Perspective: " + f.claim_pers.p_text) pc_tokens: List[str] = nltk.word_tokenize( f.claim_pers.claim_text) + nltk.word_tokenize(f.claim_pers.p_text) pc_tokens_set = set([t.lower() for t in pc_tokens]) print(pc_tokens_set) def get_cell(token) -> Cell: if token.lower() in pc_tokens_set: score = 100 else: score = 0 return Cell(token, score) html_visualizer.write_paragraph("Label : {}".format( f.claim_pers.label)) for score_paragraph in f.feature: paragraph = score_paragraph.paragraph cells = [get_cell(t) for t in paragraph.tokens] html_visualizer.write_paragraph("---") html_visualizer.multirow_print(cells, width=20)
def main(): html = HtmlVisualizer("tooltip_test.html", dark_mode=False, use_tooltip=True) line = [("1", "hello"), ("2", "word")] html.write_span_line(line) html.write_span_line(line)
def visualize_prediction_data(data_id): tokenizer = get_tokenizer() num_samples_list = open( os.path.join(working_path, "entry_prediction_n", data_id), "r").readlines() p = os.path.join(working_path, "entry_loss", "entry{}.pickle".format(data_id)) loss_outputs_list = pickle.load(open(p, "rb")) print("Loaded input data") loss_outputs = [] for e in loss_outputs_list: loss_outputs.extend(e["masked_lm_example_loss"]) print("Total of {} loss outputs".format(len(loss_outputs))) instance_idx = 0 feature_itr = load_record_v2( os.path.join(working_path, "entry_prediction_tf.done", data_id)) n = len(num_samples_list) n = 100 html = HtmlVisualizer("entry_prediction.html") for i in range(n): n_sample = int(num_samples_list[i]) assert n_sample > 0 first_inst = feature_itr.__next__() feature = Feature2Text(first_inst, tokenizer) html.write_headline("Input:") html.write_paragraph(feature.get_input_as_text(True, True)) html.write_headline("Word:" + feature.get_selected_word_text()) if instance_idx + n_sample >= len(loss_outputs): break if n_sample == 1: continue rows = [] no_dict_loss = loss_outputs[instance_idx] row = [Cell(no_dict_loss, 0), Cell("")] rows.append(row) instance_idx += 1 for j in range(1, n_sample): feature = Feature2Text(feature_itr.__next__(), tokenizer) def_cell = Cell(feature.get_def_as_text()) loss = loss_outputs[instance_idx] hl_score = 100 if loss < no_dict_loss * 0.9 else 0 row = [Cell(loss, hl_score), def_cell] rows.append(row) instance_idx += 1 html.write_table(rows)
def loss_view(dir_path): tokenizer = get_tokenizer() html_writer = HtmlVisualizer("ukp_lm_grad_high.html", dark_mode=False) for file_path in get_dir_files(dir_path): items = pickle.load(open(file_path, "rb")) for e in items: input_ids, masked_input_ids, masked_lm_example_loss = e tokens = mask_resolve_1( tokenizer.convert_ids_to_tokens(input_ids), tokenizer.convert_ids_to_tokens(masked_input_ids)) highlight = lmap(is_mask, tokens) cells = cells_from_tokens(tokens, highlight) html_writer.multirow_print(cells)
def main(): save_name = sys.argv[1] out_dir = os.path.join(output_path, "cppnc") exist_or_mkdir(out_dir) info_file_path = os.path.join(out_dir, "cppnc_triple_all_dev_info") pred_file_path = os.path.join(out_dir, save_name + ".score") cid_and_confidences = get_confidence_list_per_cid(info_file_path, pred_file_path) rows = [] for cid, confidenc_list in cid_and_confidences.items(): row = list() row.append(Cell(str(cid))) row.extend([Cell("", highlight_score=c*100) for c in confidenc_list]) rows.append(row) html = HtmlVisualizer("confidence.html") html.write_table(rows)
def main(): file_path = sys.argv[1] name = os.path.basename(file_path) viewer = EstimatorPredictionViewer(file_path) html = HtmlVisualizer("toke_score_gold.html") stopwords = load_stopwords_for_query() skip = 10 for entry_idx, entry in enumerate(viewer): if entry_idx % skip != 0: continue tokens = entry.get_tokens("input_ids") input_ids = entry.get_vector("input_ids") label_ids = entry.get_vector("label_ids") label_ids = np.reshape(label_ids, [-1, 2]) log_label_ids = np.log(label_ids + 1e-10) seg1, seg2 = split_p_h_with_input_ids(tokens, input_ids) pad_idx = tokens.index("[PAD]") assert pad_idx > 0 logits = entry.get_vector("logits") cells = [] cells2 = [] for idx in range(pad_idx): probs = label_ids[idx] token = tokens[idx] score = probs[0] color = "B" if score > 0 else "R" highlight_score = min(abs(score) * 10000, 100) if token in stopwords: highlight_score = 0 if token in seg1: highlight_score = 50 color = "G" c = Cell(token, highlight_score=highlight_score, target_color=color) cells.append(c) html.multirow_print_from_cells_list([cells, cells2]) if entry_idx > 10000: break
def main(): first_list_path = sys.argv[1] dir_path = sys.argv[2] save_path = sys.argv[3] l: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped( first_list_path) new_entries: Dict[str, List[TrecRankedListEntry]] = l flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values()) html = HtmlVisualizer(save_path) rows = [] for e in flat_entries: ahref = "<a href=\"./{}/{}.html\">{}</a>".format( dir_path, e.doc_id, e.doc_id) row = lmap(Cell, [e.query_id, e.rank, e.score, ahref]) rows.append(row) html.write_table(rows)
def print_paragraph_feature(pf_list: List[ParagraphFeature], out_path: FilePath): html = HtmlVisualizer(out_path) for pf in pf_list: html.write_paragraph("Text 1: " + pf.datapoint.text1) html.write_paragraph("Text 2: " + pf.datapoint.text2) for f in pf.feature: s = " ".join(f.paragraph.tokens) html.write_paragraph(s) html.close()
def print_file(pred_path): grouped = load_prediction(pred_path) html_pos = HtmlVisualizer("pc_view_true.html") html_neg = HtmlVisualizer("pc_view_false.html") item_cnt = 0 for key in grouped: paras: List[Tuple[str, float, Segment]] = grouped[key] is_true_arr = list([t[1] > 0.5 for t in paras]) cnt_true = sum(is_true_arr) if cnt_true == len(is_true_arr) or cnt_true == 0: continue cnt_false = len(is_true_arr) - cnt_true idx_false = 0 idx_true = 0 item_cnt += 1 for _, score, tokens in paras: is_true = score > 0.5 html = html_pos if is_true else html_neg claim, perspective, paragraph = split_3segments(tokens) highlight_terms = set(claim + perspective) if is_true: html.write_paragraph("{} of {}".format(idx_true, cnt_true)) idx_true += 1 else: html.write_paragraph("{} of {}".format(idx_false, cnt_false)) idx_false += 1 html.write_paragraph("claim : " + pretty_tokens(claim)) html.write_paragraph("perspective : " + pretty_tokens(perspective)) def make_cell(subword: Subword): if subword in highlight_terms: return Cell(subword, highlight_score=100) else: return Cell(subword) cells = lmap(make_cell, paragraph) html.multirow_print(cells) if item_cnt > 100: break
def main(): #claim_d = load_train_claim_d() html = HtmlVisualizer("doc_relevance_and_value.html") rows = [] data_id = 0 for query, k_list in load_qk(): claim_id = query.query_id claim_text = query.text doc_ids = set([k.doc_id for k in k_list]) for doc_id in list(doc_ids)[:10]: url = os.path.join(output_path, "pc_docs_html", doc_id + ".html") a = "<a href=\"{}\">url</a>".format(url) #tab_print(data_id, claim_id, doc_id) row = [Cell(data_id), Cell(claim_id), Cell(claim_text), Cell(a)] rows.append(row) data_id += 1 html.write_table(rows)
def analyze_hv(hv_tt, hv_lm, tt_grad, tokenizer): batch_size = 16 seq_len = 200 hidden_dim = 768 reshaped_grad = reshape_gradienet(tt_grad, seq_len, hidden_dim, False) hv_tt, x_list = reshape(hv_tt) hv_lm, x_list = reshape(hv_lm) assert len(hv_lm) == len(hv_tt) html = HtmlVisualizer("Preserved.html") for inst_i in range(len(hv_lm)): print("\t", end="") tokens = tokenizer.convert_ids_to_tokens(x_list[inst_i]) for seq_i in range(seq_len): token = tokenizer.convert_ids_to_tokens([x_list[inst_i, seq_i]])[0] print("{}".format(token), end="\t") print() scores = [] for layer_i in range(13): if layer_i != 1: continue layer_no = layer_i if layer_no >= 1: print("Layer {} :".format(layer_no), end="\t") else: print("Embedding:", end="\t") for seq_i in range(seq_len): n_diff_1, n_diff_2 = diff_and_grad( hv_lm[inst_i, layer_i, seq_i], hv_tt[inst_i, layer_i, seq_i], reshaped_grad[inst_i, layer_i, seq_i]) scores.append(n_diff_1) print("{}({})".format(n_diff_1, n_diff_2), end="\t") print("\n") row = [] for t, s in zip(tokens, scores): score = s / hidden_dim * 100 row.append(Cell(t, score)) html.write_table([row]) print("-----------------")
def main(): first_list_path = sys.argv[1] dir_path = sys.argv[2] save_path = sys.argv[3] l: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped( first_list_path) new_entries: Dict[str, List[TrecRankedListEntry]] = l def get_html_path_fn(doc_id): return os.path.join(dir_path, "{}.html".format(doc_id)) doc_id_to_url = load_from_pickle("urls_d") flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values()) entries = [ enrich(e, get_html_path_fn, doc_id_to_url) for e in flat_entries ] html = HtmlVisualizer(save_path, additional_styles=[ get_link_highlight_code(), get_bootstrap_include_source() ]) rows = [] head = [ get_table_head_cell("query"), get_table_head_cell("rank"), get_table_head_cell("score"), get_table_head_cell("doc_id"), get_table_head_cell("title", 300), get_table_head_cell("url"), ] for e in entries: html_path = os.path.join(dir_path, "{}.html".format(e.doc_id)) ahref = "<a href=\"{}\" target=\"_blank\">{}</a>".format( html_path, e.doc_id) elem_list = [e.query_id, e.rank, e.score, ahref, e.title, e.url] row = lmap(Cell, elem_list) rows.append(row) html.write_table_with_class(rows, "table")
def show(filename): data = EstimatorPredictionViewerGosford(filename) html_writer = HtmlVisualizer("token_scoring.html", dark_mode=False) correctness = [] for entry in data: tokens = entry.get_tokens("input_ids") logits = entry.get_vector("logits") masks = entry.get_vector("label_masks") ids = entry.get_vector("labels") token_row = [] pred_row = [] gold_row = [] rows = [token_row, pred_row, gold_row] for idx, token in enumerate(tokens): token_cell = Cell(token) if token == "[PAD]": break model_score = logits[idx][0] if masks[idx]: correct = (model_score > 0 and ids[idx] > 0) or (model_score < 0 and ids[idx] < 0) color = "B" if correct else "R" if correct and (model_score > 0 and ids[idx] > 0) : color = "G" pred_cell = Cell("{0:.2f}".format(model_score), 100, target_color=color) gold_cell = Cell("{0:.2f}".format(ids[idx]), 100, target_color=color) else: token_cell = Cell(token) pred_cell = Cell("") gold_cell = Cell("") token_row.append(token_cell) pred_row.append(pred_cell) gold_row.append(gold_cell) html_writer.multirow_print_from_cells_list(rows, 20)
def draw(): #name = "pc_para_D_grad" name = "pc_para_I_grad" #name = "pc_para_H_grad" data = EstimatorPredictionViewerGosford(name) html_writer = HtmlVisualizer(name + ".html", dark_mode=False) for inst_i, entry in enumerate(data): tokens = entry.get_tokens("input_ids") grad = entry.get_vector("gradient") m = min(grad) cells = data.cells_from_tokens(tokens) for i, cell in enumerate(cells): cells[i].highlight_score = min(abs(grad[i]) * 1e4, 255) cells[i].target_color = "B" if grad[i] > 0 else "R" print(grad) prob = softmax(entry.get_vector("logits")) pred = np.argmax(prob) label = entry.get_vector("labels") html_writer.write_paragraph("Label={} / Pred={}".format(str(label), pred)) html_writer.multirow_print(cells)
def draw2(in_file, out_file): filename = os.path.join(output_path, in_file) data = EstimatorPredictionViewerGosford(filename) html_writer = HtmlVisualizer(out_file, dark_mode=False) tokenizer = get_tokenizer() for inst_i, entry in enumerate(data): if inst_i > 100: break tokens = entry.get_tokens("input_ids") # tokens = entry.get_tokens("input_ids") prob1 = entry.get_vector("prob1") prob2 = entry.get_vector("prob2") real_loss1 = entry.get_vector("per_example_loss1") real_loss2 = entry.get_vector("per_example_loss2") masked_lm_positions = entry.get_vector("masked_lm_positions") for i, loc in enumerate(masked_lm_positions): tokens[loc] = "[{}:{}]".format(i, tokens[loc]) html_writer.multirow_print(data.cells_from_tokens(tokens)) row2 = [Cell("prob1:")] + data.cells_from_anything(prob1) row3 = [Cell("prob2:")] + data.cells_from_anything(prob2) row4 = [Cell("real_loss1:")] + data.cells_from_anything(real_loss1) row5 = [Cell("real_loss2:")] + data.cells_from_anything(real_loss2) html_writer.multirow_print_from_cells_list([row2, row3, row4, row5])
def view_grad_overlap_per_mask(): filename = "ukp_lm_probs.pickle" out_name = filename.split(".")[0] + ".html" html_writer = HtmlVisualizer(out_name, dark_mode=False) data = EstimatorPredictionViewerGosford(filename) tokenizer = data.tokenizer for inst_i, entry in enumerate(data): tokens = entry.get_mask_resolved_input_mask_with_input() highlight = lmap(is_mask, tokens) scores = entry.get_vector("overlap_score") pos_list = entry.get_vector("masked_lm_positions") probs = entry.get_vector("masked_lm_log_probs") probs = np.reshape(probs, [20, -1]) rows = [] for score, position, prob in zip(scores, pos_list, probs): tokens[position] = "{}-".format(position) + tokens[position] row = [Cell(position), Cell(score)] for idx in np.argsort(prob)[::-1][:5]: term = tokenizer.inv_vocab[idx] p = math.exp(prob[idx]) row.append(Cell(term)) row.append(Cell(p)) rows.append(row) cells = data.cells_from_tokens(tokens, highlight) for score, position in zip(scores, pos_list): cells[position].highlight_score = score / 10000 * 255 html_writer.multirow_print(cells, 20) html_writer.write_table(rows)
def run(): tokenizer = get_tokenizer() spr = StreamPickleReader("contradiction_prediction") html = HtmlVisualizer("contradiction_prediction.html") cnt = 0 while spr.has_next(): item = spr.get_item() e, p = item input_ids, _, _ = e logit, explain = p tokens = tokenizer.convert_ids_to_tokens(input_ids) p, h = split_p_h_with_input_ids(tokens, input_ids) p_score, h_score = split_p_h_with_input_ids(explain, input_ids) p_score = normalize(p_score) h_score = normalize(h_score) p_cells = [Cell("P:")] + cells_from_tokens(p, p_score) h_cells = [Cell("H:")] + cells_from_tokens(h, h_score) html.write_paragraph(str(logit)) html.multirow_print(p_cells) html.multirow_print(h_cells) if cnt > 100: break cnt += 1
def per_doc_score(): filename = "tlm_view.pickle" html_writer = HtmlVisualizer("per_doc_score.html", dark_mode=False) data = EstimatorPredictionViewerGosford(filename) amp = 20 small_threshold = 40 for inst_i, entry in enumerate(data): if inst_i > 1000: break scores = entry.get_vector("priority_score") tokens = entry.get_mask_resolved_input_mask_with_input() cells = data.cells_from_tokens(tokens) if len(cells) < small_threshold: continue avg_score = average(scores) if -0.11 > avg_score > -0.30: continue print(average(scores)) html_writer.write_headline(avg_score) rows = [] row = [] for idx, cell in enumerate(cells): row.append(cell) if len(row) == 20: html_writer.write_table([row]) row = []
def show(out_file_name, summarized_table: List[Entry]): html = HtmlVisualizer(out_file_name) tokenizer = get_tokenizer() num_print = 0 for input_ids, prob, contributions in summarized_table: tokens = tokenizer.convert_ids_to_tokens(input_ids) html.write_paragraph("Score : {}".format(prob)) cells = [] max_change = 0 for idx in range(len(input_ids)): token = tokens[idx] if token == "[PAD]": break if idx in contributions: raw_score = contributions[idx] max_change = max(abs(raw_score), max_change) score = abs(raw_score) * 100 color = "R" if raw_score > 0 else "B" c = Cell(token, highlight_score=score, target_color=color) else: c = Cell(token, highlight_score=150, target_color="Gray") cells.append(c) if max_change < 0.05: pass else: html.multirow_print(cells, 30) num_print += 1 print("printed {} of {}".format(num_print, len(summarized_table)))
def show_tfrecord(file_path): itr = load_record_v2(file_path) tokenizer = get_tokenizer() name = os.path.basename(file_path) html = HtmlVisualizer(name + ".html") for features in itr: input_ids = take(features["input_ids"]) alt_emb_mask = take(features["alt_emb_mask"]) tokens = tokenizer.convert_ids_to_tokens(input_ids) p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids) p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids) p_cells = [ Cell(p_tokens[i], 100 if p_mask[i] else 0) for i in range(len(p_tokens)) ] h_cells = [ Cell(h_tokens[i], 100 if h_mask[i] else 0) for i in range(len(h_tokens)) ] label = take(features["label_ids"])[0] html.write_paragraph("Label : {}".format(label)) html.write_table([p_cells]) html.write_table([h_cells])
def main(): html = HtmlVisualizer("tf_rel_filter.html") tokenizer = get_tokenizer() path = "/mnt/nfs/work3/youngwookim/data/bert_tf/tf_rel_filter_B_dev/" def itr(): for file in get_dir_files(path): for item in load_record(file): yield item for feature in itr(): write_feature_to_html(feature, html, tokenizer)
def main(): save_name = "alamri_mismatch_all" output_d = load_from_pickle(save_name) html = HtmlVisualizer("alamri_mismatch.html") tokenizer = get_tokenizer() logits_grouped_by_layer = output_d["per_layer_logits"] num_layers = 12 def float_arr_to_cell(head, float_arr): return [Cell(head)] + lmap(Cell, map(two_digit_float, float_arr)) def float_arr_to_cell2(head, float_arr): return [Cell(head)] + lmap(Cell, map("{0:.4f}".format, float_arr)) num_data = len(output_d['input_ids']) for data_idx in range(num_data)[:100]: def get(name): return output_d[name][data_idx] tokens = tokenizer.convert_ids_to_tokens(get("input_ids")) ex_scores = get('ex_scores') probs = scipy.special.softmax(get('logits')) pred_str = make_prediction_summary_str(probs) html.write_paragraph("Prediction: {}".format(pred_str)) html.write_paragraph("gold label={}".format(get("label"))) row1 = [Cell("")] + list( [Cell(t, int(s * 100)) for t, s in zip(tokens, ex_scores)]) row2 = float_arr_to_cell("ex_prob", ex_scores) for i, s in enumerate(ex_scores): if s > 0.5: row2[i + 1].highlight_score = 100 rows = [row1, row2] for layer_no in range(num_layers): layer_logit = logits_grouped_by_layer[layer_no][data_idx] probs = sigmoid(layer_logit) row = float_arr_to_cell("layer_{}".format(layer_no), probs[:, 1]) rows.append(row) html.write_table(rows)
def per_doc_score(): filename = "fetch_hidden_dim.pickle" html_writer = HtmlVisualizer("preserved.html", dark_mode=False) p = os.path.join(output_path, filename) raw_data = pickle.load(open(p, "rb")) n_skip = 0 data = EstimatorPredictionViewerGosford(filename) for inst_i, entry in enumerate(data): if inst_i > 100: break count_preserved = entry.get_vector("layer_count") tokens = entry.get_tokens("input_ids") cells = data.cells_from_tokens(tokens) valid_parst = count_preserved[:len(cells)] avg = np.average(count_preserved) row = [] row2 = [] #f_print = avg > 20 f_print = True print(avg) if f_print: html_writer.write_paragraph("Skipped {} articles".format(n_skip)) n_skip = 0 for idx, cell in enumerate(cells): score = count_preserved[idx] / 728 * 100 cell.highlight_score = score row.append(cell) row2.append((Cell(count_preserved[idx], score))) if len(row) == 20: html_writer.write_table([row, row2]) row = [] row2 = [] html_writer.write_paragraph(str(avg)) else: n_skip += 1
def print_as_html(fn): examples = load_record(fn) tokenizer = tokenizer_wo_tf.FullTokenizer( os.path.join(data_path, "bert_voca.txt")) html_output = HtmlVisualizer("out_name.html") for feature in examples: masked_inputs = feature["input_ids"].int64_list.value idx = 0 step = 512 while idx < len(masked_inputs): slice = masked_inputs[idx:idx + step] tokens = tokenizer.convert_ids_to_tokens(slice) idx += step cells = cells_from_tokens(tokens) html_output.multirow_print(cells) html_output.write_paragraph("----------")
def show_prediction(filename, file_path, correctness_1, correctness_2): data = EstimatorPredictionViewerGosford(filename) itr = load_record_v2(file_path) tokenizer = get_tokenizer() name = os.path.basename(filename) html = HtmlVisualizer(name + ".html") idx = 0 for entry in data: features = itr.__next__() input_ids = entry.get_vector("input_ids") input_ids2 = take(features["input_ids"]) assert np.all(input_ids == input_ids2) alt_emb_mask = take(features["alt_emb_mask"]) tokens = tokenizer.convert_ids_to_tokens(input_ids) p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids) p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids) p_cells = [ Cell(p_tokens[i], 100 if p_mask[i] else 0) for i in range(len(p_tokens)) ] h_cells = [ Cell(h_tokens[i], 100 if h_mask[i] else 0) for i in range(len(h_tokens)) ] label = take(features["label_ids"])[0] logits = entry.get_vector("logits") pred = np.argmax(logits) if not correctness_1[idx] or not correctness_2[idx]: html.write_paragraph("Label : {} Correct: {}/{}".format( label, correctness_1[idx], correctness_2[idx])) html.write_table([p_cells]) html.write_table([h_cells]) idx += 1
def loss_view(): filename = "sero_pred.pickle" p = os.path.join(output_path, filename) data = pickle.load(open(p, "rb")) print(data[0]["masked_lm_example_loss"].shape) print(data[0]["masked_input_ids"].shape) html_writer = HtmlVisualizer("sero_pred.html", dark_mode=False) data = EstimatorPredictionViewerGosford(filename) for inst_i, entry in enumerate(data): losses = entry.get_vector("masked_lm_example_loss") print(losses) tokens = entry.get_tokens("masked_input_ids") cells = data.cells_from_tokens(tokens) row = [] for idx, cell in enumerate(cells): row.append(cell) if len(row) == 20: html_writer.write_table([row]) row = [] html_writer.multirow_print(data.cells_from_anything(losses), 20)