def extract_colors(r, per_sentence): text = r.stats.original_sentence cmap = matplotlib.colors.LinearSegmentedColormap.from_list( "", ["white", "blue"]) text_s = Sentence(text) if not per_sentence: word_gradients = text_s.calc_gradients(r.query.wanted_cls) wgn = np.interp(word_gradients, (np.min(word_gradients), np.max(word_gradients)), (0., 1.)) fg, bg = [], [] for ind in range(len(wgn)): ctpl = cmap(wgn[ind])[:3] tc = twofivefive(text_color(ctpl)) ctpl = twofivefive(ctpl) fg.append(str(tc)[1:-1]) bg.append(str(ctpl)[1:-1]) return fg, bg else: sw_map = get_sentence_word_mapping(text) edit_sentence_order = calc_sentence_edit_schedule( r.query, sw_map, text_s) fg, bg = [], [] for enm_si, si in enumerate(edit_sentence_order): start, stop = sw_map[si] sub = model_config.tokenizer.clean_up_tokenization(" ".join( text_s.words[start:stop + 1])) subtext = Sentence(sub) word_gradients = np.array( subtext.calc_gradients(r.query.wanted_cls)) word_gradients /= np.linalg.norm(word_gradients) wgn = np.interp(word_gradients, (np.min(word_gradients), np.max(word_gradients)), (0., 1.)) for ind in range(len(wgn)): ctpl = cmap(wgn[ind])[:3] tc = twofivefive(text_color(ctpl)) ctpl = twofivefive(ctpl) fg.append(str(tc)[1:-1]) bg.append(str(ctpl)[1:-1]) return fg, bg
def one_mask(): max_words = 20 result = defaultdict(list) result_dist_diff_only = defaultdict(list) result_wdiff = defaultdict(list) for enm in range(len(dataset)): print(f"\nSentence {enm}: ", end="") data = dataset[enm] x, y = data["sentence"], data["label"] x = model_config.tokenizer.clean_up_tokenization(x) y = [0, 1] if y == 1 else [1, 0] y_prime = [1 - y[0], 1 - y[1]] s = Sentence(x) word_gradients = s.calc_gradients(y_prime) sorted_highest = np.argsort(word_gradients)[::-1] for observed_idx in sorted_highest[:10]: # observed_idx = sorted_highest[0] print(f"{observed_idx},", end="") sdir = 1 if len(s.words) - observed_idx > observed_idx else -1 alt_s = Sentence(s.get_with_masked([observed_idx])) original_answer = alt_s.calc_mask_predictions()[observed_idx] if len(original_answer) != 0: for mask_distance in range(1, max_words): if observed_idx + mask_distance * sdir < 0 or observed_idx + mask_distance * sdir >= len( alt_s.words): continue new_sen = Sentence( alt_s.get_with_masked([ observed_idx + mask_distance * sdir, observed_idx ])) alt_sen_pred = new_sen.calc_mask_predictions( )[observed_idx] avg_distance, avg_word_diff, dist_diff_only = find_differences( original_answer, alt_sen_pred) # print(f"Mask offset {mask_distance}: dist={avg_distance:.3f} word_dist={avg_word_diff:.3f}") result[mask_distance].append(avg_distance) result_wdiff[mask_distance].append(avg_word_diff) result_dist_diff_only[mask_distance].append(dist_diff_only) if enm % 50 == 0 or enm == len(dataset) - 1: fig = plt.figure(figsize=(11, 8)) plt.title( "Relation Bewertung der Wörter zur Nähe des nächsten [MASK]-Token" ) plt.xlabel("Entfernung zum zusätzlichen [MASK]-Token") plt.xlim(0, max_words) plt.ylim(0., 0.65) plt.ylabel("Veränderung der Bewertung") idx, mean, std = list( zip(*[(md, np.mean(lst), np.std(lst)) for (md, lst) in result_wdiff.items()])) mean = np.array(mean) std = np.array(std) plt.plot(idx, mean, color='r', label="Wort-Unterschiede") plt.fill_between(idx, mean - std, mean + std, color='r', alpha=.2) idx, mean, std = list( zip(*[(md, np.mean(lst), np.std(lst)) for (md, lst) in result_dist_diff_only.items()])) mean = np.array(mean) std = np.array(std) plt.plot(idx, mean, color='green', label="Distanz-Unterschiede") plt.fill_between(idx, mean - std, mean + std, color='green', alpha=.2) plt.xticks(idx) plt.legend() plt.savefig(f'{root}saved_plots/all/_besser_{enm}.png') # plt.show() plt.close(fig)
def two_mask(): max_words = 15 result = defaultdict(list) result_dist_diff_only = defaultdict(list) result_wdiff = defaultdict(list) for enm in range(len(dataset)): print(f"\nSentence {enm}: ", end="") data = dataset[enm] x, y = data["sentence"], data["label"] x = model_config.tokenizer.clean_up_tokenization(x) y = [0, 1] if y == 1 else [1, 0] y_prime = [1 - y[0], 1 - y[1]] s = Sentence(x) word_gradients = s.calc_gradients(y_prime) sorted_highest = np.argsort(word_gradients)[::-1] for observed_idx in sorted_highest[:10]: print(f"{observed_idx},", end="") alt_s = Sentence(s.get_with_masked([observed_idx])) original_answer = alt_s.calc_mask_predictions()[observed_idx] if len(original_answer) != 0: for mask_distance1 in range(-max_words, max_words + 1): for mask_distance2 in range(-max_words, max_words + 1): if not (0 <= observed_idx + mask_distance1 < len( alt_s.words)): continue if not (0 <= observed_idx + mask_distance2 < len( alt_s.words)): continue new_sen = Sentence( alt_s.get_with_masked( [observed_idx + mask_distance1, observed_idx])) new_sen = Sentence( new_sen.get_with_masked( [observed_idx + mask_distance2, observed_idx])) alt_sen_pred = new_sen.calc_mask_predictions( )[observed_idx] avg_distance, avg_word_diff, dist_diff_only = find_differences( original_answer, alt_sen_pred) result[(mask_distance1, mask_distance2)].append(avg_distance) result_wdiff[(mask_distance1, mask_distance2)].append(avg_word_diff) result_dist_diff_only[( mask_distance1, mask_distance2)].append(dist_diff_only) if enm % 2 == 0 or enm == len(dataset) - 1: all_variants = [(result, "result"), (result_wdiff, "wdiff"), (result_dist_diff_only, "ddiff")] with open('used_data.pickle', 'wb') as handle: pickle.dump(all_variants, handle) for res, name in all_variants: data = [(k, np.mean(v)) for k, v in res.items()] matrix = np.zeros(shape=(2 * max_words + 1, 2 * max_words + 1)) for (i, j), m in data: matrix[(i + max_words), j + max_words] = m plt.figure(figsize=(15, 12)) ax = sns.heatmap( np.flip(matrix, axis=0), linewidth=0.0, xticklabels=list(range(-max_words, max_words + 1)), yticklabels=list(reversed(range(-max_words, max_words + 1)))) ax.set_title( "Durchschnittliche Veränderung der Wörter bei 2 MASK-Tokens" ) plt.savefig(f'{root}saved_plots/2d/{name}_{enm}.pdf') plt.close()
def generate_gradient_highlights(): text = Sentence(ex) word_gradients = np.array(text.calc_gradients(y_prime)) word_gradients /= np.linalg.norm(word_gradients) wgn = np.interp(word_gradients, (np.min(word_gradients), np.max(word_gradients)), (0., 1.)) """ \\newcommand{\\reducedstrut}{\\vrule width 0pt height .9\\ht\\strutbox depth .9\\dp\\strutbox\\relax} \\newcommand{\\mycb}[3]{% \\begingroup \\setlength{\\fboxsep}{0pt}% \\colorbox[rgb]{#1}{ \\strut \\textcolor[rgb]{#2}{#3} }% \\endgroup } """ result = "" # new command overwritten error for cmap in [ # matplotlib.colors.LinearSegmentedColormap.from_list("", ["blue", "white", "red"]), # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "forestgreen"]), # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "orangered"]), # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "crimson"]), matplotlib.colors.LinearSegmentedColormap.from_list( "", ["white", "blue"]), # # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "red"]), # # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "black"]), ]: result += f"" for ind, w in enumerate(text.words): ctpl = cmap(wgn[ind])[:3] tc = str(text_color(ctpl))[1:-1] ctpl = [round(v, 3) for v in ctpl] rgba = str(ctpl)[1:-1] result += f"\\mycb{{{rgba}}}{{{tc}}}{{{latex_escape(w)}}}\\allowbreak" result += f"\n\\\\ Top 10: {', '.join(np.array(text.words)[np.argsort(-wgn)][:10])}\\ \n\n\n" # Sentence-wise calc gradients sw_map = get_sentence_word_mapping(text.text) edit_sentence_order = calc_sentence_edit_schedule( Query(y_prime), sw_map, text) for enm_si, si in enumerate(edit_sentence_order): start, stop = sw_map[si] sub = model_config.tokenizer.clean_up_tokenization(" ".join( text.words[start:stop + 1])) subtext = Sentence(sub) word_gradients = np.array(subtext.calc_gradients(y_prime)) word_gradients /= np.linalg.norm(word_gradients) wgn = np.interp(word_gradients, (np.min(word_gradients), np.max(word_gradients)), (0., 1.)) result += f"{enm_si + 1} Satz (vorher {si + 1}. Satz): " for ind, w in enumerate(subtext.words): ctpl = cmap(wgn[ind])[:3] tc = str(text_color(ctpl))[1:-1] ctpl = [round(v, 3) for v in ctpl] rgba = str(ctpl)[1:-1] result += f"\\mycb{{{rgba}}}{{{tc}}}{{{latex_escape(w)}}}\\allowbreak" result += "\\\\ \n\n" return result