def expand_sentence(text_p: Union[str, Example], word_indices: List[int], query=None, additional_mask_indices: List[int] = None, schedule_idx=[-1]) -> List[Example]: if additional_mask_indices is None: additional_mask_indices = [] if isinstance(text_p, Example): text = text_p.sentence else: text = text_p text = Sentence(text) word_indices = list(word_indices) word_indices = [ wi for wi in word_indices if not all([s in ":,;.*" for s in text.words[wi]]) ] if len(word_indices) == 0: return [] original_words = {i: text.words[i] for i in word_indices} max_words = query.consider_max_words if query is not None else Query( None).consider_max_words masked_sentence = Sentence( text.get_with_masked(word_indices + additional_mask_indices)) predictions = masked_sentence.calc_mask_predictions(max_words) result = [] for word_idx in word_indices: if not predictions[word_idx]: continue sentences = [] for predicted_token, score in predictions[word_idx]: new_sen = text.replace_word(word_idx, predicted_token) sentences.append(new_sen) classification = calc_sentiment_batch(sentences) for i, (predicted_token, score) in enumerate(predictions[word_idx]): if original_words[word_idx] != predicted_token: if isinstance(text_p, str): e = Example(sentences[i], classification[i], [(word_idx, score)], pred_ind=[i], sched_ind=[schedule_idx], sent_ind=[0]) else: e = Example(sentences[i], classification[i], text_p.changes + [(word_idx, score)], pred_ind=text_p.prediction_indices + [i], sched_ind=text_p.schedule_indices + [schedule_idx], sent_ind=text_p.sentence_indices + [0]) result.append(e) return result
def generate_example_table(): result = "" text = Sentence(ex) for query in [ Query(wanted_cls=y_prime, c=0.2, num_needed=5, mini_beam_search=False, allow_splitting=False, consider_top_k=10), Query(wanted_cls=y_prime, c=0.2, num_needed=5, mini_beam_search=True, allow_splitting=False, consider_top_k=10), Query(wanted_cls=y_prime, c=0.2, num_needed=5, mini_beam_search=False, allow_splitting=True, consider_top_k=10), Query(wanted_cls=y_prime, c=0.2, num_needed=5, mini_beam_search=True, allow_splitting=True, consider_top_k=10), ]: q_result = generate_counterfactuals(ex, query) print(q_result) ex_df = [] for change_group in q_result.examples: for e in change_group: se = Sentence(e.sentence) d = e.changed_word_distances() cwi = e.changed_word_indices() entry = { "Original": ', '.join([text.words[wi] for wi in cwi]), "Counterfactual": f"{', '.join([se.words[wi] for wi in cwi])}", "Klassifikation": f"{e.cls[1]:.2f}", "Distanz": f"{sum([d_i ** 2 for d_i in d]) + COST_PER_ADDITIONAL_WORD * len(d):.2f}" } ex_df.append(entry) ex_df = pd.DataFrame(ex_df) # result += f"\n\n\nOriginale Klassifikation: {text.calc_sentiment()[1]:.2f} \\\\ \n" # result += f"\nMBS={query.mini_beam_search}, ST={query.allow_splitting}, MAX\\_WORDS={query.consider_max_words} \\\\ \n" result += "\n\n" result += ex_df.to_latex( index=False, caption= f"{query.alg()} (Originale Klassifikation: {text.calc_sentiment()[1]:.2f})" ) return result
def get_sentence_word_mapping(text: str) -> List[Tuple[int, int]]: # can be longer than original, because Sentence(..) limited to 512 tokens tok_sen = nltk_tokenizer.sent_tokenize(text) original = model_config.tokenizer.clean_up_tokenization(" ".join( Sentence(text.lower().strip()).words)) sentence = Sentence(original) word_sentence_map = [] last_start = 0 for xp in tok_sen: cxp = clean_for_comparison(xp.lower()) for i in range(last_start, len(sentence.words) + 1): ccomp = clean_for_comparison("".join(sentence.words[last_start:i])) if ccomp == cxp: word_sentence_map.append((last_start, i - 1)) last_start = i break if last_start != len(sentence.words): word_sentence_map.append((last_start, len(sentence.words))) # merge small sentences (<6) into neighbors while True: dists = [(stop - start) for (start, stop) in word_sentence_map] if all([d > MIN_SEN_LEN for d in dists]): return word_sentence_map else: # if True not in [d > MIN_SEN_LEN for d in dists]: # return word_sentence_map sen_idx = [d > MIN_SEN_LEN for d in dists].index(False) # calc left side sen len if sen_idx - 1 < 0: left_len = None else: o_start, o_stop = word_sentence_map[sen_idx - 1] left_len = o_stop - o_start # calc right side sen len if sen_idx + 1 >= len(word_sentence_map): right_len = None else: o_start, o_stop = word_sentence_map[sen_idx + 1] right_len = o_stop - o_start if right_len is None and left_len is None: return word_sentence_map elif left_len is None or (right_len is not None and left_len < right_len): # merge with right new_entry = (word_sentence_map[sen_idx][0], word_sentence_map[sen_idx + 1][1]) word_sentence_map[sen_idx:sen_idx + 2] = [new_entry] elif right_len is None or (right_len is not None and right_len <= left_len): # merge with left new_entry = (word_sentence_map[sen_idx - 1][0], word_sentence_map[sen_idx][1]) word_sentence_map[sen_idx - 1:sen_idx + 1] = [new_entry]
def example_as_latex_string(idx): example_with_ans = data[idx] res_obj: Result res_obj = example_with_ans[1] if len(res_obj.examples) == 0: return "" cf_ex = [ escape_latex(w) for w in Sentence(res_obj.examples[0][0].sentence).words ] origi = [ escape_latex(w) for w in Sentence(res_obj.stats.original_sentence).words ] assert len(cf_ex) == len(origi) i = 0 result_str = f""" \\begin{{figure}}[h] \\begin{{center}} \\begin{{tabular}}{{|l|c|c|}} \\multicolumn{{3}}{{c}}{{Variante: {res_obj.query.alg()}}} \\\\ \\hline {{}} & Counterfactual Example & Original \\\\ \\hline Perplexity & {res_obj.examples[0][0].calc_perplexity():.2f} & {Sentence(res_obj.stats.original_sentence).calc_perplexity():.2f} \\\\ Polarität & {res_obj.examples[0][0].cls[1]:.2f} & {res_obj.stats.original_classification[1]:.2f} \\\\ \\hline \\end{{tabular}} \\end{{center}} \n """ while i != len(cf_ex): if cf_ex[i] != origi[i]: combine_from, combine_to = i, i + 1 while origi[combine_to] != cf_ex[combine_to]: combine_to += 1 left_part = ' '.join(origi[combine_from:combine_to]) right_part = ' '.join(cf_ex[combine_from:combine_to]) change = "\\mbox{[" + tex_color( left_part, "blue", False) + " \\to{} " + tex_color( right_part, "blue", True) + "]} " result_str += change i += (combine_to - combine_from) else: result_str += cf_ex[i] + " " i += 1 result_str += f"\n\\caption{{ Beispiel {idx // 4} aus der {DATASET} Evaluation ({res_obj.query.alg()}) }}\n\\label{{{idx // 4}_{res_obj.query.alg()}}}\n\\end{{figure}} " return result_str + "\n\n\n"
def calc_sentence_edit_schedule(query, sw_map, text): if USE_GRADIENTS_FOR_SENTENCE_RELEVANCE: gradients = text.calc_gradients(query.wanted_cls) sent_grad = defaultdict(list) for i in range(len(text.words)): idx = [a <= i <= b for (a, b) in sw_map].index(True) sent_grad[idx].append(gradients[i]) gradients_per_sentence = [(si, np.linalg.norm(g)) for (si, g) in sent_grad.items()] edit_sentence_order = [ y[0] for y in sorted( gradients_per_sentence, key=lambda x: x[1], reverse=True) ] else: # use distance to wanted classification for relevance dist_to_wanted_cls = [] for start, stop in sw_map: sub = model_config.tokenizer.clean_up_tokenization(" ".join( text.words[start:stop + 1])) cls = Sentence(sub).calc_sentiment() dst = mse_loss(torch.tensor(cls), torch.tensor(query.wanted_cls, dtype=torch.float32)) dist_to_wanted_cls.append(dst) edit_sentence_order = np.argsort(-np.array(dist_to_wanted_cls)) return edit_sentence_order
def default(self, obj): if isinstance(obj, np.integer): return int(obj) elif isinstance(obj, np.floating): return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, Example): ex_dict = dict(obj.__dict__) ex_dict["sentence"] = Sentence(clean(obj.sentence)).words ex_dict["perplexity"] = Sentence(obj.sentence).calc_perplexity() del ex_dict["prediction_indices"] del ex_dict["schedule_indices"] del ex_dict["sentence_indices"] del ex_dict["changes"] return ex_dict else: return super(CustomEncoder, self).default(obj)
def info(self): result_str = "" sen = Sentence(self.stats.original_sentence) result_str += f"pp={sen.calc_perplexity():.2f}, {len(sen.words)} words, y={np.round(self.stats.original_classification, 2)}\n" result_str += f"Duration: {self.stats.total_duration} | {self.stats.find_matching_words_duration} searching words | {self.stats.merging_duration} merging.\n" result_str += f"{self.total_valid_examples()} examples, {len(self.rest)} in rest, found {len(self.examples)} of {self.query.num_needed} groups with different indices.\n" for e in self.simple_results(): result_str += "\t" + e.info() + "\n" result_str += "Successful!\n" if self.success else "Query not fullfilled!" return result_str
def __init__(self, sentence: str, classification, changes, pred_ind, sched_ind, sent_ind): self.cls = classification self.sentence = sentence self.changes: List[Tuple[WordIdx, float]] = changes self.prediction_indices = pred_ind # n-th word in list of alternative words self.schedule_indices = sched_ind # 1st highest gradient word, 2nd hgw, ... self.sentence_indices = sent_ind # 1st sentence,... only != 0 if splitting text self.perplexity = None for (i, _) in changes: assert i < len(Sentence(self.sentence).words) assert len(self.schedule_indices) == len(self.prediction_indices) == len(self.changes)
def extract_colors(r, per_sentence): text = r.stats.original_sentence cmap = matplotlib.colors.LinearSegmentedColormap.from_list( "", ["white", "blue"]) text_s = Sentence(text) if not per_sentence: word_gradients = text_s.calc_gradients(r.query.wanted_cls) wgn = np.interp(word_gradients, (np.min(word_gradients), np.max(word_gradients)), (0., 1.)) fg, bg = [], [] for ind in range(len(wgn)): ctpl = cmap(wgn[ind])[:3] tc = twofivefive(text_color(ctpl)) ctpl = twofivefive(ctpl) fg.append(str(tc)[1:-1]) bg.append(str(ctpl)[1:-1]) return fg, bg else: sw_map = get_sentence_word_mapping(text) edit_sentence_order = calc_sentence_edit_schedule( r.query, sw_map, text_s) fg, bg = [], [] for enm_si, si in enumerate(edit_sentence_order): start, stop = sw_map[si] sub = model_config.tokenizer.clean_up_tokenization(" ".join( text_s.words[start:stop + 1])) subtext = Sentence(sub) word_gradients = np.array( subtext.calc_gradients(r.query.wanted_cls)) word_gradients /= np.linalg.norm(word_gradients) wgn = np.interp(word_gradients, (np.min(word_gradients), np.max(word_gradients)), (0., 1.)) for ind in range(len(wgn)): ctpl = cmap(wgn[ind])[:3] tc = twofivefive(text_color(ctpl)) ctpl = twofivefive(ctpl) fg.append(str(tc)[1:-1]) bg.append(str(ctpl)[1:-1]) return fg, bg
def merge_all_changes_into_one(grouped_items, original) -> Example: # Merge all available -> check merge_all_changes_into_this_one = list(original.words) merge_changes = [] pred_indices = [] sched_indices = [] sent_indices = [] for changes_a, la in grouped_items: if len(changes_a) == 1 and len(la) > 0: e: Example = la[0] s = Sentence(e) merge_all_changes_into_this_one[changes_a[0]] = s.words[ changes_a[0]] merge_changes += e.changes pred_indices.append(e.prediction_indices) sched_indices.append(e.schedule_indices) sent_indices.append(e.sentence_indices) merge_all_changes_into_this_one = " ".join(merge_all_changes_into_this_one) all_sentiment = calc_sentiment_batch([merge_all_changes_into_this_one]) return Example(merge_all_changes_into_this_one, all_sentiment[0], merge_changes, pred_indices, sched_indices, sent_indices)
def __repr__(self): # DEBUG HELP left_right_window = 6 sen = Sentence(self.sentence) changed_indices = self.changed_word_indices() sp = [w if i not in changed_indices else f"#{w}#" for i, w in enumerate(sen.words)] relevant_parts = ' '.join(sp) if len(relevant_parts) > 160: relevant_parts = [] for i, (w_id, score) in enumerate(self.changes): if i != 0: relevant_parts.append("[...]") word = sp[w_id] left = sp[max(0, w_id - left_right_window):w_id] right = sp[w_id + 1:w_id + left_right_window] relevant_parts.extend(left + [word] + right) relevant_parts = ' '.join(relevant_parts) return f"PRED_IDX={self.prediction_indices}, " \ f"cls={np.round(self.cls, 2)}, " \ f"idxs={changed_indices} " \ f"dist={np.round(list(dict(self.changes).values()), 2)}, {relevant_parts}"
def one_mask(): max_words = 20 result = defaultdict(list) result_dist_diff_only = defaultdict(list) result_wdiff = defaultdict(list) for enm in range(len(dataset)): print(f"\nSentence {enm}: ", end="") data = dataset[enm] x, y = data["sentence"], data["label"] x = model_config.tokenizer.clean_up_tokenization(x) y = [0, 1] if y == 1 else [1, 0] y_prime = [1 - y[0], 1 - y[1]] s = Sentence(x) word_gradients = s.calc_gradients(y_prime) sorted_highest = np.argsort(word_gradients)[::-1] for observed_idx in sorted_highest[:10]: # observed_idx = sorted_highest[0] print(f"{observed_idx},", end="") sdir = 1 if len(s.words) - observed_idx > observed_idx else -1 alt_s = Sentence(s.get_with_masked([observed_idx])) original_answer = alt_s.calc_mask_predictions()[observed_idx] if len(original_answer) != 0: for mask_distance in range(1, max_words): if observed_idx + mask_distance * sdir < 0 or observed_idx + mask_distance * sdir >= len( alt_s.words): continue new_sen = Sentence( alt_s.get_with_masked([ observed_idx + mask_distance * sdir, observed_idx ])) alt_sen_pred = new_sen.calc_mask_predictions( )[observed_idx] avg_distance, avg_word_diff, dist_diff_only = find_differences( original_answer, alt_sen_pred) # print(f"Mask offset {mask_distance}: dist={avg_distance:.3f} word_dist={avg_word_diff:.3f}") result[mask_distance].append(avg_distance) result_wdiff[mask_distance].append(avg_word_diff) result_dist_diff_only[mask_distance].append(dist_diff_only) if enm % 50 == 0 or enm == len(dataset) - 1: fig = plt.figure(figsize=(11, 8)) plt.title( "Relation Bewertung der Wörter zur Nähe des nächsten [MASK]-Token" ) plt.xlabel("Entfernung zum zusätzlichen [MASK]-Token") plt.xlim(0, max_words) plt.ylim(0., 0.65) plt.ylabel("Veränderung der Bewertung") idx, mean, std = list( zip(*[(md, np.mean(lst), np.std(lst)) for (md, lst) in result_wdiff.items()])) mean = np.array(mean) std = np.array(std) plt.plot(idx, mean, color='r', label="Wort-Unterschiede") plt.fill_between(idx, mean - std, mean + std, color='r', alpha=.2) idx, mean, std = list( zip(*[(md, np.mean(lst), np.std(lst)) for (md, lst) in result_dist_diff_only.items()])) mean = np.array(mean) std = np.array(std) plt.plot(idx, mean, color='green', label="Distanz-Unterschiede") plt.fill_between(idx, mean - std, mean + std, color='green', alpha=.2) plt.xticks(idx) plt.legend() plt.savefig(f'{root}saved_plots/all/_besser_{enm}.png') # plt.show() plt.close(fig)
def two_mask(): max_words = 15 result = defaultdict(list) result_dist_diff_only = defaultdict(list) result_wdiff = defaultdict(list) for enm in range(len(dataset)): print(f"\nSentence {enm}: ", end="") data = dataset[enm] x, y = data["sentence"], data["label"] x = model_config.tokenizer.clean_up_tokenization(x) y = [0, 1] if y == 1 else [1, 0] y_prime = [1 - y[0], 1 - y[1]] s = Sentence(x) word_gradients = s.calc_gradients(y_prime) sorted_highest = np.argsort(word_gradients)[::-1] for observed_idx in sorted_highest[:10]: print(f"{observed_idx},", end="") alt_s = Sentence(s.get_with_masked([observed_idx])) original_answer = alt_s.calc_mask_predictions()[observed_idx] if len(original_answer) != 0: for mask_distance1 in range(-max_words, max_words + 1): for mask_distance2 in range(-max_words, max_words + 1): if not (0 <= observed_idx + mask_distance1 < len( alt_s.words)): continue if not (0 <= observed_idx + mask_distance2 < len( alt_s.words)): continue new_sen = Sentence( alt_s.get_with_masked( [observed_idx + mask_distance1, observed_idx])) new_sen = Sentence( new_sen.get_with_masked( [observed_idx + mask_distance2, observed_idx])) alt_sen_pred = new_sen.calc_mask_predictions( )[observed_idx] avg_distance, avg_word_diff, dist_diff_only = find_differences( original_answer, alt_sen_pred) result[(mask_distance1, mask_distance2)].append(avg_distance) result_wdiff[(mask_distance1, mask_distance2)].append(avg_word_diff) result_dist_diff_only[( mask_distance1, mask_distance2)].append(dist_diff_only) if enm % 2 == 0 or enm == len(dataset) - 1: all_variants = [(result, "result"), (result_wdiff, "wdiff"), (result_dist_diff_only, "ddiff")] with open('used_data.pickle', 'wb') as handle: pickle.dump(all_variants, handle) for res, name in all_variants: data = [(k, np.mean(v)) for k, v in res.items()] matrix = np.zeros(shape=(2 * max_words + 1, 2 * max_words + 1)) for (i, j), m in data: matrix[(i + max_words), j + max_words] = m plt.figure(figsize=(15, 12)) ax = sns.heatmap( np.flip(matrix, axis=0), linewidth=0.0, xticklabels=list(range(-max_words, max_words + 1)), yticklabels=list(reversed(range(-max_words, max_words + 1)))) ax.set_title( "Durchschnittliche Veränderung der Wörter bei 2 MASK-Tokens" ) plt.savefig(f'{root}saved_plots/2d/{name}_{enm}.pdf') plt.close()
def clean(s): return model_config.tokenizer.clean_up_tokenization(' '.join( Sentence(s).words)).replace(" - ", "-").lower()
import gensim.downloader as api import pandas as pd from gensim.models.keyedvectors import Word2VecKeyedVectors from config import model_config from search_utils.Sentence import Sentence wv = api.load('word2vec-google-news-300') wv: Word2VecKeyedVectors s = """my thoughts were focused on the characters .""" s = model_config.tokenizer.clean_up_tokenization(s) sen_s = Sentence(s) df_wv = dict() df = dict() for i in range(len(sen_s.words)): word = sen_s.words[i] if word in ".:,?!-(){}[]/\\|&%": continue new_s = sen_s.get_with_masked(i) bert_preds = Sentence(new_s).calc_mask_predictions()[i] wv_preds = wv.most_similar(word, topn=15, restrict_vocab=200_000) df_wv[word] = list(list(zip(*wv_preds))[0]) df[word] = list(zip(*bert_preds))[0][:15] df = pd.DataFrame(df) df_wv = pd.DataFrame(df_wv)
def _gen_cf_ex(text: str, query) -> Result: """ Parameters ---------- text: str query : search_helper.classes.Query.Query Returns ------- """ gc.collect() torch.cuda.empty_cache() stats = Statistics(original_sentence=text) stats.total_duration.resume() stats.query = query text = model_config.tokenizer.clean_up_tokenization(text) text = Sentence(text) original_cls = text.calc_sentiment() stats.original_classification = original_cls examples = [] schedule = text.calc_edit_schedule(query) for schedule_idx, (edit_strategy, word_indices) in enumerate(schedule): assert isinstance(schedule_idx, int) with stats.find_matching_words_duration: if edit_strategy == Edit.NEIGHBOURS: batch = expand_sentence_neighbour(text.text, query, word_indices, schedule_idx) else: word_indices, mask_indices = word_indices batch = expand_sentence(text.text, word_indices, query, mask_indices, schedule_idx) # filtering only 'relevant' makes the found words more extreme # relevant_batch = [b for b in batch if abs(original_cls[cls_idx] - b.cls[cls_idx]) > MIN_SENTIMENT_CHANGE] relevant_batch = batch debug( f"{len(examples)} examples total | {len(batch)} new for {len(word_indices)} words with {schedule_idx} highest gradient" ) stats.tried_examples += len(batch) examples.extend(relevant_batch) with stats.merging_duration: num_per_group = max(4 - schedule_idx, 1) if schedule_idx < 10 else -1 merged_examples = generate_merged_examples(text, examples, query, num_per_group) examples.extend(merged_examples) stats.tried_examples += len(merged_examples) results: Result = examples_are_sufficient(examples, query) if results.sufficient(): stats.total_duration.pause() assert stats.all_timers_stopped() results.stats = stats results.query = query return results results: Result = examples_are_sufficient(examples, query) stats.total_duration.pause() assert stats.all_timers_stopped() results.stats = stats results.query = query return results
def _gen_cf_ex_long(text: str, query) -> Result: """ Parameters ---------- text : str query: search_helper.classes.Query.Query Returns ------- Result """ gc.collect() torch.cuda.empty_cache() stats = Statistics(original_sentence=text) stats.total_duration.resume() stats.query = query stats.original_sentence = text text = model_config.tokenizer.clean_up_tokenization(text) text = Sentence(text) stats.original_classification = text.calc_sentiment() sw_map = get_sentence_word_mapping(text.text) edit_sentence_order = calc_sentence_edit_schedule(query, sw_map, text) examples = [] for sentence_sched_idx, si in enumerate( edit_sentence_order[:query.consider_max_sentences]): debug("> subsentence") start, stop = sw_map[si] sub = model_config.tokenizer.clean_up_tokenization(" ".join( text.words[start:stop + 1])) sub_query = copy(query) subresult: Result = _gen_cf_ex(sub, sub_query) if stats.tried_sentences is None: stats.tried_sentences = 0 stats.tried_sentences += 1 stats.add(subresult.stats) subexample: Example best_subexamples = [j[0] for j in subresult.examples] debug( f"SUBEXAMPLE SEARCH FOUND {len(best_subexamples)} of {sub_query.num_needed}" ) if len(best_subexamples) < query.num_needed: best_subexamples.extend(subresult.rest[:(query.num_needed - len(best_subexamples))]) debug(f"Added from rest, now {len(best_subexamples)}") for subexample in best_subexamples: new_sen = list(text.words) new_sen[start:stop + 1] = [subexample.sentence] new_sen = Sentence( model_config.tokenizer.clean_up_tokenization( " ".join(new_sen))) new_cls = new_sen.calc_sentiment() # print(np.round(new_cls, 3), new_sen.text) new_changes = [(pos + start, dist) for (pos, dist) in subexample.changes] e = Example(new_sen.text, new_cls, new_changes, pred_ind=subexample.prediction_indices, sched_ind=subexample.schedule_indices, sent_ind=[sentence_sched_idx]) examples.append(e) with stats.merging_duration: debug("> subsentence merge") merged_examples = generate_merged_examples(text, examples, query, 1) examples.extend(merged_examples) stats.tried_examples += len(merged_examples) debug("< subsentence merge") results: Result = examples_are_sufficient(examples, query) if results.sufficient(): stats.total_duration.pause() assert stats.all_timers_stopped() results.stats = stats results.query = query results.sentence_map = sw_map return results debug("< subsentence") results: Result = examples_are_sufficient(examples, query) stats.total_duration.pause() assert stats.all_timers_stopped() results.stats = stats results.query = query results.sentence_map = sw_map return results
from search_utils.Sentence import Sentence Sentence("David [MASK] is a spanish tennis player.").calc_mask_predictions( ) # ferrer Sentence("David [MASK] is a famous musician.").calc_mask_predictions() # bowie Sentence("David [MASK] is a musician.").calc_mask_predictions() # smith Sentence("[MASK] is a musician.").calc_mask_predictions() # he Sentence( "David [MASK] was the prime minister.").calc_mask_predictions() # cameron Sentence( "David [MASK] is the prime minister.").calc_mask_predictions() # cameron Sentence("It's over Anakin! I have the [MASK] ground!").calc_mask_predictions( ) # high Sentence("It's over Anakin! I have the high [MASK]!").calc_mask_predictions( ) # ground Sentence("It's over Anakin! I [MASK] the high ground!").calc_mask_predictions( ) # need Sentence("Boy, that [MASK] quickly!").calc_mask_predictions( ) # was, happened, moved Sentence("The man who passes the sentence should swing the [MASK]." ).calc_mask_predictions() # chair, bail, stick, wheel Sentence("A Lannister always pays his [MASK].").calc_mask_predictions( ) # debts, taxes, way Sentence("This is the [MASK].").calc_mask_predictions() # end, truth, way Sentence("That's what I do: I [MASK] and I know things." ).calc_mask_predictions() # see, think, look
def calc_perplexity(self): if self.perplexity is None: self.perplexity = Sentence(self.sentence).calc_perplexity() return self.perplexity
def marked(self): ps = Sentence(self.sentence).words bold_indices = list(zip(*self.changes))[0] for i, wi in enumerate(bold_indices): ps[wi] = f"#{i + 1}#={ps[wi]}" return " ".join(ps)
def generate_gradient_highlights(): text = Sentence(ex) word_gradients = np.array(text.calc_gradients(y_prime)) word_gradients /= np.linalg.norm(word_gradients) wgn = np.interp(word_gradients, (np.min(word_gradients), np.max(word_gradients)), (0., 1.)) """ \\newcommand{\\reducedstrut}{\\vrule width 0pt height .9\\ht\\strutbox depth .9\\dp\\strutbox\\relax} \\newcommand{\\mycb}[3]{% \\begingroup \\setlength{\\fboxsep}{0pt}% \\colorbox[rgb]{#1}{ \\strut \\textcolor[rgb]{#2}{#3} }% \\endgroup } """ result = "" # new command overwritten error for cmap in [ # matplotlib.colors.LinearSegmentedColormap.from_list("", ["blue", "white", "red"]), # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "forestgreen"]), # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "orangered"]), # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "crimson"]), matplotlib.colors.LinearSegmentedColormap.from_list( "", ["white", "blue"]), # # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "red"]), # # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "black"]), ]: result += f"" for ind, w in enumerate(text.words): ctpl = cmap(wgn[ind])[:3] tc = str(text_color(ctpl))[1:-1] ctpl = [round(v, 3) for v in ctpl] rgba = str(ctpl)[1:-1] result += f"\\mycb{{{rgba}}}{{{tc}}}{{{latex_escape(w)}}}\\allowbreak" result += f"\n\\\\ Top 10: {', '.join(np.array(text.words)[np.argsort(-wgn)][:10])}\\ \n\n\n" # Sentence-wise calc gradients sw_map = get_sentence_word_mapping(text.text) edit_sentence_order = calc_sentence_edit_schedule( Query(y_prime), sw_map, text) for enm_si, si in enumerate(edit_sentence_order): start, stop = sw_map[si] sub = model_config.tokenizer.clean_up_tokenization(" ".join( text.words[start:stop + 1])) subtext = Sentence(sub) word_gradients = np.array(subtext.calc_gradients(y_prime)) word_gradients /= np.linalg.norm(word_gradients) wgn = np.interp(word_gradients, (np.min(word_gradients), np.max(word_gradients)), (0., 1.)) result += f"{enm_si + 1} Satz (vorher {si + 1}. Satz): " for ind, w in enumerate(subtext.words): ctpl = cmap(wgn[ind])[:3] tc = str(text_color(ctpl))[1:-1] ctpl = [round(v, 3) for v in ctpl] rgba = str(ctpl)[1:-1] result += f"\\mycb{{{rgba}}}{{{tc}}}{{{latex_escape(w)}}}\\allowbreak" result += "\\\\ \n\n" return result
dataset, data = list(pickle.load(file).items())[0] model_config.load(dataset, "gpt2") json_datapoint = None result: Result for another_idx, (ds_idx, result) in enumerate(tqdm(data)): max_ds_idx = max(ds_idx, max_ds_idx) if last_ds_idx != ds_idx: if another_idx != 0: json_all.append(json_datapoint) last_ds_idx = ds_idx fg, bg = extract_colors(result, per_sentence=False) fg_ps, bg_ps = extract_colors(result, per_sentence=True) json_datapoint = { 'sentence': Sentence(clean(result.stats.original_sentence)).words, 'foreground': fg, 'foreground_per_sen': fg_ps, 'background': bg, 'background_per_sen': bg_ps, 'original_cls': result.stats.original_classification, 'wanted_cls': result.query.wanted_cls, 'original_ppl': np.round( Sentence(result.stats.original_sentence).
# text = "if you enjoy more thoughtful comedies with interesting conflicted characters ; this one is for you ." # text = "no place for this story to go but down" # text = "minority report is exactly what the title indicates , a report ." # text = "it 's refreshing to see a girl-power movie that does n't feel it has to prove anything ." # text = "it has all the excitement of eating oatmeal ." # text = "his healthy sense of satire is light and fun ..." # text = "Ultimately feels empty and unsatisfying, like swallowing a Communion wafer without the wine." # text = "the action sequences are fun and reminiscent of combat scenes from the star wars series ." # text = "with jump cuts , fast editing and lots of pyrotechnics , yu clearly hopes to camouflage how bad his movie is ." # text = "why make a documentary about these marginal historical figures ?" # text = "the character of zigzag is not sufficiently developed to support a film constructed around him ." # text = "watchable up until the point where the situations and the dialogue spin hopelessly out of control" text = model_config.tokenizer.clean_up_tokenization(text) s = Sentence(text) result = _gen_cf_ex( text, Query(wanted_cls=[0., 1.], max_delta=0.4, num_needed=5, consider_max_words=500, consider_top_k=15)) print(result.info()) result = [lst[0] for lst in result.examples] data = {i: get_scatter_data(i) for i in range(len(s.words))} colors = ["red", "green", "orange", "magenta", "lawngreen"] # cmap_scale = cm.ScalarMappable(norm=mpl.colors.Normalize(vmin=0, vmax=len(result)), cmap=cm.gist_rainbow) fig = plt.figure(figsize=(10, 14))
def generate_merged_examples(original, examples: List[Example], query, curr_iter_idx): """ Parameters ---------- original : Sentence examples : List of Example query: search_helper.classes.Query.Query curr_iter_idx: int Returns ------- list of merged examples """ generated = [] cs = query.c if isinstance(query.c, list) else [query.c] for c in cs: grouped_by_changed_indices = defaultdict(list) examples = examples_sorted(examples, query.wanted_cls, c)[:MAX_EXAMPLES_TO_CONSIDER_FOR_MERGING] for e in examples: if len(grouped_by_changed_indices[tuple( e.changed_word_indices())]) <= 5: grouped_by_changed_indices[tuple( e.changed_word_indices())].append(e) grouped_items = grouped_by_changed_indices.items() grouped_keys = [set(x) for x in grouped_by_changed_indices.keys()] if len(grouped_items) <= 1: # -> nothing to merge return [] if curr_iter_idx == -1: return [merge_all_changes_into_one(grouped_items, original)] debug(f"MERGING ({len(grouped_keys)}) {grouped_keys}") for a_idx, (changes_a, la) in enumerate(grouped_items): for b_idx, (changes_b, lb) in enumerate(grouped_items): # changes already done or changes overlap or changes too close together if set(changes_a + changes_b) in grouped_keys \ or len(set(changes_a).intersection(set(changes_b))) > 0 \ or min_distance(changes_a, changes_b) < MIN_DISTANCE_BETWEEN_CHANGED_WORDS: continue resulting_edit_size = len(set(changes_a + changes_b)) take_n = max(1, 5 - resulting_edit_size) # Merge examples a: Example b: Example for a in la[:take_n]: for b in lb[:take_n]: sa, sb = Sentence(a), Sentence(b) new = list(original.words) for na in a.changed_word_indices(): new[na] = sa.words[na] for na in b.changed_word_indices(): new[na] = sb.words[na] generated.append( (" ".join(new), tuple(a.changes + b.changes), a.prediction_indices + b.prediction_indices, a.schedule_indices + b.schedule_indices, a.sentence_indices + a.sentence_indices)) grouped_keys.append( set(dict(a.changes + b.changes).keys())) # can indent 2? debug(f"MERGE: generated {len(generated)} MERGED (#c={len(cs)}) examples") if len(generated) == 0: return [] unzipped = list(zip(*generated)) sentence_list = list(unzipped[0]) if len(sentence_list) == 0: return [] sentiment_batch = calc_sentiment_batch(sentence_list) result = [] for i, (sen, changes, pred_idx, sched_idx, sent_idx) in enumerate(generated): example = Example(sen, sentiment_batch[i], list(changes), pred_ind=pred_idx, sched_ind=sched_idx, sent_ind=sent_idx) example.schedule_indices = sched_idx result.append(example) return result
def expand_sentence_neighbour(text_p: Union[str, Example], query, word_indices: List[int], schedule_idx) -> List[Example]: """ Parameters ---------- schedule_idx text_p word_indices query : search_helper.classes.Query.Query Returns ------- """ if isinstance(text_p, Example): text = text_p.sentence else: text = text_p text = Sentence(text) word_indices = list(word_indices) word_indices = [ wi for wi in word_indices if not all([s in ":,;.*" for s in text.words[wi]]) ] if len(word_indices) == 0: return [] word_indices = word_indices[:MBS_DEPTH] masked_text = Sentence(text.get_with_masked(word_indices)) initial_example = [Example(masked_text.text, [], [], [], [], [])] results = [] for word_idx in word_indices: debug(f"expand neighbours: {word_idx} of {word_indices}") tmp_results = examples_sorted(results, query.wanted_cls, query.c)[:MBS_BEAMS] results = [] for interm_example in (initial_example if word_idx == word_indices[0] else tmp_results): intermediate_sen = Sentence(interm_example) predictions = Sentence(interm_example).calc_mask_predictions( query.consider_max_words) if not predictions[word_idx]: continue sentences = [] for predicted_token, score in predictions[word_idx]: new_sen = intermediate_sen.replace_mask( word_idx, predicted_token) sentences.append(new_sen) classification = calc_sentiment_batch(sentences) for i, (predicted_token, score) in enumerate(predictions[word_idx]): results.append( Example(sentences[i], classification[i], interm_example.changes + [(word_idx, score)], pred_ind=interm_example.prediction_indices + [i], sched_ind=interm_example.schedule_indices + [schedule_idx], sent_ind=[0])) return examples_sorted(results, query.wanted_cls, query.c)