def generate_example_table(): result = "" text = Sentence(ex) for query in [ Query(wanted_cls=y_prime, c=0.2, num_needed=5, mini_beam_search=False, allow_splitting=False, consider_top_k=10), Query(wanted_cls=y_prime, c=0.2, num_needed=5, mini_beam_search=True, allow_splitting=False, consider_top_k=10), Query(wanted_cls=y_prime, c=0.2, num_needed=5, mini_beam_search=False, allow_splitting=True, consider_top_k=10), Query(wanted_cls=y_prime, c=0.2, num_needed=5, mini_beam_search=True, allow_splitting=True, consider_top_k=10), ]: q_result = generate_counterfactuals(ex, query) print(q_result) ex_df = [] for change_group in q_result.examples: for e in change_group: se = Sentence(e.sentence) d = e.changed_word_distances() cwi = e.changed_word_indices() entry = { "Original": ', '.join([text.words[wi] for wi in cwi]), "Counterfactual": f"{', '.join([se.words[wi] for wi in cwi])}", "Klassifikation": f"{e.cls[1]:.2f}", "Distanz": f"{sum([d_i ** 2 for d_i in d]) + COST_PER_ADDITIONAL_WORD * len(d):.2f}" } ex_df.append(entry) ex_df = pd.DataFrame(ex_df) # result += f"\n\n\nOriginale Klassifikation: {text.calc_sentiment()[1]:.2f} \\\\ \n" # result += f"\nMBS={query.mini_beam_search}, ST={query.allow_splitting}, MAX\\_WORDS={query.consider_max_words} \\\\ \n" result += "\n\n" result += ex_df.to_latex( index=False, caption= f"{query.alg()} (Originale Klassifikation: {text.calc_sentiment()[1]:.2f})" ) return result
def expand_sentence(text_p: Union[str, Example], word_indices: List[int], query=None, additional_mask_indices: List[int] = None, schedule_idx=[-1]) -> List[Example]: if additional_mask_indices is None: additional_mask_indices = [] if isinstance(text_p, Example): text = text_p.sentence else: text = text_p text = Sentence(text) word_indices = list(word_indices) word_indices = [ wi for wi in word_indices if not all([s in ":,;.*" for s in text.words[wi]]) ] if len(word_indices) == 0: return [] original_words = {i: text.words[i] for i in word_indices} max_words = query.consider_max_words if query is not None else Query( None).consider_max_words masked_sentence = Sentence( text.get_with_masked(word_indices + additional_mask_indices)) predictions = masked_sentence.calc_mask_predictions(max_words) result = [] for word_idx in word_indices: if not predictions[word_idx]: continue sentences = [] for predicted_token, score in predictions[word_idx]: new_sen = text.replace_word(word_idx, predicted_token) sentences.append(new_sen) classification = calc_sentiment_batch(sentences) for i, (predicted_token, score) in enumerate(predictions[word_idx]): if original_words[word_idx] != predicted_token: if isinstance(text_p, str): e = Example(sentences[i], classification[i], [(word_idx, score)], pred_ind=[i], sched_ind=[schedule_idx], sent_ind=[0]) else: e = Example(sentences[i], classification[i], text_p.changes + [(word_idx, score)], pred_ind=text_p.prediction_indices + [i], sched_ind=text_p.schedule_indices + [schedule_idx], sent_ind=text_p.sentence_indices + [0]) result.append(e) return result
def calc_mask_predictions(self, max_num=None): if max_num is None: max_num = Query(None).consider_max_words indices = [ i - 1 for i, x in enumerate(self.input_ids) if x == model_config.tokenizer.mask_token_id ] assert len( indices ) != 0, "cant use calc_mask_predictions for sentence without mask token, use calc_word_predictions" return self.calc_word_predictions( indices if len(indices) > 1 else indices[0], max_num)
for enm in tqdm.tqdm(range(new_start, len(dataset))): data = dataset[enm] x, y = data["text"], data["label"] x = model_config.tokenizer.clean_up_tokenization(x) if y == -1: info("y is -1 \ny is -1 \ny is -1 \ny is -1 \n") continue # test data for SST-2 has label -1 (placeholder?) y_prime = 1 - y y_prime = [1 - y_prime, y_prime] for mbs in [True, False]: for allow_splitting in [True, False]: query = Query(wanted_cls=y_prime, max_delta=0.4, c=0.2, num_needed=1, mask_additional_words=False, mini_beam_search=mbs, allow_splitting=allow_splitting, consider_top_k=20, consider_max_words=500, consider_max_sentences=8) r = generate_counterfactuals(x, query) results.append((enm, r)) fname = f"{new_start}_to_{enm}_on_{gpu_name()}_imdb_{date.today()}.pickle" if enm % 5 == 0 and enm != 0: try: path = F"/content/drive/My Drive/{fname}" with open(path, "wb") as file: pickle.dump({"imdb": results}, file) info("saved") except Exception as e: info(f"probably not running on colab: {e}") # with open(fname, "wb") as file: # pickle.dump({DATASET: results}, file)
data = [ ("it 's a charming and often affecting journey .", NEGATIVE), ("although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women .", NEGATIVE), ("... the film suffers from a lack of humor ( something needed to balance out the violence ) ...", POSITIVE), ("in its best moments , resembles a bad high school production of grease , without benefit of song .", NEGATIVE) ] all_result_str = "" for idx, (sen, y_prime) in enumerate(data): results = [] for ds in ["imdb", "sst-2"]: model_config.load(ds) r = generate_counterfactuals(sen, Query(y_prime, c=0.2)) results.append({ "Datensatz": ds, "Text": r.examples[0][0].sentence if len(r.examples) > 0 else "NO CF FOUND" }) results.append({"Datensatz": "Original", "Text": sen}) with pd.option_context("max_colwidth", 100000): all_result_str += pd.DataFrame(results).to_latex(index=False) all_result_str = all_result_str \ .replace("tabular", "tabularx") \ .replace("\\begin{tabularx}", "\\begin{tabularx}{\\textwidth}") print(all_result_str) print(all_result_str)
data = dataset[enm] x, y = data["text"], data["label"] x = model_config.tokenizer.clean_up_tokenization(x) if y == -1: info("y is -1 \ny is -1 \ny is -1 \ny is -1 \n") continue # test data for SST-2 has label -1 (placeholder?) y_prime = 1 - y y_prime = [1 - y_prime, y_prime] for mbs in [True, False]: for allow_splitting in [True, False]: query = Query(wanted_cls=y_prime, max_delta=0.4, c=0.2, num_needed=1, mask_additional_words=False, mini_beam_search=mbs, allow_splitting=allow_splitting) r = generate_counterfactuals(x, query) results.append((enm, r)) fname = f"{new_start}_to_{enm}_on_{gpu_name()}_H2H2_imdb_{date.today()}.pickle" if enm % 5 == 0 and enm != 0: try: path = F"/content/drive/My Drive/{fname}" with open(path, "wb") as file: pickle.dump({"imdb": results}, file) info("saved") except Exception as e: info(f"probably not running on colab: {e}")
# text = "his healthy sense of satire is light and fun ..." # text = "Ultimately feels empty and unsatisfying, like swallowing a Communion wafer without the wine." # text = "the action sequences are fun and reminiscent of combat scenes from the star wars series ." # text = "with jump cuts , fast editing and lots of pyrotechnics , yu clearly hopes to camouflage how bad his movie is ." # text = "why make a documentary about these marginal historical figures ?" # text = "the character of zigzag is not sufficiently developed to support a film constructed around him ." # text = "watchable up until the point where the situations and the dialogue spin hopelessly out of control" text = model_config.tokenizer.clean_up_tokenization(text) s = Sentence(text) result = _gen_cf_ex( text, Query(wanted_cls=[0., 1.], max_delta=0.4, num_needed=5, consider_max_words=500, consider_top_k=15)) print(result.info()) result = [lst[0] for lst in result.examples] data = {i: get_scatter_data(i) for i in range(len(s.words))} colors = ["red", "green", "orange", "magenta", "lawngreen"] # cmap_scale = cm.ScalarMappable(norm=mpl.colors.Normalize(vmin=0, vmax=len(result)), cmap=cm.gist_rainbow) fig = plt.figure(figsize=(10, 14)) gs = grid_spec.GridSpec(nrows=len(s.words), ncols=2, wspace=0, hspace=0.0001, width_ratios=[0.1, 1],
from generate_counterfactuals import generate_counterfactuals from search_utils.Query import Query from search_utils.Sentence import Sentence model_config.load("imdb", evalution_model="gpt2") num = 5 result = [] for wanted_positivity in range(num + 1): wanted_positivity = wanted_positivity / num wanted_cls = [(1 - wanted_positivity), wanted_positivity] max_delta = 50. / num / 100. print(f"{wanted_cls[1]}+-{max_delta}") # relative high consider_max_words becasue max_delta is small. # sent = "A decent story with some thrilling action scenes." # sent = "the year's best and most unpredictable comedy." sent = "an extremely unpleasant film." r = generate_counterfactuals( sent, Query(wanted_cls=wanted_cls, max_delta=max_delta)) print(r.examples[0][0] if len(r.examples) > 0 else "----") result.append({ "y'": f"{wanted_cls[1]:.1f} pm {max_delta:.1f}", "y": f"{r.examples[0][0].cls[1]:.2f}", "Counterfactual Example x' ": r.examples[0][0].sentence }) print("######") print(f"Original cls {Sentence(sent).calc_sentiment()[1]}") with pd.option_context("max_colwidth", 1000): print(pd.DataFrame(result).to_latex(index=False))
def generate_gradient_highlights(): text = Sentence(ex) word_gradients = np.array(text.calc_gradients(y_prime)) word_gradients /= np.linalg.norm(word_gradients) wgn = np.interp(word_gradients, (np.min(word_gradients), np.max(word_gradients)), (0., 1.)) """ \\newcommand{\\reducedstrut}{\\vrule width 0pt height .9\\ht\\strutbox depth .9\\dp\\strutbox\\relax} \\newcommand{\\mycb}[3]{% \\begingroup \\setlength{\\fboxsep}{0pt}% \\colorbox[rgb]{#1}{ \\strut \\textcolor[rgb]{#2}{#3} }% \\endgroup } """ result = "" # new command overwritten error for cmap in [ # matplotlib.colors.LinearSegmentedColormap.from_list("", ["blue", "white", "red"]), # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "forestgreen"]), # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "orangered"]), # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "crimson"]), matplotlib.colors.LinearSegmentedColormap.from_list( "", ["white", "blue"]), # # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "red"]), # # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "black"]), ]: result += f"" for ind, w in enumerate(text.words): ctpl = cmap(wgn[ind])[:3] tc = str(text_color(ctpl))[1:-1] ctpl = [round(v, 3) for v in ctpl] rgba = str(ctpl)[1:-1] result += f"\\mycb{{{rgba}}}{{{tc}}}{{{latex_escape(w)}}}\\allowbreak" result += f"\n\\\\ Top 10: {', '.join(np.array(text.words)[np.argsort(-wgn)][:10])}\\ \n\n\n" # Sentence-wise calc gradients sw_map = get_sentence_word_mapping(text.text) edit_sentence_order = calc_sentence_edit_schedule( Query(y_prime), sw_map, text) for enm_si, si in enumerate(edit_sentence_order): start, stop = sw_map[si] sub = model_config.tokenizer.clean_up_tokenization(" ".join( text.words[start:stop + 1])) subtext = Sentence(sub) word_gradients = np.array(subtext.calc_gradients(y_prime)) word_gradients /= np.linalg.norm(word_gradients) wgn = np.interp(word_gradients, (np.min(word_gradients), np.max(word_gradients)), (0., 1.)) result += f"{enm_si + 1} Satz (vorher {si + 1}. Satz): " for ind, w in enumerate(subtext.words): ctpl = cmap(wgn[ind])[:3] tc = str(text_color(ctpl))[1:-1] ctpl = [round(v, 3) for v in ctpl] rgba = str(ctpl)[1:-1] result += f"\\mycb{{{rgba}}}{{{tc}}}{{{latex_escape(w)}}}\\allowbreak" result += "\\\\ \n\n" return result