Пример #1
0
def expand_sentence(text_p: Union[str, Example],
                    word_indices: List[int],
                    query=None,
                    additional_mask_indices: List[int] = None,
                    schedule_idx=[-1]) -> List[Example]:
    if additional_mask_indices is None:
        additional_mask_indices = []

    if isinstance(text_p, Example):
        text = text_p.sentence
    else:
        text = text_p
    text = Sentence(text)
    word_indices = list(word_indices)
    word_indices = [
        wi for wi in word_indices
        if not all([s in ":,;.*" for s in text.words[wi]])
    ]
    if len(word_indices) == 0:
        return []

    original_words = {i: text.words[i] for i in word_indices}
    max_words = query.consider_max_words if query is not None else Query(
        None).consider_max_words
    masked_sentence = Sentence(
        text.get_with_masked(word_indices + additional_mask_indices))
    predictions = masked_sentence.calc_mask_predictions(max_words)

    result = []
    for word_idx in word_indices:

        if not predictions[word_idx]:
            continue

        sentences = []
        for predicted_token, score in predictions[word_idx]:
            new_sen = text.replace_word(word_idx, predicted_token)
            sentences.append(new_sen)

        classification = calc_sentiment_batch(sentences)
        for i, (predicted_token, score) in enumerate(predictions[word_idx]):
            if original_words[word_idx] != predicted_token:
                if isinstance(text_p, str):
                    e = Example(sentences[i],
                                classification[i], [(word_idx, score)],
                                pred_ind=[i],
                                sched_ind=[schedule_idx],
                                sent_ind=[0])
                else:
                    e = Example(sentences[i],
                                classification[i],
                                text_p.changes + [(word_idx, score)],
                                pred_ind=text_p.prediction_indices + [i],
                                sched_ind=text_p.schedule_indices +
                                [schedule_idx],
                                sent_ind=text_p.sentence_indices + [0])
                result.append(e)

    return result
Пример #2
0
def generate_example_table():
    result = ""
    text = Sentence(ex)
    for query in [
            Query(wanted_cls=y_prime,
                  c=0.2,
                  num_needed=5,
                  mini_beam_search=False,
                  allow_splitting=False,
                  consider_top_k=10),
            Query(wanted_cls=y_prime,
                  c=0.2,
                  num_needed=5,
                  mini_beam_search=True,
                  allow_splitting=False,
                  consider_top_k=10),
            Query(wanted_cls=y_prime,
                  c=0.2,
                  num_needed=5,
                  mini_beam_search=False,
                  allow_splitting=True,
                  consider_top_k=10),
            Query(wanted_cls=y_prime,
                  c=0.2,
                  num_needed=5,
                  mini_beam_search=True,
                  allow_splitting=True,
                  consider_top_k=10),
    ]:
        q_result = generate_counterfactuals(ex, query)
        print(q_result)
        ex_df = []
        for change_group in q_result.examples:
            for e in change_group:
                se = Sentence(e.sentence)
                d = e.changed_word_distances()
                cwi = e.changed_word_indices()
                entry = {
                    "Original":
                    ', '.join([text.words[wi] for wi in cwi]),
                    "Counterfactual":
                    f"{', '.join([se.words[wi] for wi in cwi])}",
                    "Klassifikation":
                    f"{e.cls[1]:.2f}",
                    "Distanz":
                    f"{sum([d_i ** 2 for d_i in d]) + COST_PER_ADDITIONAL_WORD * len(d):.2f}"
                }
                ex_df.append(entry)
        ex_df = pd.DataFrame(ex_df)
        # result += f"\n\n\nOriginale Klassifikation: {text.calc_sentiment()[1]:.2f} \\\\ \n"
        # result += f"\nMBS={query.mini_beam_search}, ST={query.allow_splitting}, MAX\\_WORDS={query.consider_max_words} \\\\ \n"
        result += "\n\n"
        result += ex_df.to_latex(
            index=False,
            caption=
            f"{query.alg()} (Originale Klassifikation: {text.calc_sentiment()[1]:.2f})"
        )

    return result
Пример #3
0
def get_sentence_word_mapping(text: str) -> List[Tuple[int, int]]:
    # can be longer than original, because Sentence(..) limited to 512 tokens
    tok_sen = nltk_tokenizer.sent_tokenize(text)
    original = model_config.tokenizer.clean_up_tokenization(" ".join(
        Sentence(text.lower().strip()).words))
    sentence = Sentence(original)
    word_sentence_map = []
    last_start = 0
    for xp in tok_sen:
        cxp = clean_for_comparison(xp.lower())
        for i in range(last_start, len(sentence.words) + 1):
            ccomp = clean_for_comparison("".join(sentence.words[last_start:i]))
            if ccomp == cxp:
                word_sentence_map.append((last_start, i - 1))
                last_start = i
                break
    if last_start != len(sentence.words):
        word_sentence_map.append((last_start, len(sentence.words)))

    # merge small sentences (<6) into neighbors
    while True:
        dists = [(stop - start) for (start, stop) in word_sentence_map]
        if all([d > MIN_SEN_LEN for d in dists]):
            return word_sentence_map
        else:
            # if True not in [d > MIN_SEN_LEN for d in dists]:
            #     return word_sentence_map
            sen_idx = [d > MIN_SEN_LEN for d in dists].index(False)
            # calc left side sen len
            if sen_idx - 1 < 0:
                left_len = None
            else:
                o_start, o_stop = word_sentence_map[sen_idx - 1]
                left_len = o_stop - o_start

            # calc right side sen len
            if sen_idx + 1 >= len(word_sentence_map):
                right_len = None
            else:
                o_start, o_stop = word_sentence_map[sen_idx + 1]
                right_len = o_stop - o_start

            if right_len is None and left_len is None:
                return word_sentence_map
            elif left_len is None or (right_len is not None and left_len <
                                      right_len):  # merge with right
                new_entry = (word_sentence_map[sen_idx][0],
                             word_sentence_map[sen_idx + 1][1])
                word_sentence_map[sen_idx:sen_idx + 2] = [new_entry]
            elif right_len is None or (right_len is not None and right_len <=
                                       left_len):  # merge with left
                new_entry = (word_sentence_map[sen_idx - 1][0],
                             word_sentence_map[sen_idx][1])
                word_sentence_map[sen_idx - 1:sen_idx + 1] = [new_entry]
Пример #4
0
def example_as_latex_string(idx):
    example_with_ans = data[idx]
    res_obj: Result
    res_obj = example_with_ans[1]
    if len(res_obj.examples) == 0:
        return ""
    cf_ex = [
        escape_latex(w)
        for w in Sentence(res_obj.examples[0][0].sentence).words
    ]
    origi = [
        escape_latex(w)
        for w in Sentence(res_obj.stats.original_sentence).words
    ]
    assert len(cf_ex) == len(origi)
    i = 0
    result_str = f"""
\\begin{{figure}}[h]
\\begin{{center}}
\\begin{{tabular}}{{|l|c|c|}} 
\\multicolumn{{3}}{{c}}{{Variante: {res_obj.query.alg()}}} \\\\
\\hline
{{}} & Counterfactual Example & Original \\\\
\\hline
Perplexity & {res_obj.examples[0][0].calc_perplexity():.2f} & {Sentence(res_obj.stats.original_sentence).calc_perplexity():.2f} \\\\
Polarität & {res_obj.examples[0][0].cls[1]:.2f} & {res_obj.stats.original_classification[1]:.2f} \\\\
\\hline
\\end{{tabular}}
\\end{{center}}
\n
"""
    while i != len(cf_ex):
        if cf_ex[i] != origi[i]:
            combine_from, combine_to = i, i + 1
            while origi[combine_to] != cf_ex[combine_to]:
                combine_to += 1
            left_part = ' '.join(origi[combine_from:combine_to])
            right_part = ' '.join(cf_ex[combine_from:combine_to])
            change = "\\mbox{[" + tex_color(
                left_part, "blue", False) + " \\to{} " + tex_color(
                    right_part, "blue", True) + "]} "
            result_str += change
            i += (combine_to - combine_from)
        else:
            result_str += cf_ex[i] + " "
            i += 1
    result_str += f"\n\\caption{{ Beispiel {idx // 4} aus der {DATASET} Evaluation ({res_obj.query.alg()}) }}\n\\label{{{idx // 4}_{res_obj.query.alg()}}}\n\\end{{figure}} "
    return result_str + "\n\n\n"
Пример #5
0
def calc_sentence_edit_schedule(query, sw_map, text):
    if USE_GRADIENTS_FOR_SENTENCE_RELEVANCE:
        gradients = text.calc_gradients(query.wanted_cls)
        sent_grad = defaultdict(list)
        for i in range(len(text.words)):
            idx = [a <= i <= b for (a, b) in sw_map].index(True)
            sent_grad[idx].append(gradients[i])
        gradients_per_sentence = [(si, np.linalg.norm(g))
                                  for (si, g) in sent_grad.items()]
        edit_sentence_order = [
            y[0] for y in sorted(
                gradients_per_sentence, key=lambda x: x[1], reverse=True)
        ]
    else:
        # use distance to wanted classification for relevance
        dist_to_wanted_cls = []
        for start, stop in sw_map:
            sub = model_config.tokenizer.clean_up_tokenization(" ".join(
                text.words[start:stop + 1]))
            cls = Sentence(sub).calc_sentiment()
            dst = mse_loss(torch.tensor(cls),
                           torch.tensor(query.wanted_cls, dtype=torch.float32))
            dist_to_wanted_cls.append(dst)
        edit_sentence_order = np.argsort(-np.array(dist_to_wanted_cls))
    return edit_sentence_order
Пример #6
0
 def default(self, obj):
     if isinstance(obj, np.integer):
         return int(obj)
     elif isinstance(obj, np.floating):
         return float(obj)
     elif isinstance(obj, np.ndarray):
         return obj.tolist()
     elif isinstance(obj, Example):
         ex_dict = dict(obj.__dict__)
         ex_dict["sentence"] = Sentence(clean(obj.sentence)).words
         ex_dict["perplexity"] = Sentence(obj.sentence).calc_perplexity()
         del ex_dict["prediction_indices"]
         del ex_dict["schedule_indices"]
         del ex_dict["sentence_indices"]
         del ex_dict["changes"]
         return ex_dict
     else:
         return super(CustomEncoder, self).default(obj)
Пример #7
0
 def info(self):
     result_str = ""
     sen = Sentence(self.stats.original_sentence)
     result_str += f"pp={sen.calc_perplexity():.2f}, {len(sen.words)} words, y={np.round(self.stats.original_classification, 2)}\n"
     result_str += f"Duration: {self.stats.total_duration} | {self.stats.find_matching_words_duration} searching words | {self.stats.merging_duration} merging.\n"
     result_str += f"{self.total_valid_examples()} examples, {len(self.rest)} in rest, found {len(self.examples)} of {self.query.num_needed} groups with different indices.\n"
     for e in self.simple_results():
         result_str += "\t" + e.info() + "\n"
     result_str += "Successful!\n" if self.success else "Query not fullfilled!"
     return result_str
Пример #8
0
 def __init__(self, sentence: str, classification, changes, pred_ind, sched_ind, sent_ind):
     self.cls = classification
     self.sentence = sentence
     self.changes: List[Tuple[WordIdx, float]] = changes
     self.prediction_indices = pred_ind  # n-th word in list of alternative words
     self.schedule_indices = sched_ind  # 1st highest gradient word, 2nd hgw, ...
     self.sentence_indices = sent_ind  # 1st sentence,... only != 0 if splitting text
     self.perplexity = None
     for (i, _) in changes:
         assert i < len(Sentence(self.sentence).words)
     assert len(self.schedule_indices) == len(self.prediction_indices) == len(self.changes)
Пример #9
0
def extract_colors(r, per_sentence):
    text = r.stats.original_sentence
    cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
        "", ["white", "blue"])
    text_s = Sentence(text)
    if not per_sentence:
        word_gradients = text_s.calc_gradients(r.query.wanted_cls)
        wgn = np.interp(word_gradients,
                        (np.min(word_gradients), np.max(word_gradients)),
                        (0., 1.))
        fg, bg = [], []
        for ind in range(len(wgn)):
            ctpl = cmap(wgn[ind])[:3]
            tc = twofivefive(text_color(ctpl))
            ctpl = twofivefive(ctpl)
            fg.append(str(tc)[1:-1])
            bg.append(str(ctpl)[1:-1])
        return fg, bg
    else:
        sw_map = get_sentence_word_mapping(text)
        edit_sentence_order = calc_sentence_edit_schedule(
            r.query, sw_map, text_s)
        fg, bg = [], []
        for enm_si, si in enumerate(edit_sentence_order):
            start, stop = sw_map[si]
            sub = model_config.tokenizer.clean_up_tokenization(" ".join(
                text_s.words[start:stop + 1]))
            subtext = Sentence(sub)
            word_gradients = np.array(
                subtext.calc_gradients(r.query.wanted_cls))
            word_gradients /= np.linalg.norm(word_gradients)
            wgn = np.interp(word_gradients,
                            (np.min(word_gradients), np.max(word_gradients)),
                            (0., 1.))
            for ind in range(len(wgn)):
                ctpl = cmap(wgn[ind])[:3]
                tc = twofivefive(text_color(ctpl))
                ctpl = twofivefive(ctpl)
                fg.append(str(tc)[1:-1])
                bg.append(str(ctpl)[1:-1])
        return fg, bg
Пример #10
0
def merge_all_changes_into_one(grouped_items, original) -> Example:
    # Merge all available -> check
    merge_all_changes_into_this_one = list(original.words)
    merge_changes = []
    pred_indices = []
    sched_indices = []
    sent_indices = []
    for changes_a, la in grouped_items:
        if len(changes_a) == 1 and len(la) > 0:
            e: Example = la[0]
            s = Sentence(e)
            merge_all_changes_into_this_one[changes_a[0]] = s.words[
                changes_a[0]]
            merge_changes += e.changes
            pred_indices.append(e.prediction_indices)
            sched_indices.append(e.schedule_indices)
            sent_indices.append(e.sentence_indices)
    merge_all_changes_into_this_one = " ".join(merge_all_changes_into_this_one)
    all_sentiment = calc_sentiment_batch([merge_all_changes_into_this_one])
    return Example(merge_all_changes_into_this_one, all_sentiment[0],
                   merge_changes, pred_indices, sched_indices, sent_indices)
Пример #11
0
    def __repr__(self):
        # DEBUG HELP
        left_right_window = 6
        sen = Sentence(self.sentence)
        changed_indices = self.changed_word_indices()
        sp = [w if i not in changed_indices else f"#{w}#" for i, w in enumerate(sen.words)]
        relevant_parts = ' '.join(sp)
        if len(relevant_parts) > 160:
            relevant_parts = []
            for i, (w_id, score) in enumerate(self.changes):
                if i != 0:
                    relevant_parts.append("[...]")
                word = sp[w_id]
                left = sp[max(0, w_id - left_right_window):w_id]
                right = sp[w_id + 1:w_id + left_right_window]
                relevant_parts.extend(left + [word] + right)
            relevant_parts = ' '.join(relevant_parts)

        return f"PRED_IDX={self.prediction_indices}, " \
               f"cls={np.round(self.cls, 2)}, " \
               f"idxs={changed_indices} " \
               f"dist={np.round(list(dict(self.changes).values()), 2)}, {relevant_parts}"
Пример #12
0
def one_mask():
    max_words = 20
    result = defaultdict(list)
    result_dist_diff_only = defaultdict(list)
    result_wdiff = defaultdict(list)

    for enm in range(len(dataset)):

        print(f"\nSentence {enm}: ", end="")
        data = dataset[enm]
        x, y = data["sentence"], data["label"]
        x = model_config.tokenizer.clean_up_tokenization(x)

        y = [0, 1] if y == 1 else [1, 0]
        y_prime = [1 - y[0], 1 - y[1]]

        s = Sentence(x)

        word_gradients = s.calc_gradients(y_prime)
        sorted_highest = np.argsort(word_gradients)[::-1]

        for observed_idx in sorted_highest[:10]:
            # observed_idx = sorted_highest[0]
            print(f"{observed_idx},", end="")
            sdir = 1 if len(s.words) - observed_idx > observed_idx else -1

            alt_s = Sentence(s.get_with_masked([observed_idx]))
            original_answer = alt_s.calc_mask_predictions()[observed_idx]

            if len(original_answer) != 0:

                for mask_distance in range(1, max_words):
                    if observed_idx + mask_distance * sdir < 0 or observed_idx + mask_distance * sdir >= len(
                            alt_s.words):
                        continue

                    new_sen = Sentence(
                        alt_s.get_with_masked([
                            observed_idx + mask_distance * sdir, observed_idx
                        ]))
                    alt_sen_pred = new_sen.calc_mask_predictions(
                    )[observed_idx]

                    avg_distance, avg_word_diff, dist_diff_only = find_differences(
                        original_answer, alt_sen_pred)

                    # print(f"Mask offset {mask_distance}: dist={avg_distance:.3f}  word_dist={avg_word_diff:.3f}")
                    result[mask_distance].append(avg_distance)
                    result_wdiff[mask_distance].append(avg_word_diff)
                    result_dist_diff_only[mask_distance].append(dist_diff_only)

        if enm % 50 == 0 or enm == len(dataset) - 1:
            fig = plt.figure(figsize=(11, 8))
            plt.title(
                "Relation Bewertung der Wörter zur Nähe des nächsten [MASK]-Token"
            )
            plt.xlabel("Entfernung zum zusätzlichen [MASK]-Token")
            plt.xlim(0, max_words)
            plt.ylim(0., 0.65)
            plt.ylabel("Veränderung der Bewertung")

            idx, mean, std = list(
                zip(*[(md, np.mean(lst), np.std(lst))
                      for (md, lst) in result_wdiff.items()]))
            mean = np.array(mean)
            std = np.array(std)
            plt.plot(idx, mean, color='r', label="Wort-Unterschiede")
            plt.fill_between(idx, mean - std, mean + std, color='r', alpha=.2)

            idx, mean, std = list(
                zip(*[(md, np.mean(lst), np.std(lst))
                      for (md, lst) in result_dist_diff_only.items()]))
            mean = np.array(mean)
            std = np.array(std)
            plt.plot(idx, mean, color='green', label="Distanz-Unterschiede")
            plt.fill_between(idx,
                             mean - std,
                             mean + std,
                             color='green',
                             alpha=.2)

            plt.xticks(idx)
            plt.legend()
            plt.savefig(f'{root}saved_plots/all/_besser_{enm}.png')
            # plt.show()
            plt.close(fig)
Пример #13
0
def two_mask():
    max_words = 15
    result = defaultdict(list)
    result_dist_diff_only = defaultdict(list)
    result_wdiff = defaultdict(list)

    for enm in range(len(dataset)):

        print(f"\nSentence {enm}: ", end="")
        data = dataset[enm]
        x, y = data["sentence"], data["label"]
        x = model_config.tokenizer.clean_up_tokenization(x)

        y = [0, 1] if y == 1 else [1, 0]
        y_prime = [1 - y[0], 1 - y[1]]

        s = Sentence(x)

        word_gradients = s.calc_gradients(y_prime)
        sorted_highest = np.argsort(word_gradients)[::-1]

        for observed_idx in sorted_highest[:10]:
            print(f"{observed_idx},", end="")

            alt_s = Sentence(s.get_with_masked([observed_idx]))
            original_answer = alt_s.calc_mask_predictions()[observed_idx]

            if len(original_answer) != 0:
                for mask_distance1 in range(-max_words, max_words + 1):
                    for mask_distance2 in range(-max_words, max_words + 1):

                        if not (0 <= observed_idx + mask_distance1 < len(
                                alt_s.words)):
                            continue
                        if not (0 <= observed_idx + mask_distance2 < len(
                                alt_s.words)):
                            continue

                        new_sen = Sentence(
                            alt_s.get_with_masked(
                                [observed_idx + mask_distance1, observed_idx]))
                        new_sen = Sentence(
                            new_sen.get_with_masked(
                                [observed_idx + mask_distance2, observed_idx]))
                        alt_sen_pred = new_sen.calc_mask_predictions(
                        )[observed_idx]

                        avg_distance, avg_word_diff, dist_diff_only = find_differences(
                            original_answer, alt_sen_pred)

                        result[(mask_distance1,
                                mask_distance2)].append(avg_distance)
                        result_wdiff[(mask_distance1,
                                      mask_distance2)].append(avg_word_diff)
                        result_dist_diff_only[(
                            mask_distance1,
                            mask_distance2)].append(dist_diff_only)

        if enm % 2 == 0 or enm == len(dataset) - 1:

            all_variants = [(result, "result"), (result_wdiff, "wdiff"),
                            (result_dist_diff_only, "ddiff")]

            with open('used_data.pickle', 'wb') as handle:
                pickle.dump(all_variants, handle)

            for res, name in all_variants:
                data = [(k, np.mean(v)) for k, v in res.items()]
                matrix = np.zeros(shape=(2 * max_words + 1, 2 * max_words + 1))
                for (i, j), m in data:
                    matrix[(i + max_words), j + max_words] = m

                plt.figure(figsize=(15, 12))
                ax = sns.heatmap(
                    np.flip(matrix, axis=0),
                    linewidth=0.0,
                    xticklabels=list(range(-max_words, max_words + 1)),
                    yticklabels=list(reversed(range(-max_words,
                                                    max_words + 1))))

                ax.set_title(
                    "Durchschnittliche Veränderung der Wörter bei 2 MASK-Tokens"
                )
                plt.savefig(f'{root}saved_plots/2d/{name}_{enm}.pdf')
                plt.close()
Пример #14
0
def clean(s):
    return model_config.tokenizer.clean_up_tokenization(' '.join(
        Sentence(s).words)).replace(" - ", "-").lower()
Пример #15
0
import gensim.downloader as api
import pandas as pd
from gensim.models.keyedvectors import Word2VecKeyedVectors
from config import model_config
from search_utils.Sentence import Sentence

wv = api.load('word2vec-google-news-300')
wv: Word2VecKeyedVectors

s = """my thoughts were focused on the characters ."""
s = model_config.tokenizer.clean_up_tokenization(s)
sen_s = Sentence(s)

df_wv = dict()
df = dict()
for i in range(len(sen_s.words)):

    word = sen_s.words[i]
    if word in ".:,?!-(){}[]/\\|&%":
        continue

    new_s = sen_s.get_with_masked(i)
    bert_preds = Sentence(new_s).calc_mask_predictions()[i]

    wv_preds = wv.most_similar(word, topn=15, restrict_vocab=200_000)

    df_wv[word] = list(list(zip(*wv_preds))[0])
    df[word] = list(zip(*bert_preds))[0][:15]

df = pd.DataFrame(df)
df_wv = pd.DataFrame(df_wv)
Пример #16
0
def _gen_cf_ex(text: str, query) -> Result:
    """

    Parameters
    ----------
    text: str
    query : search_helper.classes.Query.Query

    Returns
    -------

    """
    gc.collect()
    torch.cuda.empty_cache()

    stats = Statistics(original_sentence=text)
    stats.total_duration.resume()
    stats.query = query

    text = model_config.tokenizer.clean_up_tokenization(text)
    text = Sentence(text)
    original_cls = text.calc_sentiment()
    stats.original_classification = original_cls

    examples = []
    schedule = text.calc_edit_schedule(query)
    for schedule_idx, (edit_strategy, word_indices) in enumerate(schedule):
        assert isinstance(schedule_idx, int)
        with stats.find_matching_words_duration:
            if edit_strategy == Edit.NEIGHBOURS:
                batch = expand_sentence_neighbour(text.text, query,
                                                  word_indices, schedule_idx)
            else:
                word_indices, mask_indices = word_indices
                batch = expand_sentence(text.text, word_indices, query,
                                        mask_indices, schedule_idx)
            # filtering only 'relevant' makes the found words more extreme
            # relevant_batch = [b for b in batch if abs(original_cls[cls_idx] - b.cls[cls_idx]) > MIN_SENTIMENT_CHANGE]
            relevant_batch = batch

        debug(
            f"{len(examples)} examples total | {len(batch)} new  for {len(word_indices)} words with {schedule_idx} highest gradient"
        )

        stats.tried_examples += len(batch)
        examples.extend(relevant_batch)

        with stats.merging_duration:
            num_per_group = max(4 -
                                schedule_idx, 1) if schedule_idx < 10 else -1
            merged_examples = generate_merged_examples(text, examples, query,
                                                       num_per_group)
            examples.extend(merged_examples)
            stats.tried_examples += len(merged_examples)

        results: Result = examples_are_sufficient(examples, query)
        if results.sufficient():
            stats.total_duration.pause()
            assert stats.all_timers_stopped()
            results.stats = stats
            results.query = query
            return results

    results: Result = examples_are_sufficient(examples, query)
    stats.total_duration.pause()
    assert stats.all_timers_stopped()
    results.stats = stats
    results.query = query
    return results
Пример #17
0
def _gen_cf_ex_long(text: str, query) -> Result:
    """

    Parameters
    ----------
    text : str
    query: search_helper.classes.Query.Query

    Returns
    -------
    Result
    """

    gc.collect()
    torch.cuda.empty_cache()

    stats = Statistics(original_sentence=text)
    stats.total_duration.resume()
    stats.query = query
    stats.original_sentence = text

    text = model_config.tokenizer.clean_up_tokenization(text)
    text = Sentence(text)
    stats.original_classification = text.calc_sentiment()

    sw_map = get_sentence_word_mapping(text.text)

    edit_sentence_order = calc_sentence_edit_schedule(query, sw_map, text)

    examples = []
    for sentence_sched_idx, si in enumerate(
            edit_sentence_order[:query.consider_max_sentences]):
        debug("> subsentence")
        start, stop = sw_map[si]
        sub = model_config.tokenizer.clean_up_tokenization(" ".join(
            text.words[start:stop + 1]))

        sub_query = copy(query)
        subresult: Result = _gen_cf_ex(sub, sub_query)

        if stats.tried_sentences is None:
            stats.tried_sentences = 0
        stats.tried_sentences += 1
        stats.add(subresult.stats)

        subexample: Example
        best_subexamples = [j[0] for j in subresult.examples]
        debug(
            f"SUBEXAMPLE SEARCH FOUND {len(best_subexamples)} of {sub_query.num_needed}"
        )
        if len(best_subexamples) < query.num_needed:
            best_subexamples.extend(subresult.rest[:(query.num_needed -
                                                     len(best_subexamples))])
            debug(f"Added from rest, now {len(best_subexamples)}")

        for subexample in best_subexamples:
            new_sen = list(text.words)
            new_sen[start:stop + 1] = [subexample.sentence]
            new_sen = Sentence(
                model_config.tokenizer.clean_up_tokenization(
                    " ".join(new_sen)))
            new_cls = new_sen.calc_sentiment()
            # print(np.round(new_cls, 3), new_sen.text)

            new_changes = [(pos + start, dist)
                           for (pos, dist) in subexample.changes]
            e = Example(new_sen.text,
                        new_cls,
                        new_changes,
                        pred_ind=subexample.prediction_indices,
                        sched_ind=subexample.schedule_indices,
                        sent_ind=[sentence_sched_idx])
            examples.append(e)

        with stats.merging_duration:
            debug("> subsentence merge")
            merged_examples = generate_merged_examples(text, examples, query,
                                                       1)
            examples.extend(merged_examples)
            stats.tried_examples += len(merged_examples)
            debug("< subsentence merge")

        results: Result = examples_are_sufficient(examples, query)
        if results.sufficient():
            stats.total_duration.pause()
            assert stats.all_timers_stopped()
            results.stats = stats
            results.query = query
            results.sentence_map = sw_map
            return results
        debug("< subsentence")

    results: Result = examples_are_sufficient(examples, query)
    stats.total_duration.pause()
    assert stats.all_timers_stopped()
    results.stats = stats
    results.query = query
    results.sentence_map = sw_map
    return results
Пример #18
0
from search_utils.Sentence import Sentence

Sentence("David [MASK] is a spanish tennis player.").calc_mask_predictions(
)  # ferrer
Sentence("David [MASK] is a famous musician.").calc_mask_predictions()  # bowie
Sentence("David [MASK] is a musician.").calc_mask_predictions()  # smith

Sentence("[MASK] is a musician.").calc_mask_predictions()  # he

Sentence(
    "David [MASK] was the prime minister.").calc_mask_predictions()  # cameron
Sentence(
    "David [MASK] is the prime minister.").calc_mask_predictions()  # cameron

Sentence("It's over Anakin! I have the [MASK] ground!").calc_mask_predictions(
)  # high
Sentence("It's over Anakin! I have the high [MASK]!").calc_mask_predictions(
)  # ground
Sentence("It's over Anakin! I [MASK] the high ground!").calc_mask_predictions(
)  # need

Sentence("Boy, that [MASK] quickly!").calc_mask_predictions(
)  # was, happened, moved
Sentence("The man who passes the sentence should swing the [MASK]."
         ).calc_mask_predictions()  # chair, bail, stick, wheel
Sentence("A Lannister always pays his [MASK].").calc_mask_predictions(
)  # debts, taxes, way
Sentence("This is the [MASK].").calc_mask_predictions()  # end, truth, way
Sentence("That's what I do: I [MASK] and I know things."
         ).calc_mask_predictions()  # see, think, look
Пример #19
0
 def calc_perplexity(self):
     if self.perplexity is None:
         self.perplexity = Sentence(self.sentence).calc_perplexity()
     return self.perplexity
Пример #20
0
 def marked(self):
     ps = Sentence(self.sentence).words
     bold_indices = list(zip(*self.changes))[0]
     for i, wi in enumerate(bold_indices):
         ps[wi] = f"#{i + 1}#={ps[wi]}"
     return " ".join(ps)
Пример #21
0
def generate_gradient_highlights():
    text = Sentence(ex)
    word_gradients = np.array(text.calc_gradients(y_prime))
    word_gradients /= np.linalg.norm(word_gradients)
    wgn = np.interp(word_gradients,
                    (np.min(word_gradients), np.max(word_gradients)), (0., 1.))
    """
    \\newcommand{\\reducedstrut}{\\vrule width 0pt height .9\\ht\\strutbox depth .9\\dp\\strutbox\\relax}
    \\newcommand{\\mycb}[3]{%
      \\begingroup
      \\setlength{\\fboxsep}{0pt}%  
      \\colorbox[rgb]{#1}{ \\strut \\textcolor[rgb]{#2}{#3} }%
      \\endgroup
    }
    """
    result = ""  # new command overwritten error

    for cmap in [
            # matplotlib.colors.LinearSegmentedColormap.from_list("", ["blue", "white", "red"]),
            # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "forestgreen"]),
            # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "orangered"]),
            # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "crimson"]),
            matplotlib.colors.LinearSegmentedColormap.from_list(
                "", ["white", "blue"]),
            # # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "red"]),
            # # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "black"]),
    ]:
        result += f""
        for ind, w in enumerate(text.words):
            ctpl = cmap(wgn[ind])[:3]
            tc = str(text_color(ctpl))[1:-1]
            ctpl = [round(v, 3) for v in ctpl]
            rgba = str(ctpl)[1:-1]
            result += f"\\mycb{{{rgba}}}{{{tc}}}{{{latex_escape(w)}}}\\allowbreak"
        result += f"\n\\\\ Top 10: {', '.join(np.array(text.words)[np.argsort(-wgn)][:10])}\\ \n\n\n"

        # Sentence-wise calc gradients
        sw_map = get_sentence_word_mapping(text.text)
        edit_sentence_order = calc_sentence_edit_schedule(
            Query(y_prime), sw_map, text)

        for enm_si, si in enumerate(edit_sentence_order):
            start, stop = sw_map[si]
            sub = model_config.tokenizer.clean_up_tokenization(" ".join(
                text.words[start:stop + 1]))

            subtext = Sentence(sub)
            word_gradients = np.array(subtext.calc_gradients(y_prime))
            word_gradients /= np.linalg.norm(word_gradients)
            wgn = np.interp(word_gradients,
                            (np.min(word_gradients), np.max(word_gradients)),
                            (0., 1.))

            result += f"{enm_si + 1} Satz (vorher {si + 1}. Satz): "
            for ind, w in enumerate(subtext.words):
                ctpl = cmap(wgn[ind])[:3]
                tc = str(text_color(ctpl))[1:-1]
                ctpl = [round(v, 3) for v in ctpl]
                rgba = str(ctpl)[1:-1]
                result += f"\\mycb{{{rgba}}}{{{tc}}}{{{latex_escape(w)}}}\\allowbreak"
            result += "\\\\ \n\n"

    return result
Пример #22
0
            dataset, data = list(pickle.load(file).items())[0]
            model_config.load(dataset, "gpt2")
            json_datapoint = None
            result: Result
            for another_idx, (ds_idx, result) in enumerate(tqdm(data)):
                max_ds_idx = max(ds_idx, max_ds_idx)
                if last_ds_idx != ds_idx:
                    if another_idx != 0:
                        json_all.append(json_datapoint)
                    last_ds_idx = ds_idx
                    fg, bg = extract_colors(result, per_sentence=False)
                    fg_ps, bg_ps = extract_colors(result, per_sentence=True)

                    json_datapoint = {
                        'sentence':
                        Sentence(clean(result.stats.original_sentence)).words,
                        'foreground':
                        fg,
                        'foreground_per_sen':
                        fg_ps,
                        'background':
                        bg,
                        'background_per_sen':
                        bg_ps,
                        'original_cls':
                        result.stats.original_classification,
                        'wanted_cls':
                        result.query.wanted_cls,
                        'original_ppl':
                        np.round(
                            Sentence(result.stats.original_sentence).
Пример #23
0
# text = "if you enjoy more thoughtful comedies with interesting conflicted characters ; this one is for you ."
# text = "no place for this story to go but down"
# text = "minority report is exactly what the title indicates , a report ."
# text = "it 's refreshing to see a girl-power movie that does n't feel it has to prove anything ."
# text = "it has all the excitement of eating oatmeal ."
# text = "his healthy sense of satire is light and fun ..."
# text = "Ultimately feels empty and unsatisfying, like swallowing a Communion wafer without the wine."
# text = "the action sequences are fun and reminiscent of combat scenes from the star wars series ."
# text = "with jump cuts , fast editing and lots of pyrotechnics , yu clearly hopes to camouflage how bad his movie is ."
# text = "why make a documentary about these marginal historical figures ?"
# text = "the character of zigzag is not sufficiently developed to support a film constructed around him ."
# text = "watchable up until the point where the situations and the dialogue spin hopelessly out of control"

text = model_config.tokenizer.clean_up_tokenization(text)

s = Sentence(text)
result = _gen_cf_ex(
    text,
    Query(wanted_cls=[0., 1.],
          max_delta=0.4,
          num_needed=5,
          consider_max_words=500,
          consider_top_k=15))
print(result.info())
result = [lst[0] for lst in result.examples]
data = {i: get_scatter_data(i) for i in range(len(s.words))}

colors = ["red", "green", "orange", "magenta", "lawngreen"]
# cmap_scale = cm.ScalarMappable(norm=mpl.colors.Normalize(vmin=0, vmax=len(result)), cmap=cm.gist_rainbow)

fig = plt.figure(figsize=(10, 14))
Пример #24
0
def generate_merged_examples(original, examples: List[Example], query,
                             curr_iter_idx):
    """

    Parameters
    ----------
    original : Sentence
    examples : List of Example
    query: search_helper.classes.Query.Query
    curr_iter_idx: int

    Returns
    -------
    list of merged examples
    """
    generated = []
    cs = query.c if isinstance(query.c, list) else [query.c]
    for c in cs:
        grouped_by_changed_indices = defaultdict(list)
        examples = examples_sorted(examples, query.wanted_cls,
                                   c)[:MAX_EXAMPLES_TO_CONSIDER_FOR_MERGING]
        for e in examples:
            if len(grouped_by_changed_indices[tuple(
                    e.changed_word_indices())]) <= 5:
                grouped_by_changed_indices[tuple(
                    e.changed_word_indices())].append(e)

        grouped_items = grouped_by_changed_indices.items()
        grouped_keys = [set(x) for x in grouped_by_changed_indices.keys()]
        if len(grouped_items) <= 1:  # -> nothing to merge
            return []

        if curr_iter_idx == -1:
            return [merge_all_changes_into_one(grouped_items, original)]

        debug(f"MERGING ({len(grouped_keys)}) {grouped_keys}")

        for a_idx, (changes_a, la) in enumerate(grouped_items):
            for b_idx, (changes_b, lb) in enumerate(grouped_items):

                # changes already done or changes overlap or changes too close together
                if set(changes_a + changes_b) in grouped_keys \
                        or len(set(changes_a).intersection(set(changes_b))) > 0 \
                        or min_distance(changes_a, changes_b) < MIN_DISTANCE_BETWEEN_CHANGED_WORDS:
                    continue

                resulting_edit_size = len(set(changes_a + changes_b))
                take_n = max(1, 5 - resulting_edit_size)

                # Merge examples
                a: Example
                b: Example
                for a in la[:take_n]:
                    for b in lb[:take_n]:
                        sa, sb = Sentence(a), Sentence(b)
                        new = list(original.words)
                        for na in a.changed_word_indices():
                            new[na] = sa.words[na]
                        for na in b.changed_word_indices():
                            new[na] = sb.words[na]

                        generated.append(
                            (" ".join(new), tuple(a.changes + b.changes),
                             a.prediction_indices + b.prediction_indices,
                             a.schedule_indices + b.schedule_indices,
                             a.sentence_indices + a.sentence_indices))
                        grouped_keys.append(
                            set(dict(a.changes +
                                     b.changes).keys()))  # can indent 2?

    debug(f"MERGE: generated {len(generated)} MERGED (#c={len(cs)}) examples")

    if len(generated) == 0:
        return []
    unzipped = list(zip(*generated))
    sentence_list = list(unzipped[0])
    if len(sentence_list) == 0:
        return []
    sentiment_batch = calc_sentiment_batch(sentence_list)
    result = []
    for i, (sen, changes, pred_idx, sched_idx,
            sent_idx) in enumerate(generated):
        example = Example(sen,
                          sentiment_batch[i],
                          list(changes),
                          pred_ind=pred_idx,
                          sched_ind=sched_idx,
                          sent_ind=sent_idx)
        example.schedule_indices = sched_idx
        result.append(example)
    return result
Пример #25
0
def expand_sentence_neighbour(text_p: Union[str, Example], query,
                              word_indices: List[int],
                              schedule_idx) -> List[Example]:
    """

    Parameters
    ----------
    schedule_idx
    text_p
    word_indices
    query : search_helper.classes.Query.Query

    Returns
    -------

    """
    if isinstance(text_p, Example):
        text = text_p.sentence
    else:
        text = text_p
    text = Sentence(text)
    word_indices = list(word_indices)
    word_indices = [
        wi for wi in word_indices
        if not all([s in ":,;.*" for s in text.words[wi]])
    ]

    if len(word_indices) == 0:
        return []

    word_indices = word_indices[:MBS_DEPTH]
    masked_text = Sentence(text.get_with_masked(word_indices))

    initial_example = [Example(masked_text.text, [], [], [], [], [])]
    results = []
    for word_idx in word_indices:
        debug(f"expand neighbours: {word_idx} of {word_indices}")
        tmp_results = examples_sorted(results, query.wanted_cls,
                                      query.c)[:MBS_BEAMS]
        results = []
        for interm_example in (initial_example if word_idx == word_indices[0]
                               else tmp_results):
            intermediate_sen = Sentence(interm_example)
            predictions = Sentence(interm_example).calc_mask_predictions(
                query.consider_max_words)
            if not predictions[word_idx]:
                continue

            sentences = []
            for predicted_token, score in predictions[word_idx]:
                new_sen = intermediate_sen.replace_mask(
                    word_idx, predicted_token)
                sentences.append(new_sen)

            classification = calc_sentiment_batch(sentences)
            for i, (predicted_token,
                    score) in enumerate(predictions[word_idx]):
                results.append(
                    Example(sentences[i],
                            classification[i],
                            interm_example.changes + [(word_idx, score)],
                            pred_ind=interm_example.prediction_indices + [i],
                            sched_ind=interm_example.schedule_indices +
                            [schedule_idx],
                            sent_ind=[0]))

    return examples_sorted(results, query.wanted_cls, query.c)