Python Sentence.get_with_masked示例

编程语言: Python

命名空间/包名称: search_utils.Sentence

类/类型: Sentence

方法/功能: get_with_masked

hotexamples.com的示例: 5

Python Sentence.get_with_masked - 已找到5个示例。这些是从开源项目中提取的最受好评的search_utils.Sentence.Sentence.get_with_masked现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Sentence(25)

get_with_masked(5)

calc_gradients(4)

calc_mask_predictions(3)

calc_sentiment(1)

replace_mask(1)

replace_word(1)

示例#1

显示文件

def expand_sentence(text_p: Union[str, Example],
                    word_indices: List[int],
                    query=None,
                    additional_mask_indices: List[int] = None,
                    schedule_idx=[-1]) -> List[Example]:
    if additional_mask_indices is None:
        additional_mask_indices = []

    if isinstance(text_p, Example):
        text = text_p.sentence
    else:
        text = text_p
    text = Sentence(text)
    word_indices = list(word_indices)
    word_indices = [
        wi for wi in word_indices
        if not all([s in ":,;.*" for s in text.words[wi]])
    ]
    if len(word_indices) == 0:
        return []

    original_words = {i: text.words[i] for i in word_indices}
    max_words = query.consider_max_words if query is not None else Query(
        None).consider_max_words
    masked_sentence = Sentence(
        text.get_with_masked(word_indices + additional_mask_indices))
    predictions = masked_sentence.calc_mask_predictions(max_words)

    result = []
    for word_idx in word_indices:

        if not predictions[word_idx]:
            continue

        sentences = []
        for predicted_token, score in predictions[word_idx]:
            new_sen = text.replace_word(word_idx, predicted_token)
            sentences.append(new_sen)

        classification = calc_sentiment_batch(sentences)
        for i, (predicted_token, score) in enumerate(predictions[word_idx]):
            if original_words[word_idx] != predicted_token:
                if isinstance(text_p, str):
                    e = Example(sentences[i],
                                classification[i], [(word_idx, score)],
                                pred_ind=[i],
                                sched_ind=[schedule_idx],
                                sent_ind=[0])
                else:
                    e = Example(sentences[i],
                                classification[i],
                                text_p.changes + [(word_idx, score)],
                                pred_ind=text_p.prediction_indices + [i],
                                sched_ind=text_p.schedule_indices +
                                [schedule_idx],
                                sent_ind=text_p.sentence_indices + [0])
                result.append(e)

    return result

示例#2

显示文件

def one_mask():
    max_words = 20
    result = defaultdict(list)
    result_dist_diff_only = defaultdict(list)
    result_wdiff = defaultdict(list)

    for enm in range(len(dataset)):

        print(f"\nSentence {enm}: ", end="")
        data = dataset[enm]
        x, y = data["sentence"], data["label"]
        x = model_config.tokenizer.clean_up_tokenization(x)

        y = [0, 1] if y == 1 else [1, 0]
        y_prime = [1 - y[0], 1 - y[1]]

        s = Sentence(x)

        word_gradients = s.calc_gradients(y_prime)
        sorted_highest = np.argsort(word_gradients)[::-1]

        for observed_idx in sorted_highest[:10]:
            # observed_idx = sorted_highest[0]
            print(f"{observed_idx},", end="")
            sdir = 1 if len(s.words) - observed_idx > observed_idx else -1

            alt_s = Sentence(s.get_with_masked([observed_idx]))
            original_answer = alt_s.calc_mask_predictions()[observed_idx]

            if len(original_answer) != 0:

                for mask_distance in range(1, max_words):
                    if observed_idx + mask_distance * sdir < 0 or observed_idx + mask_distance * sdir >= len(
                            alt_s.words):
                        continue

                    new_sen = Sentence(
                        alt_s.get_with_masked([
                            observed_idx + mask_distance * sdir, observed_idx
                        ]))
                    alt_sen_pred = new_sen.calc_mask_predictions(
                    )[observed_idx]

                    avg_distance, avg_word_diff, dist_diff_only = find_differences(
                        original_answer, alt_sen_pred)

                    # print(f"Mask offset {mask_distance}: dist={avg_distance:.3f}  word_dist={avg_word_diff:.3f}")
                    result[mask_distance].append(avg_distance)
                    result_wdiff[mask_distance].append(avg_word_diff)
                    result_dist_diff_only[mask_distance].append(dist_diff_only)

        if enm % 50 == 0 or enm == len(dataset) - 1:
            fig = plt.figure(figsize=(11, 8))
            plt.title(
                "Relation Bewertung der Wörter zur Nähe des nächsten [MASK]-Token"
            )
            plt.xlabel("Entfernung zum zusätzlichen [MASK]-Token")
            plt.xlim(0, max_words)
            plt.ylim(0., 0.65)
            plt.ylabel("Veränderung der Bewertung")

            idx, mean, std = list(
                zip(*[(md, np.mean(lst), np.std(lst))
                      for (md, lst) in result_wdiff.items()]))
            mean = np.array(mean)
            std = np.array(std)
            plt.plot(idx, mean, color='r', label="Wort-Unterschiede")
            plt.fill_between(idx, mean - std, mean + std, color='r', alpha=.2)

            idx, mean, std = list(
                zip(*[(md, np.mean(lst), np.std(lst))
                      for (md, lst) in result_dist_diff_only.items()]))
            mean = np.array(mean)
            std = np.array(std)
            plt.plot(idx, mean, color='green', label="Distanz-Unterschiede")
            plt.fill_between(idx,
                             mean - std,
                             mean + std,
                             color='green',
                             alpha=.2)

            plt.xticks(idx)
            plt.legend()
            plt.savefig(f'{root}saved_plots/all/_besser_{enm}.png')
            # plt.show()
            plt.close(fig)

示例#3

显示文件

def two_mask():
    max_words = 15
    result = defaultdict(list)
    result_dist_diff_only = defaultdict(list)
    result_wdiff = defaultdict(list)

    for enm in range(len(dataset)):

        print(f"\nSentence {enm}: ", end="")
        data = dataset[enm]
        x, y = data["sentence"], data["label"]
        x = model_config.tokenizer.clean_up_tokenization(x)

        y = [0, 1] if y == 1 else [1, 0]
        y_prime = [1 - y[0], 1 - y[1]]

        s = Sentence(x)

        word_gradients = s.calc_gradients(y_prime)
        sorted_highest = np.argsort(word_gradients)[::-1]

        for observed_idx in sorted_highest[:10]:
            print(f"{observed_idx},", end="")

            alt_s = Sentence(s.get_with_masked([observed_idx]))
            original_answer = alt_s.calc_mask_predictions()[observed_idx]

            if len(original_answer) != 0:
                for mask_distance1 in range(-max_words, max_words + 1):
                    for mask_distance2 in range(-max_words, max_words + 1):

                        if not (0 <= observed_idx + mask_distance1 < len(
                                alt_s.words)):
                            continue
                        if not (0 <= observed_idx + mask_distance2 < len(
                                alt_s.words)):
                            continue

                        new_sen = Sentence(
                            alt_s.get_with_masked(
                                [observed_idx + mask_distance1, observed_idx]))
                        new_sen = Sentence(
                            new_sen.get_with_masked(
                                [observed_idx + mask_distance2, observed_idx]))
                        alt_sen_pred = new_sen.calc_mask_predictions(
                        )[observed_idx]

                        avg_distance, avg_word_diff, dist_diff_only = find_differences(
                            original_answer, alt_sen_pred)

                        result[(mask_distance1,
                                mask_distance2)].append(avg_distance)
                        result_wdiff[(mask_distance1,
                                      mask_distance2)].append(avg_word_diff)
                        result_dist_diff_only[(
                            mask_distance1,
                            mask_distance2)].append(dist_diff_only)

        if enm % 2 == 0 or enm == len(dataset) - 1:

            all_variants = [(result, "result"), (result_wdiff, "wdiff"),
                            (result_dist_diff_only, "ddiff")]

            with open('used_data.pickle', 'wb') as handle:
                pickle.dump(all_variants, handle)

            for res, name in all_variants:
                data = [(k, np.mean(v)) for k, v in res.items()]
                matrix = np.zeros(shape=(2 * max_words + 1, 2 * max_words + 1))
                for (i, j), m in data:
                    matrix[(i + max_words), j + max_words] = m

                plt.figure(figsize=(15, 12))
                ax = sns.heatmap(
                    np.flip(matrix, axis=0),
                    linewidth=0.0,
                    xticklabels=list(range(-max_words, max_words + 1)),
                    yticklabels=list(reversed(range(-max_words,
                                                    max_words + 1))))

                ax.set_title(
                    "Durchschnittliche Veränderung der Wörter bei 2 MASK-Tokens"
                )
                plt.savefig(f'{root}saved_plots/2d/{name}_{enm}.pdf')
                plt.close()

示例#4

显示文件

def expand_sentence_neighbour(text_p: Union[str, Example], query,
                              word_indices: List[int],
                              schedule_idx) -> List[Example]:
    """

    Parameters
    ----------
    schedule_idx
    text_p
    word_indices
    query : search_helper.classes.Query.Query

    Returns
    -------

    """
    if isinstance(text_p, Example):
        text = text_p.sentence
    else:
        text = text_p
    text = Sentence(text)
    word_indices = list(word_indices)
    word_indices = [
        wi for wi in word_indices
        if not all([s in ":,;.*" for s in text.words[wi]])
    ]

    if len(word_indices) == 0:
        return []

    word_indices = word_indices[:MBS_DEPTH]
    masked_text = Sentence(text.get_with_masked(word_indices))

    initial_example = [Example(masked_text.text, [], [], [], [], [])]
    results = []
    for word_idx in word_indices:
        debug(f"expand neighbours: {word_idx} of {word_indices}")
        tmp_results = examples_sorted(results, query.wanted_cls,
                                      query.c)[:MBS_BEAMS]
        results = []
        for interm_example in (initial_example if word_idx == word_indices[0]
                               else tmp_results):
            intermediate_sen = Sentence(interm_example)
            predictions = Sentence(interm_example).calc_mask_predictions(
                query.consider_max_words)
            if not predictions[word_idx]:
                continue

            sentences = []
            for predicted_token, score in predictions[word_idx]:
                new_sen = intermediate_sen.replace_mask(
                    word_idx, predicted_token)
                sentences.append(new_sen)

            classification = calc_sentiment_batch(sentences)
            for i, (predicted_token,
                    score) in enumerate(predictions[word_idx]):
                results.append(
                    Example(sentences[i],
                            classification[i],
                            interm_example.changes + [(word_idx, score)],
                            pred_ind=interm_example.prediction_indices + [i],
                            sched_ind=interm_example.schedule_indices +
                            [schedule_idx],
                            sent_ind=[0]))

    return examples_sorted(results, query.wanted_cls, query.c)

示例#5

显示文件

wv = api.load('word2vec-google-news-300')
wv: Word2VecKeyedVectors

s = """my thoughts were focused on the characters ."""
s = model_config.tokenizer.clean_up_tokenization(s)
sen_s = Sentence(s)

df_wv = dict()
df = dict()
for i in range(len(sen_s.words)):

    word = sen_s.words[i]
    if word in ".:,?!-(){}[]/\\|&%":
        continue

    new_s = sen_s.get_with_masked(i)
    bert_preds = Sentence(new_s).calc_mask_predictions()[i]

    wv_preds = wv.most_similar(word, topn=15, restrict_vocab=200_000)

    df_wv[word] = list(list(zip(*wv_preds))[0])
    df[word] = list(zip(*bert_preds))[0][:15]

df = pd.DataFrame(df)
df_wv = pd.DataFrame(df_wv)
print("BERT")
print(df.to_latex())
print("WordVec")
print(df_wv.to_latex())