Пример #1
0
def correctTranslation():
    data = request.get_json()
    translation = data["translation"]
    beam = data["beam"]
    document_unk_map = data["document_unk_map"]
    attention = data["attention"]
    document_id = data["document_id"]
    sentence_id = data["sentence_id"]

    document = get_document(document_id)

    extractor = DomainSpecificExtractor(source_file=document.filepath, src_lang=SRC_LANG, tgt_lang=TGT_LANG,
                                        train_source_file=f".data/wmt14/train.tok.clean.bpe.32000.{SRC_LANG}",
                                        train_vocab_file=f".data/vocab/train_vocab_{SRC_LANG}.pkl")
    keyphrases = extractor.extract_keyphrases()

    for key in document_unk_map:
        if key not in document.unk_map:
            document.unk_map[key] = document_unk_map[key]
        else:
            # Merge list values
            document.unk_map[key] = list(set(document.unk_map[key]) | set(document_unk_map[key]))

    sentence = document.sentences[int(sentence_id)]

    if translation != sentence.translation:
        sentence.diff = html_diff(sentence.translation[:-4].replace("@@ ", ""),
                                  translation[:-4].replace("@@ ", ""))
    sentence.translation = translation
    sentence.corrected = True
    sentence.flagged = False
    sentence.attention = attention
    sentence.beam = beam

    scorer = Scorer()
    score = scorer.compute_scores(sentence.source, sentence.translation, attention, keyphrases, "")
    score["order_id"] = sentence.score["order_id"]
    sentence.score = score

    document.sentences[int(sentence_id)] = sentence

    save_document(document, document_id)

    return jsonify({})
Пример #2
0
def filterForSimilarSentences(document_id, reference_id):
    document = get_document(document_id)
    reference = document.sentences[int(reference_id)].source
    print("Filter for similar sentences: " + str(reference))

    scorer = Scorer()

    keyphrases = []
    for k in document.keyphrases:
        keyphrases.append((k["name"], k["occurrences"]))

    for i, sentence in enumerate(document.sentences):
        score = scorer.compute_scores(sentence.source, " ".join(sentence.translation), sentence.attention,
                                      keyphrases, reference)
        score["order_id"] = i

        sentence.score = score

    save_document(document, document_id)

    return jsonify({})
Пример #3
0
class CorrelationExperiment:
    def __init__(self,
                 model,
                 source_file,
                 target_file,
                 source_file2,
                 target_file2,
                 num_sentences=1000,
                 beam_size=3):
        self.model = model
        self.source_file = source_file
        self.target_file = target_file
        self.source_file2 = source_file2
        self.target_file2 = target_file2
        self.num_sentences = num_sentences
        self.beam_size = beam_size

        self.translationList = []
        self.pairs = []
        self.scoresList = []

        self.scorer = Scorer()

        self.metric_to_cter = {}
        self.all_cter_scores = []

        self.metric_to_bad = {}

    # metric order -> correlation
    def plot_correlation(self, dir, prefix, filename):
        palette = sns.color_palette()

        for i, metric in enumerate(metrics):
            f, axes = plt.subplots()
            f.set_figheight(6)
            f.set_figwidth(6)

            x, y = [], []

            score_cter_tuples = []
            cters = []
            for score in self.metric_to_cter[metric]:
                for v in self.metric_to_cter[metric][score]:
                    cters.append(v)
                score_cter_tuples += [
                    (score, v) for v in self.metric_to_cter[metric][score]
                ]
                values = self.metric_to_cter[metric][score]
                x += [score] * len(values)
                y += values

            score_cter_tuples = sorted(score_cter_tuples,
                                       key=lambda x: x[0],
                                       reverse=reverse_sort_direction[metric])

            self.metric_to_bad[metric] = score_cter_tuples

            axes.set_ylim(-0.1, 1.1)

            plt.xticks(fontsize=15)
            plt.yticks(fontsize=15)

            corr, p_val = pearsonr(x, y)

            axes.text(0.05,
                      0.95,
                      "r = {0:.2f}".format(corr.item()),
                      transform=axes.transAxes,
                      va="top",
                      fontsize=13,
                      weight="bold")

            sns.regplot(x,
                        y,
                        ax=axes,
                        scatter_kws={'alpha': 0.2},
                        order=1,
                        color=palette[i])

            plt.ylabel("CharacTER", fontsize=17)
            plt.xlabel("Metric: " + name_map[metric], fontsize=17)
            plt.savefig(os.path.join(dir, prefix + "_" + metric + filename))
            plt.close()

    # metric order -> document quality
    def plot_bad(self, dir, prefix, filename):

        palette = sns.color_palette()

        metric_percentage = {}

        mean = statistics.mean(self.all_cter_scores)
        stdev = statistics.stdev(self.all_cter_scores)
        threshold = mean + stdev

        for metric in metrics:
            bad_percentage = []
            curr_bad_count = 0
            for score, cter in self.metric_to_bad[metric]:
                if cter >= threshold:
                    curr_bad_count += 1
                bad_percentage.append(curr_bad_count)
            metric_percentage[metric] = bad_percentage

        for i, metric in enumerate(metrics):
            f, axes = plt.subplots()
            f.set_figheight(6)
            f.set_figwidth(6)

            bad_percentage = metric_percentage[metric]

            x = [
                100 * i / len(bad_percentage)
                for i in range(1,
                               len(bad_percentage) + 1)
            ]
            y = [100 * p / max(bad_percentage) for p in bad_percentage]
            line, = plt.plot(x, y, color=palette[i], linewidth=2, alpha=0.9)
            line.set_label(name_map[metric])
            line, = plt.plot(x,
                             x,
                             marker='',
                             linestyle="--",
                             color='black',
                             linewidth=1,
                             alpha=0.9)
            line.set_label("theoretical baseline")

            for m in metrics:
                if m == metric:
                    continue
                line, = plt.plot([
                    100 * i / len(metric_percentage[m])
                    for i in range(1,
                                   len(metric_percentage[m]) + 1)
                ], [
                    100 * p / max(metric_percentage[m])
                    for p in metric_percentage[m]
                ],
                                 marker='',
                                 color=palette[metrics.index(m)],
                                 linewidth=1,
                                 alpha=0.5,
                                 label=name_map[m],
                                 linestyle="-")

            plt.legend(loc='upper left', ncol=1, fontsize=12)

            plt.yticks([0, 25, 50, 75, 100], fontsize=15)
            plt.xticks([0, 25, 50, 75, 100], fontsize=15)

            plt.ylabel("% sentences with low quality covered", fontsize=17)
            plt.xlabel("% sentences covered (metric: " + name_map[metric] +
                       ")",
                       fontsize=17)

            plt.savefig(
                os.path.join(dir, prefix + "_percentages" + "_" + metric +
                             filename))
            print("saved bad")
            plt.close()

    # BLEU values of remaining text
    # Sentences sorted from good to bad according to metric to calculate the BLEU score up to the current
    # sentence.
    # The plot shows the BLEU score when removing bad sentences first until only one good sentence remains.
    def plot_bleu(self, dir, prefix, filename):

        palette = sns.color_palette()

        metric_values = {}

        for metric in metrics:

            sorted_sentences = [(x, y, z) for _, x, y, z in sorted(
                zip([s[metric] for s in self.scoresList],
                    [p[0] for p in self.pairs], [p[1] for p in self.pairs], [
                        " ".join(translation[:-1])
                        for translation in self.translationList
                    ]),
                reverse=not reverse_sort_direction[metric])]
            sources, targets, translations = zip(*sorted_sentences)

            values = []
            for i in range(len(sources)):
                s = [targets[i] for i in range(0, i + 1)]
                t = [translations[i] for i in range(0, i + 1)]

                bleu = compute_bleu(s, t)
                values.append(bleu)
            values.reverse()
            metric_values[metric] = values

        for i, metric in enumerate(metrics):
            f, axes = plt.subplots()
            f.set_figheight(6)
            f.set_figwidth(6)

            values = metric_values[metric]

            x = [
                100 * i / len(values[:-25])
                for i in range(1,
                               len(values[:-25]) + 1)
            ]
            y = [100 * p for p in values[:-25]]
            line, = plt.plot(x, y, color=palette[i], linewidth=2, alpha=0.9)
            line.set_label(name_map[metric])

            for m in metrics:
                if m == metric:
                    continue
                line, = plt.plot([
                    100 * i / len(metric_values[m][:-25])
                    for i in range(1,
                                   len(metric_values[m][:-25]) + 1)
                ], [100 * p for p in metric_values[m][:-25]],
                                 marker='',
                                 color=palette[metrics.index(m)],
                                 linewidth=1,
                                 alpha=0.5,
                                 label=name_map[m],
                                 linestyle="-")
            #line.set_label("other metrics")

            plt.legend(loc='upper left', ncol=1, fontsize=12)

            plt.yticks(fontsize=15)
            plt.xticks([0, 25, 50, 75, 100], fontsize=15)

            plt.ylabel("BLEU", fontsize=17)
            plt.xlabel("% sentences covered (metric: " + name_map[metric] +
                       ")",
                       fontsize=17)

            plt.savefig(os.path.join(dir, prefix + "_" + metric + filename))
            print("saved bleu")
            plt.close()

    # characTER
    # Sentences sorted from bad to good according to metric.
    # The plot shows CharacTER score of the currently processed sentence.
    def plot_cter2(self, dir, prefix, filename):

        palette = sns.color_palette()

        metric_values = {}

        for metric in metrics:

            sorted_sentences = [(x, y, z) for _, x, y, z in sorted(
                zip([s[metric] for s in self.scoresList],
                    [p[0] for p in self.pairs], [p[1] for p in self.pairs], [
                        " ".join(translation[:-1])
                        for translation in self.translationList
                    ]),
                reverse=reverse_sort_direction[metric])]
            sources, targets, translations = zip(*sorted_sentences)

            values = []
            for i in range(len(sources)):
                s = targets[i]
                t = translations[i]

                cter = compute_cter(s, t)
                values.append(cter)
            metric_values[metric] = values

        for i, metric in enumerate(metrics):
            f, axes = plt.subplots()
            f.set_figheight(6)
            f.set_figwidth(6)

            axes.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

            values = metric_values[metric]

            x = [100 * i / len(values) for i in range(1, len(values) + 1)]
            y = [p for p in values]
            plt.plot(x, y, color=palette[i], linewidth=1, alpha=0.9)

            plt.xlim(0, 100)

            def movingaverage(interval, window_size):
                window = np.ones(int(window_size)) / float(window_size)
                return np.convolve(interval, window, 'valid')

            y_av = movingaverage(y, 100)

            plt.plot(x[50:-49], y_av, color='black', linewidth=3, alpha=0.9)

            plt.yticks(fontsize=15)
            plt.xticks([0, 25, 50, 75, 100], fontsize=15)

            plt.ylabel("CharacTER", fontsize=17)
            plt.xlabel("% sentences covered (metric: " + name_map[metric] +
                       ")",
                       fontsize=17)

            plt.savefig(os.path.join(dir, prefix + "_" + metric + filename))
            print("saved characTER (2)")
            plt.close()

    # Average characTER remaining text
    # Sentences sorted from good to bad according to metric to calculate the average CharacTER score up to the current
    # sentence.
    # The plot shows average CharacTER scores when removing bad sentences first until only one good sentence remains.
    def plot_cter(self, dir, prefix, filename):

        palette = sns.color_palette()

        metric_values = {}

        for metric in metrics:

            sorted_sentences = [(x, y, z) for _, x, y, z in sorted(
                zip([s[metric] for s in self.scoresList],
                    [p[0] for p in self.pairs], [p[1] for p in self.pairs], [
                        " ".join(translation[:-1])
                        for translation in self.translationList
                    ]),
                reverse=not reverse_sort_direction[metric])]
            sources, targets, translations = zip(*sorted_sentences)

            values = []
            val = 0
            for i in range(len(sources)):
                s = targets[i]
                t = translations[i]

                cter = compute_cter(s, t)
                val += cter
                values.append(val / (i + 1))

            values.reverse()
            metric_values[metric] = values

        for i, metric in enumerate(metrics):
            f, axes = plt.subplots()
            f.set_figheight(6)
            f.set_figwidth(6)

            values = metric_values[metric]

            x = [
                100 * i / len(values[:-25])
                for i in range(1,
                               len(values[:-25]) + 1)
            ]
            y = [p for p in values[:-25]]
            line, = plt.plot(x, y, color=palette[i], linewidth=2, alpha=0.9)
            line.set_label(name_map[metric])

            for m in metrics:
                if m == metric:
                    continue
                line, = plt.plot([
                    100 * i / len(metric_values[m][:-25])
                    for i in range(1,
                                   len(metric_values[m][:-25]) + 1)
                ], [p for p in metric_values[m][:-25]],
                                 marker='',
                                 color=palette[metrics.index(m)],
                                 linewidth=1,
                                 alpha=0.5,
                                 label=name_map[m],
                                 linestyle="-")
            line.set_label("other metrics")

            plt.legend(loc='upper left', ncol=1, fontsize=12)

            plt.yticks(fontsize=15)
            plt.xticks([0, 25, 50, 75, 100], fontsize=15)

            plt.ylabel("CharacTER", fontsize=17)
            plt.xlabel("% sentences covered (metric: " + name_map[metric] +
                       ")",
                       fontsize=17)

            plt.savefig(os.path.join(dir, prefix + "_" + metric + filename))
            print("saved characTER")
            plt.close()

    # distribution plot
    def plot_distr(self, dir, prefix, filename):
        palette = sns.color_palette()

        bins_map = {"length": 60}

        for i, metric in enumerate(metrics):
            f, axes = plt.subplots()
            f.set_figheight(6)
            f.set_figwidth(6)

            metric_scores = []
            for value in self.metric_to_cter[metric]:
                metric_scores += len(
                    self.metric_to_cter[metric][value]) * [value]

            if metric == "length":
                bins_map["length"] = max(metric_scores) - min(
                    metric_scores) + 1

            plt.ylabel("Density", fontsize=17)
            plt.xlabel("Metric: " + name_map[metric], fontsize=17)

            plt.xticks(fontsize=15)
            plt.yticks(fontsize=15)

            bins = bins_map[metric] if metric in bins_map else None
            dist_ax = sns.distplot(metric_scores,
                                   ax=axes,
                                   color=palette[i],
                                   bins=bins,
                                   hist_kws={"alpha": 0.2})
            ax2 = dist_ax.twinx()
            sns.boxplot(x=metric_scores, ax=ax2, color=palette[i])
            ax2.set(ylim=(-5, 5))

            plt.savefig(os.path.join(dir, prefix + "_" + metric + filename))
            plt.close()

        f, axes = plt.subplots()
        f.set_figheight(6)
        f.set_figwidth(6)
        plt.ylabel("Density", fontsize=17)
        plt.xlabel("CharacTER", fontsize=17)
        sns.distplot(self.all_cter_scores)
        plt.savefig(os.path.join(dir, prefix + "_" + "cter_dist.png"))
        plt.close()

    def run(self, src_lang, tgt_lang, dir, translationFile, scoresFile,
            attFile):
        loader = LanguagePairLoader(src_lang, tgt_lang, self.source_file,
                                    self.target_file)
        _, _, pairs = loader.load()

        loader2 = LanguagePairLoader(src_lang, tgt_lang, self.source_file2,
                                     self.target_file2)
        _, _, pairs2 = loader2.load()

        # concatenate both sets => all 1500 sentences
        pairs = pairs + pairs2

        self.pairs = pairs[:self.num_sentences]

        # Translate sources
        sources, targets, translations = [p[0] for p in self.pairs
                                          ], [p[1] for p in self.pairs], []

        extractor = DomainSpecificExtractor(
            source_file=self.source_file,
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            train_source_file=
            f".data/wmt14/train.tok.clean.bpe.32000.{src_lang}",
            train_vocab_file=f".data/vocab/train_vocab_{src_lang}.pkl")

        keyphrases = extractor.extract_keyphrases(n_results=100)

        self.translationList = []
        attentionList = []
        self.scoresList = []
        prefix = "_experiments/translated_beam3"

        if os.path.isfile(os.path.join(prefix, translationFile)) \
                and os.path.isfile(os.path.join(prefix, scoresFile)) \
                and os.path.isfile(os.path.join(prefix, attFile)):
            print("Translation reloaded")
            with open(os.path.join(prefix, translationFile), 'rb') as f:
                self.translationList = pickle.load(f)
            with open(os.path.join(prefix, attFile), 'rb') as f:
                attentionList = pickle.load(f)
            with open(os.path.join(prefix, scoresFile), 'rb') as f:
                self.scoresList = pickle.load(f)

        else:
            for i, pair in enumerate(self.pairs):
                if i % 10 == 0:
                    print("Translated {} of {}".format(i, len(self.pairs)))

                translation, attn, _ = self.model.translate(
                    pair[0], beam_size=self.beam_size)
                translations.append(" ".join(translation[:-1]))

                scores = self.scorer.compute_scores(pair[0],
                                                    " ".join(translation),
                                                    attn, keyphrases, "")

                self.translationList.append(translation)
                attentionList.append(attn)
                self.scoresList.append(scores)

            pickle.dump(self.translationList,
                        open(os.path.join(dir, translationFile), "wb"))
            pickle.dump(self.scoresList,
                        open(os.path.join(dir, scoresFile), "wb"))
            pickle.dump(attentionList, open(os.path.join(dir, attFile), "wb"))

        for i, pair in enumerate(self.pairs):
            if i % 10 == 0:
                print("Processing {} of {}".format(i, len(self.pairs)))

            for metric in self.scoresList[i]:
                #if metric == "coverage_penalty" and self.scoresList[i][metric] > 45: # remove some outliers
                #    continue
                #if metric == "keyphrase_score" and self.scoresList[i][metric] == 0:
                #    continue

                if not metric in self.metric_to_cter:
                    self.metric_to_cter[metric] = {}
                if not self.scoresList[i][metric] in self.metric_to_cter[
                        metric]:
                    self.metric_to_cter[metric][self.scoresList[i]
                                                [metric]] = []

                cter = compute_cter(pair[1],
                                    " ".join(self.translationList[i][:-1]))
                self.all_cter_scores.append(cter)
                self.metric_to_cter[metric][self.scoresList[i][metric]].append(
                    cter)
Пример #4
0
class MetricExperiment:
    def __init__(
        self,
        model,
        src_lang,
        tgt_lang,
        model_type,
        source_file,
        target_file,
        test_source_file,
        test_target_file,
        dir,
        evaluate_every=10,
        num_sentences=400,
        num_sentences_test=500,
        reuseCalculatedTranslations=False,
        reuseInitialTranslations=False,
        initialTranslationFile="",
        initialScoreFile="",
        initialTestTranslationFile="",
        translationFile="",
        batch_translate=True,
    ):

        self.model = model
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.model_type = model_type
        self.source_file = source_file
        self.target_file = target_file
        self.loader = LanguagePairLoader(src_lang, tgt_lang, source_file,
                                         target_file)
        self.test_loader = LanguagePairLoader(src_lang, tgt_lang,
                                              test_source_file,
                                              test_target_file)

        self.extractor = DomainSpecificExtractor(
            source_file=source_file,
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            train_source_file=
            f".data/wmt14/train.tok.clean.bpe.32000.{src_lang}",
            train_vocab_file=f".data/vocab/train_vocab_{src_lang}.pkl")

        self.target_extractor = DomainSpecificExtractor(
            source_file=target_file,
            src_lang=tgt_lang,
            tgt_lang=src_lang,
            train_source_file=
            f".data/wmt14/train.tok.clean.bpe.32000.{tgt_lang}",
            train_vocab_file=f".data/vocab/train_vocab_{tgt_lang}.pkl")

        self.scorer = Scorer()
        self.scores = {}
        self.num_sentences = num_sentences
        self.num_sentences_test = num_sentences_test
        self.batch_translate = batch_translate
        self.evaluate_every = evaluate_every
        self.reuseCalculatedTranslations = reuseCalculatedTranslations
        self.reuseInitialTranslations = reuseInitialTranslations

        self.initialTranslationFile = initialTranslationFile
        self.initialScoreFile = initialScoreFile
        self.initialTestTranslationFile = initialTestTranslationFile
        self.translationFile = translationFile

        self.metric_bleu_scores = {}
        self.metric_gleu_scores = {}
        self.metric_precisions = {}
        self.metric_recalls = {}

        self.prefix = "_experiments/retrain_beam3"
        self.dir = dir

    def save_data(self):
        prefix = ("batch_" if self.batch_translate else "beam_") + str(
            self.evaluate_every) + "_"
        prefix = os.path.join(self.dir, prefix)
        pickle.dump(self.metric_bleu_scores,
                    open(prefix + "metric_bleu_scores.pkl", "wb"))
        pickle.dump(self.metric_gleu_scores,
                    open(prefix + "metric_gleu_scores.pkl", "wb"))
        pickle.dump(self.metric_precisions,
                    open(prefix + "metric_precisions.pkl", "wb"))
        pickle.dump(self.metric_recalls,
                    open(prefix + "metric_recalls.pkl", "wb"))
        print("Saved all scores")

    def save_translation(self, translation, metric, step):
        name = os.path.join(self.dir,
                            metric + "_" + str(step) + self.translationFile)
        pickle.dump(translation, open(name, "wb"))
        print("Saved: " + name)

    def restore_translation(self, metric, step):
        name = os.path.join(self.prefix,
                            metric + "_" + str(step) + self.translationFile)
        with open(name, 'rb') as f:
            return pickle.load(f)

    def save_initialTranslation(self, scores, translations):
        name = os.path.join(self.dir, self.initialTranslationFile)
        pickle.dump(translations, open(name, "wb"))
        name = os.path.join(self.dir, self.initialScoreFile)
        pickle.dump(scores, open(name, "wb"))
        print("Saved: " + name)

    def restore_initialTranslation(self):
        name = os.path.join(self.prefix, self.initialTranslationFile)
        with open(name, 'rb') as f:
            translations = pickle.load(f)
        name = os.path.join(self.prefix, self.initialScoreFile)
        with open(name, 'rb') as f:
            scores = pickle.load(f)
        return translations, scores

    def save_initialTestTranslation(self, translations):
        name = os.path.join(self.dir, self.initialTestTranslationFile)
        pickle.dump(translations, open(name, "wb"))
        print("Saved: " + name)

    def restore_initialTestTranslation(self):
        name = os.path.join(self.prefix, self.initialTestTranslationFile)
        with open(name, 'rb') as f:
            return pickle.load(f)

    def run(self):
        _, _, pairs = self.loader.load()
        random.shuffle(pairs)

        pairs = pairs[:self.num_sentences]
        sources, targets, translations = [p[0] for p in pairs
                                          ], [p[1] for p in pairs], []

        keyphrases = self.extractor.extract_keyphrases(n_results=100)

        target_keyphrases = self.target_extractor.extract_keyphrases(
            n_results=100)

        # translation and scores for order of retraining
        print('Translating ...')
        if not reuseCalculatedTranslations and not reuseInitialTranslations:
            for i, pair in enumerate(tqdm(pairs)):
                translation, attn, _ = self.model.translate(pair[0])
                translations.append(" ".join(translation[:-1]))

                metrics_scores = self.scorer.compute_scores(
                    pair[0], " ".join(translation[:-1]), attn, keyphrases, "")
                for metric in metrics_scores:
                    if metric not in self.scores:
                        self.scores[metric] = []
                    self.scores[metric].append(metrics_scores[metric])
            self.save_initialTranslation(self.scores, translations)
        else:
            translations, self.scores = self.restore_initialTranslation()

        # initial test set translation
        _, _, test_pairs = self.test_loader.load()
        test_pairs = test_pairs[:self.num_sentences_test]
        test_sources, test_targets, test_translations = [
            p[0] for p in test_pairs
        ], [p[1] for p in test_pairs], []

        if not reuseCalculatedTranslations and not reuseInitialTranslations:
            print('- not reusing translations: Translating...')
            for i, source in enumerate(tqdm(test_sources)):
                translation, attn, _ = self.model.translate(source)
                test_translations.append(" ".join(translation[:-1]))

            if self.batch_translate:
                test_translations = [
                    t[:-6] for t in self.model.batch_translate(test_sources)
                ]

            self.save_initialTestTranslation(test_translations)
        else:
            test_translations = self.restore_initialTestTranslation()

        metrics = [
            "random", "keyphrase_score", "coverage_penalty", "confidence",
            "length"
        ]

        print("Evaluating metrics...")
        for i, metric in enumerate(tqdm(metrics)):
            self.metric_bleu_scores[metric] = []
            self.metric_gleu_scores[metric] = []
            self.metric_precisions[metric] = []
            self.metric_recalls[metric] = []

            sourcesCopy = sources[:]
            targetsCopy = targets[:]
            translationsCopy = translations[:]

            self.evaluate_metric(
                self.src_lang,
                self.tgt_lang,
                self.model_type,
                sourcesCopy,
                targetsCopy,
                translationsCopy,
                self.scores[metric] if metric != "random" else [],
                metric,
                target_keyphrases,
                test_sources,
                test_targets,
                test_translations,
                need_sort=True if metric != "random" else False,
                reverse=reverse_sort_direction[metric]
                if metric != "random" else True)
            print()
            print(self.metric_bleu_scores)
            self.save_data()

    def shuffle_list(self, *ls):
        l = list(zip(*ls))

        random.shuffle(l)
        return zip(*l)

    def evaluate_metric(self,
                        src_lang,
                        tgt_lang,
                        model_type,
                        sources,
                        targets,
                        translations,
                        scores,
                        metric,
                        target_keyphrases,
                        test_sources,
                        test_targets,
                        test_translations,
                        need_sort=True,
                        reverse=False):
        print()
        print("Evaluating {}".format(metric))
        base_bleu = compute_bleu(targets, translations)
        print("Base BLEU (of retraining data): {}".format(base_bleu))

        # Sort by metric
        if need_sort:
            sorted_sentences = [(x, y, z) for _, x, y, z in sorted(
                zip(scores, sources, targets, translations), reverse=reverse)]
            sources, targets, translations = zip(*sorted_sentences)
        else:
            sources, targets, translations = self.shuffle_list(
                sources, targets, translations)

        n = len(sources)
        encoder_optimizer_state, decoder_optimizer_state = None, None

        pretraining_bleu = compute_bleu(test_targets, test_translations)
        pretraining_gleu = compute_gleu(test_targets, test_translations)
        print()
        print("pretraining BLEU of test set (before retraining)")
        print(pretraining_bleu)

        prerecall = unigram_recall(target_keyphrases, test_targets,
                                   test_translations)
        preprecision = unigram_precision(target_keyphrases, test_targets,
                                         test_translations)

        self.metric_bleu_scores[metric].append(
            (pretraining_bleu, pretraining_bleu))
        self.metric_gleu_scores[metric].append(
            (pretraining_gleu, pretraining_gleu))
        self.metric_recalls[metric].append((prerecall, prerecall))
        self.metric_precisions[metric].append((preprecision, preprecision))
        self.save_data()

        if isinstance(self.model, TransformerTranslator):
            # create a new checkpoint here that gets overwritten with each ij
            # Neccessary to load trainer state.
            current_ckpt = f'.data/models/transformer/trafo_{src_lang}_{tgt_lang}_ensemble.pt'

        print('Training...')
        for i in tqdm(range(0, n)):
            # retranslate only every 10th sentence
            # evaluets for the 0th, 10th, 20th, ... sentence -> computes for sentences (0..9), (10..19), (20..29);
            # first sentence i = 0; evaluate_every = 10
            if i % self.evaluate_every != 0:
                continue

            if not reuseCalculatedTranslations:

                # Now train, and compute BLEU again
                start = i
                end = min(i + self.evaluate_every, n)

                print()
                print("Correcting {} - {} of {} sentences".format(
                    start, end - 1, n))

                if isinstance(self.model, Seq2SeqModel):
                    # same parameters that are used in the tool
                    encoder_optimizer_state, decoder_optimizer_state = retrain_iters(
                        self.model, [[x, y] for x, y in zip(
                            sources[start:end], targets[start:end])], [],
                        src_lang,
                        tgt_lang,
                        batch_size=1,
                        encoder_optimizer_state=encoder_optimizer_state,
                        decoder_optimizer_state=decoder_optimizer_state,
                        print_every=1,
                        n_epochs=15,
                        learning_rate=0.0001,
                        save_ckpt=i == n - 1)
                else:
                    # same parameters that are used in the tool
                    current_ckpt = self.model.retrain(
                        src_lang,
                        tgt_lang, [[x, y] for x, y in zip(
                            sources[start:end], targets[start:end])],
                        last_ckpt=current_ckpt,
                        epochs=15,
                        batch_size=1,
                        device=DEVICE,
                        save_ckpt=i == n - 1,
                        print_info=False)

                corrected_translations = []

                print(' - Translate using trained model')
                if not self.batch_translate:
                    # Translate trained model
                    for j in tqdm(range(0, len(test_sources))):
                        translation, _, _ = self.model.translate(
                            test_sources[j])
                        corrected_translations.append(" ".join(
                            translation[:-1]))
                else:
                    batch_translations = self.model.batch_translate(
                        test_sources)
                    corrected_translations = [
                        t[:-6] for t in batch_translations
                    ]

                self.save_translation(corrected_translations, metric, i)

            else:
                corrected_translations = self.restore_translation(metric, i)

            # Compute posttraining BLEU
            posttraining_bleu = compute_bleu(test_targets,
                                             corrected_translations)
            posttraining_gleu = compute_gleu(test_targets,
                                             corrected_translations)
            postrecall = unigram_recall(target_keyphrases, test_targets,
                                        corrected_translations)
            postprecision = unigram_precision(target_keyphrases, test_targets,
                                              corrected_translations)
            print("(Base BLEU {})".format(base_bleu))
            print("Delta Recall {} -> {}".format(prerecall, postrecall))
            print("Delta Precision {} -> {}".format(preprecision,
                                                    postprecision))
            print("Delta GLEU: {} -> {}".format(pretraining_gleu,
                                                posttraining_gleu))
            print("Delta BLEU: {} -> {}".format(pretraining_bleu,
                                                posttraining_bleu))

            delta_bleu = posttraining_bleu - pretraining_bleu
            print("Delta: {}".format(delta_bleu))

            self.metric_bleu_scores[metric].append(
                (pretraining_bleu, posttraining_bleu))
            self.metric_gleu_scores[metric].append(
                (pretraining_gleu, posttraining_gleu))
            self.metric_recalls[metric].append((prerecall, postrecall))
            self.metric_precisions[metric].append(
                (preprecision, postprecision))

            self.save_data()

        self.model = load_model(src_lang, tgt_lang, model_type,
                                device=DEVICE)  # reload initial model
        return None
Пример #5
0
class AveragedMetricExperiment:
    def __init__(self, model, source_file, target_file, raw_source_file, raw_target_file, num_sentences=400):
        self.model = model
        self.source_file = source_file
        self.target_file = target_file
        self.loader = LanguagePairLoader("de", "en", source_file, target_file)
        self.extractor = DomainSpecificExtractor(source_file=raw_source_file, train_source_file=hp.source_file,
                                                 train_vocab_file="train_vocab.pkl")
        self.target_extractor = DomainSpecificExtractor(source_file=raw_target_file, train_source_file=hp.target_file,
                                                        train_vocab_file="train_vocab_en.pkl")
        self.scorer = Scorer()
        self.scores = {}
        self.num_sentences = num_sentences

        self.metric_bleu_scores = {}
        self.metric_gleu_scores = {}
        self.metric_precisions = {}
        self.metric_recalls = {}
        self.cer = {}

        # Plot each metric
        plt.style.use('seaborn-darkgrid')
        self.palette = sns.color_palette()

    def save_data(self):
        prefix = "averaged_"
        pickle.dump(self.metric_bleu_scores, open(prefix + "metric_bleu_scores.pkl", "wb"))
        pickle.dump(self.metric_gleu_scores, open(prefix + "metric_gleu_scores.pkl", "wb"))
        pickle.dump(self.metric_precisions, open(prefix + "metric_precisions.pkl", "wb"))
        pickle.dump(self.metric_recalls, open(prefix + "metric_recalls.pkl", "wb"))
        pickle.dump(self.cer, open(prefix + "metric_cer.pkl", "wb"))
        print("Saved all scores")

    def run(self):
        _, _, pairs = self.loader.load()
        random.seed(2018)
        random.shuffle(pairs)

        pairs = pairs[:self.num_sentences]

        sources, targets, translations = [p[0] for p in pairs], [p[1] for p in pairs], []

        keyphrases = self.extractor.extract_keyphrases(n_results=100)
        print(keyphrases)
        target_keyphrases = self.target_extractor.extract_keyphrases(n_results=100)
        print(target_keyphrases)

        for i, pair in enumerate(pairs):
            if i % 10 == 0:
                print("Translated {} of {}".format(i, len(pairs)))
            translation, attn, _ = self.model.translate(pair[0])
            translations.append(" ".join(translation[:-1]))

            metrics_scores = self.scorer.compute_scores(pair[0], " ".join(translation[:-1]), attn, keyphrases)
            for metric in metrics_scores:
                if metric not in self.scores:
                    self.scores[metric] = []
                self.scores[metric].append(metrics_scores[metric])

        metrics = [
            # "coverage_penalty",
            # "coverage_deviation_penalty",
            # "confidence",
            # "length",
            # "ap_in",
            # "ap_out",
            # "random",
            "keyphrase_score"
        ]
        n_iters = 1
        for i, metric in enumerate(metrics):
            avg_bleus = [0 for _ in range(1, 100 // (step_size * 2) + 1)]
            self.metric_bleu_scores[metric] = []
            self.metric_gleu_scores[metric] = []
            self.metric_precisions[metric] = []
            self.metric_recalls[metric] = []
            self.cer[metric] = []
            for j in range(n_iters):
                self.evaluate_metric(sources, targets, translations,
                                     self.scores[metric] if metric != "random" else [],
                                     metric,
                                     target_keyphrases,
                                     need_sort=True if metric != "random" else False,
                                     reverse=sort_direction[metric] if metric != "random" else True)

                # plt.plot(x, delta_bleus, marker='', linestyle="--", color=self.palette[i], linewidth=1, alpha=0.9,
                #        label=metric)
            self.save_data()

    def shuffle_list(self, *ls):
        l = list(zip(*ls))

        random.shuffle(l)
        return zip(*l)

    def evaluate_metric(self, sources, targets, translations, scores, metric, target_keyphrases,
                        need_sort=True,
                        reverse=False):
        print("Evaluating {}".format(metric))
        base_bleu = compute_bleu(targets, translations)
        print("Base BLEU: {}".format(base_bleu))
        # Sort by metric
        if need_sort:
            sorted_sentences = [(x, y, z) for _, x, y, z in
                                sorted(zip(scores, sources, targets, translations), reverse=reverse)]
            sources, targets, translations = zip(*sorted_sentences)
        else:
            sources, targets, translations = self.shuffle_list(sources, targets, translations)

        n = len(sources)
        encoder_optimizer_state, decoder_optimizer_state = None, None

        corrected_translations = []

        cer_improvement = []
        curr_cer = 0

        for i in range(1, n + 1):
            print()
            print("{}: Correcting {} of {} sentences".format(metric, i, n))

            curr_end = i

            # Compute BLEU before training for comparison
            pretraining_bleu = compute_bleu(targets[:curr_end], translations[:curr_end])
            pretraining_gleu = compute_gleu(targets[:curr_end], translations[:curr_end])
            prerecall = unigram_recall(target_keyphrases, targets[:curr_end], translations[:curr_end])
            preprecision = unigram_precision(target_keyphrases, targets[:curr_end], translations[:curr_end])

            precer = cer(targets[i - 1].replace("@@ ", "").split(), translations[i - 1].replace("@@ ", "").split())

            translation, _, _ = seq2seq_model.translate(sources[i - 1])
            corrected_translations.append(" ".join(translation[:-1]))

            postcer = cer(targets[i - 1].replace("@@ ", "").split(),
                          " ".join(translation[:-1]).replace("@@ ", "").split())
            curr_cer = precer - postcer
            cer_improvement.append(curr_cer)

            # Compute posttraining BLEU
            posttraining_bleu = compute_bleu(targets[:curr_end], corrected_translations)
            posttraining_gleu = compute_gleu(targets[:curr_end], corrected_translations)

            postrecall = unigram_recall(target_keyphrases, targets[:curr_end], corrected_translations)
            postprecision = unigram_precision(target_keyphrases, targets[:curr_end], corrected_translations)
            print("Delta Recall {} -> {}".format(prerecall, postrecall))
            print("Delta Precision {} -> {}".format(preprecision, postprecision))
            print("Delta BLEU: {} -> {}".format(pretraining_bleu, posttraining_bleu))
            print("Delta CER: {} -> {}".format(precer, postcer))

            self.metric_bleu_scores[metric].append((pretraining_bleu, posttraining_bleu))
            self.metric_gleu_scores[metric].append((pretraining_gleu, posttraining_gleu))
            self.metric_recalls[metric].append((prerecall, postrecall))
            self.metric_precisions[metric].append((preprecision, postprecision))

            # Now train, and compute BLEU again
            encoder_optimizer_state, decoder_optimizer_state = retrain_iters(self.model,
                                                                             [[sources[i - 1],
                                                                               targets[i - 1]]], [],
                                                                             batch_size=1,
                                                                             encoder_optimizer_state=encoder_optimizer_state,
                                                                             decoder_optimizer_state=decoder_optimizer_state,
                                                                             n_epochs=1, learning_rate=0.00005,
                                                                             weight_decay=1e-3)

        self.cer[metric] = cer_improvement
        reload_model(self.model)
        return None

    def plot(self):
        plt.xlabel('% Corrected Sentences')
        plt.ylabel('Δ BLEU')
        # Add titles
        plt.title("BLEU Change for Metrics", loc='center', fontsize=12, fontweight=0)
        # Add legend
        plt.legend(loc='lower right', ncol=1)
        plt.savefig('bleu_deltas.png')
Пример #6
0
def documentUpload():
    if 'file' not in request.files:
        return redirect(request.url)
    file = request.files['file']
    # if user does not select file, browser also
    # submit an empty part without filename
    if file.filename == '':
        return redirect(request.url)
    if file and allowed_file(file.filename):
        document_name = request.args.get("document_name")
        id = uuid4()
        filename = secure_filename(file.filename)
        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        file.save(filepath)

        user = User.query.filter_by(username=get_jwt_identity()).first()
        dbDocument = DBDocument(id=id, name=document_name, user=user, model=model_name)

        document = Document(str(id), document_name, dict(), filepath)
        sentences = document.load_content(filename)
        sentences = list(filter(None, sentences))  # remove empty lines

        with open(filepath, "w", encoding='utf-8') as f:
            for i, sentence in enumerate(sentences):
                f.write(sentence.replace("@@ ", "") + "\n" if i < len(sentences) - 1 else "")

        extractor = DomainSpecificExtractor(source_file=filepath, src_lang=SRC_LANG, tgt_lang=TGT_LANG,
                                            train_source_file=f".data/wmt14/train.tok.clean.bpe.32000.{SRC_LANG}",
                                            train_vocab_file=f".data/vocab/train_vocab_{SRC_LANG}.pkl")
        keyphrases = extractor.extract_keyphrases(n_results=30)

        scorer = Scorer()

        print("Translating {} sentences".format(len(sentences)))

        beamSize = 3
        attLayer = -2
        for i, source in enumerate(sentences):
            translation, attn, translations = model.translate(source, beam_size=beamSize,  attLayer=attLayer, beam_length=0.6,
                                                                      beam_coverage=0.4)
            print("Translated {} of {}".format(i + 1, len(sentences)))

            beam = translationsToTree(translations[:beamSize])

            # print("  ", translation)
            score = scorer.compute_scores(source, " ".join(translation), attn, keyphrases, "")
            score["order_id"] = i
            sentence = Sentence(i, source, " ".join(translation), attn, beam, score)

            document.sentences.append(sentence)

        print("Finished translation")

        keyphrases = [{"name": k, "occurrences": f, "active": False} for (k, f) in keyphrases]
        document.keyphrases = keyphrases
        db.session.add(dbDocument)
        db.session.commit()

        save_document(document, id)

        return jsonify({})
    return jsonify({})
Пример #7
0
class CorrelationExperiment:
    def __init__(self, model, source_file, target_file, num_sentences=1000):
        self.model = model
        self.source_file = source_file
        self.target_file = target_file
        self.scorer = Scorer()
        self.num_sentences = num_sentences
        self.metric_to_gleu = {}
        self.all_gleu_scores = []
        self.metric_to_bad = {}
        self.bad_count = {}
        # self.threshold = 0.2826 - 0.167362
        self.threshold = 0.6

    def plot_correlation(self, filename):
        f, axes = plt.subplots(2, 3, sharey=True)
        f.set_figheight(8)
        f.set_figwidth(12)
        axes = np.reshape(axes, (6,))

        for i, metric in enumerate(metrics):
            x, y = [], []
            x_min = float('inf')
            x_max = float('-inf')

            x_temp = []

            for score in self.metric_to_gleu[metric]:
                values = self.metric_to_gleu[metric][score]
                x_temp += [score] * len(values)

            bad_count = 0
            score_gleu_tuples = []
            gleus = []
            for score in self.metric_to_gleu[metric]:
                for v in self.metric_to_gleu[metric][score]:
                    if v >= self.threshold:
                        bad_count += 1
                    gleus.append(v)
                score_gleu_tuples += [(score, v) for v in self.metric_to_gleu[metric][score]]
                values = self.metric_to_gleu[metric][score]
                x_min = min(x_min, score)
                x_max = max(x_max, score)
                x += [score] * len(values)
                y += values
                # plt.scatter([score] * len(values), values, color=palette(0), alpha=0.5)

            print(bad_count)
            print("Median {}".format(np.median(gleus)))
            print("Std {}".format(np.std(gleus)))
            self.bad_count[metric] = bad_count

            score_gleu_tuples = sorted(score_gleu_tuples, key=lambda x: x[0], reverse=sort_direction[metric])

            self.metric_to_bad[metric] = score_gleu_tuples

            b, m = P.polyfit(x, y, 1)
            axes[i].set_ylim(-0.1, 1.1)
            if metric == "ap_out":
                axes[i].set_xlim(0, 2.5)
            if metric == "shortness_penalty":
                axes[i].set_xlim(0, 1)
            corr, p_val = pearsonr(x, y)

            axes[i].text(0.05, 0.95, "r = {0:.2f}".format(corr.item()), transform=axes[i].transAxes, va="top",
                         fontsize=13, weight="bold")

            axes[i].set_title(name_map[metric],
                              {'fontsize': 15, 'horizontalalignment': 'left'}, "left")
            sns.regplot(x, y, ax=axes[i], scatter_kws={'alpha': 0.2}, order=1)
            # plt.plot(np.asarray([x_min, x_max]), b + m * np.asarray([x_min, x_max]), '-')

        axes[0].set(ylabel="CharacTER")
        axes[3].set(ylabel="CharacTER")
        plt.tight_layout()
        plt.savefig(filename)
        plt.close()

    def plot_bad(self, filename):
        f, axes = plt.subplots(2, 3, sharey=True)
        f.set_figheight(8)
        f.set_figwidth(12)
        axes = np.reshape(axes, (6,))
        palette = sns.color_palette()

        metric_percentage = {}
        for metric in metrics:
            bad_percentage = []
            curr_bad_count = 0
            for score, gleu in self.metric_to_bad[metric]:
                if gleu >= self.threshold:
                    curr_bad_count += 1
                bad_percentage.append(curr_bad_count / self.bad_count[metric])
            metric_percentage[metric] = bad_percentage

        print(len([metric_percentage[m] for m in metric_percentage]))

        for i, metric in enumerate(metrics):
            plt.subplot(2, 3, i + 1)
            bad_percentage = metric_percentage[metric]

            percentiles = [0.25, 0.5, 0.75]
            indices = []
            for perc in percentiles:
                indices.append(next(x[0] for x in enumerate(bad_percentage) if x[1] >= perc) / len(bad_percentage))
            print(metric)
            print(indices)

            n = len(bad_percentage)
            x = [100 * i / n for i in range(1, n + 1)]
            plt.plot(x, [100 * p for p in bad_percentage], color=palette[i], linewidth=2, alpha=0.9)
            plt.plot(x, x, marker='', linestyle="--", color='black',
                     linewidth=1.5, alpha=0.9)

            for m in metrics:
                plt.plot([100 * i / n for i in range(1, len(metric_percentage[m]) + 1)],
                         [100 * p for p in metric_percentage[m]], marker='', color='grey', linewidth=1, alpha=0.3)

            if i + 1 not in [1, 4]:
                plt.tick_params(labelleft='off')

            plt.yticks([0, 25, 50, 75, 100])
            plt.xticks([0, 25, 50, 75, 100])
            # Add title
            plt.title(name_map[metric], loc='left', fontsize=15, fontweight=0)
            if i + 1 == 5:
                plt.xlabel("Percentile Threshold", fontsize=15)
            if i + 1 == 4 or i + 1 == 1:
                plt.ylabel("% Covered", fontsize=15)
        plt.tight_layout()
        plt.savefig(filename)
        print("saved bad")
        plt.close()

    def plot_distr(self, filename):
        palette = sns.color_palette()
        f, axes = plt.subplots(2, 3)
        f.set_figheight(8)
        f.set_figwidth(12)
        axes = np.reshape(axes, (6,))

        bins_map = {"length": 60}
        metrics = [
            "coverage_penalty",
            "coverage_deviation_penalty",
            "confidence",
            "length",
            "ap_in",
            "ap_out"
        ]

        for i, metric in enumerate(metrics):
            metric_scores = []
            for value in self.metric_to_gleu[metric]:
                metric_scores += len(self.metric_to_gleu[metric][value]) * [value]

            if metric == "length":
                bins_map["length"] = max(metric_scores) - min(metric_scores) + 1
                # axes[i].set_xlim(0, 61)

            axes[i].set_title(name_map[metric], {'fontsize': 15, 'horizontalalignment': 'left'}, "left")
            bins = bins_map[metric] if metric in bins_map else None
            dist_ax = sns.distplot(metric_scores, ax=axes[i], color=palette[i], bins=bins, hist_kws={"alpha": 0.2})
            ax2 = dist_ax.twinx()
            sns.boxplot(x=metric_scores, ax=ax2, color=palette[i])
            ax2.set(ylim=(-5, 5))
        plt.tight_layout()
        plt.savefig(filename)
        plt.clf()

        f.set_figheight(4)
        f.set_figwidth(4)
        sns.distplot(self.all_gleu_scores)
        plt.tight_layout()
        plt.savefig("gleu_dist.png")
        plt.clf()

    def run(self):
        loader = LanguagePairLoader("de", "en", self.source_file, self.target_file)
        _, _, pairs = loader.load()

        pairs = pairs[:self.num_sentences]
        # Translate sources
        sources, targets, translations = [p[0] for p in pairs], [p[1] for p in pairs], []

        extractor = DomainSpecificExtractor(source_file="data/khresmoi.tok.de",
                                            train_source_file=hp.source_file,
                                            train_vocab_file="train_vocab.pkl")
        keyphrases = extractor.extract_keyphrases(n_results=100)
        print(keyphrases)

        for i, pair in enumerate(pairs):
            if i % 10 == 0:
                print("Translated {} of {}".format(i, len(pairs)))
            translation, attn, _ = self.model.translate(pair[0], beam_size=1)
            translations.append(" ".join(translation[:-1]))
            scores = self.scorer.compute_scores(pair[0], " ".join(translation), attn, keyphrases)

            for metric in scores:
                if metric == "coverage_penalty" and scores[metric] > 80:
                    continue
                if metric == "keyphrase_score" and scores[metric] == 0:
                    continue

                if not metric in self.metric_to_gleu:
                    self.metric_to_gleu[metric] = {}
                if not scores[metric] in self.metric_to_gleu[metric]:
                    self.metric_to_gleu[metric][scores[metric]] = []
                gleu = compute_cter(pair[1], " ".join(translation[:-1]))
                self.all_gleu_scores.append(gleu)
                self.metric_to_gleu[metric][scores[metric]].append(gleu)