Пример #1
0
    def _check_bleu_script(self):
        """ Checks the correctness of the multi-bleu script.

        Returns: True/False

        Raises:
            OSError: if multi-bleu script not exists, or if
              evaluation labels file not exits, or if BLEU score
              is not correct.
        """
        if not gfile.Exists(self._multibleu_script):
            raise OSError(
                "File not found. Fail to open multi-bleu scrip: {}".format(
                    self._multibleu_script))
        if gfile.Exists(self._eval_labels_file):
            pseudo_predictions = self._eval_labels_file
        else:
            pseudo_predictions = self._eval_labels_file + "0"
            if not gfile.Exists(pseudo_predictions):
                raise OSError(
                    "File not found. Fail to open eval_labels_file: {} or {}".
                    format(self._eval_labels_file, pseudo_predictions))
        score = multi_bleu_score(self._multibleu_script,
                                 self._eval_labels_file, pseudo_predictions)
        if int(score) < 100:
            raise OSError(
                "Fail to run multi-bleu scrip: {}. "
                "The evaluation output is {} which should be 100".format(
                    self._multibleu_script, score))
Пример #2
0
    def run(self):
        """ Runs ensemble model. """
        self._vocab_source = Vocab(
            filename=self._model_configs["infer"]["source_words_vocabulary"],
            bpe_codes_file=self._model_configs["infer"]["source_bpecodes"])
        self._vocab_target = Vocab(
            filename=self._model_configs["infer"]["target_words_vocabulary"],
            bpe_codes_file=self._model_configs["infer"]["target_bpecodes"])
        # build dataset
        dataset = Dataset(self._vocab_source,
                          self._vocab_target,
                          eval_features_file=[
                              p["features_file"]
                              for p in self._model_configs["infer_data"]
                          ])
        estimator_spec = model_fn_ensemble(
            self._model_dirs,
            dataset,
            weight_scheme=self._weight_scheme,
            inference_options=self._model_configs["infer"])
        predict_op = estimator_spec.predictions
        sess = self._build_default_session()
        text_inputter = TextLineInputter(
            dataset=dataset,
            data_field_name="eval_features_file",
            batch_size=self._model_configs["infer"]["batch_size"],
            maximum_line_length=None)
        sess.run(tf.global_variables_initializer())
        tf.logging.info("Start inference.")
        overall_start_time = time.time()

        for feeding_data, param in zip(text_inputter.make_feeding_data(),
                                       self._model_configs["infer_data"]):
            tf.logging.info("Infer Source Features File: {}.".format(
                param["features_file"]))
            start_time = time.time()
            infer(sess=sess,
                  prediction_op=predict_op,
                  feeding_data=feeding_data,
                  output=param["output_file"],
                  vocab_target=self._vocab_target,
                  alpha=self._model_configs["infer"]["length_penalty"],
                  delimiter=self._model_configs["infer"]["delimiter"],
                  output_attention=False,
                  tokenize_output=self._model_configs["infer"]["char_level"],
                  tokenize_script=self._model_configs["infer"]
                  ["tokenize_script"],
                  verbose=True)
            tf.logging.info("FINISHED {}. Elapsed Time: {}.".format(
                param["features_file"], str(time.time() - start_time)))
            if param["labels_file"] is not None:
                bleu_score = multi_bleu_score(
                    self._model_configs["infer"]["multibleu_script"],
                    param["labels_file"], param["output_file"])
                tf.logging.info("BLEU score ({}): {}".format(
                    param["features_file"], bleu_score))
        tf.logging.info("Total Elapsed Time: %s" %
                        str(time.time() - overall_start_time))
Пример #3
0
    def _do_evaluation(self, run_context, global_step):
        """ Infers the evaluation data and computes the BLEU score.

        Args:
            run_context: A `SessionRunContext` object.
            global_step: A python integer, the current training step.
        """
        start_time = time.time()
        output_prediction_file = self._tmp_trans_file_prefix + str(global_step)
        sources, hypothesis = infer(sess=run_context.session,
                                    prediction_op=self._predict_ops,
                                    infer_data=self._infer_data,
                                    output=output_prediction_file,
                                    vocab_target=self._dataset.vocab_target,
                                    vocab_source=self._dataset.vocab_source,
                                    delimiter=self._delimiter,
                                    output_attention=False,
                                    tokenize_output=self._char_level,
                                    verbose=False)
        # print translation samples
        random_start = random.randint(0, len(hypothesis) - 5)
        for idx in range(5):
            tf.logging.info("Sample%d Source: %s" %
                            (idx, self._sources[idx + random_start].strip()))
            tf.logging.info("Sample%d Encoded Input: %s" %
                            (idx, sources[idx + random_start]))
            tf.logging.info(
                "Sample%d Reference: %s" %
                (idx, self._references[idx + random_start][0].strip()))
            tf.logging.info("Sample%d Hypothesis: %s\n" %
                            (idx, hypothesis[idx + random_start].strip()))
        # evaluate with BLEU
        bleu = multi_bleu_score(hypothesis, self._references)
        if self._summary_writer is not None:
            self._summary_writer.add_summary("Metrics/BLEU", bleu, global_step)
        _, elapsed_time_all = self._timer.update_last_triggered_step(
            global_step)
        self._update_bleu_ckpt(run_context, bleu, global_step)
        tf.logging.info(
            "Evaluating DEVSET: BLEU=%.2f (Best %.2f)  GlobalStep=%d  BadCount=%d  "
            "UD %.2f  UDfromStart %.2f" %
            (bleu, self._best_bleu_score, global_step, self._bad_count,
             time.time() - start_time, elapsed_time_all))
Пример #4
0
    def _do_evaluation(self, run_context, global_step):
        """ Infers the evaluation data and computes the BLEU score.

        Args:
            run_context: A `SessionRunContext` object.
            global_step: A python integer, the current training step.
        """
        start_time = time.time()
        output_prediction_file = self._tmp_trans_file_prefix + str(global_step)
        samples_src, samples_trg = infer(
            sess=run_context.session,
            prediction_op=self._predict_ops,
            feeding_data=self._eval_feeding_data,
            output=output_prediction_file,
            vocab_target=self._dataset.vocab_target,
            alpha=self._model_configs["model_params"]
            ["inference.length_penalty"],
            delimiter=self._delimiter,
            output_attention=False,
            tokenize_output=self._char_level,
            tokenize_script=self._tokenize_script,
            verbose=False)
        # print translation samples
        for idx, (s, p) in enumerate(zip(samples_src, samples_trg)):
            tf.logging.info("Sample%d Source: %s" % (idx, s))
            tf.logging.info("Sample%d Prediction: %s\n" % (idx, p))
        # evaluate with BLEU
        bleu = multi_bleu_score(self._multibleu_script, self._eval_labels_file,
                                output_prediction_file)
        if self._summary_writer is not None:
            self._summary_writer.add_summary("Metrics/BLEU", bleu, global_step)
        _, elapsed_time_all = self._timer.update_last_triggered_step(
            global_step)
        tf.logging.info(
            "Evaluating DEVSET: BLEU=%.2f (Best %.2f)  GlobalStep=%d    UD %.2f   UDfromStart %.2f"
            % (bleu, self._best_bleu_score, global_step,
               time.time() - start_time, elapsed_time_all))
        self._update_bleu_ckpt(run_context, bleu, global_step)
Пример #5
0
    def _do_evaluation(self, run_context, global_step):
        """ Infers the evaluation data and computes the BLEU score.

        Args:
            run_context: A `SessionRunContext` object.
            global_step: A python integer, the current training step.
        """
        start_time = time.time()
        output_prediction_file = self._tmp_trans_file_prefix + str(global_step)
        sources, hypothesis = infer(
            sess=run_context.session,
            prediction_op=self._predict_ops,
            infer_data=self._infer_data,
            output=output_prediction_file,
            vocab_target=self._dataset.vocab_target,
            vocab_source=self._dataset.vocab_source,
            delimiter=self._delimiter,
            output_attention=False,
            tokenize_output=self._char_level,
            verbose=False)
        # print translation samples
        random_start = random.randint(0, len(hypothesis) - 5)
        for idx in range(5):
            tf.logging.info("Sample%d Source: %s" % (idx, self._sources[idx + random_start].strip()))
            tf.logging.info("Sample%d Encoded Input: %s" % (idx, sources[idx + random_start]))
            tf.logging.info("Sample%d Reference: %s" % (idx, self._references[idx + random_start][0].strip()))
            tf.logging.info("Sample%d Hypothesis: %s\n" % (idx, hypothesis[idx + random_start].strip()))
        # evaluate with BLEU
        bleu = multi_bleu_score(hypothesis, self._references)
        if self._summary_writer is not None:
            self._summary_writer.add_summary("Metrics/BLEU", bleu, global_step)
        _, elapsed_time_all = self._timer.update_last_triggered_step(global_step)
        self._update_bleu_ckpt(run_context, bleu, global_step)
        tf.logging.info(
            "Evaluating DEVSET: BLEU=%.2f (Best %.2f)  GlobalStep=%d  BadCount=%d  "
            "UD %.2f  UDfromStart %.2f" % (
                bleu, self._best_bleu_score, global_step, self._bad_count,
                time.time() - start_time, elapsed_time_all))
Пример #6
0
    def run(self):
        """Infers data files. """
        # build datasets
        self._vocab_source = Vocab(
            filename=self._model_configs["infer"]["source_words_vocabulary"],
            bpe_codes_file=self._model_configs["infer"]["source_bpecodes"],
            reverse_seq=False)
        self._vocab_target = Vocab(
            filename=self._model_configs["infer"]["target_words_vocabulary"],
            bpe_codes_file=self._model_configs["infer"]["target_bpecodes"],
            reverse_seq=self._model_configs["train"]["reverse_target"])
        # build dataset
        dataset = Dataset(self._vocab_source,
                          self._vocab_target,
                          eval_features_file=[
                              p["features_file"]
                              for p in self._model_configs["infer_data"]
                          ])

        self._model_configs = update_infer_params(
            self._model_configs,
            beam_size=self._model_configs["infer"]["beam_size"],
            maximum_labels_length=self._model_configs["infer"]
            ["maximum_labels_length"],
            length_penalty=self._model_configs["infer"]["length_penalty"])
        # build model
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=tf.contrib.learn.ModeKeys.INFER,
                                  dataset=dataset)
        predict_op = estimator_spec.predictions

        sess = self._build_default_session()

        text_inputter = TextLineInputter(
            dataset=dataset,
            data_field_name="eval_features_file",
            batch_size=self._model_configs["infer"]["batch_size"])
        # reload
        checkpoint_path = tf.train.latest_checkpoint(
            self._model_configs["model_dir"])
        if checkpoint_path:
            tf.logging.info("reloading models...")
            optimistic_restore(sess, checkpoint_path)
        else:
            raise OSError(
                "File NOT Found. Fail to find checkpoint file from: {}".format(
                    self._model_configs["model_dir"]))

        tf.logging.info("Start inference.")
        overall_start_time = time.time()

        for feeding_data, param in zip(text_inputter.make_feeding_data(),
                                       self._model_configs["infer_data"]):
            tf.logging.info("Infer Source Features File: {}.".format(
                param["features_file"]))
            start_time = time.time()
            infer(sess=sess,
                  prediction_op=predict_op,
                  feeding_data=feeding_data,
                  output=param["output_file"],
                  vocab_target=self._vocab_target,
                  alpha=self._model_configs["infer"]["length_penalty"],
                  delimiter=self._model_configs["infer"]["delimiter"],
                  output_attention=param["output_attention"],
                  tokenize_output=self._model_configs["infer"]["char_level"],
                  tokenize_script=self._model_configs["infer"]
                  ["tokenize_script"],
                  verbose=True)
            tf.logging.info("FINISHED {}. Elapsed Time: {}.".format(
                param["features_file"], str(time.time() - start_time)))
            if param["labels_file"] is not None:
                bleu_score = multi_bleu_score(
                    self._model_configs["infer"]["multibleu_script"],
                    param["labels_file"], param["output_file"])
                tf.logging.info("BLEU score ({}): {}".format(
                    param["features_file"], bleu_score))
        tf.logging.info("Total Elapsed Time: %s" %
                        str(time.time() - overall_start_time))