Пример #1
0
    def run(self):
        """ Runs ensemble model. """
        vocab_source = Vocab(
            filename=self._model_configs["infer"]["source_words_vocabulary"],
            bpe_codes=self._model_configs["infer"]["source_bpecodes"])
        vocab_target = Vocab(
            filename=self._model_configs["infer"]["target_words_vocabulary"],
            bpe_codes=self._model_configs["infer"]["target_bpecodes"])

        estimator_spec = model_fn_ensemble(
            self._model_dirs,
            vocab_source,
            vocab_target,
            weight_scheme=self._weight_scheme,
            inference_options=self._model_configs["infer"])
        predict_op = estimator_spec.predictions
        sess = self._build_default_session()
        text_inputter = TextLineInputter(line_readers=[
            LineReader(
                data=p["features_file"],
                preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x))
            for p in self._model_configs["infer_data"]
        ],
                                         padding_id=vocab_source.pad_id,
                                         batch_size=self.
                                         _model_configs["infer"]["batch_size"])
        sess.run(tf.global_variables_initializer())
        tf.logging.info("Start inference.")
        overall_start_time = time.time()

        for feeding_data, param in zip(
                text_inputter.make_feeding_data(estimator_spec.input_fields),
                self._model_configs["infer_data"]):
            tf.logging.info("Infer Source Features File: {}.".format(
                param["features_file"]))
            start_time = time.time()
            infer(sess=sess,
                  prediction_op=predict_op,
                  infer_data=feeding_data,
                  output=param["output_file"],
                  vocab_source=vocab_source,
                  vocab_target=vocab_target,
                  delimiter=self._model_configs["infer"]["delimiter"],
                  output_attention=False,
                  to_char_level=self._model_configs["infer"]["char_level"],
                  verbose=True)
            tf.logging.info("FINISHED {}. Elapsed Time: {}.".format(
                param["features_file"], str(time.time() - start_time)))
            if param["labels_file"] is not None:
                bleu_score = multi_bleu_score_from_file(
                    hypothesis_file=param["output_file"],
                    references_files=param["labels_file"],
                    char_level=self._model_configs["infer"]["char_level"])
                tf.logging.info("BLEU score (%s): %.2f" %
                                (param["features_file"], bleu_score))
        tf.logging.info("Total Elapsed Time: %s" %
                        str(time.time() - overall_start_time))
    def run(self):
        """Infers data files. """
        # build datasets
        vocab_source = Vocab(
            filename=self._model_configs["infer"]["source_words_vocabulary"],
            bpe_codes=self._model_configs["infer"]["source_bpecodes"],
            reverse_seq=self._model_configs["train"]["features_r2l"])
        vocab_target = Vocab(
            filename=self._model_configs["infer"]["target_words_vocabulary"],
            bpe_codes=self._model_configs["infer"]["target_bpecodes"],
            reverse_seq=self._model_configs["train"]["labels_r2l"])

        self._model_configs = update_infer_params(
            self._model_configs,
            beam_size=self._model_configs["infer"]["beam_size"],
            maximum_labels_length=self._model_configs["infer"]
            ["maximum_labels_length"],
            length_penalty=self._model_configs["infer"]["length_penalty"])
        # build model
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.INFER,
                                  vocab_source=vocab_source,
                                  vocab_target=vocab_target,
                                  name=self._model_configs["problem_name"])
        predict_op = estimator_spec.predictions

        sess = self._build_default_session()

        text_inputter = TextLineInputter(line_readers=[
            LineReader(
                data=p["features_file"],
                preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x))
            for p in self._model_configs["infer_data"]
        ],
                                         padding_id=vocab_source.pad_id,
                                         batch_size=self.
                                         _model_configs["infer"]["batch_size"])
        # reload
        checkpoint_path = tf.train.latest_checkpoint(
            self._model_configs["model_dir"])
        if checkpoint_path:
            tf.logging.info("reloading models...")
            saver = tf.train.Saver()
            saver.restore(sess, checkpoint_path)
        else:
            raise OSError(
                "File NOT Found. Fail to find checkpoint file from: {}".format(
                    self._model_configs["model_dir"]))

        tf.logging.info("Start inference.")
        overall_start_time = time.time()

        for infer_data, param in zip(
                text_inputter.make_feeding_data(
                    input_fields=estimator_spec.input_fields),
                self._model_configs["infer_data"]):
            tf.logging.info("Infer Source File: {}.".format(
                param["features_file"]))
            start_time = time.time()
            infer(sess=sess,
                  prediction_op=predict_op,
                  infer_data=infer_data,
                  output=param["output_file"],
                  vocab_source=vocab_source,
                  vocab_target=vocab_target,
                  delimiter=self._model_configs["infer"]["delimiter"],
                  output_attention=param["output_attention"],
                  tokenize_output=self._model_configs["infer"]["char_level"],
                  verbose=True)
            tf.logging.info("FINISHED {}. Elapsed Time: {}.".format(
                param["features_file"], str(time.time() - start_time)))
            if param["labels_file"] is not None:
                bleu_score = multi_bleu_score_from_file(
                    hypothesis_file=param["output_file"],
                    references_files=param["labels_file"],
                    char_level=self._model_configs["infer"]["char_level"])
                tf.logging.info("BLEU score (%s): %.2f" %
                                (param["features_file"], bleu_score))
        tf.logging.info("Total Elapsed Time: %s" %
                        str(time.time() - overall_start_time))
    def run(self):
        """ Trains the model. """
        # vocabulary
        vocab_source = Vocab(
            filename=self._model_configs["data"]["source_words_vocabulary"],
            bpe_codes=self._model_configs["data"]["source_bpecodes"],
            reverse_seq=self._model_configs["train"]["features_r2l"])
        vocab_target = Vocab(
            filename=self._model_configs["data"]["target_words_vocabulary"],
            bpe_codes=self._model_configs["data"]["target_bpecodes"],
            reverse_seq=self._model_configs["train"]["labels_r2l"])
        eval_dataset = {
            "vocab_source": vocab_source,
            "vocab_target": vocab_target,
            "features_file": self._model_configs["data"]["eval_features_file"],
            "labels_file": self._model_configs["data"]["eval_labels_file"]
        }

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.TRAIN,
                                  vocab_source=vocab_source,
                                  vocab_target=vocab_target,
                                  name=self._model_configs["problem_name"])
        train_ops = estimator_spec.train_ops
        hooks = estimator_spec.training_hooks

        # build training session
        sess = tf.train.MonitoredSession(
            session_creator=tf.train.ChiefSessionCreator(
                scaffold=tf.train.Scaffold(),
                checkpoint_dir=None,
                master="",
                config=config),
            hooks=tuple(hooks) + tuple(
                build_eval_metrics(self._model_configs,
                                   eval_dataset,
                                   model_name=estimator_spec.name)))

        train_text_inputter = ParallelTextInputter(
            LineReader(
                data=self._model_configs["data"]["train_features_file"],
                maximum_length=self._model_configs["train"]
                ["maximum_features_length"],
                preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)),
            LineReader(
                data=self._model_configs["data"]["train_labels_file"],
                maximum_length=self._model_configs["train"]
                ["maximum_labels_length"],
                preprocessing_fn=lambda x: vocab_target.convert_to_idlist(x)),
            vocab_source.pad_id,
            vocab_target.pad_id,
            batch_size=self._model_configs["train"]["batch_size"],
            batch_tokens_size=self._model_configs["train"]
            ["batch_tokens_size"],
            shuffle_every_epoch=self._model_configs["train"]
            ["shuffle_every_epoch"],
            fill_full_batch=True,
            bucketing=True)
        train_data = train_text_inputter.make_feeding_data(
            input_fields=estimator_spec.input_fields)

        eidx = [0, 0]
        update_cycle = [self._model_configs["train"]["update_cycle"], 1]

        def step_fn(step_context):
            step_context.session.run(train_ops["zeros_op"])
            try:
                while update_cycle[0] != update_cycle[1]:
                    data = train_data.next()
                    step_context.session.run(train_ops["collect_op"],
                                             feed_dict=data["feed_dict"])
                    update_cycle[1] += 1
                data = train_data.next()
                update_cycle[1] = 1
                return step_context.run_with_hooks(train_ops["train_op"],
                                                   feed_dict=data["feed_dict"])
            except StopIteration:
                eidx[1] += 1

        while not sess.should_stop():
            if eidx[0] != eidx[1]:
                tf.logging.info("STARTUP Epoch {}".format(eidx[1]))
                eidx[0] = eidx[1]
            sess.run_step_fn(step_fn)
Пример #4
0
    def run(self):
        """Infers data files. """
        # build datasets
        vocab_source = Vocab(
            filename=self._model_configs["eval"]["source_words_vocabulary"],
            bpe_codes=self._model_configs["eval"]["source_bpecodes"],
            reverse_seq=self._model_configs["train"]["features_r2l"])
        vocab_target = Vocab(
            filename=self._model_configs["eval"]["target_words_vocabulary"],
            bpe_codes=self._model_configs["eval"]["target_bpecodes"],
            reverse_seq=self._model_configs["train"]["labels_r2l"])

        # update evaluation model config
        self._model_configs, metric_str = update_eval_metric(
            self._model_configs, self._model_configs["eval"]["metric"])
        tf.logging.info("Evaluating using {}".format(metric_str))
        # build model
        estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.EVAL, vocab_source=vocab_source,
                                  vocab_target=vocab_target, name=self._model_configs["problem_name"])

        sess = self._build_default_session()

        # reload
        checkpoint_path = tf.train.latest_checkpoint(self._model_configs["model_dir"])
        if checkpoint_path:
            tf.logging.info("reloading models...")
            saver = tf.train.Saver()
            saver.restore(sess, checkpoint_path)
        else:
            raise OSError("File NOT Found. Fail to load checkpoint file from: {}"
                          .format(self._model_configs["model_dir"]))

        tf.logging.info("Start evaluation.")
        overall_start_time = time.time()

        for data_param in self._model_configs["eval_data"]:
            tf.logging.info("Evaluation Source File: {}.".format(data_param["features_file"]))
            tf.logging.info("Evaluation Target File: {}.".format(data_param["labels_file"]))
            eval_data = ParallelTextInputter(
                LineReader(data=data_param["features_file"],
                           preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)),
                LineReader(data=data_param["labels_file"],
                           preprocessing_fn=lambda x: vocab_target.convert_to_idlist(x)),
                vocab_source.pad_id,
                vocab_target.pad_id,
                batch_size=self._model_configs["eval"]["batch_size"],
                bucketing=False).make_feeding_data(
                input_fields=estimator_spec.input_fields, in_memory=True)

            start_time = time.time()
            result = evaluate_with_attention(
                sess=sess,
                loss_op=estimator_spec.loss,
                eval_data=eval_data,
                vocab_source=vocab_source,
                vocab_target=vocab_target,
                attention_op=estimator_spec.predictions \
                    if data_param["output_attention"] else None,
                output_filename_prefix=data_param["labels_file"].strip().split("/")[-1])
            tf.logging.info("FINISHED {}. Elapsed Time: {}."
                            .format(data_param["features_file"], str(time.time() - start_time)))
            tf.logging.info("Evaluation Score ({} on {}): {}"
                            .format(metric_str, data_param["features_file"], result))
        tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))