Exemplo n.º 1
0
    def run(self):
        """ Trains the model. """
        # vocabulary
        self._vocab_source = Vocab(
            filename=self._model_configs["data"]["source_words_vocabulary"],
            bpe_codes=self._model_configs["data"]["source_bpecodes"],
            reverse_seq=False)
        self._vocab_target = Vocab(
            filename=self._model_configs["data"]["target_words_vocabulary"],
            bpe_codes=self._model_configs["data"]["target_bpecodes"],
            reverse_seq=self._model_configs["train"]["reverse_target"])
        # build dataset
        dataset = Dataset(
            self._vocab_source,
            self._vocab_target,
            train_features_file=self._model_configs["data"]["train_features_file"],
            train_labels_file=self._model_configs["data"]["train_labels_file"],
            eval_features_file=self._model_configs["data"]["eval_features_file"],
            eval_labels_file=self._model_configs["data"]["eval_labels_file"])

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.TRAIN,
                                  dataset=dataset,
                                  name=self._model_configs["problem_name"])
        train_op = estimator_spec.train_op
        hooks = estimator_spec.training_hooks
        # build training session
        sess = tf.train.MonitoredSession(
            session_creator=None,
            hooks=hooks)

        train_text_inputter = ParallelTextInputter(
            dataset,
            "train_features_file",
            "train_labels_file",
            self._model_configs["train"]["batch_size"],
            self._model_configs["train"]["batch_tokens_size"],
            self._model_configs["train"]["shuffle_every_epoch"])
        train_data = train_text_inputter.make_feeding_data(
            input_fields=estimator_spec.input_fields,
            maximum_features_length=self._model_configs["train"]["maximum_features_length"],
            maximum_labels_length=self._model_configs["train"]["maximum_labels_length"])
        eidx = 0
        while True:
            if sess.should_stop():
                break
            tf.logging.info("STARTUP Epoch {}".format(eidx))

            for data in train_data:
                if sess.should_stop():
                    break
                sess.run(train_op, feed_dict=data["feed_dict"])
            eidx += 1
Exemplo n.º 2
0
    def _prepare(self):
        """ Prepares for evaluation.

        Builds the model with reuse=True, mode=EVAL and preprocesses
        data file(s).

        Furthermore, if the decay_type of optimizer is "loss_decay", creates
        the controller variables/operations.
        """
        features_file = self._dataset["features_file"]
        labels_file = self._dataset["labels_file"]
        vocab_source = self._dataset["vocab_source"]
        vocab_target = self._dataset["vocab_target"]
        text_inputter = ParallelTextInputter(
            LineReader(data=features_file,
                       preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)),
            LineReader(data=labels_file,
                       preprocessing_fn=lambda x: vocab_target.convert_to_idlist(x)),
            vocab_source.pad_id,
            vocab_target.pad_id,
            batch_size=self._batch_size,
            batch_tokens_size=None,
            shuffle_every_epoch=None,
            bucketing=True)
        estimator_spec = model_fn(
            model_configs=self._model_configs,
            mode=ModeKeys.EVAL,
            vocab_source=vocab_source,
            vocab_target=vocab_target,
            name=self._model_name,
            reuse=True,
            verbose=False)
        self._eval_feeding_data = text_inputter.make_feeding_data(
            input_fields=estimator_spec.input_fields, in_memory=True)
        self._loss_op = estimator_spec.loss
        # for learning decay decay
        self._half_lr = False
        self._start_decay_at = 0
        if self._model_configs["optimizer_params"]["optimizer.lr_decay"]["decay_type"] == "loss_decay":
            self._half_lr = True
            lr_tensor_dict = get_dict_from_collection(Constants.LEARNING_RATE_VAR_NAME)
            self._learning_rate = lr_tensor_dict[Constants.LEARNING_RATE_VAR_NAME]
            self._max_patience = self._model_configs["optimizer_params"]["optimizer.lr_decay"]["patience"]
            self._start_decay_at = self._model_configs["optimizer_params"]["optimizer.lr_decay"]["start_decay_at"]
            assert self._start_decay_at >= self._start_at, (
                "start_decay_at in optimizer.lr_decay should be no less than start_at in LossMetricSpec.")
            self._half_lr_op = lr_tensor_dict[Constants.LR_AUTO_HALF_OP_NAME]
            self._bad_count = 0
            self._min_loss = 10000.
Exemplo n.º 3
0
    def _prepare(self):
        """ Prepares for evaluation.

        Builds the model with reuse=True, mode=EVAL and preprocesses
        data file(s).

        Furthermore, if the decay_type of optimizer is "loss_decay", creates
        the controller variables/operations.
        """
        text_inputter = ParallelTextInputter(
            dataset=self._dataset,
            features_field_name="eval_features_file",
            labels_field_name="eval_labels_file",
            batch_size=self._batch_size,
            batch_tokens_size=None,
            shuffle_every_epoch=None,
            bucketing=True)
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.EVAL,
                                  dataset=self._dataset,
                                  name=self._model_name,
                                  reuse=True,
                                  verbose=False)
        self._eval_feeding_data = text_inputter.make_feeding_data(
            input_fields=estimator_spec.input_fields, in_memory=True)
        self._loss_op = estimator_spec.loss
        # for learning decay decay
        self._half_lr = False
        self._start_decay_at = 0
        if self._model_configs["optimizer_params"]["optimizer.lr_decay"][
                "decay_type"] == "loss_decay":
            self._half_lr = True
            lr_tensor_dict = get_dict_from_collection(
                Constants.LEARNING_RATE_VAR_NAME)
            self._learning_rate = lr_tensor_dict[
                Constants.LEARNING_RATE_VAR_NAME]
            self._max_patience = self._model_configs["optimizer_params"][
                "optimizer.lr_decay"]["patience"]
            self._start_decay_at = self._model_configs["optimizer_params"][
                "optimizer.lr_decay"]["start_decay_at"]
            assert self._start_decay_at >= self._start_at, (
                "start_decay_at in optimizer.lr_decay should be no less than start_at in LossMetricSpec."
            )
            div_factor = lr_tensor_dict[Constants.LR_ANNEAL_DIV_FACTOR_NAME]
            self._half_lr_op = div_factor.assign(div_factor * 2.)
            self._bad_count = 0
            self._min_loss = 10000.
Exemplo n.º 4
0
    def testParallelInputterEval(self):
        vocab_src = Vocab(vocab_src_file)
        vocab_trg = Vocab(vocab_trg_file)
        dataset = Dataset(vocab_src, vocab_trg, train_src_file, train_trg_file,
                          eval_src_file, eval_trg_file)
        inputter = ParallelTextInputter(dataset,
                                        "eval_features_file",
                                        "eval_labels_file",
                                        batch_size=13,
                                        maximum_features_length=None,
                                        maximum_labels_length=None)

        eval_iter1 = EvalTextIterator(eval_src_file,
                                      eval_trg_file,
                                      vocab_src,
                                      vocab_trg,
                                      batch_size=13)

        eval_iter2 = TrainTextIterator(eval_src_file,
                                       eval_trg_file + "0",
                                       vocab_src,
                                       vocab_trg,
                                       batch_size=13,
                                       maxlen_src=1000,
                                       maxlen_trg=1000)
        input_fields = dataset.input_fields
        eval_data = inputter.make_feeding_data()
        for a, b, c in zip(eval_iter1, eval_iter2, eval_data):
            x1 = a[0][0]
            x_len1 = a[0][1]
            y1 = a[1][0]
            y_len1 = a[1][1]
            x2 = b[0][0]
            x_len2 = b[0][1]
            y2 = b[1][0]
            y_len2 = b[1][1]
            x_new = c[1][input_fields[Constants.FEATURE_IDS_NAME]]
            x_len_new = c[1][input_fields[Constants.FEATURE_LENGTH_NAME]]
            y_new = c[1][input_fields[Constants.LABEL_IDS_NAME]]
            y_len_new = c[1][input_fields[Constants.LABEL_LENGTH_NAME]]
            assert x1.all() == x_new.all() == x2.all()
            assert x_len1.all() == x_len_new.all() == x_len2.all()
            assert y1.all() == y_new.all() == y2.all()
            assert y_len1.all() == y_len_new.all() == y_len2.all()

        print("Test Passed...")
Exemplo n.º 5
0
    def _prepare(self):
        """ Prepares for evaluation.

        Builds the model with reuse=True, mode=EVAL and preprocesses
        data file(s).

        Furthermore, if the decay_type of optimizer is "loss_decay", creates
        the controller variables/operations.
        """
        text_inputter = ParallelTextInputter(
            dataset=self._dataset,
            features_field_name="eval_features_file",
            labels_field_name="eval_labels_file",
            batch_size=self._batch_size,
            batch_tokens_size=None,
            shuffle_every_epoch=None,
            bucketing=True)
        estimator_spec = model_fn(
            model_configs=self._model_configs,
            mode=ModeKeys.EVAL,
            dataset=self._dataset,
            name=self._model_name,
            reuse=True,
            verbose=False)
        self._eval_feeding_data = text_inputter.make_feeding_data(
            input_fields=estimator_spec.input_fields, in_memory=True)
        self._loss_op = estimator_spec.loss
        # for learning decay decay
        self._half_lr = False
        self._start_decay_at = 0
        if self._model_configs["optimizer_params"]["optimizer.lr_decay"]["decay_type"] == "loss_decay":
            self._half_lr = True
            lr_tensor_dict = get_dict_from_collection(Constants.LEARNING_RATE_VAR_NAME)
            self._learning_rate = lr_tensor_dict[Constants.LEARNING_RATE_VAR_NAME]
            self._max_patience = self._model_configs["optimizer_params"]["optimizer.lr_decay"]["patience"]
            self._start_decay_at = self._model_configs["optimizer_params"]["optimizer.lr_decay"]["start_decay_at"]
            assert self._start_decay_at >= self._start_at, (
                "start_decay_at in optimizer.lr_decay should be no less than start_at in LossMetricSpec.")
            div_factor = lr_tensor_dict[Constants.LR_ANNEAL_DIV_FACTOR_NAME]
            self._half_lr_op = div_factor.assign(div_factor * 2.)
            self._bad_count = 0
            self._min_loss = 10000.
Exemplo n.º 6
0
    def testParallelInputterTrain(self):
        vocab_src = Vocab(vocab_src_file)
        vocab_trg = Vocab(vocab_trg_file)
        dataset = Dataset(vocab_src, vocab_trg, train_src_file, train_trg_file,
                          eval_src_file, eval_trg_file)
        inputter = ParallelTextInputter(dataset,
                                        "train_features_file",
                                        "train_labels_file",
                                        batch_size=13,
                                        maximum_features_length=20,
                                        maximum_labels_length=20)

        inputter._cache_size = 10
        train_iter = TrainTextIterator(train_src_file,
                                       train_trg_file,
                                       vocab_src,
                                       vocab_trg,
                                       batch_size=13,
                                       maxlen_src=20,
                                       maxlen_trg=20)
        train_iter.k = 10
        input_fields = dataset.input_fields
        train_data = inputter.make_feeding_data()
        for a, b in zip(train_iter, train_data):
            x = a[0][0]
            x_len = a[0][1]
            y = a[1][0]
            y_len = a[1][1]
            x_new = b[1][input_fields[Constants.FEATURE_IDS_NAME]]
            x_len_new = b[1][input_fields[Constants.FEATURE_LENGTH_NAME]]
            y_new = b[1][input_fields[Constants.LABEL_IDS_NAME]]
            y_len_new = b[1][input_fields[Constants.LABEL_LENGTH_NAME]]
            assert x.all() == x_new.all()
            assert x_len.all() == x_len_new.all()
            assert y.all() == y_new.all()
            assert y_len.all() == y_len_new.all()
        print("Test Passed...")
Exemplo n.º 7
0
    def run(self):
        """Infers data files. """
        # build datasets
        self._vocab_source = Vocab(
            filename=self._model_configs["eval"]["source_words_vocabulary"],
            bpe_codes=self._model_configs["eval"]["source_bpecodes"],
            reverse_seq=False)
        self._vocab_target = Vocab(
            filename=self._model_configs["eval"]["target_words_vocabulary"],
            bpe_codes=self._model_configs["eval"]["target_bpecodes"],
            reverse_seq=self._model_configs["train"]["reverse_target"])
        # build dataset
        dataset = Dataset(
            self._vocab_source,
            self._vocab_target,
            eval_features_file=[p["features_file"] for p
                                in self._model_configs["eval_data"]],
            eval_labels_file=[p["labels_file"] for p
                              in self._model_configs["eval_data"]])

        # update evaluation model config
        self._model_configs, metric_str = update_eval_metric(
            self._model_configs, self._model_configs["eval"]["metric"])
        tf.logging.info("Evaluating using {}".format(metric_str))
        # build model
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.EVAL,
                                  dataset=dataset,
                                  name=self._model_configs["problem_name"])

        sess = self._build_default_session()
        do_bucketing = (sum([p["output_attention"]
                             for p in self._model_configs["eval_data"]]) == 0)
        text_inputter = ParallelTextInputter(
            dataset=dataset,
            features_field_name="eval_features_file",
            labels_field_name="eval_labels_file",
            batch_size=self._model_configs["eval"]["batch_size"],
            bucketing=do_bucketing)
        # reload
        checkpoint_path = tf.train.latest_checkpoint(self._model_configs["model_dir"])
        if checkpoint_path:
            tf.logging.info("reloading models...")
            saver = tf.train.Saver()
            saver.restore(sess, checkpoint_path)
        else:
            raise OSError("File NOT Found. Fail to load checkpoint file from: {}"
                          .format(self._model_configs["model_dir"]))

        tf.logging.info("Start evaluation.")
        overall_start_time = time.time()

        for eval_data, param in zip(text_inputter.make_feeding_data(
                input_fields=estimator_spec.input_fields, in_memory=True),
                self._model_configs["eval_data"]):
            tf.logging.info("Evaluation Source File: {}.".format(param["features_file"]))
            tf.logging.info("Evaluation Target File: {}.".format(param["labels_file"]))
            start_time = time.time()
            result = evaluate_with_attention(
                sess=sess,
                eval_op=estimator_spec.loss,
                eval_data=eval_data,
                vocab_source=self._vocab_source,
                vocab_target=self._vocab_target,
                attention_op=estimator_spec.predictions \
                    if param["output_attention"] else None,
                output_filename_prefix=param["labels_file"].strip().split("/")[-1])
            tf.logging.info("FINISHED {}. Elapsed Time: {}."
                            .format(param["features_file"], str(time.time() - start_time)))
            tf.logging.info("Evaluation Score ({} on {}): {}"
                            .format(metric_str, param["features_file"], result))
        tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))
    def run(self):
        """ Trains the model. """
        # vocabulary
        vocab_source = Vocab(
            filename=self._model_configs["data"]["source_words_vocabulary"],
            bpe_codes=self._model_configs["data"]["source_bpecodes"],
            reverse_seq=self._model_configs["train"]["features_r2l"])
        vocab_target = Vocab(
            filename=self._model_configs["data"]["target_words_vocabulary"],
            bpe_codes=self._model_configs["data"]["target_bpecodes"],
            reverse_seq=self._model_configs["train"]["labels_r2l"])
        eval_dataset = {
            "vocab_source": vocab_source,
            "vocab_target": vocab_target,
            "features_file": self._model_configs["data"]["eval_features_file"],
            "labels_file": self._model_configs["data"]["eval_labels_file"]
        }

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.TRAIN,
                                  vocab_source=vocab_source,
                                  vocab_target=vocab_target,
                                  name=self._model_configs["problem_name"])
        train_ops = estimator_spec.train_ops
        hooks = estimator_spec.training_hooks

        # build training session
        sess = tf.train.MonitoredSession(
            session_creator=tf.train.ChiefSessionCreator(
                scaffold=tf.train.Scaffold(),
                checkpoint_dir=None,
                master="",
                config=config),
            hooks=tuple(hooks) + tuple(
                build_eval_metrics(self._model_configs,
                                   eval_dataset,
                                   model_name=estimator_spec.name)))

        train_text_inputter = ParallelTextInputter(
            LineReader(
                data=self._model_configs["data"]["train_features_file"],
                maximum_length=self._model_configs["train"]
                ["maximum_features_length"],
                preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)),
            LineReader(
                data=self._model_configs["data"]["train_labels_file"],
                maximum_length=self._model_configs["train"]
                ["maximum_labels_length"],
                preprocessing_fn=lambda x: vocab_target.convert_to_idlist(x)),
            vocab_source.pad_id,
            vocab_target.pad_id,
            batch_size=self._model_configs["train"]["batch_size"],
            batch_tokens_size=self._model_configs["train"]
            ["batch_tokens_size"],
            shuffle_every_epoch=self._model_configs["train"]
            ["shuffle_every_epoch"],
            fill_full_batch=True,
            bucketing=True)
        train_data = train_text_inputter.make_feeding_data(
            input_fields=estimator_spec.input_fields)

        eidx = [0, 0]
        update_cycle = [self._model_configs["train"]["update_cycle"], 1]

        def step_fn(step_context):
            step_context.session.run(train_ops["zeros_op"])
            try:
                while update_cycle[0] != update_cycle[1]:
                    data = train_data.next()
                    step_context.session.run(train_ops["collect_op"],
                                             feed_dict=data["feed_dict"])
                    update_cycle[1] += 1
                data = train_data.next()
                update_cycle[1] = 1
                return step_context.run_with_hooks(train_ops["train_op"],
                                                   feed_dict=data["feed_dict"])
            except StopIteration:
                eidx[1] += 1

        while not sess.should_stop():
            if eidx[0] != eidx[1]:
                tf.logging.info("STARTUP Epoch {}".format(eidx[1]))
                eidx[0] = eidx[1]
            sess.run_step_fn(step_fn)
Exemplo n.º 9
0
    def run(self):
        """Infers data files. """
        # build datasets
        self._vocab_source = Vocab(
            filename=self._model_configs["eval"]["source_words_vocabulary"],
            bpe_codes=self._model_configs["eval"]["source_bpecodes"],
            reverse_seq=False)
        self._vocab_target = Vocab(
            filename=self._model_configs["eval"]["target_words_vocabulary"],
            bpe_codes=self._model_configs["eval"]["target_bpecodes"],
            reverse_seq=self._model_configs["train"]["reverse_target"])
        # build dataset
        dataset = Dataset(
            self._vocab_source,
            self._vocab_target,
            eval_features_file=[p["features_file"] for p
                                in self._model_configs["eval_data"]],
            eval_labels_file=[p["labels_file"] for p
                              in self._model_configs["eval_data"]])

        # update evaluation model config
        self._model_configs, metric_str = update_eval_metric(
            self._model_configs, self._model_configs["eval"]["metric"])
        tf.logging.info("Evaluating using {}".format(metric_str))
        # build model
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.EVAL,
                                  dataset=dataset,
                                  name=self._model_configs["problem_name"])

        sess = self._build_default_session()
        text_inputter = ParallelTextInputter(
            dataset=dataset,
            features_field_name="eval_features_file",
            labels_field_name="eval_labels_file",
            batch_size=self._model_configs["eval"]["batch_size"],
            bucketing=(sum([p["output_attention"]
                            for p in self._model_configs["eval_data"]]) == 0))
        # reload
        checkpoint_path = tf.train.latest_checkpoint(self._model_configs["model_dir"])
        if checkpoint_path:
            tf.logging.info("reloading models...")
            saver = tf.train.Saver()
            saver.restore(sess, checkpoint_path)
        else:
            raise OSError("File NOT Found. Fail to load checkpoint file from: {}"
                          .format(self._model_configs["model_dir"]))

        tf.logging.info("Start evaluation.")
        overall_start_time = time.time()

        for eval_data, param in zip(text_inputter.make_feeding_data(
                input_fields=estimator_spec.input_fields, in_memory=True),
                self._model_configs["eval_data"]):
            tf.logging.info("Evaluation Source File: {}.".format(param["features_file"]))
            tf.logging.info("Evaluation Target File: {}.".format(param["labels_file"]))
            start_time = time.time()
            result = evaluate_with_attention(
                sess=sess,
                loss_op=estimator_spec.loss,
                eval_data=eval_data,
                vocab_source=self._vocab_source,
                vocab_target=self._vocab_target,
                attention_op=estimator_spec.predictions \
                    if param["output_attention"] else None,
                output_filename_prefix=param["labels_file"].strip().split("/")[-1])
            tf.logging.info("FINISHED {}. Elapsed Time: {}."
                            .format(param["features_file"], str(time.time() - start_time)))
            tf.logging.info("Evaluation Score ({} on {}): {}"
                            .format(metric_str, param["features_file"], result))
        tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))
Exemplo n.º 10
0
    def run(self):
        """ Trains the model. """
        # vocabulary
        self._vocab_source = Vocab(
            filename=self._model_configs["data"]["source_words_vocabulary"],
            bpe_codes=self._model_configs["data"]["source_bpecodes"],
            reverse_seq=False)
        self._vocab_target = Vocab(
            filename=self._model_configs["data"]["target_words_vocabulary"],
            bpe_codes=self._model_configs["data"]["target_bpecodes"],
            reverse_seq=self._model_configs["train"]["reverse_target"])
        # build dataset
        dataset = Dataset(
            self._vocab_source,
            self._vocab_target,
            train_features_file=self._model_configs["data"]["train_features_file"],
            train_labels_file=self._model_configs["data"]["train_labels_file"],
            eval_features_file=self._model_configs["data"]["eval_features_file"],
            eval_labels_file=self._model_configs["data"]["eval_labels_file"])

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.TRAIN,
                                  dataset=dataset,
                                  name=self._model_configs["problem_name"])
        train_ops = estimator_spec.train_ops
        hooks = estimator_spec.training_hooks
        # build training session
        sess = tf.train.MonitoredSession(
            session_creator=tf.train.ChiefSessionCreator(
                scaffold=tf.train.Scaffold(),
                checkpoint_dir=None,
                master="",
                config=config),
            hooks=hooks)

        train_text_inputter = ParallelTextInputter(
            dataset,
            "train_features_file",
            "train_labels_file",
            self._model_configs["train"]["batch_size"],
            self._model_configs["train"]["batch_tokens_size"],
            self._model_configs["train"]["shuffle_every_epoch"],
            fill_full_batch=True)
        train_data = train_text_inputter.make_feeding_data(
            input_fields=estimator_spec.input_fields,
            maximum_features_length=self._model_configs["train"]["maximum_features_length"],
            maximum_labels_length=self._model_configs["train"]["maximum_labels_length"])

        eidx = [0, 0]
        update_cycle = [self._model_configs["train"]["update_cycle"], 1]

        def step_fn(step_context):
            step_context.session.run(train_ops["zeros_op"])
            try:
                while update_cycle[0] != update_cycle[1]:
                    data = train_data.next()
                    step_context.session.run(
                        train_ops["collect_op"], feed_dict=data["feed_dict"])
                    update_cycle[1] += 1
                data = train_data.next()
                update_cycle[1] = 1
                return step_context.run_with_hooks(
                    train_ops["train_op"], feed_dict=data["feed_dict"])
            except StopIteration:
                eidx[1] += 1

        while not sess.should_stop():
            if eidx[0] != eidx[1]:
                tf.logging.info("STARTUP Epoch {}".format(eidx[1]))
                eidx[0] = eidx[1]
            sess.run_step_fn(step_fn)