Exemplo n.º 1
0
  def _build_train_spec(self):
    train_hooks = [
        hooks.LogParametersCountHook(),
        hooks.CountersHook(
            every_n_steps=self._estimator.config.save_summary_steps,
            output_dir=self._estimator.model_dir)]

    train_spec = tf.estimator.TrainSpec(
        input_fn=self._model.input_fn(
            tf.estimator.ModeKeys.TRAIN,
            self._config["train"]["batch_size"],
            self._config["data"],
            self._config["data"]["train_features_file"],
            labels_file=self._config["data"]["train_labels_file"],
            batch_type=self._config["train"].get("batch_type", "examples"),
            batch_multiplier=self._num_devices,
            bucket_width=self._config["train"].get("bucket_width", 5),
            single_pass=self._config["train"].get("single_pass", False),
            num_threads=self._config["train"].get("num_threads"),
            sample_buffer_size=self._config["train"].get("sample_buffer_size", 500000),
            prefetch_buffer_size=self._config["train"].get("prefetch_buffer_size"),
            maximum_features_length=self._config["train"].get("maximum_features_length"),
            maximum_labels_length=self._config["train"].get("maximum_labels_length")),
        max_steps=self._config["train"].get("train_steps"),
        hooks=train_hooks)
    return train_spec
Exemplo n.º 2
0
  def _build_train_spec(self, checkpoint_path):
    train_hooks = [
        hooks.LogParametersCountHook()]

    if checkpoint_path is not None:
      train_hooks.append(hooks.LoadWeightsFromCheckpointHook(checkpoint_path))
    if self._hvd is not None:
      train_hooks.append(self._hvd.BroadcastGlobalVariablesHook(0))

    train_steps = self._config["train"].get("train_steps")
    if train_steps is not None and self._hvd is not None:
      train_steps //= self._hvd.size()
    train_spec = tf.estimator.TrainSpec(
        input_fn=estimator_util.make_input_fn(
            self._model,
            tf.estimator.ModeKeys.TRAIN,
            self._config["train"]["batch_size"],
            features_file=self._config["data"]["train_features_file"],
            labels_file=self._config["data"].get("train_labels_file"),
            batch_type=self._config["train"]["batch_type"],
            batch_multiplier=self._num_devices,
            bucket_width=self._config["train"]["bucket_width"],
            maximum_features_length=self._config["train"].get("maximum_features_length"),
            maximum_labels_length=self._config["train"].get("maximum_labels_length"),
            shuffle_buffer_size=self._config["train"]["sample_buffer_size"],
            single_pass=self._config["train"].get("single_pass", False),
            num_shards=self._hvd.size() if self._hvd is not None else 1,
            shard_index=self._hvd.rank() if self._hvd is not None else 0,
            num_threads=self._config["train"].get("num_threads"),
            prefetch_buffer_size=self._config["train"].get("prefetch_buffer_size"),
            return_dataset=False),
        max_steps=train_steps,
        hooks=train_hooks)
    return train_spec
Exemplo n.º 3
0
  def _build_train_spec(self, checkpoint_path):
    train_hooks = [
        hooks.LogParametersCountHook()]

    if checkpoint_path is not None:
      train_hooks.append(hooks.LoadWeightsFromCheckpointHook(checkpoint_path))

    train_spec = tf.estimator.TrainSpec(
        input_fn=self._model.input_fn(
            tf.estimator.ModeKeys.TRAIN,
            self._config["train"]["batch_size"],
            self._config["data"],
            self._config["data"]["train_features_file"],
            labels_file=self._config["data"]["train_labels_file"],
            batch_type=self._config["train"]["batch_type"],
            batch_multiplier=self._num_devices,
            bucket_width=self._config["train"]["bucket_width"],
            single_pass=self._config["train"].get("single_pass", False),
            num_threads=self._config["train"].get("num_threads"),
            sample_buffer_size=self._config["train"]["sample_buffer_size"],
            prefetch_buffer_size=self._config["train"].get("prefetch_buffer_size"),
            maximum_features_length=self._config["train"].get("maximum_features_length"),
            maximum_labels_length=self._config["train"].get("maximum_labels_length")),
        max_steps=self._config["train"].get("train_steps"),
        hooks=train_hooks)
    return train_spec
Exemplo n.º 4
0
    def _build_train_spec(self, checkpoint_path):
        train_hooks = [hooks.LogParametersCountHook()]
        #if checkpoint_path is not None:
        #  train_hooks.append(hooks.LoadWeightsFromCheckpointHook(checkpoint_path))

        # NEW: loads params based on config.yml ["load_weights"] - see config*.yml
        if checkpoint_path is not None and "load_weights" in self._config:
            not_restore = []
            loadw = self._config["load_weights"]
            if loadw is not None:
                if not loadw.get("src_embs"):
                    not_restore.append("encoder/w_embs")
                if not loadw.get("tgt_embs"):
                    not_restore.append("decoder/w_embs")
                if not loadw.get("projection"):
                    not_restore.append("decoder/dense")
                if not loadw.get("shared_embs"):
                    not_restore.append("shared_embeddings/w_embs")
                if not loadw.get("encoder"):
                    not_restore.append("encoder")
                if not loadw.get("decoder"):
                    not_restore.append("decoder")
                if not loadw.get(
                        "optim"
                ):  # if true, also avoids global_step and word_per_sec
                    not_restore.append("optim")
                    if not loadw.get("global_step"):
                        not_restore.append("global_step")
                    if not loadw.get("words_per_sec"):
                        not_restore.append("words_per_sec")

            tf.logging.info("NOT RESTORING: %s",
                            json.dumps(not_restore, indent=2, sort_keys=True))
            train_hooks.append(
                hooks.LoadWeightsFromCheckpointHook(checkpoint_path,
                                                    not_restore))

        train_spec = tf.estimator.TrainSpec(
            input_fn=self._model.input_fn(
                tf.estimator.ModeKeys.TRAIN,
                self._config["train"]["batch_size"],
                self._config["data"],
                self._config["data"]["train_features_file"],
                labels_file=self._config["data"]["train_labels_file"],
                batch_type=self._config["train"]["batch_type"],
                batch_multiplier=self._num_devices,
                bucket_width=self._config["train"]["bucket_width"],
                single_pass=self._config["train"].get("single_pass", False),
                num_threads=self._config["train"].get("num_threads"),
                sample_buffer_size=self._config["train"]["sample_buffer_size"],
                prefetch_buffer_size=self._config["train"].get(
                    "prefetch_buffer_size"),
                maximum_features_length=self._config["train"].get(
                    "maximum_features_length"),
                maximum_labels_length=self._config["train"].get(
                    "maximum_labels_length")),
            max_steps=self._config["train"].get("train_steps"),
            hooks=train_hooks)
        return train_spec
Exemplo n.º 5
0
    def _build_train_spec(self):
        train_hooks = [
            hooks.LogParametersCountHook(),
            hooks.CountersHook(
                every_n_steps=self._estimator.config.save_summary_steps,
                output_dir=self._estimator.model_dir)
        ]

        default_sample_buffer_size = 1000000
        if "sample_buffer_size" not in self._config["train"]:
            tf.logging.warn(
                "You did not set sample_buffer_size. By default, the "
                "training dataset is shuffled by chunk of %d examples. "
                "If your dataset is larger than this value and eval_delay "
                "is shorter than the training time of one epoch, a section "
                "of the dataset will be discarded. Consider setting "
                "sample_buffer_size to the size of your dataset." %
                default_sample_buffer_size)

        train_spec = tf.estimator.TrainSpec(
            input_fn=self._model.input_fn(
                tf.estimator.ModeKeys.TRAIN,
                self._config["train"]["batch_size"],
                self._config["data"],
                self._config["data"]["train_features_file"],
                labels_file=self._config["data"]["train_labels_file"],
                batch_type=self._config["train"].get("batch_type", "examples"),
                batch_multiplier=self._num_devices,
                bucket_width=self._config["train"].get("bucket_width", 5),
                single_pass=self._config["train"].get("single_pass", False),
                num_threads=self._config["train"].get("num_threads"),
                sample_buffer_size=self._config["train"].get(
                    "sample_buffer_size", default_sample_buffer_size),
                maximum_features_length=self._config["train"].get(
                    "maximum_features_length"),
                maximum_labels_length=self._config["train"].get(
                    "maximum_labels_length")),
            max_steps=self._config["train"].get("train_steps"),
            hooks=train_hooks)
        return train_spec
Exemplo n.º 6
0
def train(estimator, model, config):
    """Runs training.

  Args:
    estimator: A `tf.estimator.Estimator`.
    model: A `opennmt.models.Model`.
    config: The configuration.
  """
    batch_size = config["train"]["batch_size"]
    prefetch_buffer_size = config["train"].get("prefetch_buffer_size",
                                               batch_size * 1000)
    num_parallel_process_calls = config["train"].get(
        "num_parallel_process_calls", multiprocessing.cpu_count())

    train_hooks = [
        hooks.LogParametersCountHook(),
        hooks.CountersHook(every_n_steps=estimator.config.save_summary_steps,
                           output_dir=estimator.model_dir)
    ]

    eval_hooks = []
    if config["train"].get("save_eval_predictions", False):
        save_path = os.path.join(estimator.model_dir, "eval")
        if not os.path.isdir(save_path):
            os.makedirs(save_path)
        eval_hooks.append(
            hooks.SaveEvaluationPredictionHook(
                model,
                os.path.join(save_path, "predictions.txt"),
                post_evaluation_fn=external_evaluation_fn(
                    config["train"].get("external_evaluators"),
                    config["data"]["eval_labels_file"],
                    output_dir=estimator.model_dir)))
    elif config["train"].get("external_evaluators") is not None:
        tf.logging.warning(
            "External evaluators only work when save_eval_predictions is enabled."
        )

    train_spec = tf.estimator.TrainSpec(input_fn=model.input_fn(
        tf.estimator.ModeKeys.TRAIN,
        batch_size,
        prefetch_buffer_size,
        num_parallel_process_calls,
        config["data"],
        config["data"]["train_features_file"],
        labels_file=config["data"]["train_labels_file"],
        num_buckets=config["train"].get("num_buckets", 5),
        sample_buffer_size=config["train"].get("sample_buffer_size", 1000000),
        maximum_features_length=config["train"].get("maximum_features_length"),
        maximum_labels_length=config["train"].get("maximum_labels_length")),
                                        max_steps=config["train"].get(
                                            "train_steps"),
                                        hooks=train_hooks)

    eval_spec = tf.estimator.EvalSpec(
        input_fn=model.input_fn(
            tf.estimator.ModeKeys.EVAL,
            batch_size,
            prefetch_buffer_size,
            num_parallel_process_calls,
            config["data"],
            config["data"]["eval_features_file"],
            labels_file=config["data"]["eval_labels_file"]),
        steps=None,
        hooks=eval_hooks,
        exporters=tf.estimator.LatestExporter(
            "latest", model.serving_input_fn(config["data"])),
        throttle_secs=config["train"].get("eval_delay", 18000))

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Exemplo n.º 7
0
def train(estimator, model, config):
    """Runs training.

  Args:
    estimator: A `tf.estimator.Estimator`.
    model: A `opennmt.models.Model`.
    config: The configuration.
  """
    if "eval" not in config:
        config["eval"] = {}

    train_hooks = [
        hooks.LogParametersCountHook(),
        hooks.CountersHook(every_n_steps=estimator.config.save_summary_steps,
                           output_dir=estimator.model_dir)
    ]

    eval_hooks = []
    if (config["eval"].get("save_eval_predictions", False)
            or config["eval"].get("external_evaluators") is not None):
        save_path = os.path.join(estimator.model_dir, "eval")
        if not os.path.isdir(save_path):
            os.makedirs(save_path)
        eval_hooks.append(
            hooks.SaveEvaluationPredictionHook(
                model,
                os.path.join(save_path, "predictions.txt"),
                post_evaluation_fn=external_evaluation_fn(
                    config["eval"].get("external_evaluators"),
                    config["data"]["eval_labels_file"],
                    output_dir=estimator.model_dir)))

    default_sample_buffer_size = 1000000
    if "sample_buffer_size" not in config["train"]:
        tf.logging.warn(
            "You did not set sample_buffer_size. By default, the "
            "training dataset is shuffled by chunk of %d examples. "
            "If your dataset is larger than this value and eval_delay "
            "is shorter than the training time of one epoch, a section "
            "of the dataset will be discarded. Consider setting "
            "sample_buffer_size to the size of your dataset." %
            default_sample_buffer_size)

    train_batch_size = config["train"]["batch_size"]
    train_batch_type = config["train"].get("batch_type", "examples")
    train_prefetch_buffer_size = config["train"].get(
        "prefetch_buffer_size",
        train_batch_size * (1000 if train_batch_type == "examples" else 50))
    train_num_parallel_process_calls = config["train"].get(
        "num_parallel_process_calls", multiprocessing.cpu_count())
    train_spec = tf.estimator.TrainSpec(input_fn=model.input_fn(
        tf.estimator.ModeKeys.TRAIN,
        train_batch_size,
        train_prefetch_buffer_size,
        train_num_parallel_process_calls,
        config["data"],
        config["data"]["train_features_file"],
        labels_file=config["data"]["train_labels_file"],
        batch_type=train_batch_type,
        bucket_width=config["train"].get("bucket_width", 5),
        sample_buffer_size=config["train"].get("sample_buffer_size",
                                               default_sample_buffer_size),
        maximum_features_length=config["train"].get("maximum_features_length"),
        maximum_labels_length=config["train"].get("maximum_labels_length")),
                                        max_steps=config["train"].get(
                                            "train_steps"),
                                        hooks=train_hooks)

    eval_batch_size = config["eval"].get(
        "batch_size",
        train_batch_size if train_batch_type == "examples" else 30)
    eval_prefetch_buffer_size = config["eval"].get("prefetch_buffer_size",
                                                   eval_batch_size * 10)
    eval_num_parallel_process_calls = config["eval"].get(
        "num_parallel_process_calls", train_num_parallel_process_calls)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=model.input_fn(
            tf.estimator.ModeKeys.EVAL,
            eval_batch_size,
            eval_prefetch_buffer_size,
            eval_num_parallel_process_calls,
            config["data"],
            config["data"]["eval_features_file"],
            labels_file=config["data"]["eval_labels_file"]),
        steps=None,
        hooks=eval_hooks,
        exporters=tf.estimator.LatestExporter(
            "latest", model.serving_input_fn(config["data"])),
        throttle_secs=config["eval"].get("eval_delay", 18000))

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Exemplo n.º 8
0
    def _build_train_spec(self, checkpoint_path):
        train_hooks = [hooks.LogParametersCountHook()]

        #if checkpoint_path is not None:
        #  train_hooks.append(hooks.LoadWeightsFromCheckpointHook(checkpoint_path))

        # TODO: pass what not/to load based on the config "load_weights" boolean info. MOVE OPTS TO LOAD TO UTIL FUNCTION

        if checkpoint_path is not None and "load_weights" in self._config:
            #not_restore = ['encoder', 'decoder', 'shared_embeddings', 'optim', 'global_step', 'word_per_sec', 'output_layer']

            #not_restore = ['optim', 'global_step', 'word_per_sec'] #, 'encoder'
            # FOR RUNN THE word_per_sec DOEN'T EXIST THROWS ERROR - HOW TO HANDLE? --checkpoint LOADING? if.startswith("word_per_sec") WAY IS THE ANSE
            not_restore = []  # make sure its empty at first call
            loadw = self._config[
                "load_weights"]  # search in the var_list with [0] for optim..., [1] for enc/dec and [1:2] for embs, projection
            if loadw is not None:
                if not loadw.get("src_embs"):
                    not_restore.append("encoder/w_embs")
                if not loadw.get("tgt_embs"):
                    not_restore.append("decoder/w_embs")

                if not loadw.get("projection"):
                    not_restore.append("decoder/dense")

                if not loadw.get("shared_embs"):
                    not_restore.append("shared_embeddings/w_embs")

                if not loadw.get("encoder"):
                    not_restore.append("encoder")
                if not loadw.get("decoder"):
                    not_restore.append("decoder")

                if not loadw.get(
                        "optim"):  #if avoided the the next two are avoided
                    not_restore.append("optim")

                    if not loadw.get("global_step"):
                        not_restore.append("global_step")
                    if not loadw.get("words_per_sec"):
                        not_restore.append("words_per_sec")

            tf.logging.info("NOT RESTORING SUB-NETWORKS: %s",
                            json.dumps(not_restore, indent=2, sort_keys=True))

            train_hooks.append(
                hooks.LoadWeightsFromCheckpointHook(
                    checkpoint_path,
                    not_restore))  #self._config["load_partial_weights"]))

        train_spec = tf.estimator.TrainSpec(
            input_fn=self._model.input_fn(
                tf.estimator.ModeKeys.TRAIN,
                self._config["train"]["batch_size"],
                self._config["data"],
                self._config["data"]["train_features_file"],
                labels_file=self._config["data"]["train_labels_file"],
                batch_type=self._config["train"]["batch_type"],
                batch_multiplier=self._num_devices,
                bucket_width=self._config["train"]["bucket_width"],
                single_pass=self._config["train"].get("single_pass", False),
                num_threads=self._config["train"].get("num_threads"),
                sample_buffer_size=self._config["train"]["sample_buffer_size"],
                prefetch_buffer_size=self._config["train"].get(
                    "prefetch_buffer_size"),
                maximum_features_length=self._config["train"].get(
                    "maximum_features_length"),
                maximum_labels_length=self._config["train"].get(
                    "maximum_labels_length")),
            max_steps=self._config["train"].get("train_steps"),
            hooks=train_hooks)
        return train_spec