def _build_train_spec(self): train_hooks = [ hooks.LogParametersCountHook(), hooks.CountersHook( every_n_steps=self._estimator.config.save_summary_steps, output_dir=self._estimator.model_dir)] train_spec = tf.estimator.TrainSpec( input_fn=self._model.input_fn( tf.estimator.ModeKeys.TRAIN, self._config["train"]["batch_size"], self._config["data"], self._config["data"]["train_features_file"], labels_file=self._config["data"]["train_labels_file"], batch_type=self._config["train"].get("batch_type", "examples"), batch_multiplier=self._num_devices, bucket_width=self._config["train"].get("bucket_width", 5), single_pass=self._config["train"].get("single_pass", False), num_threads=self._config["train"].get("num_threads"), sample_buffer_size=self._config["train"].get("sample_buffer_size", 500000), prefetch_buffer_size=self._config["train"].get("prefetch_buffer_size"), maximum_features_length=self._config["train"].get("maximum_features_length"), maximum_labels_length=self._config["train"].get("maximum_labels_length")), max_steps=self._config["train"].get("train_steps"), hooks=train_hooks) return train_spec
def _build_train_spec(self, checkpoint_path): train_hooks = [ hooks.LogParametersCountHook()] if checkpoint_path is not None: train_hooks.append(hooks.LoadWeightsFromCheckpointHook(checkpoint_path)) if self._hvd is not None: train_hooks.append(self._hvd.BroadcastGlobalVariablesHook(0)) train_steps = self._config["train"].get("train_steps") if train_steps is not None and self._hvd is not None: train_steps //= self._hvd.size() train_spec = tf.estimator.TrainSpec( input_fn=estimator_util.make_input_fn( self._model, tf.estimator.ModeKeys.TRAIN, self._config["train"]["batch_size"], features_file=self._config["data"]["train_features_file"], labels_file=self._config["data"].get("train_labels_file"), batch_type=self._config["train"]["batch_type"], batch_multiplier=self._num_devices, bucket_width=self._config["train"]["bucket_width"], maximum_features_length=self._config["train"].get("maximum_features_length"), maximum_labels_length=self._config["train"].get("maximum_labels_length"), shuffle_buffer_size=self._config["train"]["sample_buffer_size"], single_pass=self._config["train"].get("single_pass", False), num_shards=self._hvd.size() if self._hvd is not None else 1, shard_index=self._hvd.rank() if self._hvd is not None else 0, num_threads=self._config["train"].get("num_threads"), prefetch_buffer_size=self._config["train"].get("prefetch_buffer_size"), return_dataset=False), max_steps=train_steps, hooks=train_hooks) return train_spec
def _build_train_spec(self, checkpoint_path): train_hooks = [ hooks.LogParametersCountHook()] if checkpoint_path is not None: train_hooks.append(hooks.LoadWeightsFromCheckpointHook(checkpoint_path)) train_spec = tf.estimator.TrainSpec( input_fn=self._model.input_fn( tf.estimator.ModeKeys.TRAIN, self._config["train"]["batch_size"], self._config["data"], self._config["data"]["train_features_file"], labels_file=self._config["data"]["train_labels_file"], batch_type=self._config["train"]["batch_type"], batch_multiplier=self._num_devices, bucket_width=self._config["train"]["bucket_width"], single_pass=self._config["train"].get("single_pass", False), num_threads=self._config["train"].get("num_threads"), sample_buffer_size=self._config["train"]["sample_buffer_size"], prefetch_buffer_size=self._config["train"].get("prefetch_buffer_size"), maximum_features_length=self._config["train"].get("maximum_features_length"), maximum_labels_length=self._config["train"].get("maximum_labels_length")), max_steps=self._config["train"].get("train_steps"), hooks=train_hooks) return train_spec
def _build_train_spec(self, checkpoint_path): train_hooks = [hooks.LogParametersCountHook()] #if checkpoint_path is not None: # train_hooks.append(hooks.LoadWeightsFromCheckpointHook(checkpoint_path)) # NEW: loads params based on config.yml ["load_weights"] - see config*.yml if checkpoint_path is not None and "load_weights" in self._config: not_restore = [] loadw = self._config["load_weights"] if loadw is not None: if not loadw.get("src_embs"): not_restore.append("encoder/w_embs") if not loadw.get("tgt_embs"): not_restore.append("decoder/w_embs") if not loadw.get("projection"): not_restore.append("decoder/dense") if not loadw.get("shared_embs"): not_restore.append("shared_embeddings/w_embs") if not loadw.get("encoder"): not_restore.append("encoder") if not loadw.get("decoder"): not_restore.append("decoder") if not loadw.get( "optim" ): # if true, also avoids global_step and word_per_sec not_restore.append("optim") if not loadw.get("global_step"): not_restore.append("global_step") if not loadw.get("words_per_sec"): not_restore.append("words_per_sec") tf.logging.info("NOT RESTORING: %s", json.dumps(not_restore, indent=2, sort_keys=True)) train_hooks.append( hooks.LoadWeightsFromCheckpointHook(checkpoint_path, not_restore)) train_spec = tf.estimator.TrainSpec( input_fn=self._model.input_fn( tf.estimator.ModeKeys.TRAIN, self._config["train"]["batch_size"], self._config["data"], self._config["data"]["train_features_file"], labels_file=self._config["data"]["train_labels_file"], batch_type=self._config["train"]["batch_type"], batch_multiplier=self._num_devices, bucket_width=self._config["train"]["bucket_width"], single_pass=self._config["train"].get("single_pass", False), num_threads=self._config["train"].get("num_threads"), sample_buffer_size=self._config["train"]["sample_buffer_size"], prefetch_buffer_size=self._config["train"].get( "prefetch_buffer_size"), maximum_features_length=self._config["train"].get( "maximum_features_length"), maximum_labels_length=self._config["train"].get( "maximum_labels_length")), max_steps=self._config["train"].get("train_steps"), hooks=train_hooks) return train_spec
def _build_train_spec(self): train_hooks = [ hooks.LogParametersCountHook(), hooks.CountersHook( every_n_steps=self._estimator.config.save_summary_steps, output_dir=self._estimator.model_dir) ] default_sample_buffer_size = 1000000 if "sample_buffer_size" not in self._config["train"]: tf.logging.warn( "You did not set sample_buffer_size. By default, the " "training dataset is shuffled by chunk of %d examples. " "If your dataset is larger than this value and eval_delay " "is shorter than the training time of one epoch, a section " "of the dataset will be discarded. Consider setting " "sample_buffer_size to the size of your dataset." % default_sample_buffer_size) train_spec = tf.estimator.TrainSpec( input_fn=self._model.input_fn( tf.estimator.ModeKeys.TRAIN, self._config["train"]["batch_size"], self._config["data"], self._config["data"]["train_features_file"], labels_file=self._config["data"]["train_labels_file"], batch_type=self._config["train"].get("batch_type", "examples"), batch_multiplier=self._num_devices, bucket_width=self._config["train"].get("bucket_width", 5), single_pass=self._config["train"].get("single_pass", False), num_threads=self._config["train"].get("num_threads"), sample_buffer_size=self._config["train"].get( "sample_buffer_size", default_sample_buffer_size), maximum_features_length=self._config["train"].get( "maximum_features_length"), maximum_labels_length=self._config["train"].get( "maximum_labels_length")), max_steps=self._config["train"].get("train_steps"), hooks=train_hooks) return train_spec
def train(estimator, model, config): """Runs training. Args: estimator: A `tf.estimator.Estimator`. model: A `opennmt.models.Model`. config: The configuration. """ batch_size = config["train"]["batch_size"] prefetch_buffer_size = config["train"].get("prefetch_buffer_size", batch_size * 1000) num_parallel_process_calls = config["train"].get( "num_parallel_process_calls", multiprocessing.cpu_count()) train_hooks = [ hooks.LogParametersCountHook(), hooks.CountersHook(every_n_steps=estimator.config.save_summary_steps, output_dir=estimator.model_dir) ] eval_hooks = [] if config["train"].get("save_eval_predictions", False): save_path = os.path.join(estimator.model_dir, "eval") if not os.path.isdir(save_path): os.makedirs(save_path) eval_hooks.append( hooks.SaveEvaluationPredictionHook( model, os.path.join(save_path, "predictions.txt"), post_evaluation_fn=external_evaluation_fn( config["train"].get("external_evaluators"), config["data"]["eval_labels_file"], output_dir=estimator.model_dir))) elif config["train"].get("external_evaluators") is not None: tf.logging.warning( "External evaluators only work when save_eval_predictions is enabled." ) train_spec = tf.estimator.TrainSpec(input_fn=model.input_fn( tf.estimator.ModeKeys.TRAIN, batch_size, prefetch_buffer_size, num_parallel_process_calls, config["data"], config["data"]["train_features_file"], labels_file=config["data"]["train_labels_file"], num_buckets=config["train"].get("num_buckets", 5), sample_buffer_size=config["train"].get("sample_buffer_size", 1000000), maximum_features_length=config["train"].get("maximum_features_length"), maximum_labels_length=config["train"].get("maximum_labels_length")), max_steps=config["train"].get( "train_steps"), hooks=train_hooks) eval_spec = tf.estimator.EvalSpec( input_fn=model.input_fn( tf.estimator.ModeKeys.EVAL, batch_size, prefetch_buffer_size, num_parallel_process_calls, config["data"], config["data"]["eval_features_file"], labels_file=config["data"]["eval_labels_file"]), steps=None, hooks=eval_hooks, exporters=tf.estimator.LatestExporter( "latest", model.serving_input_fn(config["data"])), throttle_secs=config["train"].get("eval_delay", 18000)) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train(estimator, model, config): """Runs training. Args: estimator: A `tf.estimator.Estimator`. model: A `opennmt.models.Model`. config: The configuration. """ if "eval" not in config: config["eval"] = {} train_hooks = [ hooks.LogParametersCountHook(), hooks.CountersHook(every_n_steps=estimator.config.save_summary_steps, output_dir=estimator.model_dir) ] eval_hooks = [] if (config["eval"].get("save_eval_predictions", False) or config["eval"].get("external_evaluators") is not None): save_path = os.path.join(estimator.model_dir, "eval") if not os.path.isdir(save_path): os.makedirs(save_path) eval_hooks.append( hooks.SaveEvaluationPredictionHook( model, os.path.join(save_path, "predictions.txt"), post_evaluation_fn=external_evaluation_fn( config["eval"].get("external_evaluators"), config["data"]["eval_labels_file"], output_dir=estimator.model_dir))) default_sample_buffer_size = 1000000 if "sample_buffer_size" not in config["train"]: tf.logging.warn( "You did not set sample_buffer_size. By default, the " "training dataset is shuffled by chunk of %d examples. " "If your dataset is larger than this value and eval_delay " "is shorter than the training time of one epoch, a section " "of the dataset will be discarded. Consider setting " "sample_buffer_size to the size of your dataset." % default_sample_buffer_size) train_batch_size = config["train"]["batch_size"] train_batch_type = config["train"].get("batch_type", "examples") train_prefetch_buffer_size = config["train"].get( "prefetch_buffer_size", train_batch_size * (1000 if train_batch_type == "examples" else 50)) train_num_parallel_process_calls = config["train"].get( "num_parallel_process_calls", multiprocessing.cpu_count()) train_spec = tf.estimator.TrainSpec(input_fn=model.input_fn( tf.estimator.ModeKeys.TRAIN, train_batch_size, train_prefetch_buffer_size, train_num_parallel_process_calls, config["data"], config["data"]["train_features_file"], labels_file=config["data"]["train_labels_file"], batch_type=train_batch_type, bucket_width=config["train"].get("bucket_width", 5), sample_buffer_size=config["train"].get("sample_buffer_size", default_sample_buffer_size), maximum_features_length=config["train"].get("maximum_features_length"), maximum_labels_length=config["train"].get("maximum_labels_length")), max_steps=config["train"].get( "train_steps"), hooks=train_hooks) eval_batch_size = config["eval"].get( "batch_size", train_batch_size if train_batch_type == "examples" else 30) eval_prefetch_buffer_size = config["eval"].get("prefetch_buffer_size", eval_batch_size * 10) eval_num_parallel_process_calls = config["eval"].get( "num_parallel_process_calls", train_num_parallel_process_calls) eval_spec = tf.estimator.EvalSpec( input_fn=model.input_fn( tf.estimator.ModeKeys.EVAL, eval_batch_size, eval_prefetch_buffer_size, eval_num_parallel_process_calls, config["data"], config["data"]["eval_features_file"], labels_file=config["data"]["eval_labels_file"]), steps=None, hooks=eval_hooks, exporters=tf.estimator.LatestExporter( "latest", model.serving_input_fn(config["data"])), throttle_secs=config["eval"].get("eval_delay", 18000)) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def _build_train_spec(self, checkpoint_path): train_hooks = [hooks.LogParametersCountHook()] #if checkpoint_path is not None: # train_hooks.append(hooks.LoadWeightsFromCheckpointHook(checkpoint_path)) # TODO: pass what not/to load based on the config "load_weights" boolean info. MOVE OPTS TO LOAD TO UTIL FUNCTION if checkpoint_path is not None and "load_weights" in self._config: #not_restore = ['encoder', 'decoder', 'shared_embeddings', 'optim', 'global_step', 'word_per_sec', 'output_layer'] #not_restore = ['optim', 'global_step', 'word_per_sec'] #, 'encoder' # FOR RUNN THE word_per_sec DOEN'T EXIST THROWS ERROR - HOW TO HANDLE? --checkpoint LOADING? if.startswith("word_per_sec") WAY IS THE ANSE not_restore = [] # make sure its empty at first call loadw = self._config[ "load_weights"] # search in the var_list with [0] for optim..., [1] for enc/dec and [1:2] for embs, projection if loadw is not None: if not loadw.get("src_embs"): not_restore.append("encoder/w_embs") if not loadw.get("tgt_embs"): not_restore.append("decoder/w_embs") if not loadw.get("projection"): not_restore.append("decoder/dense") if not loadw.get("shared_embs"): not_restore.append("shared_embeddings/w_embs") if not loadw.get("encoder"): not_restore.append("encoder") if not loadw.get("decoder"): not_restore.append("decoder") if not loadw.get( "optim"): #if avoided the the next two are avoided not_restore.append("optim") if not loadw.get("global_step"): not_restore.append("global_step") if not loadw.get("words_per_sec"): not_restore.append("words_per_sec") tf.logging.info("NOT RESTORING SUB-NETWORKS: %s", json.dumps(not_restore, indent=2, sort_keys=True)) train_hooks.append( hooks.LoadWeightsFromCheckpointHook( checkpoint_path, not_restore)) #self._config["load_partial_weights"])) train_spec = tf.estimator.TrainSpec( input_fn=self._model.input_fn( tf.estimator.ModeKeys.TRAIN, self._config["train"]["batch_size"], self._config["data"], self._config["data"]["train_features_file"], labels_file=self._config["data"]["train_labels_file"], batch_type=self._config["train"]["batch_type"], batch_multiplier=self._num_devices, bucket_width=self._config["train"]["bucket_width"], single_pass=self._config["train"].get("single_pass", False), num_threads=self._config["train"].get("num_threads"), sample_buffer_size=self._config["train"]["sample_buffer_size"], prefetch_buffer_size=self._config["train"].get( "prefetch_buffer_size"), maximum_features_length=self._config["train"].get( "maximum_features_length"), maximum_labels_length=self._config["train"].get( "maximum_labels_length")), max_steps=self._config["train"].get("train_steps"), hooks=train_hooks) return train_spec