def _train_model(self, input_fn, steps, feed_fn=None, init_op=None, init_feed_fn=None, init_fn=None, device_fn=None, monitors=None, log_every_steps=100, fail_on_nan_loss=True): if self._config.execution_mode not in ('all', 'train'): return # Stagger startup of worker sessions based on task id. sleep_secs = min(self._config.training_worker_max_startup_secs, self._config.task * self._config.training_worker_session_startup_stagger_secs) if sleep_secs: logging.info('Waiting %d secs before starting task %d.', sleep_secs, self._config.task) time.sleep(sleep_secs) # Device allocation device_fn = device_fn or self._device_fn self._graph = ops.Graph() with self._graph.as_default() as g, g.device(device_fn): random_seed.set_random_seed(self._config.tf_random_seed) global_step = contrib_framework.create_global_step(g) features, targets = input_fn() self._check_inputs(features, targets) train_op, loss_op = self._get_train_ops(features, targets) # Add default monitors. if monitors is None: monitors = [] monitors += monitors_lib.get_default_monitors( loss_op=loss_op, summary_op=logging_ops.get_summary_op(), save_summary_steps=100, summary_writer=graph_actions.get_summary_writer(self._model_dir)) is_chief = self._config.task == 0 if not is_chief: # Run monitors only on chief. monitors = [] # Setup monitors. for monitor in monitors: monitor.set_estimator(self) return train( graph=g, output_dir=self._model_dir, train_op=train_op, loss_op=loss_op, global_step_tensor=global_step, init_op=init_op, init_feed_dict=init_feed_fn() if init_feed_fn is not None else None, init_fn=init_fn, log_every_steps=log_every_steps, supervisor_is_chief=is_chief, supervisor_master=self._config.master, feed_fn=feed_fn, max_steps=steps, fail_on_nan_loss=fail_on_nan_loss, monitors=monitors)
def _train_model(self, input_fn, steps, feed_fn=None, init_op=None, init_feed_fn=None, init_fn=None, device_fn=None, monitors=None, log_every_steps=100, fail_on_nan_loss=True, max_steps=None): # TODO(wicke): Remove this once Model and associated code are gone. if hasattr(self._config, 'execution_mode'): if self._config.execution_mode not in ('all', 'train'): return # Stagger startup of worker sessions based on task id. sleep_secs = min( self._config.training_worker_max_startup_secs, self._config.task * self._config.training_worker_session_startup_stagger_secs) if sleep_secs: logging.info('Waiting %d secs before starting task %d.', sleep_secs, self._config.task) time.sleep(sleep_secs) # Device allocation device_fn = device_fn or self._device_fn self._graph = ops.Graph() with self._graph.as_default() as g, g.device(device_fn): random_seed.set_random_seed(self._config.tf_random_seed) global_step = contrib_framework.create_global_step(g) features, targets = input_fn() self._check_inputs(features, targets) train_op, loss_op = self._get_train_ops(features, targets) # Add default monitors. if monitors is None: monitors = [] is_chief = self._config.task == 0 if is_chief: monitors += monitors_lib.get_default_monitors( loss_op=loss_op, summary_op=logging_ops.get_summary_op(), save_summary_steps=self._config.save_summary_steps, summary_writer=graph_actions.get_summary_writer( self._model_dir)) else: monitors = [] # Setup monitors. for monitor in monitors: monitor.set_estimator(self) return graph_actions.train( graph=g, output_dir=self._model_dir, train_op=train_op, loss_op=loss_op, global_step_tensor=global_step, init_op=init_op, init_feed_dict=init_feed_fn() if init_feed_fn is not None else None, init_fn=init_fn, log_every_steps=log_every_steps, supervisor_is_chief=is_chief, supervisor_master=self._config.master, supervisor_save_model_secs=self._config.save_checkpoints_secs, keep_checkpoint_max=self._config.keep_checkpoint_max, feed_fn=feed_fn, steps=steps, fail_on_nan_loss=fail_on_nan_loss, monitors=monitors, max_steps=max_steps)
def _train_model(self, input_fn, steps, feed_fn=None, init_op=None, init_feed_fn=None, init_fn=None, device_fn=None, monitors=None, log_every_steps=100, fail_on_nan_loss=True): # TODO(wicke): This is a hack and needs to go. if self._config.execution_mode not in ('all', 'train'): return if not self._model_dir: raise ValueError('Estimator\'s model_dir should be non-empty.') # Stagger startup of worker sessions based on task id. sleep_secs = min(self._config.training_worker_max_startup_secs, self._config.task * self._config.training_worker_session_startup_stagger_secs) if sleep_secs: logging.info('Waiting %d secs before starting task %d.', sleep_secs, self._config.task) time.sleep(sleep_secs) # Device allocation device_fn = device_fn or self._device_fn self._graph = ops.Graph() with self._graph.as_default() as g, g.device(device_fn): random_seed.set_random_seed(self._config.tf_random_seed) global_step = contrib_framework.create_global_step(g) features, targets = input_fn() self._check_inputs(features, targets) train_op, loss_op = self._get_train_ops(features, targets) # Add default monitors. if monitors is None: monitors = [] monitors += monitors_lib.get_default_monitors( loss_op=loss_op, summary_op=logging_ops.get_summary_op(), save_summary_steps=100, summary_writer=graph_actions.get_summary_writer(self._model_dir)) is_chief = self._config.task == 0 if not is_chief: # Run monitors only on chief. monitors = [] # Setup monitors. for monitor in monitors: monitor.set_estimator(self) return graph_actions.train( graph=g, output_dir=self._model_dir, train_op=train_op, loss_op=loss_op, global_step_tensor=global_step, init_op=init_op, init_feed_dict=init_feed_fn() if init_feed_fn is not None else None, init_fn=init_fn, log_every_steps=log_every_steps, supervisor_is_chief=is_chief, supervisor_master=self._config.master, feed_fn=feed_fn, max_steps=steps, fail_on_nan_loss=fail_on_nan_loss, monitors=monitors)