def _train_model(self, input_fn, steps, feed_fn=None, init_op=None, init_feed_fn=None, init_fn=None, device_fn=None, monitors=None, log_every_steps=100, fail_on_nan_loss=True, max_steps=None): # TODO(wicke): Remove this once Model and associated code are gone. if hasattr(self._config, 'execution_mode'): if self._config.execution_mode not in ('all', 'train'): return # Stagger startup of worker sessions based on task id. sleep_secs = min( self._config.training_worker_max_startup_secs, self._config.task * self._config.training_worker_session_startup_stagger_secs) if sleep_secs: logging.info('Waiting %d secs before starting task %d.', sleep_secs, self._config.task) time.sleep(sleep_secs) # Device allocation device_fn = device_fn or self._device_fn self._graph = ops.Graph() with self._graph.as_default() as g, g.device(device_fn): random_seed.set_random_seed(self._config.tf_random_seed) global_step = contrib_framework.create_global_step(g) features, targets = input_fn() self._check_inputs(features, targets) train_op, loss_op = self._get_train_ops(features, targets) # Add default monitors. if monitors is None: monitors = [] is_chief = self._config.task == 0 if is_chief: monitors += monitors_lib.get_default_monitors( loss_op=loss_op, summary_op=logging_ops.get_summary_op(), save_summary_steps=self._config.save_summary_steps, summary_writer=graph_actions.get_summary_writer( self._model_dir)) else: monitors = [] # Setup monitors. for monitor in monitors: monitor.set_estimator(self) return graph_actions.train( graph=g, output_dir=self._model_dir, train_op=train_op, loss_op=loss_op, global_step_tensor=global_step, init_op=init_op, init_feed_dict=init_feed_fn() if init_feed_fn is not None else None, init_fn=init_fn, log_every_steps=log_every_steps, supervisor_is_chief=is_chief, supervisor_master=self._config.master, supervisor_save_model_secs=self._config.save_checkpoints_secs, keep_checkpoint_max=self._config.keep_checkpoint_max, feed_fn=feed_fn, steps=steps, fail_on_nan_loss=fail_on_nan_loss, monitors=monitors, max_steps=max_steps)
def train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, steps=None, fail_on_nan_loss=True, monitors=None): """Train a model. Given `graph`, a directory to write outputs to (`output_dir`), and some ops, run a training loop. The given `train_op` performs one step of training on the model. The `loss_op` represents the objective function of the training. It is expected to increment the `global_step_tensor`, a scalar integer tensor counting training steps. This function uses `Supervisor` to initialize the graph (from a checkpoint if one is available in `output_dir`), write summaries defined in the graph, and write regular checkpoints as defined by `supervisor_save_model_secs`. Training continues until `global_step_tensor` evaluates to `max_steps`, or, if `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the program is terminated with exit code 1. Args: graph: A graph to train. It is expected that this graph is not in use elsewhere. output_dir: A directory to write outputs to. train_op: An op that performs one training step when run. loss_op: A scalar loss tensor. global_step_tensor: A tensor representing the global step. If none is given, one is extracted from the graph using the same logic as in `Supervisor`. init_op: An op that initializes the graph. If `None`, use `Supervisor`'s default. init_feed_dict: A dictionary that maps `Tensor` objects to feed values. This feed dictionary will be used when `init_op` is evaluated. init_fn: Optional callable passed to Supervisor to initialize the model. log_every_steps: Output logs regularly. The logs contain timing data and the current loss. supervisor_is_chief: Whether the current process is the chief supervisor in charge of restoring the model and running standard services. supervisor_master: The master string to use when preparing the session. supervisor_save_model_secs: Save a checkpoint every `supervisor_save_model_secs` seconds when training. supervisor_save_summaries_steps: Save summaries every `supervisor_save_summaries_steps` seconds when training. feed_fn: A function that is called every iteration to produce a `feed_dict` passed to `session.run` calls. Optional. steps: Trains for this many steps (e.g. current global step + `steps`). fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op` evaluates to `NaN`. If false, continue training as if nothing happened. monitors: List of `BaseMonitor` subclass instances. Used for callbacks inside the training loop. Returns: The final loss value. Raises: ValueError: If `global_step_tensor` is not provided. See `tf.contrib.framework.get_global_step` for how we look it up if not provided explicitly. NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever evaluates to `NaN`. """ if not output_dir: raise ValueError('Output directory should be non-empty.') with graph.as_default(): global_step_tensor = contrib_variables.assert_or_get_global_step( graph, global_step_tensor) if global_step_tensor is None: raise ValueError('No "global_step" was provided or found in the graph.') # Get current step. try: start_step = checkpoints.load_variable( output_dir, global_step_tensor.name) except (errors.NotFoundError, ValueError): start_step = 0 summary_writer = (get_summary_writer(output_dir) if supervisor_is_chief else None) # TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors. if not supervisor_is_chief: # monitors should run only on the chief. monitors = [] elif not monitors: monitors = monitors_lib.get_default_monitors( loss_op=loss_op, summary_op=logging_ops.get_summary_op(), save_summary_steps=supervisor_save_summaries_steps, summary_writer=summary_writer) max_steps = (start_step + steps) if steps else None # Start monitors, can create graph parts. for monitor in monitors: monitor.begin(max_steps=max_steps) supervisor = tf_supervisor.Supervisor( graph, init_op=init_op or tf_supervisor.Supervisor.USE_DEFAULT, init_feed_dict=init_feed_dict, is_chief=supervisor_is_chief, logdir=output_dir, saver=_make_saver(graph), global_step=global_step_tensor, summary_op=None, summary_writer=summary_writer, save_model_secs=supervisor_save_model_secs, init_fn=init_fn) session = supervisor.PrepareSession(master=supervisor_master, start_standard_services=True) supervisor.StartQueueRunners(session) with session: get_current_step = lambda: session.run(global_step_tensor) start_step = get_current_step() last_step = start_step last_log_step = start_step loss_value = None logging.info('Training steps [%d,%s)', last_step, 'inf' if max_steps is None else str(max_steps)) excinfo = None try: while not supervisor.ShouldStop() and ( (max_steps is None) or (last_step < max_steps)): start_time = time.time() feed_dict = feed_fn() if feed_fn is not None else None outputs, should_stop = _run_with_monitors( session, last_step + 1, [train_op, loss_op], feed_dict, monitors) loss_value = outputs[loss_op.name] if np.isnan(loss_value): failure_message = 'Model diverged with loss = NaN.' if fail_on_nan_loss: logging.error(failure_message) raise NanLossDuringTrainingError() else: logging.warning(failure_message) if should_stop: break this_step = get_current_step() if this_step <= last_step: logging.error( 'Global step was not incremented by train op at step %s' ': new step %d', last_step, this_step) last_step = this_step is_last_step = (max_steps is not None) and (last_step >= max_steps) if is_last_step or (last_step - last_log_step >= log_every_steps): logging.info( 'training step %d, loss = %.5f (%.3f sec/batch).', last_step, loss_value, float(time.time() - start_time)) last_log_step = last_step except errors.OutOfRangeError as e: logging.warn('Got exception during tf.learn training loop possibly ' 'due to exhausted input queue %s.', e) except BaseException as e: # pylint: disable=broad-except # Hold on to any other exceptions while we try recording a final # checkpoint and summary. excinfo = sys.exc_info() finally: try: # Call supervisor.Stop() from within a try block because it re-raises # exceptions thrown by the supervised threads. supervisor.Stop(close_summary_writer=False) # Save one last checkpoint and summaries # TODO(wicke): This should be handled by Supervisor # In case we encountered an exception in the try block before we updated # last_step, update it here (again). last_step = get_current_step() if supervisor_is_chief: ckpt_path = supervisor.save_path logging.info('Saving checkpoint for step %d to checkpoint: %s.', last_step, ckpt_path) supervisor.saver.save(session, ckpt_path, global_step=last_step) # Finish monitors. for monitor in monitors: monitor.end() # catch OutOfRangeError which is thrown when queue is out of data (and for # other reasons as well). except errors.OutOfRangeError as e: logging.warn('OutOfRangeError in tf.learn final checkpoint possibly ' 'due to exhausted input queue. Note: summary_op is not ' 'expected to trigger dequeues. %s.', e) except BaseException as e: # pylint: disable=broad-except # If we don't already have an exception to re-raise, raise this one. if not excinfo: raise # Otherwise, log this one and raise the other in the finally block. logging.error('Got exception during tf.learn final checkpoint %s.', e) finally: if excinfo: reraise(*excinfo) return loss_value
def _train_internal(graph, output_dir, train_op, loss_op, global_step_tensor, init_op, init_feed_dict, init_fn, log_every_steps, supervisor_is_chief, supervisor_master, supervisor_save_model_secs, keep_checkpoint_max, supervisor_save_summaries_steps, feed_fn, steps, fail_on_nan_loss, monitors, max_steps): """See train.""" if (steps is not None) and (max_steps is not None): raise ValueError('Can not provide both steps and max_steps.') if not output_dir: raise ValueError('Output directory should be non-empty %s.' % output_dir) if train_op is None: raise ValueError('Missing train_op.') if loss_op is None: raise ValueError('Missing loss_op.') with graph.as_default(): global_step_tensor = contrib_variables.assert_or_get_global_step( graph, global_step_tensor) if global_step_tensor is None: raise ValueError('No "global_step" was provided or found in the graph.') # Get current step. try: start_step = load_variable(output_dir, global_step_tensor.name) except (errors.NotFoundError, ValueError): start_step = 0 summary_writer = (get_summary_writer(output_dir) if supervisor_is_chief else None) # Add default chief monitors if none were provided. if not monitors: monitors = monitors_lib.get_default_monitors( loss_op=loss_op, summary_op=logging_ops.get_summary_op(), save_summary_steps=supervisor_save_summaries_steps, summary_writer=summary_writer) if supervisor_is_chief else [] # TODO(ipolosukhin): Replace all functionality of Supervisor # with Chief-Exclusive Monitors. if not supervisor_is_chief: # Prune list of monitor to the ones runnable on all workers. monitors = [monitor for monitor in monitors if monitor.run_on_all_workers] if max_steps is None: max_steps = (start_step + steps) if steps else None # Start monitors, can create graph parts. for monitor in monitors: monitor.begin(max_steps=max_steps) supervisor = tf_supervisor.Supervisor( graph, init_op=init_op or tf_supervisor.Supervisor.USE_DEFAULT, init_feed_dict=init_feed_dict, is_chief=supervisor_is_chief, logdir=output_dir, saver=_make_saver(graph, keep_checkpoint_max), global_step=global_step_tensor, summary_op=None, summary_writer=summary_writer, save_model_secs=supervisor_save_model_secs, init_fn=init_fn) session = supervisor.PrepareSession(master=supervisor_master, start_standard_services=True) supervisor.StartQueueRunners(session) with session: get_current_step = lambda: session.run(global_step_tensor) start_step = get_current_step() last_step = start_step last_log_step = start_step loss_value = None logging.info('Training steps [%d,%s)', last_step, 'inf' if max_steps is None else str(max_steps)) excinfo = None try: while not supervisor.ShouldStop() and ( (max_steps is None) or (last_step < max_steps)): start_time = time.time() feed_dict = feed_fn() if feed_fn is not None else None outputs, should_stop = _run_with_monitors( session, last_step + 1, [train_op, loss_op], feed_dict, monitors) loss_value = outputs[loss_op.name] if np.isnan(loss_value): failure_message = 'Model diverged with loss = NaN.' if fail_on_nan_loss: logging.error(failure_message) raise monitors_lib.NanLossDuringTrainingError() else: logging.warning(failure_message) if should_stop: break this_step = get_current_step() if this_step <= last_step: logging.error( 'Global step was not incremented by train op at step %s' ': new step %d', last_step, this_step) last_step = this_step is_last_step = (max_steps is not None) and (last_step >= max_steps) if is_last_step or (last_step - last_log_step >= log_every_steps): logging.info( 'training step %d, loss = %.5f (%.3f sec/batch).', last_step, loss_value, float(time.time() - start_time)) last_log_step = last_step except errors.OutOfRangeError as e: logging.warn('Got exception during tf.learn training loop possibly ' 'due to exhausted input queue %s.', e) except StopIteration: logging.info('Exhausted input iterarator.') except BaseException as e: # pylint: disable=broad-except # Hold on to any other exceptions while we try recording a final # checkpoint and summary. excinfo = sys.exc_info() finally: try: # Call supervisor.Stop() from within a try block because it re-raises # exceptions thrown by the supervised threads. supervisor.Stop(close_summary_writer=False) # Save one last checkpoint and summaries # TODO(wicke): This should be handled by Supervisor # In case we encountered an exception in the try block before we updated # last_step, update it here (again). last_step = get_current_step() if supervisor_is_chief: ckpt_path = supervisor.save_path logging.info('Saving checkpoint for step %d to checkpoint: %s.', last_step, ckpt_path) supervisor.saver.save(session, ckpt_path, global_step=last_step) # Finish monitors. for monitor in monitors: monitor.end() # catch OutOfRangeError which is thrown when queue is out of data (and for # other reasons as well). except errors.OutOfRangeError as e: logging.warn('OutOfRangeError in tf.learn final checkpoint possibly ' 'due to exhausted input queue. Note: summary_op is not ' 'expected to trigger dequeues. %s.', e) except BaseException as e: # pylint: disable=broad-except # If we don't already have an exception to re-raise, raise this one. if not excinfo: raise # Otherwise, log this one and raise the other in the finally block. logging.error('Got exception during tf.learn final checkpoint %s.', e) finally: if excinfo: reraise(*excinfo) return loss_value
def train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, steps=None, fail_on_nan_loss=True, monitors=None): """Train a model. Given `graph`, a directory to write outputs to (`output_dir`), and some ops, run a training loop. The given `train_op` performs one step of training on the model. The `loss_op` represents the objective function of the training. It is expected to increment the `global_step_tensor`, a scalar integer tensor counting training steps. This function uses `Supervisor` to initialize the graph (from a checkpoint if one is available in `output_dir`), write summaries defined in the graph, and write regular checkpoints as defined by `supervisor_save_model_secs`. Training continues until `global_step_tensor` evaluates to `max_steps`, or, if `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the program is terminated with exit code 1. Args: graph: A graph to train. It is expected that this graph is not in use elsewhere. output_dir: A directory to write outputs to. train_op: An op that performs one training step when run. loss_op: A scalar loss tensor. global_step_tensor: A tensor representing the global step. If none is given, one is extracted from the graph using the same logic as in `Supervisor`. init_op: An op that initializes the graph. If `None`, use `Supervisor`'s default. init_feed_dict: A dictionary that maps `Tensor` objects to feed values. This feed dictionary will be used when `init_op` is evaluated. init_fn: Optional callable passed to Supervisor to initialize the model. log_every_steps: Output logs regularly. The logs contain timing data and the current loss. supervisor_is_chief: Whether the current process is the chief supervisor in charge of restoring the model and running standard services. supervisor_master: The master string to use when preparing the session. supervisor_save_model_secs: Save a checkpoint every `supervisor_save_model_secs` seconds when training. supervisor_save_summaries_steps: Save summaries every `supervisor_save_summaries_steps` seconds when training. feed_fn: A function that is called every iteration to produce a `feed_dict` passed to `session.run` calls. Optional. steps: Trains for this many steps (e.g. current global step + `steps`). fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op` evaluates to `NaN`. If false, continue training as if nothing happened. monitors: List of `BaseMonitor` subclass instances. Used for callbacks inside the training loop. Returns: The final loss value. Raises: ValueError: If `global_step_tensor` is not provided. See `tf.contrib.framework.get_global_step` for how we look it up if not provided explicitly. NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever evaluates to `NaN`. """ if not output_dir: raise ValueError('Output directory should be non-empty.') with graph.as_default(): global_step_tensor = contrib_variables.assert_or_get_global_step( graph, global_step_tensor) if global_step_tensor is None: raise ValueError( 'No "global_step" was provided or found in the graph.') # Get current step. try: start_step = checkpoints.load_variable(output_dir, global_step_tensor.name) except (errors.NotFoundError, ValueError): start_step = 0 summary_writer = (get_summary_writer(output_dir) if supervisor_is_chief else None) # TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors. if not supervisor_is_chief: # monitors should run only on the chief. monitors = [] elif not monitors: monitors = monitors_lib.get_default_monitors( loss_op=loss_op, summary_op=logging_ops.get_summary_op(), save_summary_steps=supervisor_save_summaries_steps, summary_writer=summary_writer) # Start monitors, can create graph parts. for monitor in monitors: monitor.begin(max_steps=start_step + steps) supervisor = tf_supervisor.Supervisor( graph, init_op=init_op or tf_supervisor.Supervisor.USE_DEFAULT, init_feed_dict=init_feed_dict, is_chief=supervisor_is_chief, logdir=output_dir, saver=_make_saver(graph), global_step=global_step_tensor, summary_op=None, summary_writer=summary_writer, save_model_secs=supervisor_save_model_secs, init_fn=init_fn) session = supervisor.PrepareSession(master=supervisor_master, start_standard_services=True) supervisor.StartQueueRunners(session) with session: get_current_step = lambda: session.run(global_step_tensor) start_step = get_current_step() max_steps = start_step + steps last_step = start_step last_log_step = start_step loss_value = None logging.info('Training steps [%d,%s)', last_step, 'inf' if max_steps is None else str(max_steps)) excinfo = None try: while not supervisor.ShouldStop() and ((max_steps is None) or (last_step < max_steps)): start_time = time.time() feed_dict = feed_fn() if feed_fn is not None else None outputs, should_stop = _run_with_monitors( session, last_step + 1, [train_op, loss_op], feed_dict, monitors) loss_value = outputs[loss_op.name] if np.isnan(loss_value): failure_message = 'Model diverged with loss = NaN.' if fail_on_nan_loss: logging.error(failure_message) raise NanLossDuringTrainingError() else: logging.warning(failure_message) if should_stop: break this_step = get_current_step() if this_step <= last_step: logging.error( 'Global step was not incremented by train op at step %s' ': new step %d', last_step, this_step) last_step = this_step is_last_step = (max_steps is not None) and (last_step >= max_steps) if is_last_step or (last_step - last_log_step >= log_every_steps): logging.info( 'training step %d, loss = %.5f (%.3f sec/batch).', last_step, loss_value, float(time.time() - start_time)) last_log_step = last_step except errors.OutOfRangeError as e: logging.warn( 'Got exception during tf.learn training loop possibly ' 'due to exhausted input queue %s.', e) except BaseException as e: # pylint: disable=broad-except # Hold on to any other exceptions while we try recording a final # checkpoint and summary. excinfo = sys.exc_info() finally: try: # Call supervisor.Stop() from within a try block because it re-raises # exceptions thrown by the supervised threads. supervisor.Stop(close_summary_writer=False) # Save one last checkpoint and summaries # TODO(wicke): This should be handled by Supervisor # In case we encountered an exception in the try block before we updated # last_step, update it here (again). last_step = get_current_step() if supervisor_is_chief: ckpt_path = supervisor.save_path logging.info( 'Saving checkpoint for step %d to checkpoint: %s.', last_step, ckpt_path) supervisor.saver.save(session, ckpt_path, global_step=last_step) # Finish monitors. for monitor in monitors: monitor.end() # catch OutOfRangeError which is thrown when queue is out of data (and for # other reasons as well). except errors.OutOfRangeError as e: logging.warn( 'OutOfRangeError in tf.learn final checkpoint possibly ' 'due to exhausted input queue. Note: summary_op is not ' 'expected to trigger dequeues. %s.', e) except BaseException as e: # pylint: disable=broad-except # If we don't already have an exception to re-raise, raise this one. if not excinfo: raise # Otherwise, log this one and raise the other in the finally block. logging.error( 'Got exception during tf.learn final checkpoint %s.', e) finally: if excinfo: reraise(*excinfo) return loss_value
def _train_model(self, input_fn, steps, feed_fn=None, init_op=None, init_feed_fn=None, init_fn=None, device_fn=None, monitors=None, log_every_steps=100, fail_on_nan_loss=True): if self._config.execution_mode not in ('all', 'train'): return # Stagger startup of worker sessions based on task id. sleep_secs = min(self._config.training_worker_max_startup_secs, self._config.task * self._config.training_worker_session_startup_stagger_secs) if sleep_secs: logging.info('Waiting %d secs before starting task %d.', sleep_secs, self._config.task) time.sleep(sleep_secs) # Device allocation device_fn = device_fn or self._device_fn self._graph = ops.Graph() with self._graph.as_default() as g, g.device(device_fn): random_seed.set_random_seed(self._config.tf_random_seed) global_step = contrib_framework.create_global_step(g) features, targets = input_fn() self._check_inputs(features, targets) train_op, loss_op = self._get_train_ops(features, targets) # Add default monitors. if monitors is None: monitors = [] monitors += monitors_lib.get_default_monitors( loss_op=loss_op, summary_op=logging_ops.get_summary_op(), save_summary_steps=100, summary_writer=graph_actions.get_summary_writer(self._model_dir)) is_chief = self._config.task == 0 if not is_chief: # Run monitors only on chief. monitors = [] # Setup monitors. for monitor in monitors: monitor.set_estimator(self) return train( graph=g, output_dir=self._model_dir, train_op=train_op, loss_op=loss_op, global_step_tensor=global_step, init_op=init_op, init_feed_dict=init_feed_fn() if init_feed_fn is not None else None, init_fn=init_fn, log_every_steps=log_every_steps, supervisor_is_chief=is_chief, supervisor_master=self._config.master, feed_fn=feed_fn, max_steps=steps, fail_on_nan_loss=fail_on_nan_loss, monitors=monitors)
def evaluate(graph, output_dir, checkpoint_path, eval_dict, update_op=None, global_step_tensor=None, supervisor_master='', log_every_steps=10, feed_fn=None, max_steps=None): """Evaluate a model loaded from a checkpoint. Given `graph`, a directory to write summaries to (`output_dir`), a checkpoint to restore variables from, and a `dict` of `Tensor`s to evaluate, run an eval loop for `max_steps` steps. In each step of evaluation, all tensors in the `eval_dict` are evaluated, and every `log_every_steps` steps, they are logged. At the very end of evaluation, a summary is evaluated (finding the summary ops using `Supervisor`'s logic) and written to `output_dir`. Args: graph: A `Graph` to train. It is expected that this graph is not in use elsewhere. output_dir: A string containing the directory to write a summary to. checkpoint_path: A string containing the path to a checkpoint to restore. Can be `None` if the graph doesn't require loading any variables. eval_dict: A `dict` mapping string names to tensors to evaluate. It is evaluated in every logging step. The result of the final evaluation is returned. If update_op is None, then it's evaluated in every step. update_op: A `Tensor` which is run in every step. global_step_tensor: A `Variable` containing the global step. If `None`, one is extracted from the graph using the same logic as in `Supervisor`. Used to place eval summaries on training curves. supervisor_master: The master string to use when preparing the session. log_every_steps: Integer. Output logs every `log_every_steps` evaluation steps. The logs contain the `eval_dict` and timing information. feed_fn: A function that is called every iteration to produce a `feed_dict` passed to `session.run` calls. Optional. max_steps: Integer. Evaluate `eval_dict` this many times. Returns: A tuple `(eval_results, global_step)`: eval_results: A `dict` mapping `string` to numeric values (`int`, `float`) that are the result of running eval_dict in the last step. `None` if no eval steps were run. global_step: The global step this evaluation corresponds to. """ global_step_tensor = contrib_variables.assert_or_get_global_step( graph, global_step_tensor) # Add scalar summaries for every tensor in evaluation dict if there is not # one existing already or it's a string. existing_tags = [ tensor_util.constant_value(summary.op.inputs[0]) for summary in ops.get_collection(ops.GraphKeys.SUMMARIES) ] for key, value in eval_dict.items(): if key in existing_tags: continue if isinstance(value, ops.Tensor): summaries.summarize_tensor(value, tag=key) # Create or get summary op, global_step and saver. summary_op = logging_ops.get_summary_op() saver = _get_saver() local_init_op = _get_local_init_op() ready_op = _get_ready_op() session_manager = session_manager_lib.SessionManager( local_init_op=local_init_op, ready_op=ready_op) session, initialized = session_manager.recover_session( master=supervisor_master, saver=saver, checkpoint_dir=checkpoint_path) # Start queue runners. coord = coordinator.Coordinator() threads = _start_queue_runners(session, coord) with session: if not initialized: logging.warning('Failed to initialize from %s.', checkpoint_path) # TODO(ipolosukhin): This should be failing, but old code relies on that. session.run(variables.initialize_all_variables()) if checkpoint_path: _restore_from_checkpoint(session, graph, checkpoint_path, saver) current_global_step = session.run(global_step_tensor) eval_results = None # TODO(amodei): Fix this to run through the eval set exactly once. step = 0 logging.info('Eval steps [%d,%s) for training step %d.', step, 'inf' if max_steps is None else str(max_steps), current_global_step) try: try: while (max_steps is None) or (step < max_steps): start_time = time.time() feed_dict = feed_fn() if feed_fn is not None else None eval_results = None if update_op is not None: session.run(update_op, feed_dict=feed_dict) else: eval_results = _run_dict(session, eval_dict, feed_dict=feed_dict) # TODO(wicke): We should assert that the global step hasn't changed. step += 1 if step % log_every_steps == 0: if eval_results is None: eval_results = _run_dict(session, eval_dict, feed_dict=feed_dict) duration = time.time() - start_time logging.info( 'Results after %d steps (%.3f sec/batch): %s.', step, float(duration), ', '.join('%s = %s' % (k, v) for k, v in eval_results.items())) finally: if eval_results is None: eval_results = _run_dict(session, eval_dict, feed_dict=feed_dict) # Stop queue runners. coord.request_stop() coord.join(threads, stop_grace_period_secs=120) # Make our own summary writer and write a summary to the eval dir. # Only is feed_fn is not provided. # TODO(ipolosukhin): Convert evaluation to use streaming_metrics, # then we can save for non feed_fn as well. if summary_op is not None and feed_fn is None: summary_writer = None try: summary_writer = get_summary_writer(output_dir) summary_str = session.run(summary_op) if summary_str: summary_writer.add_summary(summary_str, current_global_step) finally: if summary_writer: summary_writer.close() # catch OutOfRangeError which is thrown when queue is out of data (and for # other reasons as well). except errors.OutOfRangeError as e: if max_steps is None: logging.info('Input queue is exhausted.') else: logging.warn('Input queue is exhausted: %s.', e) # catch StopIteration which is thrown is DataReader is out of data. except StopIteration as e: if max_steps is None: logging.info('Input iterator is exhausted.') else: logging.warn('Input iterator is exhausted: %s.', e) return eval_results, current_global_step
def evaluate(graph, output_dir, checkpoint_path, eval_dict, global_step_tensor=None, init_op=None, supervisor_master='', log_every_steps=10, feed_fn=None, max_steps=None): """Evaluate a model loaded from a checkpoint. Given `graph`, a directory to write summaries to (`output_dir`), a checkpoint to restore variables from, and a `dict` of `Tensor`s to evaluate, run an eval loop for `max_steps` steps. In each step of evaluation, all tensors in the `eval_dict` are evaluated, and every `log_every_steps` steps, they are logged. At the very end of evaluation, a summary is evaluated (finding the summary ops using `Supervisor`'s logic) and written to `output_dir`. Args: graph: A `Graph` to train. It is expected that this graph is not in use elsewhere. output_dir: A string containing the directory to write a summary to. checkpoint_path: A string containing the path to a checkpoint to restore. Can be `None` if the graph doesn't require loading any variables. eval_dict: A `dict` mapping string names to tensors to evaluate for in every eval step. global_step_tensor: A `Variable` containing the global step. If `None`, one is extracted from the graph using the same logic as in `Supervisor`. Used to place eval summaries on training curves. init_op: An op that initializes the graph. If `None`, use `Supervisor`'s default. supervisor_master: The master string to use when preparing the session. log_every_steps: Integer. Output logs every `log_every_steps` evaluation steps. The logs contain the `eval_dict` and timing information. feed_fn: A function that is called every iteration to produce a `feed_dict` passed to `session.run` calls. Optional. max_steps: Integer. Evaluate `eval_dict` this many times. Returns: A tuple `(eval_results, global_step)`: eval_results: A `dict` mapping `string` to numeric values (`int`, `float`) that are the eval results from the last step of the eval. None if no eval steps were run. global_step: The global step this evaluation corresponds to. """ global_step_tensor = contrib_variables.assert_or_get_global_step( graph, global_step_tensor) # Add scalar summaries for every tensor in evaluation dict if there is not # one existing already or it's a string. existing_tags = [ tensor_util.constant_value(summary.op.inputs[0]) for summary in ops.get_collection(ops.GraphKeys.SUMMARIES) ] for key, value in eval_dict.items(): if key in existing_tags: continue if isinstance(value, ops.Tensor): summaries.summarize_tensor(value, tag=key) # Create or get summary op. summary_op = logging_ops.get_summary_op() # TODO(wicke): Don't use supervisor here, or switch to output_dir=eval_dir. supervisor, session = _prepare_session( graph=graph, output_dir=None, # Must be None to avoid writing an event file start_services=False, global_step_tensor=global_step_tensor, init_op=init_op, supervisor_is_chief=True, supervisor_master=supervisor_master, supervisor_save_model_secs=None) global_step_tensor = supervisor.global_step with session: if checkpoint_path: _restore_from_checkpoint(session, graph, checkpoint_path, supervisor.saver) current_global_step = session.run(global_step_tensor) eval_results = None # TODO(amodei): Fix this to run through the eval set exactly once. step = 0 logging.info('Eval steps [%d,%s)', step, 'inf' if max_steps is None else str(max_steps)) try: try: while not supervisor.ShouldStop() and ((max_steps is None) or (step < max_steps)): start_time = time.time() feed_dict = feed_fn() if feed_fn is not None else None eval_results = _run_dict(session, eval_dict, feed_dict=feed_dict) # TODO(wicke): We should assert that the global step hasn't changed. step += 1 if step % log_every_steps == 0: duration = time.time() - start_time logging.info( 'Results after %d steps (%.3f sec/batch): %s.', step, float(duration), ', '.join('%s = %s' % (k, v) for k, v in eval_results.items())) finally: # Make our own summary writer and write a summary to the eval dir. # Only is feed_fn is not provided. # TODO(ipolosukhin): Convert evaluation to use streaming_metrics, # then we can save for non feed_fn as well. if summary_op is not None and feed_fn is None: summary_writer = None try: summary_writer = SummaryWriter( output_dir, graph_def=session.graph_def) summary_str = session.run(summary_op) if summary_str: summary_writer.add_summary(summary_str, current_global_step) finally: if summary_writer: summary_writer.close() # Call supervisor.Stop() from within a try block because it re-raises # exceptions thrown by the supervised threads. supervisor.Stop() # catch OutOfRangeError which is thrown when queue is out of data (and for # other reasons as well). except errors.OutOfRangeError as e: logging.warn('Input queue is exhausted: %s.', e) # catch StopIteration which is thrown is DataReader is out of data. except StopIteration as e: logging.info('Input iterator is exhausted: %s.', e) return eval_results, current_global_step
def _train_model(self, input_fn, steps, feed_fn=None, init_op=None, init_feed_fn=None, init_fn=None, device_fn=None, monitors=None, log_every_steps=100, fail_on_nan_loss=True): # TODO(wicke): This is a hack and needs to go. if self._config.execution_mode not in ('all', 'train'): return if not self._model_dir: raise ValueError('Estimator\'s model_dir should be non-empty.') # Stagger startup of worker sessions based on task id. sleep_secs = min(self._config.training_worker_max_startup_secs, self._config.task * self._config.training_worker_session_startup_stagger_secs) if sleep_secs: logging.info('Waiting %d secs before starting task %d.', sleep_secs, self._config.task) time.sleep(sleep_secs) # Device allocation device_fn = device_fn or self._device_fn self._graph = ops.Graph() with self._graph.as_default() as g, g.device(device_fn): random_seed.set_random_seed(self._config.tf_random_seed) global_step = contrib_framework.create_global_step(g) features, targets = input_fn() self._check_inputs(features, targets) train_op, loss_op = self._get_train_ops(features, targets) # Add default monitors. if monitors is None: monitors = [] monitors += monitors_lib.get_default_monitors( loss_op=loss_op, summary_op=logging_ops.get_summary_op(), save_summary_steps=100, summary_writer=graph_actions.get_summary_writer(self._model_dir)) is_chief = self._config.task == 0 if not is_chief: # Run monitors only on chief. monitors = [] # Setup monitors. for monitor in monitors: monitor.set_estimator(self) return graph_actions.train( graph=g, output_dir=self._model_dir, train_op=train_op, loss_op=loss_op, global_step_tensor=global_step, init_op=init_op, init_feed_dict=init_feed_fn() if init_feed_fn is not None else None, init_fn=init_fn, log_every_steps=log_every_steps, supervisor_is_chief=is_chief, supervisor_master=self._config.master, feed_fn=feed_fn, max_steps=steps, fail_on_nan_loss=fail_on_nan_loss, monitors=monitors)
def evaluate(graph, output_dir, checkpoint_path, eval_dict, update_op=None, global_step_tensor=None, supervisor_master='', log_every_steps=10, feed_fn=None, max_steps=None): """Evaluate a model loaded from a checkpoint. Given `graph`, a directory to write summaries to (`output_dir`), a checkpoint to restore variables from, and a `dict` of `Tensor`s to evaluate, run an eval loop for `max_steps` steps. In each step of evaluation, all tensors in the `eval_dict` are evaluated, and every `log_every_steps` steps, they are logged. At the very end of evaluation, a summary is evaluated (finding the summary ops using `Supervisor`'s logic) and written to `output_dir`. Args: graph: A `Graph` to train. It is expected that this graph is not in use elsewhere. output_dir: A string containing the directory to write a summary to. checkpoint_path: A string containing the path to a checkpoint to restore. Can be `None` if the graph doesn't require loading any variables. eval_dict: A `dict` mapping string names to tensors to evaluate. It is evaluated in every logging step. The result of the final evaluation is returned. If update_op is None, then it's evaluated in every step. update_op: A `Tensor` which is run in every step. global_step_tensor: A `Variable` containing the global step. If `None`, one is extracted from the graph using the same logic as in `Supervisor`. Used to place eval summaries on training curves. supervisor_master: The master string to use when preparing the session. log_every_steps: Integer. Output logs every `log_every_steps` evaluation steps. The logs contain the `eval_dict` and timing information. feed_fn: A function that is called every iteration to produce a `feed_dict` passed to `session.run` calls. Optional. max_steps: Integer. Evaluate `eval_dict` this many times. Returns: A tuple `(eval_results, global_step)`: eval_results: A `dict` mapping `string` to numeric values (`int`, `float`) that are the result of running eval_dict in the last step. `None` if no eval steps were run. global_step: The global step this evaluation corresponds to. """ global_step_tensor = contrib_variables.assert_or_get_global_step( graph, global_step_tensor) # Add scalar summaries for every tensor in evaluation dict if there is not # one existing already or it's a string. existing_tags = [tensor_util.constant_value(summary.op.inputs[0]) for summary in ops.get_collection(ops.GraphKeys.SUMMARIES)] for key, value in eval_dict.items(): if key in existing_tags: continue if isinstance(value, ops.Tensor): summaries.summarize_tensor(value, tag=key) # Create or get summary op, global_step and saver. summary_op = logging_ops.get_summary_op() saver = _get_saver() local_init_op = _get_local_init_op() ready_op = _get_ready_op() session_manager = session_manager_lib.SessionManager( local_init_op=local_init_op, ready_op=ready_op) session, initialized = session_manager.recover_session( master=supervisor_master, saver=saver, checkpoint_dir=checkpoint_path) # Start queue runners. coord = coordinator.Coordinator() threads = _start_queue_runners(session, coord) with session: if not initialized: logging.warning('Failed to initialize from %s.', checkpoint_path) # TODO(ipolosukhin): This should be failing, but old code relies on that. session.run(variables.initialize_all_variables()) if checkpoint_path: _restore_from_checkpoint(session, graph, checkpoint_path, saver) current_global_step = session.run(global_step_tensor) eval_results = None # TODO(amodei): Fix this to run through the eval set exactly once. step = 0 logging.info('Eval steps [%d,%s) for training step %d.', step, 'inf' if max_steps is None else str(max_steps), current_global_step) try: try: while (max_steps is None) or (step < max_steps): start_time = time.time() feed_dict = feed_fn() if feed_fn is not None else None eval_results = None if update_op is not None: session.run(update_op, feed_dict=feed_dict) else: eval_results = _run_dict(session, eval_dict, feed_dict=feed_dict) # TODO(wicke): We should assert that the global step hasn't changed. step += 1 if step % log_every_steps == 0: if eval_results is None: eval_results = _run_dict(session, eval_dict, feed_dict=feed_dict) duration = time.time() - start_time logging.info('Results after %d steps (%.3f sec/batch): %s.', step, float(duration), ', '.join('%s = %s' % (k, v) for k, v in eval_results.items())) finally: if eval_results is None: eval_results = _run_dict(session, eval_dict, feed_dict=feed_dict) # Stop queue runners. coord.request_stop() coord.join(threads, stop_grace_period_secs=120) # Make our own summary writer and write a summary to the eval dir. # Only is feed_fn is not provided. # TODO(ipolosukhin): Convert evaluation to use streaming_metrics, # then we can save for non feed_fn as well. if summary_op is not None and feed_fn is None: summary_writer = None try: summary_writer = SummaryWriter(output_dir, graph_def=session.graph_def) summary_str = session.run(summary_op) if summary_str: summary_writer.add_summary(summary_str, current_global_step) finally: if summary_writer: summary_writer.close() # catch OutOfRangeError which is thrown when queue is out of data (and for # other reasons as well). except errors.OutOfRangeError as e: if max_steps is None: logging.info('Input queue is exhausted.') else: logging.warn('Input queue is exhausted: %s.', e) # catch StopIteration which is thrown is DataReader is out of data. except StopIteration as e: if max_steps is None: logging.info('Input iterator is exhausted.') else: logging.warn('Input iterator is exhausted: %s.', e) return eval_results, current_global_step
def evaluate( graph, output_dir, checkpoint_path, eval_dict, global_step_tensor=None, init_op=None, supervisor_master="", log_every_steps=10, feed_fn=None, max_steps=None, ): """Evaluate a model loaded from a checkpoint. Given `graph`, a directory to write summaries to (`output_dir`), a checkpoint to restore variables from, and a `dict` of `Tensor`s to evaluate, run an eval loop for `max_steps` steps. In each step of evaluation, all tensors in the `eval_dict` are evaluated, and every `log_every_steps` steps, they are logged. At the very end of evaluation, a summary is evaluated (finding the summary ops using `Supervisor`'s logic) and written to `output_dir`. Args: graph: A `Graph` to train. It is expected that this graph is not in use elsewhere. output_dir: A string containing the directory to write a summary to. checkpoint_path: A string containing the path to a checkpoint to restore. Can be `None` if the graph doesn't require loading any variables. eval_dict: A `dict` mapping string names to tensors to evaluate for in every eval step. global_step_tensor: A `Variable` containing the global step. If `None`, one is extracted from the graph using the same logic as in `Supervisor`. Used to place eval summaries on training curves. init_op: An op that initializes the graph. If `None`, use `Supervisor`'s default. supervisor_master: The master string to use when preparing the session. log_every_steps: Integer. Output logs every `log_every_steps` evaluation steps. The logs contain the `eval_dict` and timing information. feed_fn: A function that is called every iteration to produce a `feed_dict` passed to `session.run` calls. Optional. max_steps: Integer. Evaluate `eval_dict` this many times. Returns: A tuple `(eval_results, global_step)`: eval_results: A `dict` mapping `string` to numeric values (`int`, `float`) that are the eval results from the last step of the eval. None if no eval steps were run. global_step: The global step this evaluation corresponds to. """ global_step_tensor = contrib_variables.assert_or_get_global_step(graph, global_step_tensor) # Add scalar summaries for every tensor in evaluation dict if there is not # one existing already or it's a string. existing_tags = [ tensor_util.constant_value(summary.op.inputs[0]) for summary in ops.get_collection(ops.GraphKeys.SUMMARIES) ] for key, value in eval_dict.items(): if key in existing_tags: continue if isinstance(value, ops.Tensor): summaries.summarize_tensor(value, tag=key) # Create or get summary op. summary_op = logging_ops.get_summary_op() # TODO(wicke): Don't use supervisor here, or switch to output_dir=eval_dir. supervisor, session = _prepare_session( graph=graph, output_dir=None, # Must be None to avoid writing an event file start_services=False, global_step_tensor=global_step_tensor, init_op=init_op, supervisor_is_chief=True, supervisor_master=supervisor_master, supervisor_save_model_secs=None, ) global_step_tensor = supervisor.global_step with session: if checkpoint_path: _restore_from_checkpoint(session, graph, checkpoint_path, supervisor.saver) current_global_step = session.run(global_step_tensor) eval_results = None # TODO(amodei): Fix this to run through the eval set exactly once. step = 0 logging.info("Eval steps [%d,%s)", step, "inf" if max_steps is None else str(max_steps)) try: try: while not supervisor.ShouldStop() and ((max_steps is None) or (step < max_steps)): start_time = time.time() feed_dict = feed_fn() if feed_fn is not None else None eval_results = _run_dict(session, eval_dict, feed_dict=feed_dict) # TODO(wicke): We should assert that the global step hasn't changed. step += 1 if step % log_every_steps == 0: duration = time.time() - start_time logging.info( "Results after %d steps (%.3f sec/batch): %s.", step, float(duration), ", ".join("%s = %s" % (k, v) for k, v in eval_results.items()), ) finally: # Make our own summary writer and write a summary to the eval dir. # Only is feed_fn is not provided. # TODO(ipolosukhin): Convert evaluation to use streaming_metrics, # then we can save for non feed_fn as well. if summary_op is not None and feed_fn is None: summary_writer = None try: summary_writer = SummaryWriter(output_dir, graph_def=session.graph_def) summary_str = session.run(summary_op) if summary_str: summary_writer.add_summary(summary_str, current_global_step) finally: if summary_writer: summary_writer.close() # Call supervisor.Stop() from within a try block because it re-raises # exceptions thrown by the supervised threads. supervisor.Stop() # catch OutOfRangeError which is thrown when queue is out of data (and for # other reasons as well). except errors.OutOfRangeError as e: logging.warn("Input queue is exhausted: %s.", e) # catch StopIteration which is thrown is DataReader is out of data. except StopIteration as e: logging.info("Input iterator is exhausted: %s.", e) return eval_results, current_global_step