Пример #1
0
  def _init_summary_op(self, summary_op=USE_DEFAULT):
    """Initilizes summary_op.

    Args:
      summary_op: An Operation that returns a Summary for the event logs.
        If set to USE_DEFAULT, create an op that merges all the summaries.
    """
    if summary_op is Supervisor.USE_DEFAULT:
      summary_op = self._get_first_op_from_collection(ops.GraphKeys.SUMMARY_OP)
      if summary_op is None:
        summary_op = logging_ops.merge_all_summaries()
        if summary_op is not None:
          ops.add_to_collection(ops.GraphKeys.SUMMARY_OP, summary_op)
    self._summary_op = summary_op
Пример #2
0
  def _init_summary_op(self, summary_op=USE_DEFAULT):
    """Initilizes summary_op.

    Args:
      summary_op: An Operation that returns a Summary for the event logs.
        If set to USE_DEFAULT, create an op that merges all the summaries.
    """
    if summary_op is Supervisor.USE_DEFAULT:
      summary_op = self._get_first_op_from_collection(ops.GraphKeys.SUMMARY_OP)
      if summary_op is None:
        summary_op = logging_ops.merge_all_summaries()
        if summary_op is not None:
          ops.add_to_collection(ops.GraphKeys.SUMMARY_OP, summary_op)
    self._summary_op = summary_op
Пример #3
0
def evaluation_loop(master,
                    checkpoint_dir,
                    logdir,
                    num_evals=1,
                    initial_op=None,
                    initial_op_feed_dict=None,
                    eval_op=None,
                    eval_op_feed_dict=None,
                    final_op=None,
                    final_op_feed_dict=None,
                    summary_op=_USE_DEFAULT,
                    summary_op_feed_dict=None,
                    variables_to_restore=None,
                    eval_interval_secs=60,
                    max_number_of_evaluations=None,
                    session_config=None,
                    timeout=None):
  """Runs TF-Slim's Evaluation Loop.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_dir: The directory where checkpoints are stored.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    initial_op: An operation run at the beginning of evaluation.
    initial_op_feed_dict: A feed dictionary to use when executing `initial_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.merge_all_summaries().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    eval_interval_secs: The minimum number of seconds between evaluations.
    max_number_of_evaluations: the max number of iterations of the evaluation.
      If the value is left as 'None', the evaluation continues indefinitely.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.
    timeout: The maximum amount of time to wait between checkpoints. If left as
      `None`, then the process will wait indefinitely.

  Returns:
    The value of `final_op` or `None` if `final_op` is `None`.
  """
  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  global_step = variables.get_or_create_global_step()

  saver = tf_saver.Saver(variables_to_restore or
                         variables.get_variables_to_restore())

  summary_writer = summary_io.SummaryWriter(logdir)

  sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                             logdir=logdir,
                             summary_op=None,
                             summary_writer=None,
                             global_step=None,
                             saver=saver)

  number_of_evaluations = 0
  for checkpoint_path in checkpoints_iterator(checkpoint_dir,
                                              eval_interval_secs,
                                              timeout):
    logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                           time.gmtime()))

    with sv.managed_session(
        master, start_standard_services=False, config=session_config) as sess:
      sv.saver.restore(sess, checkpoint_path)
      sv.start_queue_runners(sess)
      final_op_value = evaluation(sess,
                                  num_evals=num_evals,
                                  initial_op=initial_op,
                                  initial_op_feed_dict=initial_op_feed_dict,
                                  eval_op=eval_op,
                                  eval_op_feed_dict=eval_op_feed_dict,
                                  final_op=final_op,
                                  final_op_feed_dict=final_op_feed_dict,
                                  summary_op=summary_op,
                                  summary_op_feed_dict=summary_op_feed_dict,
                                  summary_writer=summary_writer,
                                  global_step=global_step)

    logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                           time.gmtime()))
    number_of_evaluations += 1
    if (max_number_of_evaluations and
        number_of_evaluations >= max_number_of_evaluations):
      logging.info('Reached max_number_of_evaluations=%s. Exit',
                   max_number_of_evaluations)
      return final_op_value

  logging.info(
      'Timed-out waiting for new checkpoint file. Exiting evaluation loop.')
  return final_op_value
Пример #4
0
def evaluate_once(master,
                  checkpoint_path,
                  logdir,
                  num_evals=1,
                  initial_op=None,
                  initial_op_feed_dict=None,
                  eval_op=None,
                  eval_op_feed_dict=None,
                  final_op=None,
                  final_op_feed_dict=None,
                  summary_op=_USE_DEFAULT,
                  summary_op_feed_dict=None,
                  variables_to_restore=None,
                  session_config=None):
  """Evaluates the model at the given checkpoint path.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_path: The path to a checkpoint to use for evaluation.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    initial_op: An operation run at the beginning of evaluation.
    initial_op_feed_dict: A feed dictionary to use when executing `initial_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.merge_all_summaries().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.

  Returns:
    The value of `final_op` or `None` if `final_op` is `None`.
  """
  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  global_step = variables.get_or_create_global_step()

  saver = tf_saver.Saver(
      variables_to_restore or variables.get_variables_to_restore(),
      write_version=saver_pb2.SaverDef.V1)

  summary_writer = summary_io.SummaryWriter(logdir)

  sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                             logdir=logdir,
                             summary_op=None,
                             summary_writer=None,
                             global_step=None,
                             saver=None)

  logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                         time.gmtime()))
  with sv.managed_session(
      master, start_standard_services=False, config=session_config) as sess:
    saver.restore(sess, checkpoint_path)
    sv.start_queue_runners(sess)
    final_op_value = evaluation(sess,
                                num_evals=num_evals,
                                initial_op=initial_op,
                                initial_op_feed_dict=initial_op_feed_dict,
                                eval_op=eval_op,
                                eval_op_feed_dict=eval_op_feed_dict,
                                final_op=final_op,
                                final_op_feed_dict=final_op_feed_dict,
                                summary_op=summary_op,
                                summary_op_feed_dict=summary_op_feed_dict,
                                summary_writer=summary_writer,
                                global_step=global_step)

  logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                         time.gmtime()))

  return final_op_value
Пример #5
0
    def _setup_training(self):
        """Sets up graph, model and trainer."""
        # Create config if not given.
        if self._config is None:
            self._config = RunConfig(verbose=self.verbose)
        # Create new graph.
        self._graph = ops.Graph()
        self._graph.add_to_collection("IS_TRAINING", True)
        with self._graph.as_default():
            random_seed.set_random_seed(self._config.tf_random_seed)
            self._global_step = variables.Variable(0,
                                                   name="global_step",
                                                   trainable=False)

            # Setting up inputs and outputs.
            self._inp, self._out = self._data_feeder.input_builder()

            # If class weights are provided, add them to the graph.
            # Different loss functions can use this tensor by name.
            if self.class_weight:
                self._class_weight_node = constant_op.constant(
                    self.class_weight, name='class_weight')

            # Add histograms for X and y if they are floats.
            if self._data_feeder.input_dtype in (np.float32, np.float64):
                logging_ops.histogram_summary("X", self._inp)
            if self._data_feeder.output_dtype in (np.float32, np.float64):
                logging_ops.histogram_summary("y", self._out)

            # Create model's graph.
            self._model_predictions, self._model_loss = self.model_fn(
                self._inp, self._out)

            # Set up a single operator to merge all the summaries
            self._summaries = logging_ops.merge_all_summaries()

            # Create trainer and augment graph with gradients and optimizer.
            # Additionally creates initialization ops.
            learning_rate = self.learning_rate
            optimizer = self.optimizer
            if callable(learning_rate):
                learning_rate = learning_rate(self._global_step)
            if callable(optimizer):
                optimizer = optimizer(learning_rate)
            self._train = optimizers.optimize_loss(
                self._model_loss,
                self._global_step,
                learning_rate=learning_rate,
                optimizer=optimizer,
                clip_gradients=self.clip_gradients)

            # Update ops during training, e.g. batch_norm_ops
            self._train = control_flow_ops.group(
                self._train, *ops.get_collection('update_ops'))

            # Get all initializers for all trainable variables.
            self._initializers = variables.initialize_all_variables()

            # Create model's saver capturing all the nodes created up until now.
            self._saver = train.Saver(
                max_to_keep=self._config.keep_checkpoint_max,
                keep_checkpoint_every_n_hours=self._config.
                keep_checkpoint_every_n_hours)

            # Enable monitor to create validation data dict with appropriate tf placeholders
            self._monitor.create_val_feed_dict(self._inp, self._out)

            # Create session to run model with.
            self._session = session.Session(self._config.tf_master,
                                            config=self._config.tf_config)

            # Run parameter initializers.
            self._session.run(self._initializers)
Пример #6
0
    def _setup_training(self):
        """Sets up graph, model and trainer."""
        # Create config if not given.
        if self._config is None:
            self._config = RunConfig(verbose=self.verbose)
        # Create new graph.
        self._graph = ops.Graph()
        self._graph.add_to_collection("IS_TRAINING", True)
        with self._graph.as_default():
            random_seed.set_random_seed(self._config.tf_random_seed)
            self._global_step = variables.Variable(
                0, name="global_step", trainable=False)

            # Setting up inputs and outputs.
            self._inp, self._out = self._data_feeder.input_builder()

            # If class weights are provided, add them to the graph.
            # Different loss functions can use this tensor by name.
            if self.class_weight:
                self._class_weight_node = constant_op.constant(
                    self.class_weight, name='class_weight')

            # Add histograms for X and y if they are floats.
            if self._data_feeder.input_dtype in (np.float32, np.float64):
                logging_ops.histogram_summary("X", self._inp)
            if self._data_feeder.output_dtype in (np.float32, np.float64):
                logging_ops.histogram_summary("y", self._out)

            # Create model's graph.
            self._model_predictions, self._model_loss = self.model_fn(
                self._inp, self._out)

            # Create trainer and augment graph with gradients and optimizer.
            # Additionally creates initialization ops.
            learning_rate = self.learning_rate
            optimizer = self.optimizer
            if callable(learning_rate):
                learning_rate = learning_rate(self._global_step)
            if callable(optimizer):
                optimizer = optimizer(learning_rate)
            self._train = optimizers.optimize_loss(self._model_loss, self._global_step,
                learning_rate=learning_rate,
                optimizer=optimizer, clip_gradients=self.clip_gradients)

            # Update ops during training, e.g. batch_norm_ops
            self._train = control_flow_ops.group(self._train, *ops.get_collection('update_ops'))

            # Merge all summaries into single tensor.
            self._summaries = logging_ops.merge_all_summaries()
 
            # Get all initializers for all trainable variables.
            self._initializers = variables.initialize_all_variables()

            # Create model's saver capturing all the nodes created up until now.
            self._saver = train.Saver(
                max_to_keep=self._config.keep_checkpoint_max,
                keep_checkpoint_every_n_hours=self._config.keep_checkpoint_every_n_hours)

            # Enable monitor to create validation data dict with appropriate tf placeholders
            self._monitor.create_val_feed_dict(self._inp, self._out)

            # Create session to run model with.
            self._session = session.Session(self._config.tf_master, config=self._config.tf_config)

            # Run parameter initializers.
            self._session.run(self._initializers)
Пример #7
0
def evaluation_loop(master,
                    checkpoint_dir,
                    logdir,
                    num_evals=1,
                    eval_op=None,
                    eval_op_feed_dict=None,
                    final_op=None,
                    final_op_feed_dict=None,
                    summary_op=_USE_DEFAULT,
                    summary_op_feed_dict=None,
                    variables_to_restore=None,
                    eval_interval_secs=60,
                    max_number_of_evaluations=None):
  """Runs TF-Slim's Evaluation Loop.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_dir: The directory where checkpoints are stored.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.merge_all_summaries().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    eval_interval_secs: The minimum number of seconds between evaluations.
    max_number_of_evaluations: the max number of iterations of the evaluation.
      If the value is left as 'None', the evaluation continues indefinitely.
  """
  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  global_step = variables.get_or_create_global_step()

  init_op = control_flow_ops.group(tf_variables.initialize_all_variables(),
                                   tf_variables.initialize_local_variables(),
                                   data_flow_ops.initialize_all_tables())

  saver = tf_saver.Saver(variables_to_restore or
                         variables.get_variables_to_restore())

  summary_writer = summary_io.SummaryWriter(logdir)

  sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                             logdir=logdir,
                             init_op=init_op,
                             summary_op=None,
                             summary_writer=None,
                             global_step=None,
                             saver=saver)

  last_checkpoint = None
  number_of_evaluations = 0
  while True:
    last_checkpoint = wait_for_new_checkpoint(checkpoint_dir, last_checkpoint)
    start = time.time()
    logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                           time.gmtime()))

    with sv.managed_session(master, start_standard_services=False) as sess:
      sv.saver.restore(sess, last_checkpoint)
      sv.start_queue_runners(sess)
      evaluation(sess,
                 num_evals=num_evals,
                 eval_op=eval_op,
                 eval_op_feed_dict=eval_op_feed_dict,
                 final_op=final_op,
                 final_op_feed_dict=final_op_feed_dict,
                 summary_op=summary_op,
                 summary_op_feed_dict=summary_op_feed_dict,
                 summary_writer=summary_writer,
                 global_step=global_step)

    logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                           time.gmtime()))
    number_of_evaluations += 1
    if (max_number_of_evaluations and
        number_of_evaluations >= max_number_of_evaluations):
      logging.info('Reached max_number_of_evaluations=%s. Exit',
                   max_number_of_evaluations)
      break

    time_to_next_eval = start + eval_interval_secs - time.time()
    if time_to_next_eval > 0:
      time.sleep(time_to_next_eval)
Пример #8
0
def train(
    train_op,
    logdir,
    log_every_n_steps=1,
    graph=None,
    master='',
    is_chief=True,
    global_step=None,
    number_of_steps=None,
    init_op=_USE_DEFAULT,
    init_feed_dict=None,
    init_fn=None,
    summary_op=_USE_DEFAULT,
    save_summaries_secs=600,
    startup_delay_steps=0,
    saver=None,
    save_interval_secs=600,
    sync_optimizer=None):
  """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: the directory where training logs are written to.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The BNS name of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If none, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, or if `number_of_steps` is
      negative.
  """
  if train_op is None:
    raise ValueError('train_op cannot be None.')

  if sync_optimizer and startup_delay_steps > 0:
    raise ValueError(
        'startup_delay_steps must be zero when sync_optimizer is supplied.')

  if number_of_steps is not None and number_of_steps <= 0:
    raise ValueError(
        '`number_of_steps` must be either None or a positive number.')

  graph = graph or ops.get_default_graph()
  if global_step is None:
    global_step = variables.get_or_create_global_step()
  saver = saver or tf_saver.Saver()

  if init_op is None:
    init_op = control_flow_ops.group(
        tf_variables.initialize_all_variables(),
        tf_variables.initialize_local_variables(),
        tf_variables.initialize_all_tables())

  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  local_init_op = None
  cleanup_op = None

  if is_chief and sync_optimizer:
    if not isinstance(sync_optimizer,
                      sync_replicas_optimizer.SyncReplicasOptimizer):
      raise ValueError(
          '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer')

    # Need to create these BEFORE the supervisor finalizes the graph:
    local_init_op = sync_optimizer.get_init_tokens_op()
    chief_queue_runner = sync_optimizer.get_chief_queue_runner()
    cleanup_op = sync_optimizer.get_clean_up_op()

  if number_of_steps:
    should_stop_op = math_ops.greater_equal(global_step, number_of_steps)
  else:
    should_stop_op = constant_op.constant(False)

  should_log_op = math_ops.equal(
      math_ops.mod(global_step, log_every_n_steps), 0)

  sv = supervisor.Supervisor(
      graph=graph,
      is_chief=is_chief,
      logdir=logdir,
      init_op=init_op,
      init_feed_dict=init_feed_dict,
      local_init_op=local_init_op,
      summary_op=summary_op,
      global_step=global_step,
      saver=saver,
      save_summaries_secs=save_summaries_secs,
      save_model_secs=save_interval_secs,
      init_fn=init_fn)

  with sv.managed_session(master, start_standard_services=False) as sess:
    if is_chief:
      sv.start_standard_services(sess)
    elif not is_chief and startup_delay_steps > 0:
      _wait_for_step(sess, global_step,
                     min(startup_delay_steps, number_of_steps or sys.maxint))
    sv.start_queue_runners(sess)
    if is_chief and sync_optimizer:
      sv.start_queue_runners(sess, [chief_queue_runner])

    total_loss = train_loop(
        sv, sess, train_op, should_stop_op, should_log_op, global_step,
        cleanup_op)

    # This waits for service threads to finish.
    sv.Stop()

    if sv.is_chief:
      logging.info('Finished training! Saving model to disk.')
      sv.saver.save(sess, sv.save_path, global_step=sv.global_step)

    return total_loss
Пример #9
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=_USE_DEFAULT,
          init_fn=None,
          ready_op=_USE_DEFAULT,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          summary_writer=_USE_DEFAULT,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None,
          session_config=None,
          trace_every_n_steps=None):
    """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to. If None, model
      checkpoints and summaries will not be written.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The BNS name of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.initialize_all_variables()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If left to its default
      value, then the session is initialized by calling
      `tf.initialize_local_variables()` and `tf.initialize_all_tables()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    ready_op: Operation to check if the model is ready to use. If left to its
      default value, then the session checks for readiness by calling
      `tf.report_uninitialized_variables()`.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    summary_writer: `SummaryWriter` to use.  Can be `None`
      to indicate that no summaries should be written. If unset, we
      create a SummaryWriter.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If None, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.
    trace_every_n_steps: produce and save a `Timeline` in Chrome trace format
      and add it to the summaries every `trace_every_n_steps`. If None, no trace
      information will be produced or saved.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, if `number_of_steps` is
      negative, or if `trace_every_n_steps` is not `None` and no `logdir` is
      provided.
  """
    if train_op is None:
        raise ValueError('train_op cannot be None.')

    if logdir is None:
        if summary_op != _USE_DEFAULT:
            raise ValueError('Cannot provide summary_op because logdir=None')
        if saver is not None:
            raise ValueError('Cannot provide saver because logdir=None')
        if trace_every_n_steps is not None:
            raise ValueError('Cannot provide trace_every_n_steps because '
                             'logdir=None')

    if sync_optimizer is not None and startup_delay_steps > 0:
        raise ValueError(
            'startup_delay_steps must be zero when sync_optimizer is supplied.'
        )

    if number_of_steps is not None and number_of_steps <= 0:
        raise ValueError(
            '`number_of_steps` must be either None or a positive number.')

    graph = graph or ops.get_default_graph()
    with graph.as_default():
        if global_step is None:
            global_step = variables.get_or_create_global_step()
        saver = saver or tf_saver.Saver()

        with ops.name_scope('init_ops'):
            if init_op == _USE_DEFAULT:
                init_op = tf_variables.initialize_all_variables()

            if ready_op == _USE_DEFAULT:
                ready_op = tf_variables.report_uninitialized_variables()

            if local_init_op == _USE_DEFAULT:
                local_init_op = control_flow_ops.group(
                    tf_variables.initialize_local_variables(),
                    data_flow_ops.initialize_all_tables())

        if summary_op == _USE_DEFAULT:
            summary_op = logging_ops.merge_all_summaries()

        if summary_writer == _USE_DEFAULT:
            summary_writer = supervisor.Supervisor.USE_DEFAULT

        cleanup_op = None

        if is_chief and sync_optimizer is not None:
            if not isinstance(
                    sync_optimizer,
                (sync_replicas_optimizer.SyncReplicasOptimizer,
                 sync_replicas_optimizer.SyncReplicasOptimizerV2)):
                raise ValueError(
                    '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer or '
                    'tf.train.SyncReplicasOptimizerV2.')

            # Need to create these BEFORE the supervisor finalizes the graph:
            with ops.control_dependencies([init_op]):
                init_tokens_op = sync_optimizer.get_init_tokens_op()
            init_op = init_tokens_op
            chief_queue_runner = sync_optimizer.get_chief_queue_runner()
            if isinstance(sync_optimizer,
                          sync_replicas_optimizer.SyncReplicasOptimizer):
                cleanup_op = sync_optimizer.get_clean_up_op()

        if train_step_kwargs == _USE_DEFAULT:
            with ops.name_scope('train_step'):
                train_step_kwargs = {}

                if number_of_steps:
                    should_stop_op = math_ops.greater_equal(
                        global_step, number_of_steps)
                else:
                    should_stop_op = constant_op.constant(False)
                train_step_kwargs['should_stop'] = should_stop_op
                train_step_kwargs['should_log'] = math_ops.equal(
                    math_ops.mod(global_step, log_every_n_steps), 0)
                if is_chief and trace_every_n_steps is not None:
                    train_step_kwargs['should_trace'] = math_ops.equal(
                        math_ops.mod(global_step, trace_every_n_steps), 0)
                    train_step_kwargs['logdir'] = logdir

    sv = supervisor.Supervisor(graph=graph,
                               is_chief=is_chief,
                               logdir=logdir,
                               init_op=init_op,
                               init_feed_dict=init_feed_dict,
                               local_init_op=local_init_op,
                               ready_op=ready_op,
                               summary_op=summary_op,
                               summary_writer=summary_writer,
                               global_step=global_step,
                               saver=saver,
                               save_summaries_secs=save_summaries_secs,
                               save_model_secs=save_interval_secs,
                               init_fn=init_fn)

    if summary_writer is not None:
        train_step_kwargs['summary_writer'] = sv.summary_writer

    should_retry = True
    while should_retry:
        try:
            should_retry = False
            with sv.managed_session(master,
                                    start_standard_services=False,
                                    config=session_config) as sess:
                logging.info('Starting Session.')
                if is_chief:
                    if logdir:
                        sv.start_standard_services(sess)
                elif startup_delay_steps > 0:
                    _wait_for_step(
                        sess, global_step,
                        min(startup_delay_steps, number_of_steps
                            or sys.maxint))
                sv.start_queue_runners(sess)
                logging.info('Starting Queues.')
                if is_chief and sync_optimizer is not None:
                    sv.start_queue_runners(sess, [chief_queue_runner])
                try:
                    while not sv.should_stop():
                        total_loss, should_stop = train_step_fn(
                            sess, train_op, global_step, train_step_kwargs)
                        if should_stop:
                            logging.info('Stopping Training.')
                            break
                    if logdir and sv.is_chief:
                        logging.info(
                            'Finished training! Saving model to disk.')
                        sv.saver.save(sess,
                                      sv.save_path,
                                      global_step=sv.global_step)
                except:
                    if sv.is_chief and cleanup_op is not None:
                        logging.info('About to execute sync_clean_up_op!')
                        sess.run(cleanup_op)
                    raise

        except errors.AbortedError:
            # Always re-run on AbortedError as it indicates a restart of one of the
            # distributed tensorflow servers.
            logging.info('Retrying training!')
            should_retry = True

    return total_loss
Пример #10
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=_USE_DEFAULT,
          init_fn=None,
          ready_op=_USE_DEFAULT,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          summary_writer=_USE_DEFAULT,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None,
          session_config=None):
  """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to. If None, model
      checkpoints and summaries will not be written.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The BNS name of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.initialize_all_variables()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If left to its default
      value, then the session is initialized by calling
      `tf.initialize_local_variables()` and `tf.initialize_all_tables()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    ready_op: Operation to check if the model is ready to use. If left to its
      default value, then the session checks for readiness by calling
      `tf.report_uninitialized_variables()`.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    summary_writer: `SummaryWriter` to use.  Can be `None`
      to indicate that no summaries should be written. If unset, we
      create a SummaryWriter.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If None, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, or if `number_of_steps` is
      negative.
  """
  if train_op is None:
    raise ValueError('train_op cannot be None.')

  if logdir is None:
    if summary_op != _USE_DEFAULT:
      raise ValueError('Cannot provide summary_op because logdir=None')
    if saver is not None:
      raise ValueError('Cannot provide saver because logdir=None')

  if sync_optimizer and startup_delay_steps > 0:
    raise ValueError(
        'startup_delay_steps must be zero when sync_optimizer is supplied.')

  if number_of_steps is not None and number_of_steps <= 0:
    raise ValueError(
        '`number_of_steps` must be either None or a positive number.')

  graph = graph or ops.get_default_graph()
  with graph.as_default():
    if global_step is None:
      global_step = variables.get_or_create_global_step()
    saver = saver or tf_saver.Saver()

    if init_op == _USE_DEFAULT:
      init_op = tf_variables.initialize_all_variables()

    if ready_op == _USE_DEFAULT:
      ready_op = tf_variables.report_uninitialized_variables()

    if summary_op == _USE_DEFAULT:
      summary_op = logging_ops.merge_all_summaries()

    if summary_writer == _USE_DEFAULT:
      summary_writer = supervisor.Supervisor.USE_DEFAULT

    if local_init_op == _USE_DEFAULT:
      local_init_op = control_flow_ops.group(
          tf_variables.initialize_local_variables(),
          data_flow_ops.initialize_all_tables())

    cleanup_op = None

    if is_chief and sync_optimizer:
      if not isinstance(sync_optimizer,
                        sync_replicas_optimizer.SyncReplicasOptimizer):
        raise ValueError(
            '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer')

      # Need to create these BEFORE the supervisor finalizes the graph:
      with ops.control_dependencies([init_op]):
        init_tokens_op = sync_optimizer.get_init_tokens_op()
      init_op = init_tokens_op
      chief_queue_runner = sync_optimizer.get_chief_queue_runner()
      cleanup_op = sync_optimizer.get_clean_up_op()

    if train_step_kwargs == _USE_DEFAULT:
      train_step_kwargs = {}

      if number_of_steps:
        should_stop_op = math_ops.greater_equal(global_step, number_of_steps)
      else:
        should_stop_op = constant_op.constant(False)
      train_step_kwargs['should_stop'] = should_stop_op
      train_step_kwargs['should_log'] = math_ops.equal(
          math_ops.mod(global_step, log_every_n_steps), 0)

  sv = supervisor.Supervisor(
      graph=graph,
      is_chief=is_chief,
      logdir=logdir,
      init_op=init_op,
      init_feed_dict=init_feed_dict,
      local_init_op=local_init_op,
      ready_op=ready_op,
      summary_op=summary_op,
      summary_writer=summary_writer,
      global_step=global_step,
      saver=saver,
      save_summaries_secs=save_summaries_secs,
      save_model_secs=save_interval_secs,
      init_fn=init_fn)

  should_retry = True
  while should_retry:
    try:
      should_retry = False
      with sv.managed_session(
          master, start_standard_services=False, config=session_config) as sess:
        logging.info('Starting Session.')
        if is_chief:
          if logdir:
            sv.start_standard_services(sess)
        elif startup_delay_steps > 0:
          _wait_for_step(sess, global_step,
                         min(startup_delay_steps,
                             number_of_steps or sys.maxint))
        sv.start_queue_runners(sess)
        logging.info('Starting Queues.')
        if is_chief and sync_optimizer:
          sv.start_queue_runners(sess, [chief_queue_runner])
        try:
          while not sv.should_stop():
            total_loss, should_stop = train_step_fn(
                sess, train_op, global_step, train_step_kwargs)
            if should_stop:
              logging.info('Stopping Training.')
              break
          if logdir and sv.is_chief:
            logging.info('Finished training! Saving model to disk.')
            sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
        except:
          if sv.is_chief and cleanup_op is not None:
            logging.info('About to execute sync_clean_up_op!')
            sess.run(cleanup_op)
          raise

    except errors.AbortedError:
      # Always re-run on AbortedError as it indicates a restart of one of the
      # distributed tensorflow servers.
      logging.info('Retrying training!')
      should_retry = True

  return total_loss
Пример #11
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=None,
          init_fn=None,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None):
    """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The BNS name of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.initialize_all_variables()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If None,
      then the session is initialized by calling
      `tf.initialize_local_variables()` and `tf.initialize_all_tables()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If none, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, or if `number_of_steps` is
      negative.
  """
    if train_op is None:
        raise ValueError('train_op cannot be None.')

    if sync_optimizer and startup_delay_steps > 0:
        raise ValueError(
            'startup_delay_steps must be zero when sync_optimizer is supplied.'
        )

    if number_of_steps is not None and number_of_steps <= 0:
        raise ValueError(
            '`number_of_steps` must be either None or a positive number.')

    graph = graph or ops.get_default_graph()
    with graph.as_default():
        if global_step is None:
            global_step = variables.get_or_create_global_step()
        saver = saver or tf_saver.Saver()

    if init_op == _USE_DEFAULT:
        init_op = tf_variables.initialize_all_variables()

    if summary_op == _USE_DEFAULT:
        summary_op = logging_ops.merge_all_summaries()

    cleanup_op = None

    if is_chief and sync_optimizer:
        if not isinstance(sync_optimizer,
                          sync_replicas_optimizer.SyncReplicasOptimizer):
            raise ValueError(
                '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer')

        # Need to create these BEFORE the supervisor finalizes the graph:
        with ops.control_dependencies([init_op]):
            init_tokens_op = sync_optimizer.get_init_tokens_op()
        init_op = init_tokens_op
        chief_queue_runner = sync_optimizer.get_chief_queue_runner()
        cleanup_op = sync_optimizer.get_clean_up_op()

    if train_step_kwargs == _USE_DEFAULT:
        train_step_kwargs = {}

        if number_of_steps:
            should_stop_op = math_ops.greater_equal(global_step,
                                                    number_of_steps)
        else:
            should_stop_op = constant_op.constant(False)
        train_step_kwargs['should_stop'] = should_stop_op
        train_step_kwargs['should_log'] = math_ops.equal(
            math_ops.mod(global_step, log_every_n_steps), 0)

    sv = supervisor.Supervisor(graph=graph,
                               is_chief=is_chief,
                               logdir=logdir,
                               init_op=init_op,
                               init_feed_dict=init_feed_dict,
                               local_init_op=local_init_op,
                               summary_op=summary_op,
                               global_step=global_step,
                               saver=saver,
                               save_summaries_secs=save_summaries_secs,
                               save_model_secs=save_interval_secs,
                               init_fn=init_fn)

    with sv.managed_session(master, start_standard_services=False) as sess:
        if is_chief:
            sv.start_standard_services(sess)
        elif not is_chief and startup_delay_steps > 0:
            _wait_for_step(
                sess, global_step,
                min(startup_delay_steps, number_of_steps or sys.maxint))
        sv.start_queue_runners(sess)
        if is_chief and sync_optimizer:
            sv.start_queue_runners(sess, [chief_queue_runner])

        try:
            while not sv.should_stop():
                total_loss, should_stop = train_step_fn(
                    sess, train_op, global_step, train_step_kwargs)
                if should_stop:
                    break
        finally:
            if sv.is_chief and cleanup_op is not None:
                sess.run(cleanup_op)

        # This waits for service threads to finish.
        sv.Stop()

        if sv.is_chief:
            logging.info('Finished training! Saving model to disk.')
            sv.saver.save(sess, sv.save_path, global_step=sv.global_step)

        return total_loss
Пример #12
0
def evaluation_loop(master,
                    checkpoint_dir,
                    logdir,
                    num_evals=1,
                    initial_op=None,
                    initial_op_feed_dict=None,
                    eval_op=None,
                    eval_op_feed_dict=None,
                    final_op=None,
                    final_op_feed_dict=None,
                    summary_op=_USE_DEFAULT,
                    summary_op_feed_dict=None,
                    variables_to_restore=None,
                    eval_interval_secs=60,
                    max_number_of_evaluations=None,
                    session_config=None):
    """Runs TF-Slim's Evaluation Loop.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_dir: The directory where checkpoints are stored.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    initial_op: An operation run at the beginning of evaluation.
    initial_op_feed_dict: A feed dictionary to use when executing `initial_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.merge_all_summaries().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    eval_interval_secs: The minimum number of seconds between evaluations.
    max_number_of_evaluations: the max number of iterations of the evaluation.
      If the value is left as 'None', the evaluation continues indefinitely.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.

  Returns:
    The value of `final_op` or `None` if `final_op` is `None`.
  """
    if summary_op == _USE_DEFAULT:
        summary_op = logging_ops.merge_all_summaries()

    global_step = variables.get_or_create_global_step()

    saver = tf_saver.Saver(variables_to_restore
                           or variables.get_variables_to_restore())

    summary_writer = summary_io.SummaryWriter(logdir)

    sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                               logdir=logdir,
                               summary_op=None,
                               summary_writer=None,
                               global_step=None,
                               saver=saver)

    number_of_evaluations = 0
    for checkpoint_path in checkpoints_iterator(checkpoint_dir,
                                                eval_interval_secs):
        logging.info('Starting evaluation at ' +
                     time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))

        with sv.managed_session(master,
                                start_standard_services=False,
                                config=session_config) as sess:
            sv.saver.restore(sess, checkpoint_path)
            sv.start_queue_runners(sess)
            final_op_value = evaluation(
                sess,
                num_evals=num_evals,
                initial_op=initial_op,
                initial_op_feed_dict=initial_op_feed_dict,
                eval_op=eval_op,
                eval_op_feed_dict=eval_op_feed_dict,
                final_op=final_op,
                final_op_feed_dict=final_op_feed_dict,
                summary_op=summary_op,
                summary_op_feed_dict=summary_op_feed_dict,
                summary_writer=summary_writer,
                global_step=global_step)

        logging.info('Finished evaluation at ' +
                     time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))
        number_of_evaluations += 1
        if (max_number_of_evaluations
                and number_of_evaluations >= max_number_of_evaluations):
            logging.info('Reached max_number_of_evaluations=%s. Exit',
                         max_number_of_evaluations)
            break

    return final_op_value
Пример #13
0
def evaluate_once(master,
                  checkpoint_path,
                  logdir,
                  num_evals=1,
                  eval_op=None,
                  eval_op_feed_dict=None,
                  final_op=None,
                  final_op_feed_dict=None,
                  summary_op=_USE_DEFAULT,
                  summary_op_feed_dict=None,
                  variables_to_restore=None,
                  session_config=None):
  """Evaluates the model at the given checkpoint path.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_path: The path to a checkpoint to use for evaluation.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.merge_all_summaries().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.

  Returns:
    The value of `final_op` or `None` if `final_op` is `None`.
  """
  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  global_step = variables.get_or_create_global_step()

  init_op = control_flow_ops.group(tf_variables.initialize_all_variables(),
                                   tf_variables.initialize_local_variables(),
                                   data_flow_ops.initialize_all_tables())

  saver = tf_saver.Saver(variables_to_restore or
                         variables.get_variables_to_restore())

  summary_writer = summary_io.SummaryWriter(logdir)

  sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                             logdir=logdir,
                             init_op=init_op,
                             summary_op=None,
                             summary_writer=None,
                             global_step=None,
                             saver=None)

  logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                         time.gmtime()))
  with sv.managed_session(
      master, start_standard_services=False, config=session_config) as sess:
    saver.restore(sess, checkpoint_path)
    sv.start_queue_runners(sess)
    final_op_value = evaluation(sess,
                                num_evals=num_evals,
                                eval_op=eval_op,
                                eval_op_feed_dict=eval_op_feed_dict,
                                final_op=final_op,
                                final_op_feed_dict=final_op_feed_dict,
                                summary_op=summary_op,
                                summary_op_feed_dict=summary_op_feed_dict,
                                summary_writer=summary_writer,
                                global_step=global_step)

  logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                         time.gmtime()))

  return final_op_value