Пример #1
0
def main(args):
    if not os.path.exists(FLAGS.checkpoint):
        tf.logging.fatal(
            'Checkpoint %s does not exist. Have you download it? See tools/download_data.sh',
            FLAGS.checkpoint)
    g = tf.Graph()
    with g.as_default():
        input_image = PreprocessImage(FLAGS.image_path[0])

        with slim.arg_scope(inception.inception_v3_arg_scope()):
            logits, end_points = inception.inception_v3(
                input_image, num_classes=FLAGS.num_classes, is_training=False)

        bottleneck = end_points['PreLogits']
        init_op = control_flow_ops.group(
            variables.initialize_all_variables(),
            variables.initialize_local_variables(),
            data_flow_ops.initialize_all_tables())
        saver = tf_saver.Saver()
        sess = tf.Session()
        saver.restore(sess, FLAGS.checkpoint)

        # Run the evaluation on the image
        bottleneck_eval = np.squeeze(sess.run(bottleneck))

    first = True
    for val in bottleneck_eval:
        if not first:
            sys.stdout.write(",")
        first = False
        sys.stdout.write('{:.3f}'.format(val))
    sys.stdout.write('\n')
Пример #2
0
def prep_graph():
    global predictions
    global labelmap
    global label_dict
    global sess
    global input_image
    global food_list
    food_list = []
    with open(food_names) as f:
        for x in f:
            food_list.append(x.rstrip())
    g = tf.Graph()
    with g.as_default():
        input_image = tf.placeholder(tf.string)
        processed_image = PreprocessImage(input_image)
        with slim.arg_scope(inception.inception_v3_arg_scope()):
            logits, end_points = inception.inception_v3(processed_image,
                                                        num_classes=6012,
                                                        is_training=False)
        predictions = end_points['multi_predictions'] = tf.nn.sigmoid(
            logits, name='multi_predictions')
        init_op = control_flow_ops.group(
            variables.initialize_all_variables(),
            variables.initialize_local_variables(),
            data_flow_ops.initialize_all_tables())
        saver = tf_saver.Saver()
        sess = tf.Session()
        saver.restore(sess, checkpoint)
        labelmap, label_dict = LoadLabelMaps(6012, labelmap_file,
                                             label_dict_file)
Пример #3
0
def _get_local_init_op():
    local_init_op = _get_first_op_from_collection(ops.GraphKeys.LOCAL_INIT_OP)
    if local_init_op is None:
        op_list = [variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()]
        if op_list:
            local_init_op = control_flow_ops.group(*op_list)
            ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op)
    return local_init_op
Пример #4
0
def _export_graph(graph, saver, checkpoint_path, export_dir,
                  default_graph_signature, named_graph_signatures,
                  exports_to_keep):
  """Exports graph via session_bundle, by creating a Session."""
  with graph.as_default():
    with tf_session.Session('') as session:
      variables.initialize_local_variables()
      data_flow_ops.initialize_all_tables()
      saver.restore(session, checkpoint_path)

      export = exporter.Exporter(saver)
      export.init(init_op=control_flow_ops.group(
          variables.initialize_local_variables(),
          data_flow_ops.initialize_all_tables()),
                  default_graph_signature=default_graph_signature,
                  named_graph_signatures=named_graph_signatures)
      export.export(export_dir, contrib_variables.get_global_step(), session,
                    exports_to_keep=exports_to_keep)
Пример #5
0
def _get_local_init_op():
  local_init_op = _get_first_op_from_collection(
      ops.GraphKeys.LOCAL_INIT_OP)
  if local_init_op is None:
    op_list = [variables.initialize_local_variables(),
               data_flow_ops.initialize_all_tables()]
    if op_list:
      local_init_op = control_flow_ops.group(*op_list)
      ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op)
  return local_init_op
Пример #6
0
def _assert_metrics(test_case, expected_loss, expected_eval_metrics,
                    model_fn_ops):
  test_case.assertAlmostEqual(expected_loss, model_fn_ops.loss.eval(), places=4)
  for k in six.iterkeys(expected_eval_metrics):
    test_case.assertIn(k, six.iterkeys(model_fn_ops.eval_metric_ops))
  variables.initialize_local_variables().run()
  for key, expected_value in six.iteritems(expected_eval_metrics):
    value_tensor, update_tensor = model_fn_ops.eval_metric_ops[key]
    update = update_tensor.eval()
    test_case.assertAlmostEqual(
        expected_value,
        update,
        places=4,
        msg="%s: update, expected %s, got %s." % (key, expected_value, update))
    value = value_tensor.eval()
    test_case.assertAlmostEqual(
        expected_value,
        value,
        places=4,
        msg="%s: value, expected %s, got %s." % (key, expected_value, value))
Пример #7
0
def _assert_metrics(test_case, expected_loss, expected_eval_metrics,
                    model_fn_ops):
  test_case.assertAlmostEqual(expected_loss, model_fn_ops.loss.eval(), places=4)
  for k in six.iterkeys(expected_eval_metrics):
    test_case.assertIn(k, six.iterkeys(model_fn_ops.eval_metric_ops))
  variables.initialize_local_variables().run()
  for key, expected_value in six.iteritems(expected_eval_metrics):
    value_tensor, update_tensor = model_fn_ops.eval_metric_ops[key]
    update = update_tensor.eval()
    test_case.assertAlmostEqual(
        expected_value,
        update,
        places=4,
        msg="%s: update, expected %s, got %s." % (key, expected_value, update))
    value = value_tensor.eval()
    test_case.assertAlmostEqual(
        expected_value,
        value,
        places=4,
        msg="%s: value, expected %s, got %s." % (key, expected_value, value))
Пример #8
0
  def run(self,
          num_batches=None,
          graph=None,
          session=None,
          start_queues=True,
          initialize_variables=True,
          **kwargs):
    """Builds and runs the columns of the `DataFrame` and yields batches.

    This is a generator that yields a dictionary mapping column names to
    evaluated columns.

    Args:
      num_batches: the maximum number of batches to produce. If none specified,
        the returned value will iterate through infinite batches.
      graph: the `Graph` in which the `DataFrame` should be built.
      session: the `Session` in which to run the columns of the `DataFrame`.
      start_queues: if true, queues will be started before running and halted
        after producting `n` batches.
      initialize_variables: if true, variables will be initialized.
      **kwargs: Additional keyword arguments e.g. `num_epochs`.

    Yields:
      A dictionary, mapping column names to the values resulting from running
      each column for a single batch.
    """
    if graph is None:
      graph = ops.get_default_graph()
    with graph.as_default():
      if session is None:
        session = sess.Session()
      self_built = self.build(**kwargs)
      keys = list(self_built.keys())
      cols = list(self_built.values())
      if initialize_variables:
        if variables.local_variables():
          session.run(variables.initialize_local_variables())
        if variables.all_variables():
          session.run(variables.initialize_all_variables())
      if start_queues:
        coord = coordinator.Coordinator()
        threads = qr.start_queue_runners(sess=session, coord=coord)
      i = 0
      while num_batches is None or i < num_batches:
        i += 1
        try:
          values = session.run(cols)
          yield collections.OrderedDict(zip(keys, values))
        except errors.OutOfRangeError:
          break
      if start_queues:
        coord.request_stop()
        coord.join(threads)
Пример #9
0
def label(image_path,
          checkpoint="openimages_dataset/data/2016_08/model.ckpt",
          num_classes=6012,
          labelmap_path="openimages_dataset/data/2016_08/labelmap.txt",
          dict_path="openimages_dataset/dict.csv",
          threshold=0.5,
          rounding_digits=1):
    if not os.path.exists(checkpoint):
        tf.logging.fatal(
            'Checkpoint %s does not exist. Have you download it? See tools/download_data.sh',
            checkpoint)
    g = tf.Graph()
    with g.as_default():
        input_image = PreprocessImage(image_path)

        with slim.arg_scope(inception.inception_v3_arg_scope()):
            logits, end_points = inception.inception_v3(
                input_image, num_classes=num_classes, is_training=False)

        predictions = end_points['multi_predictions'] = tf.nn.sigmoid(
            logits, name='multi_predictions')
        init_op = control_flow_ops.group(
            variables.initialize_all_variables(),
            variables.initialize_local_variables(),
            data_flow_ops.initialize_all_tables())
        saver = tf_saver.Saver()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        saver.restore(sess, checkpoint)

        # Run the evaluation on the image
        predictions_eval = np.squeeze(sess.run(predictions))

    # Print top(n) results
    labelmap, label_dict = LoadLabelMaps(num_classes, labelmap_path, dict_path)

    top_k = predictions_eval.argsort()[:][::-1]
    returned_labels = []
    for idx in top_k:
        mid = labelmap[idx]
        display_name = label_dict.get(mid, 'unknown')
        score = predictions_eval[idx]
        if score < threshold:
            if returned_labels:
                break
            else:
                threshold -= 0.1
                if threshold < 0.1:
                    break
        returned_labels.append((display_name, score))

    return returned_labels
Пример #10
0
def main_op():
  """Returns a main op to init variables and tables.

  Returns the main op including the group of ops that initializes all
  variables, initializes local variables and initialize all tables.

  Returns:
    The set of ops to be run as part of the main op upon the load operation.
  """
  init = tf_variables.initialize_all_variables()
  init_local = tf_variables.initialize_local_variables()
  init_tables = tf_data_flow_ops.initialize_all_tables()
  return tf.group(init, init_local, init_tables)
Пример #11
0
def run_feeds_iter(output_dict, feed_dicts, restore_checkpoint_path=None):
    """Run `output_dict` tensors with each input in `feed_dicts`.

  If `restore_checkpoint_path` is supplied, restore from checkpoint. Otherwise,
  init all variables.

  Args:
    output_dict: A `dict` mapping string names to `Tensor` objects to run.
      Tensors must all be from the same graph.
    feed_dicts: Iterable of `dict` objects of input values to feed.
    restore_checkpoint_path: A string containing the path to a checkpoint to
      restore.

  Yields:
    A sequence of dicts of values read from `output_dict` tensors, one item
    yielded for each item in `feed_dicts`. Keys are the same as `output_dict`,
    values are the results read from the corresponding `Tensor` in
    `output_dict`.

  Raises:
    ValueError: if `output_dict` or `feed_dicts` is None or empty.
  """
    if not output_dict:
        raise ValueError('output_dict is invalid: %s.' % output_dict)
    if not feed_dicts:
        raise ValueError('feed_dicts is invalid: %s.' % feed_dicts)

    graph = contrib_ops.get_graph_from_inputs(output_dict.values())
    with graph.as_default() as g:
        with tf_session.Session('') as session:
            session.run(
                resources.initialize_resources(resources.shared_resources() +
                                               resources.local_resources()))
            if restore_checkpoint_path:
                _restore_from_checkpoint(session, g, restore_checkpoint_path)
            else:
                session.run(variables.initialize_all_variables())
            session.run(variables.initialize_local_variables())
            session.run(data_flow_ops.initialize_all_tables())
            coord = coordinator.Coordinator()
            threads = None
            try:
                threads = queue_runner.start_queue_runners(session,
                                                           coord=coord)
                for f in feed_dicts:
                    yield session.run(output_dict, f)
            finally:
                coord.request_stop()
                if threads:
                    coord.join(threads, stop_grace_period_secs=120)
Пример #12
0
def main(args):
    if not os.path.exists(FLAGS.checkpoint):
        tf.logging.fatal(
            'Checkpoint %s does not exist. Have you download it? See tools/download_data.sh',
            FLAGS.checkpoint)
    g = tf.Graph()
    with g.as_default():
        input_image = tf.placeholder(tf.string)
        processed_image = PreprocessImage(input_image)

        with slim.arg_scope(inception.inception_v3_arg_scope()):
            logits, end_points = inception.inception_v3(
                processed_image,
                num_classes=FLAGS.num_classes,
                is_training=False)

        predictions = end_points['multi_predictions'] = tf.nn.sigmoid(
            logits, name='multi_predictions')
        init_op = control_flow_ops.group(
            variables.initialize_all_variables(),
            variables.initialize_local_variables(),
            data_flow_ops.initialize_all_tables())
        saver = tf_saver.Saver()
        sess = tf.Session()
        saver.restore(sess, FLAGS.checkpoint)

        # Run the evaluation on the images
        for image_path in FLAGS.image_path:
            if not os.path.exists(image_path):
                tf.logging.fatal('Input image does not exist %s',
                                 FLAGS.image_path[0])
            img_data = tf.gfile.FastGFile(image_path).read()
            print(image_path)
            predictions_eval = np.squeeze(
                sess.run(predictions, {input_image: img_data}))

            # Print top(n) results
            labelmap, label_dict = LoadLabelMaps(FLAGS.num_classes,
                                                 FLAGS.labelmap, FLAGS.dict)

            top_k = predictions_eval.argsort()[-FLAGS.n:][::-1]
            for idx in top_k:
                mid = labelmap[idx]
                display_name = label_dict.get(mid, 'unknown')
                score = predictions_eval[idx]
                print('{}: {} - {} (score = {:.2f})'.format(
                    idx, mid, display_name, score))
            print()
Пример #13
0
    def _init_local_init_op(self, local_init_op=USE_DEFAULT):
        """Initializes local_init_op.

    Args:
      local_init_op: `Operation` run for every new supervisor instance. If set
      to USE_DEFAULT create an op based on the `LOCAL_INITIALIZERS` graph
      collection.
    """
        if local_init_op is Supervisor.USE_DEFAULT:
            local_init_op = self._get_first_op_from_collection(ops.GraphKeys.LOCAL_INIT_OP)
            if local_init_op is None:
                op_list = [variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()]
                if op_list:
                    local_init_op = control_flow_ops.group(*op_list)
                    ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op)
        self._local_init_op = local_init_op
Пример #14
0
def run_feeds_iter(output_dict, feed_dicts, restore_checkpoint_path=None):
  """Run `output_dict` tensors with each input in `feed_dicts`.

  If `restore_checkpoint_path` is supplied, restore from checkpoint. Otherwise,
  init all variables.

  Args:
    output_dict: A `dict` mapping string names to `Tensor` objects to run.
      Tensors must all be from the same graph.
    feed_dicts: Iterable of `dict` objects of input values to feed.
    restore_checkpoint_path: A string containing the path to a checkpoint to
      restore.

  Yields:
    A sequence of dicts of values read from `output_dict` tensors, one item
    yielded for each item in `feed_dicts`. Keys are the same as `output_dict`,
    values are the results read from the corresponding `Tensor` in
    `output_dict`.

  Raises:
    ValueError: if `output_dict` or `feed_dicts` is None or empty.
  """
  if not output_dict:
    raise ValueError('output_dict is invalid: %s.' % output_dict)
  if not feed_dicts:
    raise ValueError('feed_dicts is invalid: %s.' % feed_dicts)

  graph = contrib_ops.get_graph_from_inputs(output_dict.values())

  with graph.as_default() as g:
    with tf_session.Session('') as session:
      if restore_checkpoint_path:
        _restore_from_checkpoint(session, g, restore_checkpoint_path)
      else:
        session.run(variables.initialize_all_variables())
      session.run(variables.initialize_local_variables())
      session.run(data_flow_ops.initialize_all_tables())
      coord = coordinator.Coordinator()
      threads = None
      try:
        threads = queue_runner.start_queue_runners(session, coord=coord)
        for f in feed_dicts:
          yield session.run(output_dict, f)
      finally:
        coord.request_stop()
        if threads:
          coord.join(threads, stop_grace_period_secs=120)
Пример #15
0
    def _init_local_init_op(self, local_init_op=USE_DEFAULT):
        """Initializes local_init_op.

    Args:
      local_init_op: `Operation` run for every new supervisor instance. If set
      to USE_DEFAULT, use the first op from the GraphKeys.LOCAL_INIT_OP
      collection. If the collection is empty, create an op that initializes
      all local variables and all tables.
    """
        if local_init_op is Supervisor.USE_DEFAULT:
            local_init_op = self._get_first_op_from_collection(ops.GraphKeys.LOCAL_INIT_OP)
            if local_init_op is None:
                op_list = [variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()]
                if op_list:
                    local_init_op = control_flow_ops.group(*op_list)
                    ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op)
        self._local_init_op = local_init_op
Пример #16
0
  def _init_local_init_op(self, local_init_op=USE_DEFAULT):
    """Initializes local_init_op.

    Args:
      local_init_op: `Operation` run for every new supervisor instance. If set
      to USE_DEFAULT create an op based on the `LOCAL_INITIALIZERS` graph
      collection.
    """
    if local_init_op is Supervisor.USE_DEFAULT:
      local_init_op = self._get_first_op_from_collection(
          ops.GraphKeys.LOCAL_INIT_OP)
      if local_init_op is None:
        op_list = [variables.initialize_local_variables(),
                   data_flow_ops.initialize_all_tables()]
        if op_list:
          local_init_op = control_flow_ops.group(*op_list)
          ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op)
    self._local_init_op = local_init_op
Пример #17
0
  def _init_local_init_op(self, local_init_op=USE_DEFAULT):
    """Initializes local_init_op.

    Args:
      local_init_op: `Operation` run for every new supervisor instance. If set
      to USE_DEFAULT, use the first op from the GraphKeys.LOCAL_INIT_OP
      collection. If the collection is empty, create an op that initializes
      all local variables and all tables.
    """
    if local_init_op is Supervisor.USE_DEFAULT:
      local_init_op = self._get_first_op_from_collection(
          ops.GraphKeys.LOCAL_INIT_OP)
      if local_init_op is None:
        op_list = [variables.initialize_local_variables(),
                   data_flow_ops.initialize_all_tables()]
        if op_list:
          local_init_op = control_flow_ops.group(*op_list)
          ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op)
    self._local_init_op = local_init_op
Пример #18
0
def train(
    train_op,
    logdir,
    log_every_n_steps=1,
    graph=None,
    master='',
    is_chief=True,
    global_step=None,
    number_of_steps=None,
    init_op=_USE_DEFAULT,
    init_feed_dict=None,
    init_fn=None,
    summary_op=_USE_DEFAULT,
    save_summaries_secs=600,
    startup_delay_steps=0,
    saver=None,
    save_interval_secs=600,
    sync_optimizer=None):
  """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: the directory where training logs are written to.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The BNS name of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If none, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, or if `number_of_steps` is
      negative.
  """
  if train_op is None:
    raise ValueError('train_op cannot be None.')

  if sync_optimizer and startup_delay_steps > 0:
    raise ValueError(
        'startup_delay_steps must be zero when sync_optimizer is supplied.')

  if number_of_steps is not None and number_of_steps <= 0:
    raise ValueError(
        '`number_of_steps` must be either None or a positive number.')

  graph = graph or ops.get_default_graph()
  if global_step is None:
    global_step = variables.get_or_create_global_step()
  saver = saver or tf_saver.Saver()

  if init_op is None:
    init_op = control_flow_ops.group(
        tf_variables.initialize_all_variables(),
        tf_variables.initialize_local_variables(),
        tf_variables.initialize_all_tables())

  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  local_init_op = None
  cleanup_op = None

  if is_chief and sync_optimizer:
    if not isinstance(sync_optimizer,
                      sync_replicas_optimizer.SyncReplicasOptimizer):
      raise ValueError(
          '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer')

    # Need to create these BEFORE the supervisor finalizes the graph:
    local_init_op = sync_optimizer.get_init_tokens_op()
    chief_queue_runner = sync_optimizer.get_chief_queue_runner()
    cleanup_op = sync_optimizer.get_clean_up_op()

  if number_of_steps:
    # Need to subtract 1 since the check for greater/equality is done
    # concurrently with the increment of global_step.
    # TODO(nsilberman): add a dependency to ensure the order of operations.
    should_stop_op = math_ops.greater_equal(global_step, number_of_steps-1)
  else:
    should_stop_op = constant_op.constant(False)

  should_log_op = math_ops.equal(math_ops.mod(global_step, log_every_n_steps),
                                 0)

  sv = supervisor.Supervisor(
      graph=graph,
      is_chief=is_chief,
      logdir=logdir,
      init_op=init_op,
      init_feed_dict=init_feed_dict,
      local_init_op=local_init_op,
      summary_op=summary_op,
      global_step=global_step,
      saver=saver,
      save_summaries_secs=save_summaries_secs,
      save_model_secs=save_interval_secs,
      init_fn=init_fn)

  with sv.managed_session(master, start_standard_services=False) as sess:
    if is_chief:
      sv.start_standard_services(sess)
    elif not is_chief and startup_delay_steps > 0:
      _wait_for_step(sess, global_step,
                     min(startup_delay_steps, number_of_steps or sys.maxint))
    sv.start_queue_runners(sess)
    if is_chief and sync_optimizer:
      sv.start_queue_runners(sess, [chief_queue_runner])

    total_loss = train_loop(
        sv, sess, train_op, should_stop_op, should_log_op, global_step,
        cleanup_op)

    # This waits for service threads to finish.
    sv.Stop()

    if sv.is_chief:
      logging.info('Finished training! Saving model to disk.')
      sv.saver.save(sess, sv.save_path, global_step=sv.global_step)

    return total_loss
Пример #19
0
def evaluation_loop(master,
                    checkpoint_dir,
                    logdir,
                    num_evals=1,
                    eval_op=None,
                    eval_op_feed_dict=None,
                    final_op=None,
                    final_op_feed_dict=None,
                    summary_op=_USE_DEFAULT,
                    summary_op_feed_dict=None,
                    variables_to_restore=None,
                    eval_interval_secs=60,
                    max_number_of_evaluations=None):
  """Runs TF-Slim's Evaluation Loop.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_dir: The directory where checkpoints are stored.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.merge_all_summaries().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    eval_interval_secs: The minimum number of seconds between evaluations.
    max_number_of_evaluations: the max number of iterations of the evaluation.
      If the value is left as 'None', the evaluation continues indefinitely.
  """
  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  global_step = variables.get_or_create_global_step()

  init_op = control_flow_ops.group(tf_variables.initialize_all_variables(),
                                   tf_variables.initialize_local_variables(),
                                   data_flow_ops.initialize_all_tables())

  saver = tf_saver.Saver(variables_to_restore or
                         variables.get_variables_to_restore())

  summary_writer = summary_io.SummaryWriter(logdir)

  sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                             logdir=logdir,
                             init_op=init_op,
                             summary_op=None,
                             summary_writer=None,
                             global_step=None,
                             saver=saver)

  last_checkpoint = None
  number_of_evaluations = 0
  while True:
    last_checkpoint = wait_for_new_checkpoint(checkpoint_dir, last_checkpoint)
    start = time.time()
    logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                           time.gmtime()))

    with sv.managed_session(master, start_standard_services=False) as sess:
      sv.saver.restore(sess, last_checkpoint)
      sv.start_queue_runners(sess)
      evaluation(sess,
                 num_evals=num_evals,
                 eval_op=eval_op,
                 eval_op_feed_dict=eval_op_feed_dict,
                 final_op=final_op,
                 final_op_feed_dict=final_op_feed_dict,
                 summary_op=summary_op,
                 summary_op_feed_dict=summary_op_feed_dict,
                 summary_writer=summary_writer,
                 global_step=global_step)

    logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                           time.gmtime()))
    number_of_evaluations += 1
    if (max_number_of_evaluations and
        number_of_evaluations >= max_number_of_evaluations):
      logging.info('Reached max_number_of_evaluations=%s. Exit',
                   max_number_of_evaluations)
      break

    time_to_next_eval = start + eval_interval_secs - time.time()
    if time_to_next_eval > 0:
      time.sleep(time_to_next_eval)
Пример #20
0
def train(
    train_op,
    logdir,
    log_every_n_steps=1,
    graph=None,
    master='',
    is_chief=True,
    global_step=None,
    number_of_steps=None,
    init_op=_USE_DEFAULT,
    init_feed_dict=None,
    init_fn=None,
    summary_op=_USE_DEFAULT,
    save_summaries_secs=600,
    startup_delay_steps=0,
    saver=None,
    save_interval_secs=600,
    sync_optimizer=None):
  """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: the directory where training logs are written to.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The BNS name of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If none, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, or if `number_of_steps` is
      negative.
  """
  if train_op is None:
    raise ValueError('train_op cannot be None.')

  if sync_optimizer and startup_delay_steps > 0:
    raise ValueError(
        'startup_delay_steps must be zero when sync_optimizer is supplied.')

  if number_of_steps is not None and number_of_steps <= 0:
    raise ValueError(
        '`number_of_steps` must be either None or a positive number.')

  graph = graph or ops.get_default_graph()
  if global_step is None:
    global_step = variables.get_or_create_global_step()
  saver = saver or tf_saver.Saver()

  if init_op is None:
    init_op = control_flow_ops.group(
        tf_variables.initialize_all_variables(),
        tf_variables.initialize_local_variables(),
        tf_variables.initialize_all_tables())

  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  local_init_op = None
  cleanup_op = None

  if is_chief and sync_optimizer:
    if not isinstance(sync_optimizer,
                      sync_replicas_optimizer.SyncReplicasOptimizer):
      raise ValueError(
          '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer')

    # Need to create these BEFORE the supervisor finalizes the graph:
    local_init_op = sync_optimizer.get_init_tokens_op()
    chief_queue_runner = sync_optimizer.get_chief_queue_runner()
    cleanup_op = sync_optimizer.get_clean_up_op()

  if number_of_steps:
    should_stop_op = math_ops.greater_equal(global_step, number_of_steps)
  else:
    should_stop_op = constant_op.constant(False)

  should_log_op = math_ops.equal(
      math_ops.mod(global_step, log_every_n_steps), 0)

  sv = supervisor.Supervisor(
      graph=graph,
      is_chief=is_chief,
      logdir=logdir,
      init_op=init_op,
      init_feed_dict=init_feed_dict,
      local_init_op=local_init_op,
      summary_op=summary_op,
      global_step=global_step,
      saver=saver,
      save_summaries_secs=save_summaries_secs,
      save_model_secs=save_interval_secs,
      init_fn=init_fn)

  with sv.managed_session(master, start_standard_services=False) as sess:
    if is_chief:
      sv.start_standard_services(sess)
    elif not is_chief and startup_delay_steps > 0:
      _wait_for_step(sess, global_step,
                     min(startup_delay_steps, number_of_steps or sys.maxint))
    sv.start_queue_runners(sess)
    if is_chief and sync_optimizer:
      sv.start_queue_runners(sess, [chief_queue_runner])

    total_loss = train_loop(
        sv, sess, train_op, should_stop_op, should_log_op, global_step,
        cleanup_op)

    # This waits for service threads to finish.
    sv.Stop()

    if sv.is_chief:
      logging.info('Finished training! Saving model to disk.')
      sv.saver.save(sess, sv.save_path, global_step=sv.global_step)

    return total_loss
Пример #21
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=_USE_DEFAULT,
          init_fn=None,
          ready_op=_USE_DEFAULT,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          summary_writer=_USE_DEFAULT,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None,
          session_config=None,
          trace_every_n_steps=None):
    """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to. If None, model
      checkpoints and summaries will not be written.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The BNS name of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.initialize_all_variables()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If left to its default
      value, then the session is initialized by calling
      `tf.initialize_local_variables()` and `tf.initialize_all_tables()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    ready_op: Operation to check if the model is ready to use. If left to its
      default value, then the session checks for readiness by calling
      `tf.report_uninitialized_variables()`.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    summary_writer: `SummaryWriter` to use.  Can be `None`
      to indicate that no summaries should be written. If unset, we
      create a SummaryWriter.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If None, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.
    trace_every_n_steps: produce and save a `Timeline` in Chrome trace format
      and add it to the summaries every `trace_every_n_steps`. If None, no trace
      information will be produced or saved.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, if `number_of_steps` is
      negative, or if `trace_every_n_steps` is not `None` and no `logdir` is
      provided.
  """
    if train_op is None:
        raise ValueError('train_op cannot be None.')

    if logdir is None:
        if summary_op != _USE_DEFAULT:
            raise ValueError('Cannot provide summary_op because logdir=None')
        if saver is not None:
            raise ValueError('Cannot provide saver because logdir=None')
        if trace_every_n_steps is not None:
            raise ValueError('Cannot provide trace_every_n_steps because '
                             'logdir=None')

    if sync_optimizer is not None and startup_delay_steps > 0:
        raise ValueError(
            'startup_delay_steps must be zero when sync_optimizer is supplied.'
        )

    if number_of_steps is not None and number_of_steps <= 0:
        raise ValueError(
            '`number_of_steps` must be either None or a positive number.')

    graph = graph or ops.get_default_graph()
    with graph.as_default():
        if global_step is None:
            global_step = variables.get_or_create_global_step()
        saver = saver or tf_saver.Saver()

        with ops.name_scope('init_ops'):
            if init_op == _USE_DEFAULT:
                init_op = tf_variables.initialize_all_variables()

            if ready_op == _USE_DEFAULT:
                ready_op = tf_variables.report_uninitialized_variables()

            if local_init_op == _USE_DEFAULT:
                local_init_op = control_flow_ops.group(
                    tf_variables.initialize_local_variables(),
                    data_flow_ops.initialize_all_tables())

        if summary_op == _USE_DEFAULT:
            summary_op = logging_ops.merge_all_summaries()

        if summary_writer == _USE_DEFAULT:
            summary_writer = supervisor.Supervisor.USE_DEFAULT

        cleanup_op = None

        if is_chief and sync_optimizer is not None:
            if not isinstance(
                    sync_optimizer,
                (sync_replicas_optimizer.SyncReplicasOptimizer,
                 sync_replicas_optimizer.SyncReplicasOptimizerV2)):
                raise ValueError(
                    '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer or '
                    'tf.train.SyncReplicasOptimizerV2.')

            # Need to create these BEFORE the supervisor finalizes the graph:
            with ops.control_dependencies([init_op]):
                init_tokens_op = sync_optimizer.get_init_tokens_op()
            init_op = init_tokens_op
            chief_queue_runner = sync_optimizer.get_chief_queue_runner()
            if isinstance(sync_optimizer,
                          sync_replicas_optimizer.SyncReplicasOptimizer):
                cleanup_op = sync_optimizer.get_clean_up_op()

        if train_step_kwargs == _USE_DEFAULT:
            with ops.name_scope('train_step'):
                train_step_kwargs = {}

                if number_of_steps:
                    should_stop_op = math_ops.greater_equal(
                        global_step, number_of_steps)
                else:
                    should_stop_op = constant_op.constant(False)
                train_step_kwargs['should_stop'] = should_stop_op
                train_step_kwargs['should_log'] = math_ops.equal(
                    math_ops.mod(global_step, log_every_n_steps), 0)
                if is_chief and trace_every_n_steps is not None:
                    train_step_kwargs['should_trace'] = math_ops.equal(
                        math_ops.mod(global_step, trace_every_n_steps), 0)
                    train_step_kwargs['logdir'] = logdir

    sv = supervisor.Supervisor(graph=graph,
                               is_chief=is_chief,
                               logdir=logdir,
                               init_op=init_op,
                               init_feed_dict=init_feed_dict,
                               local_init_op=local_init_op,
                               ready_op=ready_op,
                               summary_op=summary_op,
                               summary_writer=summary_writer,
                               global_step=global_step,
                               saver=saver,
                               save_summaries_secs=save_summaries_secs,
                               save_model_secs=save_interval_secs,
                               init_fn=init_fn)

    if summary_writer is not None:
        train_step_kwargs['summary_writer'] = sv.summary_writer

    should_retry = True
    while should_retry:
        try:
            should_retry = False
            with sv.managed_session(master,
                                    start_standard_services=False,
                                    config=session_config) as sess:
                logging.info('Starting Session.')
                if is_chief:
                    if logdir:
                        sv.start_standard_services(sess)
                elif startup_delay_steps > 0:
                    _wait_for_step(
                        sess, global_step,
                        min(startup_delay_steps, number_of_steps
                            or sys.maxint))
                sv.start_queue_runners(sess)
                logging.info('Starting Queues.')
                if is_chief and sync_optimizer is not None:
                    sv.start_queue_runners(sess, [chief_queue_runner])
                try:
                    while not sv.should_stop():
                        total_loss, should_stop = train_step_fn(
                            sess, train_op, global_step, train_step_kwargs)
                        if should_stop:
                            logging.info('Stopping Training.')
                            break
                    if logdir and sv.is_chief:
                        logging.info(
                            'Finished training! Saving model to disk.')
                        sv.saver.save(sess,
                                      sv.save_path,
                                      global_step=sv.global_step)
                except:
                    if sv.is_chief and cleanup_op is not None:
                        logging.info('About to execute sync_clean_up_op!')
                        sess.run(cleanup_op)
                    raise

        except errors.AbortedError:
            # Always re-run on AbortedError as it indicates a restart of one of the
            # distributed tensorflow servers.
            logging.info('Retrying training!')
            should_retry = True

    return total_loss
Пример #22
0
 def _default_local_init_op():
   return control_flow_ops.group(variables.initialize_local_variables(),
                                 data_flow_ops.initialize_all_tables())
Пример #23
0
def evaluate_once(checkpoint_path,
                  logdir,
                  master='',
                  num_evals=1,
                  eval_op=None,
                  eval_op_feed_dict=None,
                  final_op=None,
                  final_op_feed_dict=None,
                  summary_op=_USE_DEFAULT,
                  summary_op_feed_dict=None,
                  variables_to_restore=None,
                  session_config=None):
  """Evaluates the model at the given checkpoint path.

  Args:
    checkpoint_path: The path to a checkpoint to use for evaluation.
    logdir: The directory where the TensorFlow summaries are written to.
    master: The BNS address of the TensorFlow master.
    num_evals: The number of times to run `eval_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.merge_all_summaries().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.

  Returns:
    The value of `final_op` or `None` if `final_op` is `None`.
  """
  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  global_step = variables.get_or_create_global_step()

  init_op = control_flow_ops.group(tf_variables.initialize_all_variables(),
                                   tf_variables.initialize_local_variables(),
                                   data_flow_ops.initialize_all_tables())

  saver = tf_saver.Saver(variables_to_restore or
                         variables.get_variables_to_restore())

  summary_writer = summary_io.SummaryWriter(logdir)

  sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                             logdir=logdir,
                             init_op=init_op,
                             summary_op=None,
                             summary_writer=None,
                             global_step=None,
                             saver=None)

  logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                         time.gmtime()))
  with sv.managed_session(
      master, start_standard_services=False, config=session_config) as sess:
    saver.restore(sess, checkpoint_path)
    sv.start_queue_runners(sess)
    final_op_value = evaluation(sess,
                                num_evals=num_evals,
                                eval_op=eval_op,
                                eval_op_feed_dict=eval_op_feed_dict,
                                final_op=final_op,
                                final_op_feed_dict=final_op_feed_dict,
                                summary_op=summary_op,
                                summary_op_feed_dict=summary_op_feed_dict,
                                summary_writer=summary_writer,
                                global_step=global_step)

  logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                         time.gmtime()))

  return final_op_value
Пример #24
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=_USE_DEFAULT,
          init_fn=None,
          ready_op=_USE_DEFAULT,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          summary_writer=_USE_DEFAULT,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None,
          session_config=None):
  """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to. If None, model
      checkpoints and summaries will not be written.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The BNS name of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.initialize_all_variables()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If left to its default
      value, then the session is initialized by calling
      `tf.initialize_local_variables()` and `tf.initialize_all_tables()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    ready_op: Operation to check if the model is ready to use. If left to its
      default value, then the session checks for readiness by calling
      `tf.report_uninitialized_variables()`.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    summary_writer: `SummaryWriter` to use.  Can be `None`
      to indicate that no summaries should be written. If unset, we
      create a SummaryWriter.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If None, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, or if `number_of_steps` is
      negative.
  """
  if train_op is None:
    raise ValueError('train_op cannot be None.')

  if logdir is None:
    if summary_op != _USE_DEFAULT:
      raise ValueError('Cannot provide summary_op because logdir=None')
    if saver is not None:
      raise ValueError('Cannot provide saver because logdir=None')

  if sync_optimizer and startup_delay_steps > 0:
    raise ValueError(
        'startup_delay_steps must be zero when sync_optimizer is supplied.')

  if number_of_steps is not None and number_of_steps <= 0:
    raise ValueError(
        '`number_of_steps` must be either None or a positive number.')

  graph = graph or ops.get_default_graph()
  with graph.as_default():
    if global_step is None:
      global_step = variables.get_or_create_global_step()
    saver = saver or tf_saver.Saver()

    if init_op == _USE_DEFAULT:
      init_op = tf_variables.initialize_all_variables()

    if ready_op == _USE_DEFAULT:
      ready_op = tf_variables.report_uninitialized_variables()

    if summary_op == _USE_DEFAULT:
      summary_op = logging_ops.merge_all_summaries()

    if summary_writer == _USE_DEFAULT:
      summary_writer = supervisor.Supervisor.USE_DEFAULT

    if local_init_op == _USE_DEFAULT:
      local_init_op = control_flow_ops.group(
          tf_variables.initialize_local_variables(),
          data_flow_ops.initialize_all_tables())

    cleanup_op = None

    if is_chief and sync_optimizer:
      if not isinstance(sync_optimizer,
                        sync_replicas_optimizer.SyncReplicasOptimizer):
        raise ValueError(
            '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer')

      # Need to create these BEFORE the supervisor finalizes the graph:
      with ops.control_dependencies([init_op]):
        init_tokens_op = sync_optimizer.get_init_tokens_op()
      init_op = init_tokens_op
      chief_queue_runner = sync_optimizer.get_chief_queue_runner()
      cleanup_op = sync_optimizer.get_clean_up_op()

    if train_step_kwargs == _USE_DEFAULT:
      train_step_kwargs = {}

      if number_of_steps:
        should_stop_op = math_ops.greater_equal(global_step, number_of_steps)
      else:
        should_stop_op = constant_op.constant(False)
      train_step_kwargs['should_stop'] = should_stop_op
      train_step_kwargs['should_log'] = math_ops.equal(
          math_ops.mod(global_step, log_every_n_steps), 0)

  sv = supervisor.Supervisor(
      graph=graph,
      is_chief=is_chief,
      logdir=logdir,
      init_op=init_op,
      init_feed_dict=init_feed_dict,
      local_init_op=local_init_op,
      ready_op=ready_op,
      summary_op=summary_op,
      summary_writer=summary_writer,
      global_step=global_step,
      saver=saver,
      save_summaries_secs=save_summaries_secs,
      save_model_secs=save_interval_secs,
      init_fn=init_fn)

  should_retry = True
  while should_retry:
    try:
      should_retry = False
      with sv.managed_session(
          master, start_standard_services=False, config=session_config) as sess:
        logging.info('Starting Session.')
        if is_chief:
          if logdir:
            sv.start_standard_services(sess)
        elif startup_delay_steps > 0:
          _wait_for_step(sess, global_step,
                         min(startup_delay_steps,
                             number_of_steps or sys.maxint))
        sv.start_queue_runners(sess)
        logging.info('Starting Queues.')
        if is_chief and sync_optimizer:
          sv.start_queue_runners(sess, [chief_queue_runner])
        try:
          while not sv.should_stop():
            total_loss, should_stop = train_step_fn(
                sess, train_op, global_step, train_step_kwargs)
            if should_stop:
              logging.info('Stopping Training.')
              break
          if logdir and sv.is_chief:
            logging.info('Finished training! Saving model to disk.')
            sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
        except:
          if sv.is_chief and cleanup_op is not None:
            logging.info('About to execute sync_clean_up_op!')
            sess.run(cleanup_op)
          raise

    except errors.AbortedError:
      # Always re-run on AbortedError as it indicates a restart of one of the
      # distributed tensorflow servers.
      logging.info('Retrying training!')
      should_retry = True

  return total_loss
Пример #25
0
 def _default_local_init_op():
     return control_flow_ops.group(variables.initialize_local_variables(),
                                   data_flow_ops.initialize_all_tables())
Пример #26
0
def evaluate_once(master,
                  checkpoint_path,
                  logdir,
                  num_evals=1,
                  eval_op=None,
                  eval_op_feed_dict=None,
                  final_op=None,
                  final_op_feed_dict=None,
                  summary_op=_USE_DEFAULT,
                  summary_op_feed_dict=None,
                  variables_to_restore=None,
                  session_config=None):
  """Evaluates the model at the given checkpoint path.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_path: The path to a checkpoint to use for evaluation.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.merge_all_summaries().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.

  Returns:
    The value of `final_op` or `None` if `final_op` is `None`.
  """
  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  global_step = variables.get_or_create_global_step()

  init_op = control_flow_ops.group(tf_variables.initialize_all_variables(),
                                   tf_variables.initialize_local_variables(),
                                   data_flow_ops.initialize_all_tables())

  saver = tf_saver.Saver(variables_to_restore or
                         variables.get_variables_to_restore())

  summary_writer = summary_io.SummaryWriter(logdir)

  sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                             logdir=logdir,
                             init_op=init_op,
                             summary_op=None,
                             summary_writer=None,
                             global_step=None,
                             saver=None)

  logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                         time.gmtime()))
  with sv.managed_session(
      master, start_standard_services=False, config=session_config) as sess:
    saver.restore(sess, checkpoint_path)
    sv.start_queue_runners(sess)
    final_op_value = evaluation(sess,
                                num_evals=num_evals,
                                eval_op=eval_op,
                                eval_op_feed_dict=eval_op_feed_dict,
                                final_op=final_op,
                                final_op_feed_dict=final_op_feed_dict,
                                summary_op=summary_op,
                                summary_op_feed_dict=summary_op_feed_dict,
                                summary_writer=summary_writer,
                                global_step=global_step)

  logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                         time.gmtime()))

  return final_op_value