def test_save_steps(self):
        hook = basic_session_run_hooks.SummarySaverHook(
            save_steps=8,
            summary_writer=self.summary_writer,
            summary_op=self.summary_op)

        with self.test_session() as sess:
            hook.begin()
            sess.run(variables_lib.global_variables_initializer())
            mon_sess = monitored_session._HookedSession(sess, [hook])
            for _ in range(30):
                mon_sess.run(self.train_op)
            hook.end(sess)

        self.summary_writer.assert_summaries(test_case=self,
                                             expected_logdir=self.log_dir,
                                             expected_summaries={
                                                 1: {
                                                     'my_summary': 1.0
                                                 },
                                                 9: {
                                                     'my_summary': 2.0
                                                 },
                                                 17: {
                                                     'my_summary': 3.0
                                                 },
                                                 25: {
                                                     'my_summary': 4.0
                                                 },
                                             })
Exemplo n.º 2
0
    def test_save_secs_saving_once_every_three_steps(self):
        hook = basic_session_run_hooks.SummarySaverHook(
            save_secs=0.9,
            summary_writer=self.summary_writer,
            summary_op=self.summary_op)

        with self.test_session() as sess:
            hook.begin()
            sess.run(variables_lib.global_variables_initializer())
            mon_sess = monitored_session._HookedSession(sess, [hook])
            for _ in range(8):
                mon_sess.run(self.train_op)
                time.sleep(0.3)
            hook.end(sess)

        self.summary_writer.assert_summaries(test_case=self,
                                             expected_logdir=self.log_dir,
                                             expected_summaries={
                                                 1: {
                                                     'my_summary': 1.0
                                                 },
                                                 4: {
                                                     'my_summary': 2.0
                                                 },
                                                 7: {
                                                     'my_summary': 3.0
                                                 },
                                             })
Exemplo n.º 3
0
  def test_save_secs_saving_once_every_three_steps(self, mock_time):
    mock_time.return_value = 1484695987.209386
    hook = basic_session_run_hooks.SummarySaverHook(
        save_secs=9.,
        summary_writer=self.summary_writer,
        summary_op=self.summary_op)

    with self.test_session() as sess:
      hook.begin()
      sess.run(variables_lib.global_variables_initializer())
      mon_sess = monitored_session._HookedSession(sess, [hook])
      for _ in range(8):
        mon_sess.run(self.train_op)
        mock_time.return_value += 3.1
      hook.end(sess)

    # 24.8 seconds passed (3.1*8), it saves every 9 seconds starting from first:
    self.summary_writer.assert_summaries(
        test_case=self,
        expected_logdir=self.log_dir,
        expected_summaries={
            1: {
                'my_summary': 1.0
            },
            4: {
                'my_summary': 2.0
            },
            7: {
                'my_summary': 3.0
            },
        })
Exemplo n.º 4
0
def MonitoredTrainingSession(
        master='',  # pylint: disable=invalid-name
        is_chief=True,
        checkpoint_dir=None,
        hooks=None,
        scaffold=None,
        config=None):
    """Creates a `MonitoredSession` for training.

  For a chief, this utility sets proper session initializer/restorer. It also
  creates hooks related to checkpoint and summary saving. For workers, this
  utility sets proper session creator which waits for the chief to
  inialize/restore.


  Args:
    master: `String` the TensorFlow master to use.
    is_chief: If `True`, it will take care of initialization and recovery the
      underlying TensorFlow session. If `False`, it will wait on a chief to
      initialize or recover the TensorFlow session.
    checkpoint_dir: A string.  Optional path to a directory where to restore
      variables.
    hooks: Optional list of `SessionRunHook` objects.
    scaffold: A `Scaffold` used for gathering or building supportive ops. If
      not specified, a default one is created. It's used to finalize the graph.
    config: `ConfigProto` proto used to configure the session.

  Returns:
    A `MonitoredSession` object.
  """
    hooks = hooks or []
    scaffold = scaffold or Scaffold()
    if not is_chief:
        session_creator = WorkerSessionCreator(scaffold=scaffold,
                                               master=master,
                                               config=config)
    else:
        session_creator = ChiefSessionCreator(scaffold=scaffold,
                                              checkpoint_dir=checkpoint_dir,
                                              master=master,
                                              config=config)
        hooks.extend([
            basic_session_run_hooks.StepCounterHook(output_dir=checkpoint_dir),
            basic_session_run_hooks.SummarySaverHook(
                scaffold=scaffold, output_dir=checkpoint_dir),
            basic_session_run_hooks.CheckpointSaverHook(checkpoint_dir,
                                                        save_secs=600,
                                                        scaffold=scaffold),
        ])

    return MonitoredSession(session_creator=session_creator, hooks=hooks)
Exemplo n.º 5
0
def _monitored_train(graph,
                     output_dir,
                     train_op,
                     loss_op,
                     global_step_tensor=None,
                     init_op=None,
                     init_feed_dict=None,
                     init_fn=None,
                     log_every_steps=10,
                     supervisor_is_chief=True,
                     supervisor_master='',
                     supervisor_save_model_secs=600,
                     supervisor_save_model_steps=None,
                     keep_checkpoint_max=5,
                     supervisor_save_summaries_secs=None,
                     supervisor_save_summaries_steps=100,
                     feed_fn=None,
                     steps=None,
                     fail_on_nan_loss=True,
                     hooks=None,
                     max_steps=None):
  """Train a model via monitored_session.

  Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
  run a training loop. The given `train_op` performs one step of training on the
  model. The `loss_op` represents the objective function of the training. It is
  expected to increment the `global_step_tensor`, a scalar integer tensor
  counting training steps. This function uses `Supervisor` to initialize the
  graph (from a checkpoint if one is available in `output_dir`), write summaries
  defined in the graph, and write regular checkpoints as defined by
  `supervisor_save_model_secs`.

  Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
  `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
  program is terminated with exit code 1.

  Args:
    graph: A graph to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A directory to write outputs to.
    train_op: An op that performs one training step when run.
    loss_op: A scalar loss tensor.
    global_step_tensor: A tensor representing the global step. If none is given,
      one is extracted from the graph using the same logic as in `Supervisor`.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    init_feed_dict: A dictionary that maps `Tensor` objects to feed values.
      This feed dictionary will be used when `init_op` is evaluated.
    init_fn: Optional callable passed to Supervisor to initialize the model.
    log_every_steps: Output logs regularly. The logs contain timing data and the
      current loss. A `0` or negative value disables logging.
    supervisor_is_chief: Whether the current process is the chief supervisor in
      charge of restoring the model and running standard services.
    supervisor_master: The master string to use when preparing the session.
    supervisor_save_model_secs: Save checkpoints every this many seconds. Can
        not be specified with `supervisor_save_model_steps`.
    supervisor_save_model_steps: Save checkpoints every this many steps. Can not
        be specified with `supervisor_save_model_secs`.
    keep_checkpoint_max: The maximum number of recent checkpoint files to
      keep. As new files are created, older files are deleted. If None or 0,
      all checkpoint files are kept. This is simply passed as the max_to_keep
      arg to `tf.Saver` constructor.
    supervisor_save_summaries_secs: Save summaries every
      `supervisor_save_summaries_secs` seconds when training.
    supervisor_save_summaries_steps: Save summaries every
      `supervisor_save_summaries_steps` steps when training. Exactly one of
      `supervisor_save_model_steps` and `supervisor_save_model_secs` should be
      specified, and the other should be None.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    steps: Trains for this many steps (e.g. current global step + `steps`).
    fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
      evaluates to `NaN`. If false, continue training as if nothing happened.
    hooks: List of `SessionRunHook` subclass instances. Used for callbacks
      inside the training loop.
    max_steps: Number of total steps for which to train model. If `None`,
      train forever. Two calls fit(steps=100) means 200 training iterations.
      On the other hand two calls of fit(max_steps=100) means, second call
      will not do any iteration since first call did all 100 steps.

  Returns:
    The final loss value.

  Raises:
    ValueError: If `output_dir`, `train_op`, `loss_op`, or `global_step_tensor`
      is not provided. See `tf.contrib.framework.get_global_step` for how we
      look up the latter if not provided explicitly.
    NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever
      evaluates to `NaN`.
    ValueError: If both `steps` and `max_steps` are not `None`.
  """
  if (steps is not None) and (max_steps is not None):
    raise ValueError('Can not provide both steps and max_steps.')
  if not output_dir:
    raise ValueError('Output directory should be non-empty %s.' % output_dir)
  if train_op is None:
    raise ValueError('Missing train_op.')
  if loss_op is None:
    raise ValueError('Missing loss_op.')
  if hooks is None:
    hooks = []
  if not isinstance(hooks, list):
    raise ValueError('Hooks should be a list.')
  with graph.as_default():
    global_step_tensor = contrib_variables.assert_or_get_global_step(
        graph, global_step_tensor)
  if global_step_tensor is None:
    raise ValueError('No "global_step" was provided or found in the graph.')

  if max_steps is not None:
    try:
      start_step = load_variable(output_dir, global_step_tensor.name)
      if max_steps <= start_step:
        logging.info('Skipping training since max_steps has already saved.')
        return None
    except:  # pylint: disable=bare-except
      pass

  # Adapted SessionRunHooks such as ExportMonitor depend on the
  # CheckpointSaverHook to be executed before they should be executed.
  # The `hooks` param comprises of deprecated monitor hooks
  # (such as ExportMonitor). Appending them after the basic_session_run_hooks.
  all_hooks = []
  with graph.as_default():
    all_hooks.append(basic_session_run_hooks.NanTensorHook(
        loss_op, fail_on_nan_loss=fail_on_nan_loss))
    if log_every_steps > 0:
      all_hooks.append(basic_session_run_hooks.LoggingTensorHook({
          'loss': loss_op.name,
          'step': global_step_tensor.name
      }, every_n_iter=log_every_steps))

    def make_saver():
      return tf_saver.Saver(
          sharded=True, max_to_keep=keep_checkpoint_max, defer_build=True,
          write_version=saver_pb2.SaverDef.V1)

    scaffold = monitored_session.Scaffold(
        init_op=init_op,
        init_feed_dict=init_feed_dict,
        init_fn=init_fn,
        saver=monitored_session.Scaffold.get_or_default('saver',
                                                        ops.GraphKeys.SAVERS,
                                                        make_saver))

    if not supervisor_is_chief:
      session_creator = monitored_session.WorkerSessionCreator(
          scaffold=scaffold,
          master=supervisor_master)
    else:
      session_creator = monitored_session.ChiefSessionCreator(
          scaffold=scaffold,
          checkpoint_dir=output_dir,
          master=supervisor_master)
      summary_writer = summary_io.SummaryWriterCache.get(output_dir)
      all_hooks.append(
          basic_session_run_hooks.StepCounterHook(
              summary_writer=summary_writer))
      all_hooks.append(
          basic_session_run_hooks.SummarySaverHook(
              save_secs=supervisor_save_summaries_secs,
              save_steps=supervisor_save_summaries_steps,
              summary_writer=summary_writer,
              scaffold=scaffold))
      if (supervisor_save_model_secs is not None
          or supervisor_save_model_steps is not None):
        all_hooks.append(
            basic_session_run_hooks.CheckpointSaverHook(
                output_dir,
                save_secs=supervisor_save_model_secs,
                save_steps=supervisor_save_model_steps,
                scaffold=scaffold))

    if steps is not None or max_steps is not None:
      all_hooks.append(basic_session_run_hooks.StopAtStepHook(steps, max_steps))
    all_hooks.extend(hooks)

    with monitored_session.MonitoredSession(
        session_creator=session_creator,
        hooks=all_hooks) as super_sess:
      loss = None
      while not super_sess.should_stop():
        _, loss = super_sess.run([train_op, loss_op], feed_fn() if feed_fn else
                                 None)
    summary_io.SummaryWriterCache.clear()
    return loss
 def test_raise_in_none_secs_and_steps(self):
     with self.assertRaises(ValueError):
         basic_session_run_hooks.SummarySaverHook(
             save_secs=None,
             save_steps=None,
             summary_writer=self.summary_writer)
 def test_raise_when_scaffold_and_summary_op_both_present(self):
     with self.assertRaises(ValueError):
         basic_session_run_hooks.SummarySaverHook(
             scaffold=monitored_session.Scaffold(),
             summary_op=self.summary_op)
 def test_raise_when_scaffold_and_summary_op_both_missing(self):
     with self.assertRaises(ValueError):
         basic_session_run_hooks.SummarySaverHook()
Exemplo n.º 9
0
def train(args):
    """Train CIFAR-10 for a number of steps.

  Args:
    args: The command line arguments.
  """

    with tf.Graph().as_default():

        # Create the global step
        global_step = tf.contrib.framework.create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs(args.data_dir,
                                                  args.batch_size,
                                                  args.use_fp16)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images, args.batch_size, args.use_fp16)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step, args.batch_size)

        scaffold = monitored_session.Scaffold()

        session_creator = monitored_session.ChiefSessionCreator(
            scaffold,
            checkpoint_dir=args.train_dir,
            config=tf.ConfigProto(
                log_device_placement=args.log_device_placement))

        hooks = [
            # Hook to save the model every N steps and at the end.
            basic_session_run_hooks.CheckpointSaverHook(
                args.train_dir,
                checkpoint_basename=CHECKPOINT_BASENAME,
                save_steps=args.checkpoint_interval_steps,
                scaffold=scaffold),

            # Hook to save a summary every N steps.
            basic_session_run_hooks.SummarySaverHook(
                save_steps=args.summary_interval_steps,
                output_dir=args.train_dir,
                scaffold=scaffold),

            # Hook to stop at step N.
            basic_session_run_hooks.StopAtStepHook(
                last_step=args.train_max_steps)
        ]

        # Start a new monitored session. This will automatically restart the
        # sessions if the parameter servers are preempted.
        with monitored_session.MonitoredSession(
                session_creator=session_creator, hooks=hooks) as sess:

            while not sess.should_stop():

                start_time = time.time()
                _, loss_value, global_step_value = sess.run(
                    [train_op, loss, global_step])
                duration = time.time() - start_time

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                if global_step_value % 10 == 0:
                    num_examples_per_step = args.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    logging.info(
                        ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                         'sec/batch)'), datetime.now(), global_step_value,
                        loss_value, examples_per_sec, sec_per_batch)
Exemplo n.º 10
0
def MonitoredTrainingSession(
        master='',  # pylint: disable=invalid-name
        is_chief=True,
        checkpoint_dir=None,
        scaffold=None,
        hooks=None,
        chief_only_hooks=None,
        save_checkpoint_secs=600,
        save_summaries_steps=100,
        save_summaries_secs=None,
        config=None,
        stop_grace_period_secs=120,
        log_step_count_steps=100):
    """Creates a `MonitoredSession` for training.

  For a chief, this utility sets proper session initializer/restorer. It also
  creates hooks related to checkpoint and summary saving. For workers, this
  utility sets proper session creator which waits for the chief to
  initialize/restore.


  Args:
    master: `String` the TensorFlow master to use.
    is_chief: If `True`, it will take care of initialization and recovery the
      underlying TensorFlow session. If `False`, it will wait on a chief to
      initialize or recover the TensorFlow session.
    checkpoint_dir: A string.  Optional path to a directory where to restore
      variables.
    scaffold: A `Scaffold` used for gathering or building supportive ops. If
      not specified, a default one is created. It's used to finalize the graph.
    hooks: Optional list of `SessionRunHook` objects.
    chief_only_hooks: list of `SessionRunHook` objects. Activate these hooks if
      `is_chief==True`, ignore otherwise.
    save_checkpoint_secs: The frequency, in seconds, that a checkpoint is saved
      using a default checkpoint saver. If `save_checkpoint_secs` is set to
      `None`, then the default checkpoint saver isn't used.
    save_summaries_steps: The frequency, in number of global steps, that the
      summaries are written to disk using a default summary saver. If both
      `save_summaries_steps` and `save_summaries_secs` are set to `None`, then
      the default summary saver isn't used.
    save_summaries_secs: The frequency, in secs, that the summaries are written
      to disk using a default summary saver.  If both `save_summaries_steps` and
      `save_summaries_secs` are set to `None`, then the default summary saver
      isn't used.
    config: an instance of `tf.ConfigProto` proto used to configure the session.
      It's the `config` argument of constructor of `tf.Session`.
    stop_grace_period_secs: Number of seconds given to threads to stop after
      `close()` has been called.
    log_step_count_steps: The frequency, in number of global steps, that the
      global step/sec is logged.

  Returns:
    A `MonitoredSession` object.
  """
    scaffold = scaffold or Scaffold()
    if not is_chief:
        session_creator = WorkerSessionCreator(scaffold=scaffold,
                                               master=master,
                                               config=config)
        return MonitoredSession(session_creator=session_creator,
                                hooks=hooks or [],
                                stop_grace_period_secs=stop_grace_period_secs)

    all_hooks = []
    if chief_only_hooks:
        all_hooks.extend(chief_only_hooks)
    session_creator = ChiefSessionCreator(scaffold=scaffold,
                                          checkpoint_dir=checkpoint_dir,
                                          master=master,
                                          config=config)

    if checkpoint_dir:
        all_hooks.append(
            basic_session_run_hooks.StepCounterHook(
                output_dir=checkpoint_dir, every_n_steps=log_step_count_steps))

        if (save_summaries_steps
                and save_summaries_steps > 0) or (save_summaries_secs
                                                  and save_summaries_secs > 0):
            all_hooks.append(
                basic_session_run_hooks.SummarySaverHook(
                    scaffold=scaffold,
                    save_steps=save_summaries_steps,
                    save_secs=save_summaries_secs,
                    output_dir=checkpoint_dir))
        if save_checkpoint_secs and save_checkpoint_secs > 0:
            all_hooks.append(
                basic_session_run_hooks.CheckpointSaverHook(
                    checkpoint_dir,
                    save_secs=save_checkpoint_secs,
                    scaffold=scaffold))

    if hooks:
        all_hooks.extend(hooks)
    return MonitoredSession(session_creator=session_creator,
                            hooks=all_hooks,
                            stop_grace_period_secs=stop_grace_period_secs)
Exemplo n.º 11
0
def train(train_op,
          logdir,
          master='',
          is_chief=True,
          scaffold=None,
          hooks=None,
          chief_only_hooks=None,
          save_checkpoint_secs=600,
          save_summaries_steps=100,
          config=None):
    """Runs the training loop.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where the graph and checkpoints are saved.
    master: The URL of the master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    scaffold: An tf.train.Scaffold instance.
    hooks: List of `tf.train.SessionRunHook` callbacks which are run inside the
      training loop.
    chief_only_hooks: List of `tf.train.SessionRunHook` instances which are run
      inside the training loop for the chief trainer only.
    save_checkpoint_secs: The frequency, in seconds, that a checkpoint is saved
      using a default checkpoint saver. If `save_checkpoint_secs` is set to
      `None`, then the default checkpoint saver isn't used.
    save_summaries_steps: The frequency, in number of global steps, that the
      summaries are written to disk using a default summary saver. If
      `save_summaries_steps` is set to `None`, then the default summary saver
      isn't used.
    config: An instance of `tf.ConfigProto`.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `logdir` is `None` and either `save_checkpoint_secs` or
    `save_summaries_steps` are `None.
  """
    # TODO(nsilberman): move this logic into monitored_session.py
    scaffold = scaffold or monitored_session.Scaffold()

    hooks = hooks or []

    if is_chief:
        session_creator = monitored_session.ChiefSessionCreator(
            scaffold=scaffold,
            checkpoint_dir=logdir,
            master=master,
            config=config)

        if chief_only_hooks:
            hooks.extend(chief_only_hooks)

        hooks.append(
            basic_session_run_hooks.StepCounterHook(output_dir=logdir))

        if save_summaries_steps:
            if logdir is None:
                raise ValueError(
                    'logdir cannot be None when save_summaries_steps is None')
            hooks.append(
                basic_session_run_hooks.SummarySaverHook(
                    scaffold=scaffold,
                    save_steps=save_summaries_steps,
                    output_dir=logdir))

        if save_checkpoint_secs:
            if logdir is None:
                raise ValueError(
                    'logdir cannot be None when save_checkpoint_secs is None')
            hooks.append(
                basic_session_run_hooks.CheckpointSaverHook(
                    logdir, save_secs=save_checkpoint_secs, scaffold=scaffold))
    else:
        session_creator = monitored_session.WorkerSessionCreator(
            scaffold=scaffold, master=master, config=config)

    with monitored_session.MonitoredSession(session_creator=session_creator,
                                            hooks=hooks) as session:
        loss = None
        while not session.should_stop():
            loss = session.run(train_op)
    return loss
Exemplo n.º 12
0
def PartialRestoreSession(
        master='',  # pylint: disable=invalid-name
        is_chief=True,
        checkpoint_dir=None,
        restore_var_list=None,
        scaffold=None,
        hooks=None,
        chief_only_hooks=None,
        save_checkpoint_secs=600,
        save_summaries_steps=monitored_session.USE_DEFAULT,
        save_summaries_secs=monitored_session.USE_DEFAULT,
        config=None,
        stop_grace_period_secs=120,
        log_step_count_steps=100):
    """Creates a `MonitoredSession` for training.

    Supports partial restoration from checkpoints with parameter
    `restore_var_list`, by adding `CheckpointRestorerHook`.

  For a chief, this utility sets proper session initializer/restorer. It also
  creates hooks related to checkpoint and summary saving. For workers, this
  utility sets proper session creator which waits for the chief to
  initialize/restore. Please check `tf.train.MonitoredSession` for more
  information.


  Args:
    master: `String` the TensorFlow master to use.
    is_chief: If `True`, it will take care of initialization and recovery the
      underlying TensorFlow session. If `False`, it will wait on a chief to
      initialize or recover the TensorFlow session.
    checkpoint_dir: A string.  Optional path to a directory where to restore
      variables.
    restore_var_list: a list of variables, optional, if not all variables should
      be recovered from checkpoint.
      Useful when changing network structures during training, i.e., finetuning
      a pretrained model with new layers.
    scaffold: A `Scaffold` used for gathering or building supportive ops. If
      not specified, a default one is created. It's used to finalize the graph.
    hooks: Optional list of `SessionRunHook` objects.
    chief_only_hooks: list of `SessionRunHook` objects. Activate these hooks if
      `is_chief==True`, ignore otherwise.
    save_checkpoint_secs: The frequency, in seconds, that a checkpoint is saved
      using a default checkpoint saver. If `save_checkpoint_secs` is set to
      `None`, then the default checkpoint saver isn't used.
    save_summaries_steps: The frequency, in number of global steps, that the
      summaries are written to disk using a default summary saver. If both
      `save_summaries_steps` and `save_summaries_secs` are set to `None`, then
      the default summary saver isn't used. Default 100.
    save_summaries_secs: The frequency, in secs, that the summaries are written
      to disk using a default summary saver.  If both `save_summaries_steps` and
      `save_summaries_secs` are set to `None`, then the default summary saver
      isn't used. Default not enabled.
    config: an instance of `tf.ConfigProto` proto used to configure the session.
      It's the `config` argument of constructor of `tf.Session`.
    stop_grace_period_secs: Number of seconds given to threads to stop after
      `close()` has been called.
    log_step_count_steps: The frequency, in number of global steps, that the
      global step/sec is logged.

  Returns:
    A `MonitoredSession` object.
  """
    if save_summaries_steps == monitored_session.USE_DEFAULT \
            and save_summaries_secs == monitored_session.USE_DEFAULT:
        save_summaries_steps = 100
        save_summaries_secs = None
    elif save_summaries_secs == monitored_session.USE_DEFAULT:
        save_summaries_secs = None
    elif save_summaries_steps == monitored_session.USE_DEFAULT:
        save_summaries_steps = None

    scaffold = scaffold or monitored_session.Scaffold()
    if not is_chief:
        session_creator = monitored_session.WorkerSessionCreator(
            scaffold=scaffold, master=master, config=config)
        return monitored_session.MonitoredSession(
            session_creator=session_creator,
            hooks=hooks or [],
            stop_grace_period_secs=stop_grace_period_secs)

    all_hooks = []
    if chief_only_hooks:
        all_hooks.extend(chief_only_hooks)
    if restore_var_list is None:
        restore_checkpoint_dir = checkpoint_dir
    else:
        restore_checkpoint_dir = None
        all_hooks.append(
            CheckpointRestorerHook(checkpoint_dir, var_list=restore_var_list))
        all_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
        missing_vars = filter(lambda v: not (v in restore_var_list), all_vars)
        logging.warning("MonitoredTrainingSession not restoring %s",
                        missing_vars)
    session_creator = monitored_session.ChiefSessionCreator(
        scaffold=scaffold,
        checkpoint_dir=restore_checkpoint_dir,
        master=master,
        config=config)

    if checkpoint_dir:
        all_hooks.append(
            basic_session_run_hooks.StepCounterHook(
                output_dir=checkpoint_dir, every_n_steps=log_step_count_steps))

        if (save_summaries_steps
                and save_summaries_steps > 0) or (save_summaries_secs
                                                  and save_summaries_secs > 0):
            all_hooks.append(
                basic_session_run_hooks.SummarySaverHook(
                    scaffold=scaffold,
                    save_steps=save_summaries_steps,
                    save_secs=save_summaries_secs,
                    output_dir=checkpoint_dir))
        if save_checkpoint_secs and save_checkpoint_secs > 0:
            all_hooks.append(
                basic_session_run_hooks.CheckpointSaverHook(
                    checkpoint_dir,
                    save_secs=save_checkpoint_secs,
                    scaffold=scaffold))

    if hooks:
        all_hooks.extend(hooks)
    return monitored_session.MonitoredSession(
        session_creator=session_creator,
        hooks=all_hooks,
        stop_grace_period_secs=stop_grace_period_secs)
Exemplo n.º 13
0
def train(args):
    """Train CIFAR-10 for a number of steps.

  Args:
    args: The command line arguments.
  """
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create the global step.
        global_step = tf.contrib.framework.create_global_step()

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                                 args.batch_size)
        decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        cifar10.LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.GradientDescentOptimizer(lr)

        # Calculate the gradients for each model tower.
        tower_grads = []
        for i in xrange(args.num_gpus):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
                    # Calculate the loss for one tower of the CIFAR model. This function
                    # constructs the entire CIFAR model but shares the variables across
                    # all towers.
                    loss = tower_loss(scope, args)

                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

                    # Retain the summaries from the final tower.
                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                  scope)

                    # Calculate the gradients for the batch of data on this CIFAR tower.
                    grads = opt.compute_gradients(loss)

                    # Keep track of the gradients across all towers.
                    tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)

        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.summary.histogram(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))

        # Track the moving averages of all trainable variables.
        # To understand why the following line is necessary, see:
        # https://github.com/carpedm20/DCGAN-tensorflow/issues/59
        with tf.variable_scope(tf.get_variable_scope(), reuse=False):
            variable_averages = tf.train.ExponentialMovingAverage(
                cifar10.MOVING_AVERAGE_DECAY, global_step)
            variables_averages_op = variable_averages.apply(
                tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)

        scaffold = monitored_session.Scaffold(summary_op=summary_op)

        # allow_soft_placement must be set to True to build towers on GPU, as some
        # of the ops do not have GPU implementations.
        session_creator = monitored_session.ChiefSessionCreator(
            scaffold,
            checkpoint_dir=args.train_dir,
            config=tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=args.log_device_placement))

        hooks = [
            # Hook to save the model every N steps and at the end.
            basic_session_run_hooks.CheckpointSaverHook(
                args.train_dir,
                checkpoint_basename=CHECKPOINT_BASENAME,
                save_steps=args.checkpoint_interval_steps,
                scaffold=scaffold),

            # Hook to save a summary every N steps.
            basic_session_run_hooks.SummarySaverHook(
                save_steps=args.summary_interval_steps,
                output_dir=args.train_dir,
                scaffold=scaffold),

            # Hook to stop at step N.
            basic_session_run_hooks.StopAtStepHook(
                last_step=args.train_max_steps)
        ]

        # Start a new monitored session. This will automatically restart the
        # sessions if the parameter servers are preempted.
        with monitored_session.MonitoredSession(
                session_creator=session_creator, hooks=hooks) as sess:

            while not sess.should_stop():
                start_time = time.time()
                _, loss_value, global_step_value = sess.run(
                    [train_op, loss, global_step])
                duration = time.time() - start_time

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                if global_step_value % 10 == 0:
                    num_examples_per_step = args.batch_size * args.num_gpus
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = duration / args.num_gpus

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    logging.info(format_str %
                                 (datetime.now(), global_step_value,
                                  loss_value, examples_per_sec, sec_per_batch))
Exemplo n.º 14
0
def MonitoredTrainingSession(master='',
                             is_chief=True,
                             checkpoint_dir=None,
                             scaffold=None,
                             hooks=None,
                             chief_only_hooks=None,
                             save_checkpoint_secs=USE_DEFAULT,
                             save_summaries_steps=USE_DEFAULT,
                             save_summaries_secs=USE_DEFAULT,
                             config=None,
                             stop_grace_period_secs=120,
                             log_step_count_steps=100,
                             save_checkpoint_steps=USE_DEFAULT,
                             summary_dir=None):
    if save_summaries_steps == USE_DEFAULT and save_summaries_secs == USE_DEFAULT:
        save_summaries_steps = 100
        save_summaries_secs = None
    elif save_summaries_secs == USE_DEFAULT:
        save_summaries_secs = None
    elif save_summaries_steps == USE_DEFAULT:
        save_summaries_steps = None

    if (save_checkpoint_steps == USE_DEFAULT
            and save_checkpoint_secs == USE_DEFAULT):
        save_checkpoint_steps = None
        save_checkpoint_secs = 600
    elif save_checkpoint_secs == USE_DEFAULT:
        save_checkpoint_secs = None
    elif save_checkpoint_steps == USE_DEFAULT:
        save_checkpoint_steps = None

    scaffold = scaffold or Scaffold()

    all_hooks = []
    if is_chief and chief_only_hooks:
        all_hooks.extend(chief_only_hooks)

    session_creator = ChiefSessionCreator(scaffold=scaffold,
                                          checkpoint_dir=checkpoint_dir,
                                          master=master,
                                          config=config)

    summary_dir = summary_dir or checkpoint_dir
    if summary_dir:
        if (save_summaries_steps
                and save_summaries_steps > 0) or (save_summaries_secs
                                                  and save_summaries_secs > 0):
            all_hooks.append(
                basic_session_run_hooks.SummarySaverHook(
                    scaffold=scaffold,
                    save_steps=save_summaries_steps,
                    save_secs=save_summaries_secs,
                    output_dir=summary_dir))

    if checkpoint_dir:
        if (save_checkpoint_secs and save_checkpoint_secs > 0) or (
                save_checkpoint_steps and save_checkpoint_steps > 0):
            all_hooks.append(
                basic_session_run_hooks.CheckpointSaverHook(
                    checkpoint_dir,
                    save_steps=save_checkpoint_steps,
                    save_secs=save_checkpoint_secs,
                    scaffold=scaffold))

    if hooks:
        all_hooks.extend(hooks)

    hvd_info_rank0('all hooks {}'.format(all_hooks))
    return MonitoredSession(session_creator=session_creator,
                            hooks=all_hooks,
                            stop_grace_period_secs=stop_grace_period_secs)