def testCanAchieveZeroLoss(self):
        logdir = os.path.join(self.get_temp_dir(), 'can_achieve_zero_loss')

        with ops.Graph().as_default():
            random_seed.set_random_seed(0)
            tf_inputs = constant_op.constant(self._inputs,
                                             dtype=dtypes.float32)
            tf_labels = constant_op.constant(self._labels,
                                             dtype=dtypes.float32)

            tf_predictions = logistic_classifier(tf_inputs)
            loss_ops.log_loss(tf_predictions, tf_labels)
            total_loss = loss_ops.get_total_loss()

            optimizer = gradient_descent.GradientDescentOptimizer(
                learning_rate=1.0)

            train_op = training.create_train_op(total_loss, optimizer)

            loss = training.train(
                train_op,
                logdir,
                hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)])
            self.assertIsNotNone(loss)
            self.assertLess(loss, .015)
예제 #2
0
    def _train_model(self, checkpoint_dir, num_steps):
        """Trains a simple classification model.

        Note that the data has been configured such that after around 300 steps,
        the model has memorized the dataset (e.g. we can expect %100 accuracy).

        Args:
          checkpoint_dir: The directory where the checkpoint is written to.
          num_steps: The number of steps to train for.
        """
        with ops.Graph().as_default():
            random_seed.set_random_seed(0)
            tf_inputs = constant_op.constant(
                self._inputs, dtype=dtypes.float32)
            tf_labels = constant_op.constant(
                self._labels, dtype=dtypes.float32)

            tf_predictions = logistic_classifier(tf_inputs)
            loss = loss_ops.log_loss(tf_labels, tf_predictions)

            optimizer = gradient_descent.GradientDescentOptimizer(
                learning_rate=1.0)
            train_op = training.create_train_op(loss, optimizer)

            loss = training.train(
                train_op,
                checkpoint_dir,
                hooks=[basic_session_run_hooks.StopAtStepHook(num_steps)])
예제 #3
0
  def test_stop_based_on_num_step(self):
    h = basic_session_run_hooks.StopAtStepHook(num_steps=10)

    with ops.Graph().as_default():
      global_step = variables.get_or_create_global_step()
      no_op = control_flow_ops.no_op()
      h.begin()
      with session_lib.Session() as sess:
        mon_sess = monitored_session._HookedSession(sess, [h])
        sess.run(state_ops.assign(global_step, 5))
        h.after_create_session(sess, None)
        mon_sess.run(no_op)
        self.assertFalse(mon_sess.should_stop())
        sess.run(state_ops.assign(global_step, 13))
        mon_sess.run(no_op)
        self.assertFalse(mon_sess.should_stop())
        sess.run(state_ops.assign(global_step, 14))
        mon_sess.run(no_op)
        self.assertFalse(mon_sess.should_stop())
        sess.run(state_ops.assign(global_step, 15))
        mon_sess.run(no_op)
        self.assertTrue(mon_sess.should_stop())
        sess.run(state_ops.assign(global_step, 16))
        mon_sess._should_stop = False
        mon_sess.run(no_op)
        self.assertTrue(mon_sess.should_stop())
예제 #4
0
    def testTrainWithLocalVariable(self):
        with ops.Graph().as_default():
            random_seed.set_random_seed(0)
            tf_inputs = constant_op.constant(self._inputs,
                                             dtype=dtypes.float32)
            tf_labels = constant_op.constant(self._labels,
                                             dtype=dtypes.float32)

            local_multiplier = variables_lib.local_variable(1.0)

            tf_predictions = logistic_classifier(tf_inputs) * local_multiplier
            losses.log_loss(tf_labels, tf_predictions)
            total_loss = losses.get_total_loss()
            optimizer = gradient_descent.GradientDescentOptimizer(
                learning_rate=1.0)
            train_op = training.create_train_op(total_loss, optimizer)

            loss = training.train(
                train_op,
                None,
                hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)],
                save_summaries_steps=None,
                save_checkpoint_secs=None)
            self.assertIsNotNone(loss)
            self.assertLess(loss, .015)
    def testResumeTrainAchievesRoughlyTheSameLoss(self):
        number_of_steps = [300, 1, 5]
        logdir = os.path.join(self.get_temp_dir(), 'resume_train_same_loss')

        for i in range(len(number_of_steps)):
            with ops.Graph().as_default():
                random_seed.set_random_seed(i)
                tf_inputs = constant_op.constant(self._inputs,
                                                 dtype=dtypes.float32)
                tf_labels = constant_op.constant(self._labels,
                                                 dtype=dtypes.float32)

                tf_predictions = logistic_classifier(tf_inputs)
                loss_ops.log_loss(tf_predictions, tf_labels)
                total_loss = loss_ops.get_total_loss()

                optimizer = gradient_descent.GradientDescentOptimizer(
                    learning_rate=1.0)

                train_op = training.create_train_op(total_loss, optimizer)

                saver = saver_lib.Saver()

                loss = training.train(
                    train_op,
                    logdir,
                    hooks=[
                        basic_session_run_hooks.StopAtStepHook(
                            num_steps=number_of_steps[i]),
                        basic_session_run_hooks.CheckpointSaverHook(
                            logdir, save_steps=50, saver=saver),
                    ])
                self.assertIsNotNone(loss)
                self.assertLess(loss, .015)
예제 #6
0
    def testTrainWithAlteredGradients(self):
        # Use the same learning rate but different gradient multipliers
        # to train two models. Model with equivalently larger learning
        # rate (i.e., learning_rate * gradient_multiplier) has smaller
        # training loss.
        multipliers = [1., 1000.]
        number_of_steps = 10
        learning_rate = 0.001

        # First, train the model with equivalently smaller learning rate.
        with ops.Graph().as_default():
            random_seed.set_random_seed(0)
            train_op = self.create_train_op(learning_rate=learning_rate,
                                            gradient_multiplier=multipliers[0])

            loss0 = training.train(train_op,
                                   None,
                                   hooks=[
                                       basic_session_run_hooks.StopAtStepHook(
                                           num_steps=number_of_steps),
                                   ],
                                   save_checkpoint_secs=None,
                                   save_summaries_steps=None)
            self.assertIsNotNone(loss0)
            self.assertGreater(loss0, .5)

        # Second, train the model with equivalently larger learning rate.
        with ops.Graph().as_default():
            random_seed.set_random_seed(0)
            train_op = self.create_train_op(learning_rate=learning_rate,
                                            gradient_multiplier=multipliers[1])

            loss1 = training.train(train_op,
                                   None,
                                   hooks=[
                                       basic_session_run_hooks.StopAtStepHook(
                                           num_steps=number_of_steps),
                                   ],
                                   save_checkpoint_secs=None,
                                   save_summaries_steps=None)
            self.assertIsNotNone(loss1)
            self.assertLess(loss1, .5)

        # The loss of the model trained with larger learning rate should
        # be smaller.
        self.assertGreater(loss0, loss1)
예제 #7
0
    def _test_multiple_steps_helper(self, get_hooks_fn_fn):
        train_ops = self._gan_train_ops(generator_add=10,
                                        discriminator_add=100)
        train_steps = namedtuples.GANTrainSteps(generator_train_steps=3,
                                                discriminator_train_steps=4)
        final_step = train.gan_train(
            train_ops,
            get_hooks_fn=get_hooks_fn_fn(train_steps),
            logdir='',
            hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=1)])

        self.assertTrue(np.isscalar(final_step))
        self.assertEqual(1 + 3 * 10 + 4 * 100, final_step)
예제 #8
0
    def _test_run_helper(self, create_gan_model_fn):
        random_seed.set_random_seed(1234)
        model = create_gan_model_fn()
        loss = train.gan_loss(model)

        g_opt = gradient_descent.GradientDescentOptimizer(1.0)
        d_opt = gradient_descent.GradientDescentOptimizer(1.0)
        train_ops = train.gan_train_ops(model, loss, g_opt, d_opt)

        final_step = train.gan_train(
            train_ops,
            logdir='',
            hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=2)])
        self.assertTrue(np.isscalar(final_step))
        self.assertEqual(2, final_step)
예제 #9
0
    def test_patchgan(self, create_gan_model_fn):
        """Ensure that patch-based discriminators work end-to-end."""
        random_seed.set_random_seed(1234)
        model = create_gan_model_fn()
        loss = train.gan_loss(model)

        g_opt = gradient_descent.GradientDescentOptimizer(1.0)
        d_opt = gradient_descent.GradientDescentOptimizer(1.0)
        train_ops = train.gan_train_ops(model, loss, g_opt, d_opt)

        final_step = train.gan_train(
            train_ops,
            logdir='',
            hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=2)])
        self.assertTrue(np.isscalar(final_step))
        self.assertEqual(2, final_step)
예제 #10
0
  def testTrainWithNoInitAssignCanAchieveZeroLoss(self):
    g = ops.Graph()
    with g.as_default():
      random_seed.set_random_seed(0)
      tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
      tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)

      tf_predictions = batchnorm_classifier(tf_inputs)
      loss_ops.log_loss(tf_predictions, tf_labels)
      total_loss = loss_ops.get_total_loss()

      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)

      train_op = training.create_train_op(total_loss, optimizer)

      loss = training.train(
          train_op,
          self._logdir,
          hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)])
      self.assertLess(loss, .1)
예제 #11
0
    def testMonitoredSessionStopAtStepHook(self):
        random_seed.set_random_seed(1)

        with ops.device("/device:IPU:0"):
            pa = array_ops.placeholder(np.float32, [2, 2], name="a")
            pb = array_ops.placeholder(np.float32, [2, 2], name="b")
            output = pa + pb

        with variable_scope.variable_scope('gs', use_resource=True):
            training_util.create_global_step()

        hook = basic_session_run_hooks.StopAtStepHook(num_steps=2)

        with ms.MonitoredSession(session_creator=ms.ChiefSessionCreator(),
                                 hooks=[hook]) as sess:

            fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]}
            result = sess.run(output, fd)
            self.assertAllClose(result, [[1., 2.], [6., 8.]])

            fd = {pa: [[0., 0.], [1., 1.]], pb: [[2., 1.], [4., 5.]]}
            result = sess.run(output, fd)
            self.assertAllClose(result, [[2., 1.], [5., 6.]])
예제 #12
0
    def _train_model(self, checkpoint_dir, num_steps):
        """Trains a simple classification model.

    Note that the data has been configured such that after around 300 steps,
    the model has memorized the dataset (e.g. we can expect %100 accuracy).

    Args:
      checkpoint_dir: The directory where the checkpoint is written to.
      num_steps: The number of steps to train for.
    """
        with ops.Graph().as_default():
            random_seed.set_random_seed(0)
            tf_inputs = constant_op.constant(self._inputs,
                                             dtype=dtypes.float32)
            tf_labels = constant_op.constant(self._labels,
                                             dtype=dtypes.float32)

            tf_predictions = logistic_classifier(tf_inputs)
            loss_op = losses.log_loss(labels=tf_labels,
                                      predictions=tf_predictions)

            optimizer = gradient_descent.GradientDescentOptimizer(
                learning_rate=1.0)
            train_op = optimizer.minimize(loss_op,
                                          training.get_or_create_global_step())

            with monitored_session.MonitoredTrainingSession(
                    checkpoint_dir=checkpoint_dir,
                    hooks=[basic_session_run_hooks.StopAtStepHook(num_steps)
                           ]) as session:
                loss = None
                while not session.should_stop():
                    _, loss = session.run([train_op, loss_op])

                if num_steps >= 300:
                    assert loss < .015
예제 #13
0
  def testTrainWithInitFromCheckpoint(self):
    logdir1 = os.path.join(self.get_temp_dir(), 'tmp_logs1/')
    logdir2 = os.path.join(self.get_temp_dir(), 'tmp_logs2/')

    if gfile.Exists(logdir1):  # For running on jenkins.
      gfile.DeleteRecursively(logdir1)
    if gfile.Exists(logdir2):  # For running on jenkins.
      gfile.DeleteRecursively(logdir2)

    # First, train the model one step (make sure the error is high).
    with ops.Graph().as_default():
      random_seed.set_random_seed(0)
      train_op = self.create_train_op()
      saver = saver_lib.Saver()
      loss = training.train(
          train_op,
          logdir1,
          hooks=[
              basic_session_run_hooks.CheckpointSaverHook(
                  logdir1, save_steps=1, saver=saver),
              basic_session_run_hooks.StopAtStepHook(num_steps=1),
          ],
          save_checkpoint_secs=None)
      self.assertGreater(loss, .5)

    # Next, train the model to convergence.
    with ops.Graph().as_default():
      random_seed.set_random_seed(1)
      train_op = self.create_train_op()
      saver = saver_lib.Saver()
      loss = training.train(
          train_op,
          logdir1,
          hooks=[
              basic_session_run_hooks.CheckpointSaverHook(
                  logdir1, save_steps=1, saver=saver),
              basic_session_run_hooks.StopAtStepHook(num_steps=300),
          ],
          save_checkpoint_secs=None)
      self.assertIsNotNone(loss)
      self.assertLess(loss, .02)

    # Finally, advance the model a single step and validate that the loss is
    # still low.
    with ops.Graph().as_default():
      random_seed.set_random_seed(2)
      train_op = self.create_train_op()

      model_variables = variables_lib2.global_variables()
      model_path = os.path.join(logdir1, 'model.ckpt-300')

      assign_fn = variables_lib.assign_from_checkpoint_fn(model_path,
                                                          model_variables)

      def init_fn(_, session):
        assign_fn(session)

      loss = training.train(
          train_op,
          logdir2,
          scaffold=monitored_session.Scaffold(init_fn=init_fn),
          hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=1)])

      self.assertIsNotNone(loss)
      self.assertLess(loss, .02)
예제 #14
0
def _monitored_train(graph,
                     output_dir,
                     train_op,
                     loss_op,
                     global_step_tensor=None,
                     init_op=None,
                     init_feed_dict=None,
                     init_fn=None,
                     log_every_steps=10,
                     supervisor_is_chief=True,
                     supervisor_master='',
                     supervisor_save_model_secs=600,
                     supervisor_save_model_steps=None,
                     keep_checkpoint_max=5,
                     supervisor_save_summaries_secs=None,
                     supervisor_save_summaries_steps=100,
                     feed_fn=None,
                     steps=None,
                     fail_on_nan_loss=True,
                     hooks=None,
                     max_steps=None):
  """Train a model via monitored_session.

  Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
  run a training loop. The given `train_op` performs one step of training on the
  model. The `loss_op` represents the objective function of the training. It is
  expected to increment the `global_step_tensor`, a scalar integer tensor
  counting training steps. This function uses `Supervisor` to initialize the
  graph (from a checkpoint if one is available in `output_dir`), write summaries
  defined in the graph, and write regular checkpoints as defined by
  `supervisor_save_model_secs`.

  Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
  `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
  program is terminated with exit code 1.

  Args:
    graph: A graph to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A directory to write outputs to.
    train_op: An op that performs one training step when run.
    loss_op: A scalar loss tensor.
    global_step_tensor: A tensor representing the global step. If none is given,
      one is extracted from the graph using the same logic as in `Supervisor`.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    init_feed_dict: A dictionary that maps `Tensor` objects to feed values.
      This feed dictionary will be used when `init_op` is evaluated.
    init_fn: Optional callable passed to Supervisor to initialize the model.
    log_every_steps: Output logs regularly. The logs contain timing data and the
      current loss. A `0` or negative value disables logging.
    supervisor_is_chief: Whether the current process is the chief supervisor in
      charge of restoring the model and running standard services.
    supervisor_master: The master string to use when preparing the session.
    supervisor_save_model_secs: Save checkpoints every this many seconds. Can
        not be specified with `supervisor_save_model_steps`.
    supervisor_save_model_steps: Save checkpoints every this many steps. Can not
        be specified with `supervisor_save_model_secs`.
    keep_checkpoint_max: The maximum number of recent checkpoint files to
      keep. As new files are created, older files are deleted. If None or 0,
      all checkpoint files are kept. This is simply passed as the max_to_keep
      arg to `tf.Saver` constructor.
    supervisor_save_summaries_secs: Save summaries every
      `supervisor_save_summaries_secs` seconds when training.
    supervisor_save_summaries_steps: Save summaries every
      `supervisor_save_summaries_steps` steps when training. Exactly one of
      `supervisor_save_model_steps` and `supervisor_save_model_secs` should be
      specified, and the other should be None.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    steps: Trains for this many steps (e.g. current global step + `steps`).
    fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
      evaluates to `NaN`. If false, continue training as if nothing happened.
    hooks: List of `SessionRunHook` subclass instances. Used for callbacks
      inside the training loop.
    max_steps: Number of total steps for which to train model. If `None`,
      train forever. Two calls fit(steps=100) means 200 training iterations.
      On the other hand two calls of fit(max_steps=100) means, second call
      will not do any iteration since first call did all 100 steps.

  Returns:
    The final loss value.

  Raises:
    ValueError: If `output_dir`, `train_op`, `loss_op`, or `global_step_tensor`
      is not provided. See `tf.contrib.framework.get_global_step` for how we
      look up the latter if not provided explicitly.
    NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever
      evaluates to `NaN`.
    ValueError: If both `steps` and `max_steps` are not `None`.
  """
  if (steps is not None) and (max_steps is not None):
    raise ValueError('Can not provide both steps and max_steps.')
  if not output_dir:
    raise ValueError('Output directory should be non-empty %s.' % output_dir)
  if train_op is None:
    raise ValueError('Missing train_op.')
  if loss_op is None:
    raise ValueError('Missing loss_op.')
  if hooks is None:
    hooks = []
  if not isinstance(hooks, list):
    raise ValueError('Hooks should be a list.')
  with graph.as_default():
    global_step_tensor = contrib_variables.assert_or_get_global_step(
        graph, global_step_tensor)
  if global_step_tensor is None:
    raise ValueError('No "global_step" was provided or found in the graph.')

  if max_steps is not None:
    try:
      start_step = load_variable(output_dir, global_step_tensor.name)
      if max_steps <= start_step:
        logging.info('Skipping training since max_steps has already saved.')
        return None
    except:  # pylint: disable=bare-except
      pass

  # Adapted SessionRunHooks such as ExportMonitor depend on the
  # CheckpointSaverHook to be executed before they should be executed.
  # The `hooks` param comprises of deprecated monitor hooks
  # (such as ExportMonitor). Appending them after the basic_session_run_hooks.
  all_hooks = []
  with graph.as_default():
    all_hooks.append(basic_session_run_hooks.NanTensorHook(
        loss_op, fail_on_nan_loss=fail_on_nan_loss))
    if log_every_steps > 0:
      all_hooks.append(basic_session_run_hooks.LoggingTensorHook({
          'loss': loss_op.name,
          'step': global_step_tensor.name
      }, every_n_iter=log_every_steps))

    def make_saver():
      return tf_saver.Saver(
          sharded=True, max_to_keep=keep_checkpoint_max, defer_build=True,
          write_version=saver_pb2.SaverDef.V1)

    scaffold = monitored_session.Scaffold(
        init_op=init_op,
        init_feed_dict=init_feed_dict,
        init_fn=init_fn,
        saver=monitored_session.Scaffold.get_or_default('saver',
                                                        ops.GraphKeys.SAVERS,
                                                        make_saver))

    if not supervisor_is_chief:
      session_creator = monitored_session.WorkerSessionCreator(
          scaffold=scaffold,
          master=supervisor_master)
    else:
      session_creator = monitored_session.ChiefSessionCreator(
          scaffold=scaffold,
          checkpoint_dir=output_dir,
          master=supervisor_master)
      summary_writer = summary_io.SummaryWriterCache.get(output_dir)
      all_hooks.append(
          basic_session_run_hooks.StepCounterHook(
              summary_writer=summary_writer))
      all_hooks.append(
          basic_session_run_hooks.SummarySaverHook(
              save_secs=supervisor_save_summaries_secs,
              save_steps=supervisor_save_summaries_steps,
              summary_writer=summary_writer,
              scaffold=scaffold))
      if (supervisor_save_model_secs is not None
          or supervisor_save_model_steps is not None):
        all_hooks.append(
            basic_session_run_hooks.CheckpointSaverHook(
                output_dir,
                save_secs=supervisor_save_model_secs,
                save_steps=supervisor_save_model_steps,
                scaffold=scaffold))

    if steps is not None or max_steps is not None:
      all_hooks.append(basic_session_run_hooks.StopAtStepHook(steps, max_steps))
    all_hooks.extend(hooks)

    with monitored_session.MonitoredSession(
        session_creator=session_creator,
        hooks=all_hooks) as super_sess:
      loss = None
      while not super_sess.should_stop():
        _, loss = super_sess.run([train_op, loss_op], feed_fn() if feed_fn else
                                 None)
    summary_io.SummaryWriterCache.clear()
    return loss
예제 #15
0
  def testTrainAllVarsHasLowerLossThanTrainSubsetOfVars(self):
    logdir = os.path.join(self.get_temp_dir(), 'tmp_logs3/')
    if gfile.Exists(logdir):  # For running on jenkins.
      gfile.DeleteRecursively(logdir)

    # First, train only the weights of the model.
    with ops.Graph().as_default():
      random_seed.set_random_seed(0)
      total_loss = self.ModelLoss()
      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
      weights = variables_lib.get_variables_by_name('weights')

      train_op = training.create_train_op(
          total_loss, optimizer, variables_to_train=weights)

      saver = saver_lib.Saver()
      loss = training.train(
          train_op,
          logdir,
          hooks=[
              basic_session_run_hooks.CheckpointSaverHook(
                  logdir, save_steps=1, saver=saver),
              basic_session_run_hooks.StopAtStepHook(num_steps=200),
          ])
      self.assertGreater(loss, .015)
      self.assertLess(loss, .05)

    # Next, train the biases of the model.
    with ops.Graph().as_default():
      random_seed.set_random_seed(1)
      total_loss = self.ModelLoss()
      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
      biases = variables_lib.get_variables_by_name('biases')

      train_op = training.create_train_op(
          total_loss, optimizer, variables_to_train=biases)

      saver = saver_lib.Saver()
      loss = training.train(
          train_op,
          logdir,
          hooks=[
              basic_session_run_hooks.CheckpointSaverHook(
                  logdir, save_steps=1, saver=saver),
              basic_session_run_hooks.StopAtStepHook(num_steps=300),
          ])
      self.assertGreater(loss, .015)
      self.assertLess(loss, .05)

    # Finally, train both weights and bias to get lower loss.
    with ops.Graph().as_default():
      random_seed.set_random_seed(2)
      total_loss = self.ModelLoss()
      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)

      train_op = training.create_train_op(total_loss, optimizer)
      saver = saver_lib.Saver()
      loss = training.train(
          train_op,
          logdir,
          hooks=[
              basic_session_run_hooks.CheckpointSaverHook(
                  logdir, save_steps=1, saver=saver),
              basic_session_run_hooks.StopAtStepHook(num_steps=400),
          ])
      self.assertIsNotNone(loss)
      self.assertLess(loss, .015)
예제 #16
0
  def testTrainWithAlteredGradients(self):
    # Use the same learning rate but different gradient multipliers
    # to train two models. Model with equivalently larger learning
    # rate (i.e., learning_rate * gradient_multiplier) has smaller
    # training loss.
    logdir1 = os.path.join(self.get_temp_dir(), 'tmp_logs6/')
    logdir2 = os.path.join(self.get_temp_dir(), 'tmp_logs7/')

    if gfile.Exists(logdir1):
      gfile.DeleteRecursively(logdir1)
    if gfile.Exists(logdir2):
      gfile.DeleteRecursively(logdir2)

    multipliers = [1., 1000.]
    number_of_steps = 10
    losses = []
    learning_rate = 0.001

    # First, train the model with equivalently smaller learning rate.
    with ops.Graph().as_default():
      random_seed.set_random_seed(0)
      train_op = self.create_train_op(
          learning_rate=learning_rate, gradient_multiplier=multipliers[0])

      saver = saver_lib.Saver()

      loss = training.train(
          train_op,
          logdir1,
          hooks=[
              basic_session_run_hooks.StopAtStepHook(num_steps=number_of_steps),
              basic_session_run_hooks.CheckpointSaverHook(
                  logdir1, save_steps=50, saver=saver),
          ])

      losses.append(loss)
      self.assertGreater(loss, .5)

    # Second, train the model with equivalently larger learning rate.
    with ops.Graph().as_default():
      random_seed.set_random_seed(0)
      train_op = self.create_train_op(
          learning_rate=learning_rate, gradient_multiplier=multipliers[1])
      saver = saver_lib.Saver()

      loss = training.train(
          train_op,
          logdir2,
          hooks=[
              basic_session_run_hooks.StopAtStepHook(num_steps=number_of_steps),
              basic_session_run_hooks.CheckpointSaverHook(
                  logdir2, save_steps=50, saver=saver),
          ])

      losses.append(loss)
      self.assertIsNotNone(loss)
      self.assertLess(loss, .5)

    # The loss of the model trained with larger learning rate should
    # be smaller.
    self.assertGreater(losses[0], losses[1])
예제 #17
0
def train(args):
    """Train CIFAR-10 for a number of steps.

  Args:
    args: The command line arguments.
  """

    with tf.Graph().as_default():

        # Create the global step
        global_step = tf.contrib.framework.create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs(args.data_dir,
                                                  args.batch_size,
                                                  args.use_fp16)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images, args.batch_size, args.use_fp16)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step, args.batch_size)

        scaffold = monitored_session.Scaffold()

        session_creator = monitored_session.ChiefSessionCreator(
            scaffold,
            checkpoint_dir=args.train_dir,
            config=tf.ConfigProto(
                log_device_placement=args.log_device_placement))

        hooks = [
            # Hook to save the model every N steps and at the end.
            basic_session_run_hooks.CheckpointSaverHook(
                args.train_dir,
                checkpoint_basename=CHECKPOINT_BASENAME,
                save_steps=args.checkpoint_interval_steps,
                scaffold=scaffold),

            # Hook to save a summary every N steps.
            basic_session_run_hooks.SummarySaverHook(
                save_steps=args.summary_interval_steps,
                output_dir=args.train_dir,
                scaffold=scaffold),

            # Hook to stop at step N.
            basic_session_run_hooks.StopAtStepHook(
                last_step=args.train_max_steps)
        ]

        # Start a new monitored session. This will automatically restart the
        # sessions if the parameter servers are preempted.
        with monitored_session.MonitoredSession(
                session_creator=session_creator, hooks=hooks) as sess:

            while not sess.should_stop():

                start_time = time.time()
                _, loss_value, global_step_value = sess.run(
                    [train_op, loss, global_step])
                duration = time.time() - start_time

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                if global_step_value % 10 == 0:
                    num_examples_per_step = args.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    logging.info(
                        ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                         'sec/batch)'), datetime.now(), global_step_value,
                        loss_value, examples_per_sec, sec_per_batch)
 def test_raise_in_both_last_step_and_num_steps(self):
     with self.assertRaises(ValueError):
         basic_session_run_hooks.StopAtStepHook(num_steps=10, last_step=20)
예제 #19
0
def train(args):
    """Train CIFAR-10 for a number of steps.

  Args:
    args: The command line arguments.
  """
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create the global step.
        global_step = tf.contrib.framework.create_global_step()

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                                 args.batch_size)
        decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        cifar10.LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.GradientDescentOptimizer(lr)

        # Calculate the gradients for each model tower.
        tower_grads = []
        for i in xrange(args.num_gpus):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
                    # Calculate the loss for one tower of the CIFAR model. This function
                    # constructs the entire CIFAR model but shares the variables across
                    # all towers.
                    loss = tower_loss(scope, args)

                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

                    # Retain the summaries from the final tower.
                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                  scope)

                    # Calculate the gradients for the batch of data on this CIFAR tower.
                    grads = opt.compute_gradients(loss)

                    # Keep track of the gradients across all towers.
                    tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)

        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.summary.histogram(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))

        # Track the moving averages of all trainable variables.
        # To understand why the following line is necessary, see:
        # https://github.com/carpedm20/DCGAN-tensorflow/issues/59
        with tf.variable_scope(tf.get_variable_scope(), reuse=False):
            variable_averages = tf.train.ExponentialMovingAverage(
                cifar10.MOVING_AVERAGE_DECAY, global_step)
            variables_averages_op = variable_averages.apply(
                tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)

        scaffold = monitored_session.Scaffold(summary_op=summary_op)

        # allow_soft_placement must be set to True to build towers on GPU, as some
        # of the ops do not have GPU implementations.
        session_creator = monitored_session.ChiefSessionCreator(
            scaffold,
            checkpoint_dir=args.train_dir,
            config=tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=args.log_device_placement))

        hooks = [
            # Hook to save the model every N steps and at the end.
            basic_session_run_hooks.CheckpointSaverHook(
                args.train_dir,
                checkpoint_basename=CHECKPOINT_BASENAME,
                save_steps=args.checkpoint_interval_steps,
                scaffold=scaffold),

            # Hook to save a summary every N steps.
            basic_session_run_hooks.SummarySaverHook(
                save_steps=args.summary_interval_steps,
                output_dir=args.train_dir,
                scaffold=scaffold),

            # Hook to stop at step N.
            basic_session_run_hooks.StopAtStepHook(
                last_step=args.train_max_steps)
        ]

        # Start a new monitored session. This will automatically restart the
        # sessions if the parameter servers are preempted.
        with monitored_session.MonitoredSession(
                session_creator=session_creator, hooks=hooks) as sess:

            while not sess.should_stop():
                start_time = time.time()
                _, loss_value, global_step_value = sess.run(
                    [train_op, loss, global_step])
                duration = time.time() - start_time

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                if global_step_value % 10 == 0:
                    num_examples_per_step = args.batch_size * args.num_gpus
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = duration / args.num_gpus

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    logging.info(format_str %
                                 (datetime.now(), global_step_value,
                                  loss_value, examples_per_sec, sec_per_batch))