Пример #1
0
 def testMovingAverageVariables(self):
   height, width = 3, 3
   with self.test_session():
     images = tf.random_uniform((5, height, width, 3), seed=1)
     ops.batch_norm(images, scale=True)
     moving_mean = tf.moving_average_variables()[0]
     moving_variance = tf.moving_average_variables()[1]
     self.assertEquals(moving_mean.op.name, 'BatchNorm/moving_mean')
     self.assertEquals(moving_variance.op.name, 'BatchNorm/moving_variance')
Пример #2
0
 def testCreateVariablesWithoutCenterWithoutScale(self):
   height, width = 3, 3
   with self.test_session():
     images = tf.random_uniform((5, height, width, 3), seed=1)
     ops.batch_norm(images, center=False, scale=False)
     beta = variables.get_variables_by_name('beta')
     self.assertEquals(beta, [])
     gamma = variables.get_variables_by_name('gamma')
     self.assertEquals(gamma, [])
     moving_mean = tf.moving_average_variables()[0]
     moving_variance = tf.moving_average_variables()[1]
     self.assertEquals(moving_mean.op.name, 'BatchNorm/moving_mean')
     self.assertEquals(moving_variance.op.name, 'BatchNorm/moving_variance')
  def _CheckDecay(self, ema, actual_decay, dim):
    tens = _Repeat(10.0, dim)
    thirties = _Repeat(30.0, dim)
    var0 = tf.Variable(tens, name="v0")
    var1 = tf.Variable(thirties, name="v1")
    tf.initialize_all_variables().run()
    # Note that tensor2 is not a Variable but just a plain Tensor resulting
    # from the sum operation.
    tensor2 = var0 + var1
    update = ema.apply([var0, var1, tensor2])
    avg0 = ema.average(var0)
    avg1 = ema.average(var1)
    avg2 = ema.average(tensor2)

    self.assertItemsEqual([var0, var1], tf.moving_average_variables())

    self.assertFalse(avg0 in tf.trainable_variables())
    self.assertFalse(avg1 in tf.trainable_variables())
    self.assertFalse(avg2 in tf.trainable_variables())
    tf.initialize_all_variables().run()

    self.assertEqual("v0/ExponentialMovingAverage:0", avg0.name)
    self.assertEqual("v1/ExponentialMovingAverage:0", avg1.name)
    self.assertEqual("add/ExponentialMovingAverage:0", avg2.name)

    # Check initial values.
    self.assertAllClose(tens, var0.eval())
    self.assertAllClose(thirties, var1.eval())
    self.assertAllClose(_Repeat(10.0 + 30.0, dim), tensor2.eval())

    # Check that averages are initialized correctly.
    self.assertAllClose(tens, avg0.eval())
    self.assertAllClose(thirties, avg1.eval())
    # Note that averages of Tensor's initialize to zeros_like since no value
    # of the Tensor is known because the Op has not been run (yet).
    self.assertAllClose(_Repeat(0.0, dim), avg2.eval())

    # Update the averages and check.
    update.run()
    dk = actual_decay

    expected = _Repeat(10.0 * dk + 10.0 * (1 - dk), dim)
    self.assertAllClose(expected, avg0.eval())
    expected = _Repeat(30.0 * dk + 30.0 * (1 - dk), dim)
    self.assertAllClose(expected, avg1.eval())
    expected = _Repeat(0.0 * dk + (10.0 + 30.0) * (1 - dk), dim)
    self.assertAllClose(expected, avg2.eval())

    # Again, update the averages and check.
    update.run()
    expected = _Repeat((10.0 * dk + 10.0 * (1 - dk)) * dk + 10.0 * (1 - dk),
                       dim)
    self.assertAllClose(expected, avg0.eval())
    expected = _Repeat((30.0 * dk + 30.0 * (1 - dk)) * dk + 30.0 * (1 - dk),
                       dim)
    self.assertAllClose(expected, avg1.eval())
    expected = _Repeat(((0.0 * dk + (10.0 + 30.0) * (1 - dk)) * dk +
                        (10.0 + 30.0) * (1 - dk)),
                       dim)
    self.assertAllClose(expected, avg2.eval())
Пример #4
0
def get_other_op(global_step):
    batchnorm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    # Track the moving averages of all trainable variables
    variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
    variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())
    variables_averages_op = variable_averages.apply(variables_to_average)

    batchnorm_updates_op = tf.group(*batchnorm_updates)
    return variables_averages_op, batchnorm_updates_op
Пример #5
0
  def create_init_fn_to_restore(self, master_checkpoint,
                                inception_checkpoint=None):
    """Creates an init operations to restore weights from various checkpoints.

    Args:
      master_checkpoint: path to a checkpoint which contains all weights for
        the whole model.
      inception_checkpoint: path to a checkpoint which contains weights for the
        inception part only.

    Returns:
      a function to run initialization ops.
    """
    all_assign_ops = []
    all_feed_dict = {}

    def assign_from_checkpoint(variables, checkpoint):
      logging.info('Request to re-store %d weights from %s',
                   len(variables), checkpoint)
      if not variables:
        logging.error('Can\'t find any variables to restore.')
        sys.exit(1)
      assign_op, feed_dict = slim.assign_from_checkpoint(checkpoint, variables)
      all_assign_ops.append(assign_op)
      all_feed_dict.update(feed_dict)

    logging.info('variables_to_restore:\n%s' % utils.variables_to_restore().keys())
    logging.info('moving_average_variables:\n%s' % [v.op.name for v in tf.moving_average_variables()])
    logging.info('trainable_variables:\n%s' % [v.op.name for v in tf.trainable_variables()])
    if master_checkpoint:
      assign_from_checkpoint(utils.variables_to_restore(), master_checkpoint)

    if inception_checkpoint:
      variables = utils.variables_to_restore(
        'AttentionOcr_v1/conv_tower_fn/INCE', strip_scope=True)
      assign_from_checkpoint(variables, inception_checkpoint)

    def init_assign_fn(sess):
      logging.info('Restoring checkpoint(s)')
      sess.run(all_assign_ops, all_feed_dict)

    return init_assign_fn
Пример #6
0
    def add_train_step(self):
        with tf.variable_scope('taining'):
            loss = slim.losses.cross_entropy_loss(self.logits[0], self.ground_truth, label_smoothing=0.1, weight=1.0)
            loss_auxiliary = slim.losses.cross_entropy_loss(self.logits[1], self.ground_truth, label_smoothing=0.1, weight=0.4, scope='aux_loss')
            losses = [loss, loss_auxiliary]
            regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            total_loss = tf.add_n(losses + regularization_losses, name='total_loss')
            loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
            loss_averages_op = loss_averages.apply(losses + [total_loss])

            with tf.control_dependencies([loss_averages_op]):
                self.total_loss = tf.identity(total_loss)

            apply_gradient_op = self.optimizer.minimize(self.total_loss)

            variable_averages = tf.train.ExponentialMovingAverage(inception.MOVING_AVERAGE_DECAY, num_updates=None)
            variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())
            variables_averages_op = variable_averages.apply(variables_to_average)
            batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION)
            batchnorm_updates_op = tf.group(*batchnorm_updates)
            self.train_step = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op)
Пример #7
0
    def build_graph(self, filenames, labels, subset, feed_hypes=None):

        hypes = self.hypes.copy()

        if feed_hypes:
            with tf.name_scope(None):
                for i in feed_hypes:
                    hypes[i] = tf.placeholder('float32', name=i)
                    hypes[i].set_shape([])

        with tf.name_scope('inputs'):

            filenames, labels = tf.train.slice_input_producer(
                tensor_list=[filenames, labels],
                capacity=hypes['batch_size'] * 2,
                shuffle=(subset == 'train'))

            filenames, labels = tf.train.batch(tensor_list=[filenames, labels],
                                               capacity=hypes['batch_size'] *
                                               2,
                                               batch_size=hypes['batch_size'])

            images0 = [
                tf.image.decode_jpeg(tf.read_file(i[0]), channels=3)
                for i in tf.split(0, hypes['batch_size'], filenames)
            ]

            images0 = [skin.util.square_pad(i) for i in images0]

            if subset == 'train':
                images0 = [tf.image.random_flip_left_right(i) for i in images0]
                images0 = [tf.image.random_flip_up_down(i) for i in images0]

            if hypes['spatial_transformer']:
                images = skin.util.spatial_tranform(images0,
                                                    hypes['batch_size'],
                                                    subset, hypes['loc_net'],
                                                    hypes['xform_reg'])
            else:
                images = tf.pack(
                    [tf.image.resize_images(i, 299, 299) for i in images0])

            with tf.name_scope(None):
                images = tf.identity(images, name='input')

        logits, logits_aux = inception_model.inference(
            images=(images - 128) / 128.,
            num_classes=len(self.labels),
            for_training=(subset == 'train'),
            restore_logits=(subset != 'train'))

        with tf.name_scope(None):
            logits = tf.identity(logits, name='logits')
        tf.histogram_summary('logits', logits)

        with tf.name_scope('loss'):

            batch_size, num_classes = logits.get_shape().as_list()

            labels_sparse = tf.sparse_to_dense(
                sparse_indices=tf.transpose(
                    tf.pack([tf.range(batch_size), labels])),
                output_shape=[batch_size, num_classes],
                sparse_values=np.ones(batch_size, dtype='float32'))

            loss = tf.nn.softmax_cross_entropy_with_logits(
                logits, labels_sparse)
            loss = tf.reduce_mean(loss, name='loss')

            loss_aux = tf.nn.softmax_cross_entropy_with_logits(
                logits_aux, labels_sparse)
            loss_aux = tf.reduce_mean(loss_aux, name='loss_aux')

            loss = 0.7 * loss + 0.3 * loss_aux

            tf.scalar_summary('loss', loss)

        fetches = {'loss': loss, 'filenames': filenames, 'logits': logits}

        def print_graph_ops():
            with open('/tmp/graph_ops.txt', 'w') as f:
                for op in tf.get_default_graph().get_operations():
                    f.write(op.type.ljust(35) + '\t' + op.name + '\n')

        if subset == 'train':

            reg_losses = tf.get_collection('regularization_losses')

            for i, j in enumerate(reg_losses):
                if 'loc_net' in j.name:
                    reg_losses[i] *= hypes['loc_net_reg']

            reg_loss = tf.add_n(reg_losses)
            tf.scalar_summary('reg_loss', reg_loss)

            with tf.variable_scope('reg_loss'):
                loss += reg_loss

            print_graph_ops()

            global_step = tf.Variable(0, name='global_step', trainable=False)

            opt = eval('tf.train.{}Optimizer'.format('Adam'))(
                learning_rate=hypes['learning_rate'],
                epsilon=hypes['epsilon'],
                beta1=hypes['beta1'],
                beta2=hypes['beta2'])

            grads = opt.compute_gradients(loss)
            apply_grads = opt.apply_gradients(grads, global_step)

            variable_averages = tf.train.ExponentialMovingAverage(
                hypes['variable_averages_decay'], global_step)
            variables_to_average = (tf.trainable_variables() +
                                    tf.moving_average_variables())
            variables_averages_op = variable_averages.apply(
                variables_to_average)

            batchnorm_updates_op = tf.group(*tf.get_collection('_update_ops_'))

            train_op = tf.group(apply_grads, variables_averages_op,
                                batchnorm_updates_op)

            for grad, var in grads:
                tf.histogram_summary(var.op.name, var)
                try:
                    tf.histogram_summary(var.op.name + '/gradients', grad)
                except:
                    print var.op.name

            fetches.update({
                'reg_loss': reg_loss,
                'train_op': train_op,
                'global_step': global_step
            })

        else:

            print_graph_ops()

        return fetches
Пример #8
0
def inception_model_fn(features, labels, mode, params):
    """Inception v3 model using Estimator API."""
    num_classes = FLAGS.num_classes
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    is_eval = (mode == tf.estimator.ModeKeys.EVAL)

    if isinstance(features, dict):
        features = features['feature']

    features = tensor_transform_fn(features, params['input_perm'])

    # This nested function allows us to avoid duplicating the logic which
    # builds the network, for different values of --precision.
    def build_network():
        if FLAGS.precision == 'bfloat16':
            with contrib_tpu.bfloat16_scope():
                logits, end_points = inception.inception_v3(
                    features, num_classes, is_training=is_training)
            logits = tf.cast(logits, tf.float32)
        elif FLAGS.precision == 'float32':
            logits, end_points = inception.inception_v3(
                features, num_classes, is_training=is_training)
        return logits, end_points

    if FLAGS.clear_update_collections:
        # updates_collections must be set to None in order to use fused batchnorm
        with arg_scope(
                inception.inception_v3_arg_scope(
                    weight_decay=0.0,
                    batch_norm_decay=BATCH_NORM_DECAY,
                    batch_norm_epsilon=BATCH_NORM_EPSILON,
                    updates_collections=None)):
            logits, end_points = build_network()
    else:
        with arg_scope(
                inception.inception_v3_arg_scope(
                    batch_norm_decay=BATCH_NORM_DECAY,
                    batch_norm_epsilon=BATCH_NORM_EPSILON)):
            logits, end_points = build_network()

    predictions = {
        'classes': tf.argmax(input=logits, axis=1),
        'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
    }

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            })

    if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and (
            not FLAGS.use_tpu):
        with tf.control_dependencies([
                tf.Print(predictions['classes'], [predictions['classes']],
                         summarize=FLAGS.eval_batch_size,
                         message='prediction: ')
        ]):
            labels = tf.Print(labels, [labels],
                              summarize=FLAGS.eval_batch_size,
                              message='label: ')

    one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32)

    if 'AuxLogits' in end_points:
        tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels,
                                        logits=tf.cast(end_points['AuxLogits'],
                                                       tf.float32),
                                        weights=0.4,
                                        label_smoothing=0.1,
                                        scope='aux_loss')

    tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels,
                                    logits=logits,
                                    weights=1.0,
                                    label_smoothing=0.1)

    losses = tf.add_n(tf.losses.get_losses())
    l2_loss = []
    for v in tf.trainable_variables():
        if 'BatchNorm' not in v.name and 'weights' in v.name:
            l2_loss.append(tf.nn.l2_loss(v))
    loss = losses + WEIGHT_DECAY * tf.add_n(l2_loss)

    initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256
    if FLAGS.use_learning_rate_warmup:
        # Adjust initial learning rate to match final warmup rate
        warmup_decay = FLAGS.learning_rate_decay**(
            (FLAGS.warmup_epochs + FLAGS.cold_epochs) /
            FLAGS.learning_rate_decay_epochs)
        adj_initial_learning_rate = initial_learning_rate * warmup_decay

    final_learning_rate = 0.0001 * initial_learning_rate

    host_call = None
    train_op = None
    if is_training:
        batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size
        global_step = tf.train.get_or_create_global_step()
        current_epoch = tf.cast(
            (tf.cast(global_step, tf.float32) / batches_per_epoch), tf.int32)

        learning_rate = tf.train.exponential_decay(
            learning_rate=initial_learning_rate,
            global_step=global_step,
            decay_steps=int(FLAGS.learning_rate_decay_epochs *
                            batches_per_epoch),
            decay_rate=FLAGS.learning_rate_decay,
            staircase=True)

        if FLAGS.use_learning_rate_warmup:
            wlr = 0.1 * adj_initial_learning_rate
            wlr_height = tf.cast(
                0.9 * adj_initial_learning_rate /
                (FLAGS.warmup_epochs + FLAGS.learning_rate_decay_epochs - 1),
                tf.float32)
            epoch_offset = tf.cast(FLAGS.cold_epochs - 1, tf.int32)
            exp_decay_start = (FLAGS.warmup_epochs + FLAGS.cold_epochs +
                               FLAGS.learning_rate_decay_epochs)
            lin_inc_lr = tf.add(
                wlr,
                tf.multiply(
                    tf.cast(tf.subtract(current_epoch, epoch_offset),
                            tf.float32), wlr_height))
            learning_rate = tf.where(
                tf.greater_equal(current_epoch, FLAGS.cold_epochs),
                (tf.where(tf.greater_equal(current_epoch, exp_decay_start),
                          learning_rate, lin_inc_lr)), wlr)

        # Set a minimum boundary for the learning rate.
        learning_rate = tf.maximum(learning_rate,
                                   final_learning_rate,
                                   name='learning_rate')

        if FLAGS.optimizer == 'sgd':
            tf.logging.info('Using SGD optimizer')
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=learning_rate)
        elif FLAGS.optimizer == 'momentum':
            tf.logging.info('Using Momentum optimizer')
            optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                                   momentum=0.9)
        elif FLAGS.optimizer == 'RMS':
            tf.logging.info('Using RMS optimizer')
            optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                                  RMSPROP_DECAY,
                                                  momentum=RMSPROP_MOMENTUM,
                                                  epsilon=RMSPROP_EPSILON)
        else:
            tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer)

        if FLAGS.use_tpu:
            optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step=global_step)
        if FLAGS.moving_average:
            ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY,
                                                    num_updates=global_step)
            variables_to_average = (tf.trainable_variables() +
                                    tf.moving_average_variables())
            with tf.control_dependencies([train_op
                                          ]), tf.name_scope('moving_average'):
                train_op = ema.apply(variables_to_average)

        # To log the loss, current learning rate, and epoch for Tensorboard, the
        # summary op needs to be run on the host CPU via host_call. host_call
        # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
        # dimension. These Tensors are implicitly concatenated to
        # [params['batch_size']].
        gs_t = tf.reshape(global_step, [1])
        loss_t = tf.reshape(loss, [1])
        lr_t = tf.reshape(learning_rate, [1])
        ce_t = tf.reshape(current_epoch, [1])

        if not FLAGS.skip_host_call:

            def host_call_fn(gs, loss, lr, ce):
                """Training host call. Creates scalar summaries for training metrics.

        This function is executed on the CPU and should not directly reference
        any Tensors in the rest of the `model_fn`. To pass Tensors from the
        model to the `metric_fn`, provide them as part of the `host_call`. See
        https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
        for more information.

        Arguments should match the list of `Tensor` objects passed as the second
        element in the tuple passed to `host_call`.

        Args:
          gs: `Tensor with shape `[batch]` for the global_step
          loss: `Tensor` with shape `[batch]` for the training loss.
          lr: `Tensor` with shape `[batch]` for the learning_rate.
          ce: `Tensor` with shape `[batch]` for the current_epoch.

        Returns:
          List of summary ops to run on the CPU host.
        """
                gs = gs[0]
                with summary.create_file_writer(FLAGS.model_dir).as_default():
                    with summary.always_record_summaries():
                        summary.scalar('loss', tf.reduce_mean(loss), step=gs)
                        summary.scalar('learning_rate',
                                       tf.reduce_mean(lr),
                                       step=gs)
                        summary.scalar('current_epoch',
                                       tf.reduce_mean(ce),
                                       step=gs)

                        return summary.all_summary_ops()

            host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t])

    eval_metrics = None
    if is_eval:

        def metric_fn(labels, logits):
            """Evaluation metric function. Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch, ]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      """
            predictions = tf.argmax(logits, axis=1)
            top_1_accuracy = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            top_5_accuracy = tf.metrics.mean(in_top_5)

            return {
                'accuracy': top_1_accuracy,
                'accuracy@5': top_5_accuracy,
            }

        eval_metrics = (metric_fn, [labels, logits])

    return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                        loss=loss,
                                        train_op=train_op,
                                        host_call=host_call,
                                        eval_metrics=eval_metrics)
def train(target, dataset, cluster_spec):
  """Train Inception on a dataset for a number of steps."""
  # Number of workers and parameter servers are infered from the workers and ps
  # hosts string.
  num_workers = len(cluster_spec.as_dict()['worker'])
  num_parameter_servers = len(cluster_spec.as_dict()['ps'])
  # If no value is given, num_replicas_to_aggregate defaults to be the number of
  # workers.
  if FLAGS.num_replicas_to_aggregate == -1:
    num_replicas_to_aggregate = num_workers
  else:
    num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate

  # Both should be greater than 0 in a distributed training.
  assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and '
                                                         'num_parameter_servers'
                                                         ' must be > 0.')

  # Choose worker 0 as the chief. Note that any worker could be the chief
  # but there should be only one chief.
  is_chief = (FLAGS.task_id == 0)

  # Ops are assigned to worker by default.
  with tf.device('/job:worker/task:%d' % FLAGS.task_id):
    # Variables and its related init/assign ops are assigned to ps.
    with slim.scopes.arg_scope(
        [slim.variables.variable, slim.variables.global_step],
        device=slim.variables.VariableDeviceChooser(num_parameter_servers)):
      # Create a variable to count the number of train() calls. This equals the
      # number of updates applied to the variables.
      global_step = slim.variables.global_step()

      # Calculate the learning rate schedule.
      num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                               FLAGS.batch_size)
      # Decay steps need to be divided by the number of replicas to aggregate.
      decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay /
                        num_replicas_to_aggregate)

      # Decay the learning rate exponentially based on the number of steps.
      lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                      global_step,
                                      decay_steps,
                                      FLAGS.learning_rate_decay_factor,
                                      staircase=True)
      # Add a summary to track the learning rate.
      tf.scalar_summary('learning_rate', lr)

      # Create an optimizer that performs gradient descent.
      opt = tf.train.RMSPropOptimizer(lr,
                                      RMSPROP_DECAY,
                                      momentum=RMSPROP_MOMENTUM,
                                      epsilon=RMSPROP_EPSILON)

      images, labels = image_processing.distorted_inputs(
          dataset,
          batch_size=FLAGS.batch_size,
          num_preprocess_threads=FLAGS.num_preprocess_threads)

      # Number of classes in the Dataset label set plus 1.
      # Label 0 is reserved for an (unused) background class.
      num_classes = dataset.num_classes() + 1
      logits = inception.inference(images, num_classes, for_training=True)
      # Add classification loss.
      inception.loss(logits, labels)

      # Gather all of the losses including regularization losses.
      losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
      losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)

      total_loss = tf.add_n(losses, name='total_loss')

      if is_chief:
        # Compute the moving average of all individual losses and the
        # total loss.
        loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
        loss_averages_op = loss_averages.apply(losses + [total_loss])

        # Attach a scalar summmary to all individual losses and the total loss;
        # do the same for the averaged version of the losses.
        for l in losses + [total_loss]:
          loss_name = l.op.name
          # Name each loss as '(raw)' and name the moving average version of the
          # loss as the original loss name.
          tf.scalar_summary(loss_name + ' (raw)', l)
          tf.scalar_summary(loss_name, loss_averages.average(l))

        # Add dependency to compute loss_averages.
        with tf.control_dependencies([loss_averages_op]):
          total_loss = tf.identity(total_loss)

      # Track the moving averages of all trainable variables.
      # Note that we maintain a 'double-average' of the BatchNormalization
      # global statistics.
      # This is not needed when the number of replicas are small but important
      # for synchronous distributed training with tens of workers/replicas.
      exp_moving_averager = tf.train.ExponentialMovingAverage(
          inception.MOVING_AVERAGE_DECAY, global_step)

      variables_to_average = (
          tf.trainable_variables() + tf.moving_average_variables())

      # Add histograms for model variables.
      for var in variables_to_average:
        tf.histogram_summary(var.op.name, var)

      # Create synchronous replica optimizer.
      opt = tf.train.SyncReplicasOptimizer(
          opt,
          replicas_to_aggregate=num_replicas_to_aggregate,
          replica_id=FLAGS.task_id,
          total_num_replicas=num_workers,
          variable_averages=exp_moving_averager,
          variables_to_average=variables_to_average)

      batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION)
      assert batchnorm_updates, 'Batchnorm updates are missing'
      batchnorm_updates_op = tf.group(*batchnorm_updates)
      # Add dependency to compute batchnorm_updates.
      with tf.control_dependencies([batchnorm_updates_op]):
        total_loss = tf.identity(total_loss)

      # Compute gradients with respect to the loss.
      grads = opt.compute_gradients(total_loss)

      # Add histograms for gradients.
      for grad, var in grads:
        if grad is not None:
          tf.histogram_summary(var.op.name + '/gradients', grad)

      apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)

      with tf.control_dependencies([apply_gradients_op]):
        train_op = tf.identity(total_loss, name='train_op')

      # Get chief queue_runners, init_tokens and clean_up_op, which is used to
      # synchronize replicas.
      # More details can be found in sync_replicas_optimizer.
      chief_queue_runners = [opt.get_chief_queue_runner()]
      init_tokens_op = opt.get_init_tokens_op()
      clean_up_op = opt.get_clean_up_op()

      # Create a saver.
      saver = tf.train.Saver()

      # Build the summary operation based on the TF collection of Summaries.
      summary_op = tf.merge_all_summaries()

      # Build an initialization operation to run below.
      init_op = tf.initialize_all_variables()

      # We run the summaries in the same thread as the training operations by
      # passing in None for summary_op to avoid a summary_thread being started.
      # Running summaries and training operations in parallel could run out of
      # GPU memory.
      sv = tf.train.Supervisor(is_chief=is_chief,
                               logdir=FLAGS.train_dir,
                               init_op=init_op,
                               summary_op=None,
                               global_step=global_step,
                               saver=saver,
                               save_model_secs=FLAGS.save_interval_secs)

      tf.logging.info('%s Supervisor' % datetime.now())

      sess_config = tf.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=FLAGS.log_device_placement)

      # Get a session.
      sess = sv.prepare_or_wait_for_session(target, config=sess_config)

      # Start the queue runners.
      queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
      sv.start_queue_runners(sess, queue_runners)
      tf.logging.info('Started %d queues for processing input data.',
                      len(queue_runners))

      if is_chief:
        sv.start_queue_runners(sess, chief_queue_runners)
        sess.run(init_tokens_op)

      # Train, checking for Nans. Concurrently run the summary operation at a
      # specified interval. Note that the summary_op and train_op never run
      # simultaneously in order to prevent running out of GPU memory.
      next_summary_time = time.time() + FLAGS.save_summaries_secs
      while not sv.should_stop():
        try:
          start_time = time.time()
          loss_value, step = sess.run([train_op, global_step])
          assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
          if step > FLAGS.max_steps:
            break
          duration = time.time() - start_time

          if step % 30 == 0:
            examples_per_sec = FLAGS.batch_size / float(duration)
            format_str = ('Worker %d: %s: step %d, loss = %.2f'
                          '(%.1f examples/sec; %.3f  sec/batch)')
            tf.logging.info(format_str %
                            (FLAGS.task_id, datetime.now(), step, loss_value,
                             examples_per_sec, duration))

          # Determine if the summary_op should be run on the chief worker.
          if is_chief and next_summary_time < time.time():
            tf.logging.info('Running Summary operation on the chief.')
            summary_str = sess.run(summary_op)
            sv.summary_computed(sess, summary_str)
            tf.logging.info('Finished running Summary operation.')

            # Determine the next time for running the summary.
            next_summary_time += FLAGS.save_summaries_secs
        except:
          if is_chief:
            tf.logging.info('About to execute sync_clean_up_op!')
            sess.run(clean_up_op)
          raise

      # Stop the supervisor.  This also waits for service threads to finish.
      sv.stop()

      # Save after the training ends.
      if is_chief:
        saver.save(sess,
                   os.path.join(FLAGS.train_dir, 'model.ckpt'),
                   global_step=global_step)
def main(argv=None):
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    tf.logging.info('PS hosts are: %s' % ps_hosts)
    tf.logging.info('Worker hosts are: %s' % worker_hosts)
    cluster_spec = tf.train.ClusterSpec({
        'ps': ps_hosts,
        'worker': worker_hosts
    })
    server = tf.train.Server({
        'ps': ps_hosts,
        'worker': worker_hosts
    },
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_id,
                             protocol=FLAGS.protocol)

    sspManager = SspManager(len(worker_hosts), 5)
    if FLAGS.job_name == 'ps':
        if FLAGS.task_id == 0:
            rpcServer = sspManager.create_rpc_server(ps_hosts[0].split(':')[0])
            rpcServer.serve()
        server.join()

    time.sleep(5)
    rpcClient = sspManager.create_rpc_client(ps_hosts[0].split(':')[0])

    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    is_chief = (FLAGS.task_id == 0)
    if is_chief:
        if not tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.MakeDirs(FLAGS.train_dir)

    num_workers = len(cluster_spec.as_dict()['worker'])
    num_parameter_servers = len(cluster_spec.as_dict()['ps'])

    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        with slim.scopes.arg_scope(
            [slim.variables.variable, slim.variables.global_step],
                device=slim.variables.VariableDeviceChooser(
                    num_parameter_servers)):
            '''Prepare Input'''
            global_step = slim.variables.global_step()
            batch_size = tf.placeholder(dtype=tf.int32,
                                        shape=(),
                                        name='batch_size')
            images, labels = image_processing.distorted_inputs(
                dataset,
                batch_size,
                num_preprocess_threads=FLAGS.num_preprocess_threads)
            num_classes = dataset.num_classes() + 1
            '''Inference'''
            logits = inception.inference(images,
                                         num_classes,
                                         for_training=True)
            '''Loss'''
            inception.loss(logits, labels, batch_size)
            losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
            losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            total_loss = tf.add_n(losses, name='total_loss')
            if is_chief:
                loss_averages = tf.train.ExponentialMovingAverage(0.9,
                                                                  name='avg')
                loss_averages_op = loss_averages.apply(losses + [total_loss])
                with tf.control_dependencies([loss_averages_op]):
                    total_loss = tf.identity(total_loss)
            '''Optimizer'''
            exp_moving_averager = tf.train.ExponentialMovingAverage(
                inception.MOVING_AVERAGE_DECAY, global_step)
            variables_to_average = (tf.trainable_variables() +
                                    tf.moving_average_variables())
            num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                                     FLAGS.batch_size)
            decay_steps = int(num_batches_per_epoch *
                              FLAGS.num_epochs_per_decay / num_workers)
            lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                            global_step,
                                            decay_steps,
                                            FLAGS.learning_rate_decay_factor,
                                            staircase=True)
            opt = tf.train.RMSPropOptimizer(lr,
                                            RMSPROP_DECAY,
                                            momentum=RMSPROP_MOMENTUM,
                                            epsilon=RMSPROP_EPSILON)
            '''Train Operation'''
            batchnorm_updates = tf.get_collection(
                slim.ops.UPDATE_OPS_COLLECTION)
            assert batchnorm_updates, 'Batchnorm updates are missing'
            batchnorm_updates_op = tf.group(*batchnorm_updates)
            with tf.control_dependencies([batchnorm_updates_op]):
                total_loss = tf.identity(total_loss)
            naive_grads = opt.compute_gradients(total_loss)
            grads = [(tf.scalar_mul(
                tf.cast(batch_size / FLAGS.batch_size, tf.float32), grad), var)
                     for grad, var in naive_grads]
            apply_gradients_op = opt.apply_gradients(grads,
                                                     global_step=global_step)
            with tf.control_dependencies([apply_gradients_op]):
                train_op = tf.identity(total_loss, name='train_op')
            '''Supervisor and Session'''
            saver = tf.train.Saver()
            init_op = tf.global_variables_initializer()
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=init_op,
                                     summary_op=None,
                                     global_step=global_step,
                                     recovery_wait_secs=1,
                                     saver=saver,
                                     save_model_secs=FLAGS.save_interval_secs)
            tf.logging.info('%s Supervisor' % datetime.now())
            sess_config = tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=FLAGS.log_device_placement)
            sess = sv.prepare_or_wait_for_session(server.target,
                                                  config=sess_config)
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            '''Start Training'''
            sv.start_queue_runners(sess, queue_runners)
            tf.logging.info('Started %d queues for processing input data.',
                            len(queue_runners))

            batch_size_num = FLAGS.batch_size
            for step in range(FLAGS.max_steps):
                start_time = time.time()
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()
                loss_value, gs = sess.run(
                    [train_op, global_step],
                    feed_dict={batch_size: batch_size_num},
                    options=run_options,
                    run_metadata=run_metadata)

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                duration = time.time() - start_time
                examples_per_sec = batch_size_num / float(duration)
                sec_per_batch = float(duration)
                format_str = (
                    "time: " + str(time.time()) +
                    '; %s: step %d (gs %d), loss= %.2f (%.1f samples/s; %.3f s/batch)'
                )
                tf.logging.info(format_str %
                                (datetime.now(), step, gs, loss_value,
                                 examples_per_sec, sec_per_batch))
                rpcClient.check_staleness(FLAGS.task_id, step)
Пример #11
0
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # Add histograms for trainable variables.
    for var in tf.trainable_variables():
      summaries.append(tf.histogram_summary(var.op.name, var))

    # Track the moving averages of all trainable variables.
    # Note that we maintain a "double-average" of the BatchNormalization
    # global statistics. This is more complicated then need be but we employ
    # this for backward-compatibility with our previous models.
    variable_averages = tf.train.ExponentialMovingAverage(
        inception.MOVING_AVERAGE_DECAY, global_step)

    # Another possiblility is to use tf.slim.get_variables().
    variables_to_average = (tf.trainable_variables() +
                            tf.moving_average_variables())
    variables_averages_op = variable_averages.apply(variables_to_average)

    # Group all updates to into a single train op.
    batchnorm_updates_op = tf.group(*batchnorm_updates)
    train_op = tf.group(apply_gradient_op, variables_averages_op,
                        batchnorm_updates_op)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation from the last tower summaries.
    summary_op = tf.merge_summary(summaries)

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()
Пример #12
0
def model_fn(features, labels, mode, params):
    """Mobilenet v1 model using Estimator API."""
    num_classes = params['num_classes']
    training_active = (mode == tf.estimator.ModeKeys.TRAIN)
    eval_active = (mode == tf.estimator.ModeKeys.EVAL)

    if isinstance(features, dict):
        features = features['feature']

    features = supervised_images.tensor_transform_fn(features,
                                                     params['input_perm'])

    if params['clear_update_collections']:
        # updates_collections must be set to None in order to use fused batchnorm
        with arg_scope(mobilenet_v1.mobilenet_v1_arg_scope()):
            logits, end_points = mobilenet_v1.mobilenet_v1(
                features,
                num_classes,
                is_training=training_active,
                depth_multiplier=params['depth_multiplier'])
    else:
        with arg_scope(mobilenet_v1.mobilenet_v1_arg_scope()):
            logits, end_points = mobilenet_v1.mobilenet_v1(
                features,
                num_classes,
                is_training=training_active,
                depth_multiplier=params['depth_multiplier'])

    predictions = {
        'classes': tf.argmax(input=logits, axis=1),
        'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
    }

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            })

    if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and (
            not params['use_tpu']):
        with tf.control_dependencies([
                tf.Print(predictions['classes'], [predictions['classes']],
                         summarize=params['eval_batch_size'],
                         message='prediction: ')
        ]):
            labels = tf.Print(labels, [labels],
                              summarize=params['eval_batch_size'],
                              message='label: ')

    one_hot_labels = tf.one_hot(labels, params['num_classes'], dtype=tf.int32)

    tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels,
                                    logits=logits,
                                    weights=1.0,
                                    label_smoothing=0.1)
    loss = tf.losses.get_total_loss(add_regularization_losses=True)

    initial_learning_rate = params['learning_rate'] * params['train_batch_size'] / 256  # pylint: disable=line-too-long
    final_learning_rate = 0.0001 * initial_learning_rate

    train_op = None
    if training_active:
        batches_per_epoch = params['num_train_images'] // params[
            'train_batch_size']
        global_step = tf.train.get_or_create_global_step()

        learning_rate = tf.train.exponential_decay(
            learning_rate=initial_learning_rate,
            global_step=global_step,
            decay_steps=params['learning_rate_decay_epochs'] *
            batches_per_epoch,
            decay_rate=params['learning_rate_decay'],
            staircase=True)

        # Set a minimum boundary for the learning rate.
        learning_rate = tf.maximum(learning_rate,
                                   final_learning_rate,
                                   name='learning_rate')

        if params['optimizer'] == 'sgd':
            tf.logging.info('Using SGD optimizer')
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=learning_rate)
        elif params['optimizer'] == 'momentum':
            tf.logging.info('Using Momentum optimizer')
            optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                                   momentum=0.9)
        elif params['optimizer'] == 'RMS':
            tf.logging.info('Using RMS optimizer')
            optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                                  RMSPROP_DECAY,
                                                  momentum=RMSPROP_MOMENTUM,
                                                  epsilon=RMSPROP_EPSILON)
        else:
            tf.logging.fatal('Unknown optimizer:', params['optimizer'])

        if params['use_tpu']:
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step=global_step)
        if params['moving_average']:
            ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY,
                                                    num_updates=global_step)
            variables_to_average = (tf.trainable_variables() +
                                    tf.moving_average_variables())
            with tf.control_dependencies([train_op
                                          ]), tf.name_scope('moving_average'):
                train_op = ema.apply(variables_to_average)

    eval_metrics = None
    if eval_active:

        def metric_fn(labels, predictions):
            accuracy = tf.metrics.accuracy(
                labels, tf.argmax(input=predictions, axis=1))
            return {'accuracy': accuracy}

        if params['use_logits']:
            eval_predictions = logits
        else:
            eval_predictions = end_points['Predictions']

        eval_metrics = (metric_fn, [labels, eval_predictions])

    return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                           loss=loss,
                                           train_op=train_op,
                                           eval_metrics=eval_metrics)
Пример #13
0
    def resnet_model_fn(features, labels, mode, params):
        """Returns the model function."""
        global_step = tf.train.get_global_step()

        feature = features['feature']
        labels = labels['label']
        one_hot_labels = model_utils.get_label(labels,
                                               params,
                                               bird_num_classes,
                                               batch_size=params['batch_size'])

        def get_logits():
            """Return the logits."""
            end_points, aux_logits = None, None
            if FLAGS.model_type == 'resnet':
                avg_pool = model.resnet_v1_model(feature, labels, mode, params)
            else:
                assert False
            name = 'final_dense_dst'
            with tf.variable_scope('target_CLS'):
                logits = tf.layers.dense(
                    inputs=avg_pool,
                    units=bird_num_classes,
                    kernel_initializer=tf.random_normal_initializer(
                        stddev=.01),
                    name=name)
                if end_points is not None:
                    aux_pool = end_points['AuxLogits_Pool']
                    aux_logits = tf.layers.dense(
                        inputs=aux_pool,
                        units=bird_num_classes,
                        kernel_initializer=tf.random_normal_initializer(
                            stddev=.001),
                        name='Aux{}'.format(name))
            return logits, aux_logits, end_points

        logits, _, _ = get_logits()
        logits = tf.cast(logits, tf.float32)

        if FLAGS.model_type == 'resnet':
            dst_loss = tf.losses.softmax_cross_entropy(
                logits=logits,
                weights=1.,
                onehot_labels=one_hot_labels,
                label_smoothing=params['label_smoothing'])
            dst_l2_loss = FLAGS.weight_decay * tf.add_n([
                tf.nn.l2_loss(v) for v in tf.trainable_variables()
                if 'batch_normalization' not in v.name
            ])
            loss = dst_loss + dst_l2_loss

        train_op = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            cur_finetune_step = tf.train.get_global_step()
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                if FLAGS.model_type == 'resnet':
                    finetune_learning_rate = rampcosine()
                else:
                    finetune_learning_rate = rampcosine()
                if FLAGS.optimizer == 'momentum':
                    optimizer = tf.train.MomentumOptimizer(
                        learning_rate=finetune_learning_rate,
                        momentum=params['momentum'],
                        use_nesterov=True)
                elif FLAGS.optimizer == 'RMS':
                    optimizer = tf.train.RMSPropOptimizer(
                        finetune_learning_rate,
                        RMSPROP_DECAY,
                        momentum=RMSPROP_MOMENTUM,
                        epsilon=RMSPROP_EPSILON)
                elif FLAGS.optimizer == 'adam':
                    optimizer = tf.train.AdamOptimizer(finetune_learning_rate)

                optimizer = tf.SyncReplicasOptimizer(
                    optimizer,
                    replicas_to_aggregate=FLAGS.sync_replicas,
                    total_num_replicas=run_config.num_worker_replicas)
                train_op = tf.contrib.training.create_train_op(loss, optimizer)
                with tf.variable_scope('finetune'):
                    train_op = optimizer.minimize(loss, cur_finetune_step)
                if FLAGS.moving_average:
                    ema = tf.train.ExponentialMovingAverage(
                        decay=MOVING_AVERAGE_DECAY, num_updates=global_step)
                    variables_to_average = (tf.trainable_variables() +
                                            tf.moving_average_variables())
                    with tf.control_dependencies([train_op]):
                        with tf.name_scope('moving_average'):
                            train_op = ema.apply(variables_to_average)
        else:
            train_op = None

        batch_size = params['batch_size']  # pylint: disable=unused-variable
        eval_metrics = None
        if mode == tf.estimator.ModeKeys.EVAL:
            eval_metrics = model_utils.metric_fn(labels, logits)

        if mode == tf.estimator.ModeKeys.TRAIN:
            with tf.control_dependencies([train_op]):
                tf.summary.scalar('classifier/finetune_loss', loss)
                tf.summary.scalar('classifier/finetune_lr',
                                  finetune_learning_rate)
        else:
            train_op = None

        return tf.estimator.EstimatorSpec(
            mode=mode,
            loss=loss,
            train_op=train_op,
            eval_metric_ops=eval_metrics,
        )
def train():
  ps_hosts = FLAGS.ps_hosts.split(',')
  worker_hosts = FLAGS.worker_hosts.split(',')
  print ('PS hosts are: %s' % ps_hosts)
  print ('Worker hosts are: %s' % worker_hosts)

  server = tf.train.Server(
      {'ps': ps_hosts, 'worker': worker_hosts},
      job_name = FLAGS.job_name,
      task_index=FLAGS.task_id)

  if FLAGS.job_name == 'ps':
    # `ps` jobs wait for incoming connections from the workers.
    server.join()

  is_chief = (FLAGS.task_id == 0)
  if is_chief:
    if tf.gfile.Exists(FLAGS.train_dir):
      tf.gfile.DeleteRecursively(FLAGS.train_dir)
    tf.gfile.MakeDirs(FLAGS.train_dir)

  """Train CIFAR-10 for a number of steps."""
  cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts})
  device_setter = tf.train.replica_device_setter(cluster=cluster)
  with tf.device(device_setter):
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                    global_step,
                                    decay_steps,
                                    LEARNING_RATE_DECAY_FACTOR,
                                    staircase=True)
    tf.scalar_summary('learning_rate', lr)
    opt = tf.train.GradientDescentOptimizer(lr)


    # Track the moving averages of all trainable variables.
    exp_moving_averager = tf.train.ExponentialMovingAverage(
        MOVING_AVERAGE_DECAY, global_step)
    variables_to_average = (
        tf.trainable_variables() + tf.moving_average_variables())

    opt = tf.train.SyncReplicasOptimizer(
        opt,
        replicas_to_aggregate=len(worker_hosts),
        replica_id=FLAGS.task_id,
        total_num_replicas=len(worker_hosts),
        variable_averages=exp_moving_averager,
        variables_to_average=variables_to_average)


    # Compute gradients with respect to the loss.
    grads = opt.compute_gradients(loss)

    # Add histograms for gradients.
    for grad, var in grads:
      if grad is not None:
        tf.histogram_summary(var.op.name + '/gradients', grad)

    apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)

    with tf.control_dependencies([apply_gradients_op]):
      train_op = tf.identity(loss, name='train_op')


    chief_queue_runners = [opt.get_chief_queue_runner()]
    init_tokens_op = opt.get_init_tokens_op()

    saver = tf.train.Saver()
    # We run the summaries in the same thread as the training operations by
    # passing in None for summary_op to avoid a summary_thread being started.
    # Running summaries and training operations in parallel could run out of
    # GPU memory.
    sv = tf.train.Supervisor(is_chief=is_chief,
                             logdir=FLAGS.train_dir,
                             init_op=tf.initialize_all_variables(),
                             summary_op=tf.merge_all_summaries(),
                             global_step=global_step,
                             saver=saver,
                             save_model_secs=60)

    tf.logging.info('%s Supervisor' % datetime.now())

    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement)

    print ("Before session init")
    # Get a session.
    sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
    print ("Before session init done")

    # Start the queue runners.
    queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
    sv.start_queue_runners(sess, queue_runners)
    print ('Started %d queues for processing input data.' % len(queue_runners))

    sv.start_queue_runners(sess, chief_queue_runners)
    sess.run(init_tokens_op)

    print ('Start training')
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value, gs = sess.run([train_op, loss, global_step])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, gs, loss_value,
                             examples_per_sec, sec_per_batch))

    if is_chief:
        saver.save(sess,
                   os.path.join(FLAGS.train_dir, 'model.ckpt'),
                   global_step=global_step)
Пример #15
0
    def test_restore_ema(self):

        # Create 100 phony x, y data points in NumPy, y = x * 0.1 + 0.3
        x_data = np.random.rand(100).astype(np.float32)
        y_data = x_data * 0.1 + 0.3

        # Try to find values for W and b that compute y_data = W * x_data + b
        # (We know that W should be 0.1 and b 0.3, but TensorFlow will
        # figure that out for us.)
        W = tf.Variable(tf.random_uniform([1], -1.0, 1.0), name='W')
        b = tf.Variable(tf.zeros([1]), name='b')
        y = W * x_data + b

        # Minimize the mean squared errors.
        loss = tf.reduce_mean(tf.square(y - y_data))
        optimizer = tf.train.GradientDescentOptimizer(0.5)
        opt_op = optimizer.minimize(loss)

        # Track the moving averages of all trainable variables.
        ema = tf.train.ExponentialMovingAverage(decay=0.9999)
        averages_op = ema.apply(tf.trainable_variables())
        with tf.control_dependencies([opt_op]):
            train_op = tf.group(averages_op)

        # Before starting, initialize the variables.  We will 'run' this first.
        init = tf.global_variables_initializer()

        saver = tf.train.Saver(tf.trainable_variables())

        # Launch the graph.
        sess = tf.Session()
        sess.run(init)

        # Fit the line.
        for _ in range(201):
            sess.run(train_op)

        w_reference = sess.run('W/ExponentialMovingAverage:0')
        b_reference = sess.run('b/ExponentialMovingAverage:0')

        saver.save(sess, os.path.join(self.tmp_dir, "model_ex1"))

        tf.reset_default_graph()

        tf.train.import_meta_graph(os.path.join(self.tmp_dir,
                                                "model_ex1.meta"))
        sess = tf.Session()

        print('------------------------------------------------------')
        for var in tf.global_variables():
            print('all variables: ' + var.op.name)
        for var in tf.trainable_variables():
            print('normal variable: ' + var.op.name)
        for var in tf.moving_average_variables():
            print('ema variable: ' + var.op.name)
        print('------------------------------------------------------')

        mode = 1
        restore_vars = {}
        if mode == 0:
            ema = tf.train.ExponentialMovingAverage(1.0)
            for var in tf.trainable_variables():
                print('%s: %s' % (ema.average_name(var), var.op.name))
                restore_vars[ema.average_name(var)] = var
        elif mode == 1:
            for var in tf.trainable_variables():
                ema_name = var.op.name + '/ExponentialMovingAverage'
                print('%s: %s' % (ema_name, var.op.name))
                restore_vars[ema_name] = var

        saver = tf.train.Saver(restore_vars, name='ema_restore')

        saver.restore(sess, os.path.join(self.tmp_dir, "model_ex1"))

        w_restored = sess.run('W:0')
        b_restored = sess.run('b:0')

        self.assertAlmostEqual(
            w_reference, w_restored,
            'Restored model modes not use the EMA filtered weight')
        self.assertAlmostEqual(
            b_reference, b_restored,
            'Restored model modes not use the EMA filtered bias')
def train(*args, **kwargs):
    # Get all neccessary paramters from kwargs
    try:
        # Get model graph
        my_model_graph = kwargs['model_graph']
    except:
        logging.error('(model_graph) was not provided!')
        raise KeyError('(model_graph) was not provided!')
    try:
        # Get loss operations
        my_loss = kwargs['loss']
    except:
        logging.error('(losses) was not provided!')
        raise KeyError('(losses) was not provided!')

    try:
        # Get metric operations
        my_metric_ops = kwargs['metrics']
    except:
        my_metric_ops = None
        pass

    # Build the summary operation based on the TF collection of Summaries.
    if not kwargs['output_dir'] or 'output_dir' not in kwargs:
        kwargs['output_dir'] = 'output_dir/train_dir/%s' % datetime.now(
        ).strftime('%Y_%m_%d_%H.%M')
    logging.info('Saving evaluation results to: {}'.format(
        kwargs['output_dir']))

    # Add train iterator
    train_iter = kwargs['train_iter']
    train_iter.initialize()
    train_iter.load_img_lst()
    train_data = train_iter.data_batch()
    train_label = train_iter.label_batch()
    # Add validation iterator
    valid_iter = kwargs['valid_iter']
    valid_iter.initialize()
    valid_iter.load_img_lst()
    valid_data = valid_iter.data_batch()
    valid_label = valid_iter.label_batch()

    # Define global step
    global_step = tf.get_variable('global_step', [],
                                  initializer=tf.constant_initializer(0),
                                  trainable=False)

    test_image_to_log = tf.placeholder(
        tf.uint8,
        [40, kwargs['image_shape'][-3], kwargs['image_shape'][-2], 3])
    log_image_test = tf.summary.image("Test examples",
                                      test_image_to_log,
                                      max_outputs=40)

    train_image_to_log = tf.placeholder(
        tf.uint8,
        [40, kwargs['image_shape'][-3], kwargs['image_shape'][-2], 3])
    log_image_train = tf.summary.image("Train examples",
                                       train_image_to_log,
                                       max_outputs=40)

    # Selecte optimizer
    lr = kwargs['learning_rate']
    if kwargs['optimizer'] == 'GradientDescentOptimizer':
        opt = tf.train.GradientDescentOptimizer(kwargs['learning_rate'])
    elif kwargs['optimizer'] == 'MomentumOptimizer':
        opt = tf.train.MomentumOptimizer(kwargs['learning_rate'],
                                         kwargs['momentum'])
    elif kwargs['optimizer'] == 'AdamOptimizer':
        opt = tf.train.AdamOptimizer(kwargs['learning_rate'])
    elif kwargs['optimizer'] == 'AdadeltaOptimizer':
        opt = tf.train.AdadeltaOptimizer(kwargs['learning_rate'],
                                         kwargs['rho'])
    elif kwargs['optimizer'] == 'RMSPropOptimizer':
        decay_steps = int(kwargs['tr_num_examples'] / kwargs['batch_size'] *
                          kwargs['num_epochs_per_decay'])
        lr = tf.train.exponential_decay(kwargs['learning_rate'],
                                        global_step,
                                        decay_steps,
                                        kwargs['learning_rate_decay_factor'],
                                        staircase=True)
        opt = tf.train.RMSPropOptimizer(lr,
                                        kwargs['RMSPROP_DECAY'],
                                        momentum=kwargs['momentum'],
                                        epsilon=kwargs['RMSPROP_EPSILON'])
    else:
        logging.error('Hyperparameter "optimizer" was not provided!')
        raise KeyError('Hyperparameter "optimizer" was not provided!')
    logging.info('Selected Optimizer: {}'.format(kwargs['optimizer']))

    gpu_id = kwargs['gpus']
    if not isinstance(gpu_id, list):
        gpu_id = [gpu_id]
    gpu_id = gpu_id[0]

    with tf.device('/gpu:%d' % gpu_id):
        logging.info('Training on gpu:{}'.format(gpu_id))
        # Get endpoint / or get tensor from session.graph
        out_, train_eps_ = my_model_graph(train_data,
                                          restore_logits=False,
                                          is_training=True,
                                          reuse=None,
                                          scope=kwargs['model_name'],
                                          **kwargs)

        # Add loss operation
        loss_op = my_loss(out_, train_label, 'train', **kwargs)
        # train_metric_ops = tf.group(*[m(out_, train_label, 'train', **kwargs)
        #        for m in my_metric_ops])

        # Add loss-averages for training
        tr_loss_averages_op = metrics.add_loss_averages(
            tf.get_collection('train'), 'train_summaries')

        # Add learning rate to summary
        train_summaries = [tf.summary.scalar('learning_rate', lr)]
        train_summaries += tf.get_collection('train_summaries')

        # Calculate and apply selected gradients
        if kwargs['train_scopes']:
            ws = []
            # Find all parameters in the train scopes
            for tr_scope in kwargs['train_scopes']:
                logging.info('Add to training endpoints: {}'.format(tr_scope))
                with tf.variable_scope(tr_scope, reuse=True) as scope:
                    w_names = [
                        '/'.join(i.name.split('/')[1:])[:-2] for i in
                        tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES,
                                          scope=scope.name)
                    ]
                    ws += [tf.get_variable(w_name) for w_name in w_names]
                    for w_name in w_names:
                        logging.info('({})-paramter: {}'.format(
                            tr_scope, w_name))

            # Compute gradients for this selected parameters
            grads = opt.compute_gradients(loss_op, ws)
            apply_gradient_op = opt.apply_gradients(grads,
                                                    global_step=global_step)
        else:
            # Update all parameters
            logging.info('Adding all parameters to training endpoints')
            grads = opt.compute_gradients(loss_op)
            apply_gradient_op = opt.apply_gradients(grads,
                                                    global_step=global_step)

        # Get batchnorm moving mean and variance updates
        if 'UPDATE_OPS_COLLECTION' in kwargs:
            logging.debug('add batchnorm updates')
            batchnorm_updates = tf.get_collection(
                kwargs['UPDATE_OPS_COLLECTION'])
            batchnorm_updates_op = tf.group(*batchnorm_updates)

        # Add histograms for gradients.
        #for grad, var in grads:
        #    if grad is not None:
        #        train_summaries.append(
        #            tf.histogram_summary(var.op.name + '/gradients', grad))

        # Track the moving averages of all trainable variables.
        # Note that we maintain a "double-average" of the BatchNormalization
        # global statistics. This is more complicated then need be but we employ
        # this for backward-compatibility with our previous models.
        variable_averages = tf.train.ExponentialMovingAverage(
            kwargs['MOVING_AVERAGE_DECAY'], global_step)

        # Update moving averages of all parameters
        variables_to_average = (tf.trainable_variables() +
                                tf.moving_average_variables())
        variables_averages_op = variable_averages.apply(variables_to_average)
        # Group all updates
        if 'UPDATE_OPS_COLLECTION' in kwargs:
            logging.debug('batchnorm updates in train_op')
            train_op = tf.group(apply_gradient_op, variables_averages_op,
                                batchnorm_updates_op)
        else:
            logging.debug('no batchnorm updates in train_op')
            train_op = tf.group(apply_gradient_op, variables_averages_op)

        # Add evaluation graph after training step
        test_out_, _ = my_model_graph(valid_data,
                                      restore_logits=False,
                                      is_training=False,
                                      reuse=True,
                                      scope=kwargs['model_name'],
                                      **kwargs)

        # Add validation metrics and averages
        test_loss_op = my_loss(test_out_, valid_label, 'validation', **kwargs)
        # test_metric_ops = tf.group(*([m(test_out_, valid_label, 'validation', **kwargs)
        #        for m in my_metric_ops]))
        if my_metric_ops != None:
            test_metric_ops_list = my_metric_ops(test_out_, valid_label,
                                                 'validation', **kwargs)

        # Add loss-averages for validation
        va_loss_averages_op = metrics.add_loss_averages(
            tf.get_collection('validation'), 'validation_summaries')

        validation_summaries = tf.get_collection('validation_summaries')

    # Build summary operation
    train_summary_op = tf.summary.merge(train_summaries)
    validation_summary_op = tf.summary.merge(validation_summaries)
    # summary_op = tf.merge_all_summaries()
    # Build an initialization operation to run below.
    init_op = tf.initialize_all_variables()

    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=kwargs['gpu_fraction'])

    # Define a session
    sess = tf.Session(config=tf.ConfigProto(
        gpu_options=gpu_options,
        allow_soft_placement=True,
        log_device_placement=kwargs['log_device_placement']))

    # initialize all variables
    sess.run(init_op)

    # restore checkpoint and create saver
    if kwargs['pretrained_checkpoint_dir']:
        ckpt = tf.train.get_checkpoint_state(
            kwargs['pretrained_checkpoint_dir'])
        ignore_missing_vars = True
        print(
            '----------------\nrestoring checkpoint: {} ignore_missing_vars={}'
            .format(ckpt.model_checkpoint_path, ignore_missing_vars))
        init_fn, _ = restore_checkpoint(
            sess,
            ckpt.model_checkpoint_path,
            var_list=tf.all_variables(),
            ignore_missing_vars=ignore_missing_vars,
            reshape_variables=False)
        init_fn(sess)
        print(
            'checkpoint restored: {} ignoring missing vars={}\n------------------------'
            .format(ckpt.model_checkpoint_path, ignore_missing_vars))
    # else:

    saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)

    # Start the queue runners.
    coord = None
    if train_iter.need_queue_runners() or valid_iter.need_queue_runners():
        logging.debug('Create coordinator, start queue runners...')
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    summary_writer = tf.summary.FileWriter(kwargs['output_dir'],
                                           graph=sess.graph)

    # validation and testloss placeholder, manual rate, not every batch not
    # every epoch
    tr_summary_op_train, tr_loss_placeholder = create_summary_op(
        loss_op.name.split(':')[0])  # loss_op.name
    if my_metric_ops == None:
        va_summary_op_train, va_loss_placeholder = create_summary_op(
            test_loss_op.name.split(':')[0])  # test_loss_op.name
    else:
        va_summary_op_train_list, va_loss_placeholder_list = [], []
        for cnt, m in enumerate(test_metric_ops_list):
            if cnt == 10:
                va_summary_op_train, va_loss_placeholder = create_summary_op(
                    'va/' + test_loss_op.name.split(':')[0] + '_average_' +
                    str(cnt))  # test_loss_op.name
            else:
                va_summary_op_train, va_loss_placeholder = create_summary_op(
                    'va/' + test_loss_op.name.split(':')[0] + '_channelid_' +
                    str(cnt))  # test_loss_op.name

            va_summary_op_train_list.append(va_summary_op_train)
            va_loss_placeholder_list.append(va_loss_placeholder)

    for step in range(kwargs['max_steps']):

        epoch_start = time.time()
        # Training step
        if step % 1 == 0:
            num_iter = int(
                math.ceil(
                    float(kwargs['tr_num_examples']) / kwargs['batch_size']))
            train_step = 0
            train_loss = []
            start_time = time.time()
            while train_step < num_iter and not should_stop(coord):
                train_iter.read_batch()
                train_data_feed = train_iter.get_data_batch()
                train_label_feed = train_iter.get_label_batch()
                if train_data_feed != None and train_label_feed != None:
                    # Merge data and label dicts
                    train_data.update(train_label)
                    train_data_feed.update(train_label_feed)
                    data_keys = train_data.keys()
                    data_feed_keys = train_data_feed.keys()
                    assert data_keys == data_feed_keys
                    train_loss_, _ = sess.run([loss_op, train_op],
                                              feed_dict={
                                                  train_data[k]:
                                                  train_data_feed[k]
                                                  for k in data_keys
                                              })
                    '''
                    print(train_data_feed['labels'].shape)
                    print(tmp_out['predictions'])
                    print(tmp_out['predictions'].shape)
                    print(train_loss_)
                    print(train_loss_.shape)
                    sys.exit()
                    '''
                else:
                    train_loss_, _ = sess.run([loss_op, train_op])
                assert not np.isnan(
                    train_loss_), 'Model diverged with training-loss = NaN'
                train_loss += [train_loss_]
                train_step += 1

            mean_loss_tr = np.mean(train_loss)

            summary_str = sess.run(
                tr_summary_op_train,
                feed_dict={tr_loss_placeholder: mean_loss_tr})
            summary_writer.add_summary(summary_str,
                                       (step * kwargs['tr_num_examples']))

            duration = time.time() - start_time
            examples_per_sec = kwargs['batch_size'] / float(duration)
            format_str = ('Epoch %d, tr_loss = %.5f (%.1f examples/sec; %.3f '
                          'sec/epoch)')
            logging.info(format_str %
                         (step, mean_loss_tr, examples_per_sec, duration))

        # Evaluation step
        evaluation_step = 1 if 'evaluation_step' not in kwargs else kwargs[
            'evaluation_step']
        if step % evaluation_step == 0:
            # sess.run(batchnorm_updates_op)
            num_iter = int(
                math.ceil(
                    float(kwargs['va_num_examples']) / kwargs['batch_size']))
            test_step = 0
            test_loss = []
            start_time = time.time()
            while test_step < num_iter and not should_stop(coord):
                valid_iter.read_batch()
                valid_data_feed = valid_iter.get_data_batch()
                valid_label_feed = valid_iter.get_label_batch()
                if valid_data_feed != None and valid_label_feed != None:
                    # Merge data and label dicts
                    valid_data.update(valid_label)
                    valid_data_feed.update(valid_label_feed)
                    data_keys = valid_data.keys()
                    data_feed_keys = valid_data_feed.keys()
                    assert data_keys == data_feed_keys
                    if my_metric_ops == None:
                        test_loss_ = sess.run([test_loss_op],
                                              feed_dict={
                                                  valid_data[k]:
                                                  valid_data_feed[k]
                                                  for k in data_keys
                                              })
                        assert not np.isnan(
                            test_loss_
                        ), 'Model diverged with validation-loss = NaN'
                    else:
                        test_loss_ = sess.run(test_metric_ops_list,
                                              feed_dict={
                                                  valid_data[k]:
                                                  valid_data_feed[k]
                                                  for k in data_keys
                                              })
                else:
                    if my_metric_ops == None:
                        test_loss_ = sess.run([test_loss_op])
                        assert not np.isnan(
                            test_loss_
                        ), 'Model diverged with validation-loss = NaN'
                    else:
                        test_loss_ = sess.run(test_metric_ops_list)

                test_loss += [test_loss_]
                test_step += 1

            if my_metric_ops == None:
                mean_loss_va = np.mean(test_loss)

                summary_str = sess.run(
                    va_summary_op_train,
                    feed_dict={va_loss_placeholder: mean_loss_va})
                summary_writer.add_summary(summary_str,
                                           (step * kwargs['tr_num_examples']))

                duration = time.time() - start_time
                examples_per_sec = kwargs['batch_size'] / float(duration)
                format_test_str = (
                    'Epoch %d, va_loss = %.5f (%.1f examples/sec, %.3f '
                    'sec/epoch)')
                logging.info(format_test_str %
                             (step, mean_loss_va, examples_per_sec, duration))
            else:
                test_loss = np.array(test_loss)
                test_loss = np.mean(test_loss, axis=0)

                for cnt, l in enumerate(test_loss):
                    summary_str = sess.run(
                        va_summary_op_train_list[cnt],
                        feed_dict={va_loss_placeholder_list[cnt]: l})
                    summary_writer.add_summary(
                        summary_str, (step * kwargs['tr_num_examples']))

                duration = time.time() - start_time
                examples_per_sec = kwargs['batch_size'] / float(duration)
                format_test_str = (
                    'Epoch %d, va_loss_total = %.5f (%.1f examples/sec, %.3f '
                    'sec/epoch)')
                logging.info(format_test_str %
                             (step, test_loss[0], examples_per_sec, duration))

        # IMAGE SUNMMARY STUFF
        summary_step = 1 if 'summary_step' not in kwargs else kwargs[
            'summary_step']
        if step % summary_step == 0:
            logging.debug('Add summary string...')
            if train_data_feed != None and train_label_feed != None:
                # Run all output-opterations and summary ops

                out_.update({'train_summary_op': train_summary_op})
                out = sess.run(out_,
                               feed_dict={
                                   train_data[k]: train_data_feed[k]
                                   for k in data_keys
                               })
                # summary_str = out['train_summary_op']

                # Add image summaries
                if 'train_image_summary' in kwargs:
                    out.update(train_data_feed)
                    out.update({'step': step, 'mode': 'train'})
                    img_logs = kwargs['train_image_summary'](kwargs, **out)

                    list_of_log_images = []
                    for train_output_to_log, name in img_logs:
                        list_of_log_images.append(train_output_to_log)

                    feed = {train_image_to_log: np.array(list_of_log_images)}
                    train_image_summary_str = sess.run(log_image_train,
                                                       feed_dict=feed)
                    summary_writer.add_summary(train_image_summary_str)

            else:
                # print("should not happen")
                # sys.exit()
                # Run all output operations and summary ops
                out_.update({'train_summary_op': train_summary_op})
                # Add input data and labels to this run
                out_.update(train_data)
                out_.update(train_label)
                out = sess.run(out_)

                if 'train_image_summary' in kwargs:
                    # Add current step and mode to image summary fuction input
                    out.update({'step': step, 'mode': 'train'})
                    img_logs = kwargs['train_image_summary'](kwargs, **out)

                    list_of_log_images = []
                    for train_output_to_log, name in img_logs:
                        list_of_log_images.append(train_output_to_log)

                    feed = {train_image_to_log: np.array(list_of_log_images)}
                    train_image_summary_str = sess.run(log_image_train,
                                                       feed_dict=feed)
                    summary_writer.add_summary(train_image_summary_str)
                    '''
                    for train_output_to_log, name in img_logs:
                        feed = {
                            test_image_to_log: train_output_to_log, log_image_name: name}
                        train_image_summary_str = sess.run(
                            log_image, feed_dict=feed)
                        summary_writer.add_summary(train_image_summary_str)
                    '''
            if valid_data_feed != None and valid_label_feed != None:

                test_out_.update(
                    {'validation_summary_op': validation_summary_op})
                out = sess.run(test_out_,
                               feed_dict={
                                   valid_data[k]: valid_data_feed[k]
                                   for k in data_keys
                               })

                # Add image summaries
                if 'validation_image_summary' in kwargs:
                    out.update(valid_data_feed)
                    out.update({'step': step, 'mode': 'validation'})
                    img_logs = kwargs['validation_image_summary'](kwargs,
                                                                  **out)

                    list_of_log_images = []
                    for test_output_to_log, name in img_logs:
                        list_of_log_images.append(test_output_to_log)

                    feed = {test_image_to_log: np.array(list_of_log_images)}
                    test_image_summary_str = sess.run(log_image_test,
                                                      feed_dict=feed)
                    summary_writer.add_summary(test_image_summary_str)

            else:
                # print("should not happen")
                # sys.exit()
                # Run all output operations and the summary ops
                test_out_.update(
                    {'validation_summary_op': validation_summary_op})
                # Add input data and labels to summary run
                test_out_.update(valid_data)
                test_out_.update(valid_label)
                out = sess.run(test_out_)

                if 'validation_image_summary' in kwargs:
                    out.update({'step': step, 'mode': 'validation'})
                    img_logs = kwargs['validation_image_summary'](kwargs,
                                                                  **out)

                    list_of_log_images = []
                    for test_output_to_log, name in img_logs:
                        list_of_log_images.append(test_output_to_log)

                    feed = {test_image_to_log: np.array(list_of_log_images)}
                    test_image_summary_str = sess.run(log_image_test,
                                                      feed_dict=feed)
                    summary_writer.add_summary(test_image_summary_str)

        # Save the model checkpoint periodically.
        save_step = 1 if 'save_step' not in kwargs else kwargs['save_step']
        if (step % save_step == 0 or
            (step + 1) == kwargs['max_steps']) and step != 0:
            logging.info('Saving checkpoint to: {}, step: {}'.format(
                kwargs['output_dir'], step))
            checkpoint_path = os.path.join(kwargs['output_dir'],
                                           'new-model.ckpt')
            saver.save(sess, checkpoint_path, global_step=global_step)

        logging.info('Time per Epoch: {}'.format(time.time() - epoch_start))

    # Join threads and close session
    if train_iter.need_queue_runners() or valid_iter.need_queue_runners():
        logging.debug('request coordinater stop, joining threads...')
        coord.request_stop()
        coord.join(threads)
        sess.close()
Пример #17
0
def batch_normalization(
        x,
        training,
        name="batch_normalization",
        decay=0.99,
        epsilon=1e-5,
        global_norm=True):
    # Get input shape as python list.
    shape = x.get_shape().as_list()

    if global_norm:
        # Channel-wise statistics.
        size = shape[-1:]
        axes = list(range(len(shape)-1))
        keep_dims = False
    else:
        # Pixel-wise statistics.
        size = [1] + shape[1:]
        axes = [0]
        keep_dims = True

    with tf.variable_scope(name):
        beta = tf.get_variable(
            name="beta",
            shape=size,
            initializer=tf.constant_initializer(0.0),
        )
        gamma = tf.get_variable(
            name="gamma",
            shape=size,
            initializer=tf.random_normal_initializer(1.0, 0.02),
        )
        moving_mean = tf.get_variable(
            name="moving_mean",
            shape=size,
            initializer=tf.constant_initializer(0.0),
            trainable=False,
        )
        moving_var = tf.get_variable(
            name="moving_var",
            shape=size,
            initializer=tf.constant_initializer(1.0),
            trainable=False,
        )

        # Add moving vars to the tf collection.
        # The list of moving vars can be obtained with
        # tf.moving_average_variables().
        if moving_mean not in tf.moving_average_variables():
            collection = tf.GraphKeys.MOVING_AVERAGE_VARIABLES
            tf.add_to_collection(collection, moving_mean)
            tf.add_to_collection(collection, moving_var)

        def train_mode():
            # execute at training time
            batch_mean, batch_var = tf.nn.moments(
                                        x,
                                        axes=axes,
                                        keep_dims=keep_dims,
                                    )
            update_mean = tf.assign_sub(
                moving_mean, (1-decay) * (moving_mean-batch_mean)
            )
            update_var = tf.assign_sub(
                moving_var, (1-decay) * (moving_var-batch_var)
            )

            # Automatically update global means and variances.
            with tf.control_dependencies([update_mean, update_var]):
                return tf.nn.batch_normalization(
                            x, batch_mean, batch_var, beta, gamma, epsilon)

        def test_mode():
            # execute at test time
            return tf.nn.batch_normalization(
                       x, moving_mean, moving_var, beta, gamma, epsilon)

        return tf.cond(training, train_mode, test_mode)
Пример #18
0
def main():
    """Train on dataset for a number of steps."""
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = 100
        num_epochs_per_decay = 5
        decay_steps = int(num_batches_per_epoch * num_epochs_per_decay)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                        global_step,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.AdamOptimizer(lr)

        # Set the number of preprocessing threads
        num_preprocess_threads = FLAGS.num_preprocess_threads

        with h5py.File("../data/training_celeba_FaceTracker.h5") as hf:
            _images_train = hf["training_data"][:10]
            _landmarks_train = hf["training_landmarks"][:10]
            mean_landmarks = hf["mean_landmarks"][:]

        with h5py.File("../data/validation_celeba_FaceTracker.h5") as hf:
            _images_val = hf["validation_data"][:10]
            _landmarks_val = hf["validation_landmarks"][:10]

        # Load the mean vector and std of (true_landmark - perturbed_landmark)
        try:
            delta_mean = np.load("../data/delta_mean.npy")
            delta_std = np.load("../data/delta_std.npy")
        except:
            delta_mean, delta_std = get_perturbation_statistics()

        image_shape = _images_train[0].shape
        lms_shape = _landmarks_train[0].shape

        def get_random_sample():
            idx = np.random.randint(0, len(_images_train))
            shape = _landmarks_train[idx].astype("float32")
            initial_shape = sample_perturbation(shape, mean_landmarks).astype("float32")
            # plt.imshow(_images_train[idx][:, :, 0], cmap="gray")
            # plt.scatter(shape[:, 0], shape[:, 1], c="g")
            # plt.scatter(initial_shape[:, 0], initial_shape[:, 1], c="r")
            # plt.show()
            # plt.clf()
            # plt.close()
            return _images_train[idx].astype("float32"), shape, initial_shape

        image, shape, initial_shape = tf.py_func(get_random_sample, [],
                                                 [tf.float32, tf.float32, tf.float32], name="random_sample_train")
        image.set_shape(image_shape)
        shape.set_shape(lms_shape)
        initial_shape.set_shape(lms_shape)

        images, lms, inits = tf.train.batch([image, shape, initial_shape],
                                            FLAGS.batch_size,
                                            dynamic_pad=False,
                                            capacity=1000,
                                            enqueue_many=False,
                                            num_threads=num_preprocess_threads,
                                            name='train_img_batch')

        def get_random_sample_val():
            idx = np.random.randint(0, len(_images_val))
            shape = _landmarks_val[idx].astype("float32")
            initial_shape = sample_perturbation(shape, mean_landmarks).astype("float32")
            return _images_val[idx].astype("float32"), shape, initial_shape

        image_val, shape_val, initial_shape_val = tf.py_func(get_random_sample_val, [],
                                                             [tf.float32, tf.float32, tf.float32],
                                                             name="random_sample_val")
        image_val.set_shape(image_shape)
        shape_val.set_shape(lms_shape)
        initial_shape_val.set_shape(lms_shape)

        images_val, lms_val, inits_val = tf.train.batch([image_val, shape_val, initial_shape_val],
                                                        FLAGS.batch_size,
                                                        dynamic_pad=False,
                                                        capacity=1000,
                                                        enqueue_many=False,
                                                        num_threads=num_preprocess_threads,
                                                        name='val_img_batch')

        print('Defining model...')
        with tf.device(FLAGS.train_device):
            # Retain the summaries from the final tower.
            summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, "")
            with tf.variable_scope("scopernn") as scopernn:
                predictions, dxs, _ = models.model(images, inits, is_training=True)
                scopernn.reuse_variables()
                predictions_val, dxs_val, _ = models.model(images_val, inits_val, is_training=False)

            total_loss_train = 0
            total_loss_val = 0
            list_train_loss, list_val_loss = [], []

            loss_weights = [1, 1, 1, 1]
            with tf.name_scope("Error_train"):
                for i, dx in enumerate(dxs):
                    loss_norm, loss = models.normalized_rmse(inits, dx, lms, delta_mean, delta_std)
                    tf.histogram_summary('errors', loss)
                    list_train_loss.append(loss)
                    total_loss_train += loss_norm * loss_weights[i]
                    summaries.append(tf.scalar_summary('losses_train/step_{}'.format(i),
                                                       loss))
            with tf.name_scope("Error_val"):
                for i, dx in enumerate(dxs_val):
                    loss_norm_val, loss_val = models.normalized_rmse(inits_val, dx, lms_val, delta_mean, delta_std)
                    tf.histogram_summary('errors', loss_val)
                    list_val_loss.append(loss_val)
                    total_loss_val += loss_norm_val * loss_weights[i]
                    summaries.append(tf.scalar_summary('losses_val/step_{}'.format(i),
                                                       loss_val))

            # Calculate the gradients for the batch of data
            grads = opt.compute_gradients(total_loss_train)

        summaries.append(tf.scalar_summary('losses/total_train', total_loss_train))
        summaries.append(tf.scalar_summary('losses/total_val', total_loss_val))

        gt_images_val, = tf.py_func(utils.batch_draw_landmarks_green, [images_val, lms_val],
                                    [tf.float32], name="gt_img_visu")
        init_images_val, = tf.py_func(utils.batch_draw_landmarks_red, [images_val, inits_val],
                                      [tf.float32], name="init_img_visu")
        pred_images_val, = tf.py_func(utils.batch_draw_landmarks_green,
                                      [images_val, predictions_val], [tf.float32], name="pred_img_visu")

        summary = tf.image_summary('images_val',
                                   tf.concat(2, [gt_images_val, init_images_val, pred_images_val]),
                                   max_images=8)
        summaries.append(tf.histogram_summary('dx_train', predictions - inits))
        summaries.append(tf.histogram_summary('dx_val', predictions_val - inits_val))

        summaries.append(summary)

        batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION,
                                              "")

        # Add a summary to track the learning rate.
        summaries.append(tf.scalar_summary('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(tf.histogram_summary(var.op.name +
                                                      '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.histogram_summary(var.op.name, var))

        # Track the moving averages of all trainable variables.
        # Note that we maintain a "double-average" of the BatchNormalization
        # global statistics. This is more complicated then need be but we employ
        # this for backward-compatibility with our previous models.
        variable_averages = tf.train.ExponentialMovingAverage(
            MOVING_AVERAGE_DECAY, global_step)

        # Another possibility is to use tf.slim.get_variables().
        variables_to_average = (
            tf.trainable_variables() + tf.moving_average_variables())
        variables_averages_op = variable_averages.apply(variables_to_average)

        # Group all updates to into a single train op.
        # NOTE: Currently we are not using batchnorm in MDM.
        batchnorm_updates_op = tf.group(*batchnorm_updates)
        train_op = tf.group(apply_gradient_op, variables_averages_op,
                            batchnorm_updates_op)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.merge_summary(summaries)
        # Start running operations on the Graph. allow_soft_placement must be
        # set to True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()
        print('Initializing variables...')
        sess.run(init)
        print('Initialized variables.')

        if FLAGS.pretrained_model_checkpoint_path:
            assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
            variables_to_restore = tf.get_collection(
                slim.variables.VARIABLES_TO_RESTORE)
            restorer = tf.train.Saver(variables_to_restore)
            restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
            print('%s: Pre-trained model restored from %s' %
                  (datetime.now(), FLAGS.pretrained_model_checkpoint_path))

        #################
        # APP
        #################
        cap = cv2.VideoCapture(0)

        mode = 0  # detect
        shape = []
        init_shape = []
        print '\n\nPRESS q/Q to QUIT\n'

        while True:
            # Capture frame-by-frame
            ret, frame = cap.read()
            if ret is True:
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

                # detect face with Haar cascade
                if mode == 0:
                    faces = face_cascade.detectMultiScale(gray, 1.1, 3)
                    if len(faces) == 0:
                        continue

                # face detection succesfull. Start Tracking!

                if mode == 0:
                    init_shape = compute_init_shape(mean_landmarks, faces)
                else:
                    # Need to realign init_shape with mean shape
                    a,b,tx,ty = utils.CalcSimT(np.ravel(mean_landmarks, order='F'), init_shape.ravel('F'))
                    init_shape = utils.SimT(np.ravel(mean_landmarks, order='F'), a, b, tx, ty)
                    init_shape = np.reshape(init_shape, (5, 2), order='F')

                leyex, leyey = init_shape[0]
                reyex, reyey = init_shape[1]

                ax, bx = 44. / (reyex - leyex), 44. * (1 - leyex / (reyex - leyex))
                ay, by = 44. / (reyex - leyex), 44. * (1 - leyey / (reyex - leyex))

                # # # # Format image to 128 x 128 and rescale the init shape
                gray_cropped, init_cropped = format_img(gray, init_shape.copy())
                gcc = gray_cropped.copy()

                gray_cropped = gray_cropped.reshape((1, 128, 128, 1)).astype(np.float32)
                init_cropped = init_cropped.reshape((1, 5, 2))

                # import matplotlib.pylab as plt
                # # # img = cv2.imread("000091.jpg", 0)
                # # # img = img[40:170, 40:150]
                # # # img = cv2.resize(img, (128, 128), interpolation=cv2.INTER_AREA)
                # # # gray_cropped = img.reshape((1, 128, 128, 1)) / 255.
                # bla = _images_train[4].reshape((1, 128, 128, 1))

                # plt.imshow(bla[0, :, :, 0], cmap="gray")
                # plt.scatter(init_cropped[0, :, 0], init_cropped[0, :, 1])
                # plt.scatter(_landmarks_train[4, :, 0], _landmarks_train[4, :, 1], color="green", s=40)
                # preds = sess.run(predictions_val, feed_dict={images_val:bla,
                #                                              inits_val:init_cropped})
                # plt.scatter(preds[0, :, 0], preds[0, :, 1], color="red")
                # plt.show()
                # raw_input()

                preds = sess.run(predictions_val, feed_dict={images_val:gray_cropped,
                                                             inits_val:init_cropped})
                preds = preds[0]
                # # # Convert preds to the big image scale
                preds[:,0] = (preds[:, 0] - bx) / (ax)
                preds[:,1] = (preds[:, 1] - by) / (ay)

                mode = 1

                # if len(faces) != 0:
                #     for (x, y, w, h) in faces:
                #         cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

                # if init_shape.shape[0] > 0:
                #     for k in range(5):
                #         cv2.circle(frame,(int(init_shape[k,0]),int(init_shape[k,1])),2,(0,0,255),-1)

                # if init_cropped != []:
                #     for k in range(5):
                #         cv2.circle(gcc,(int(init_cropped[0][k,0]),int(init_cropped[0][k,1])),2,(0,0,255),-1)
                #         # cv2.circle(gcc,(int(preds[k,0]),int(preds[k,1])),2,(0,0,255),-1)

            # if len(faces) == 0:
            #     return shape, head_pose, score, mode, track_time

                if mode != 0:
                    generate_overlay(frame,
                                     preds,
                                     init_shape)
                    cv2.imshow('Face Tracker (q/Q: Quit)', frame)
                    if cv2.waitKey(1) & 0xFF == ord('q'):
                        break
                    # for next frame
                    init_shape = preds
                else:
                    cv2.imshow('Face Tracker (q/Q: Quit)', frame)
                    if cv2.waitKey(1) & 0xFF == ord('q'):
                        break

            else:

                break

        # When everything done, release the capture
        cap.release()
        cv2.destroyAllWindows()
Пример #19
0
    def _make_graph(self):
        self.logger.info("Generating training graph on {} GPUs ...".format(
            self.cfg.nr_gpus))

        weights_initializer = slim.xavier_initializer()
        biases_initializer = tf.constant_initializer(0.)
        biases_regularizer = tf.no_regularizer
        weights_regularizer = tf.contrib.layers.l2_regularizer(
            self.cfg.weight_decay)

        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in range(self.cfg.nr_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('tower_%d' % i) as name_scope:
                        # Force all Variables to reside on the CPU.
                        with slim.arg_scope(
                            [slim.model_variable, slim.variable],
                                device='/device:CPU:0'):
                            with slim.arg_scope([slim.conv2d, slim.conv2d_in_plane, \
                                                 slim.conv2d_transpose, slim.separable_conv2d,
                                                 slim.fully_connected],
                                                weights_regularizer=weights_regularizer,
                                                biases_regularizer=biases_regularizer,
                                                weights_initializer=weights_initializer,
                                                biases_initializer=biases_initializer):
                                # loss over single GPU
                                self.net.make_network(is_train=True)
                                if i == self.cfg.nr_gpus - 1:
                                    loss = self.net.get_loss(include_wd=True)
                                else:
                                    loss = self.net.get_loss()
                                self._input_list.append(self.net.get_inputs())

                        tf.get_variable_scope().reuse_variables()

                        if i == 0:
                            if self.cfg.nr_gpus > 1 and self.cfg.bn_train is True:
                                self.logger.warning(
                                    "BN is calculated only on single GPU.")
                            extra_update_ops = tf.get_collection(
                                tf.GraphKeys.UPDATE_OPS, name_scope)
                            with tf.control_dependencies(extra_update_ops):
                                grads = self._optimizer.compute_gradients(loss)
                        else:
                            grads = self._optimizer.compute_gradients(loss)
                        final_grads = []
                        with tf.variable_scope('Gradient_Mult') as scope:
                            for grad, var in grads:
                                scale = 1.
                                if self.cfg.double_bias and '/biases:' in var.name:
                                    scale *= 2.
                                if not np.allclose(scale, 1.):
                                    grad = tf.multiply(grad, scale)
                                final_grads.append((grad, var))
                        tower_grads.append(final_grads)

        if len(tower_grads) > 1:
            grads = sum_gradients(tower_grads)
        else:
            grads = tower_grads[0]

        if False:
            variable_averages = tf.train.ExponentialMovingAverage(0.9999)
            variables_to_average = (tf.trainable_variables() +
                                    tf.moving_average_variables())
            variables_averages_op = variable_averages.apply(
                variables_to_average)

            apply_gradient_op = self._optimizer.apply_gradients(grads)
            train_op = tf.group(apply_gradient_op, variables_averages_op,
                                *extra_update_ops)
        else:
            apply_gradient_op = self._optimizer.apply_gradients(grads)
            train_op = tf.group(apply_gradient_op, *extra_update_ops)

        return train_op
Пример #20
0
def train(scope=''):
    """Train on dataset for a number of steps."""
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        train_dirs = FLAGS.datasets.split(':')

        # Calculate the learning rate schedule.
        num_batches_per_epoch = 100
        num_epochs_per_decay = 5
        decay_steps = int(num_batches_per_epoch * num_epochs_per_decay)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                        global_step,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.AdamOptimizer(lr)

        # Override the number of preprocessing threads to account for the increased
        # number of GPU towers.
        num_preprocess_threads = FLAGS.num_preprocess_threads

        _images, _shapes, _reference_shape, pca_model = \
            data_provider.load_images(train_dirs)

        reference_shape = tf.constant(_reference_shape,
                                      dtype=tf.float32,
                                      name='reference_shape')

        image_shape = _images[0].shape
        lms_shape = _shapes[0].points.shape

        def get_random_sample(rotation_stddev=10):
            idx = np.random.randint(low=0, high=len(_images))
            im = menpo.image.Image(_images[idx].transpose(2, 0, 1), copy=False)
            lms = _shapes[idx]
            im.landmarks['PTS'] = lms
            if np.random.rand() < .5:
                im = utils.mirror_image(im)

            if np.random.rand() < .5:
              theta = np.random.normal(scale=rotation_stddev)
              rot = menpo.transform.rotate_ccw_about_centre(lms, theta)
              im = im.warp_to_shape(im.shape, rot)

            pixels = im.pixels.transpose(1, 2, 0).astype('float32')
            shape = im.landmarks['PTS'].lms.points.astype('float32')
            return pixels, shape

        image, shape = tf.py_func(get_random_sample, [],
                                  [tf.float32, tf.float32])

        initial_shape = data_provider.random_shape(shape, reference_shape,
                                                   pca_model)
        image.set_shape(image_shape)
        shape.set_shape(lms_shape)
        initial_shape.set_shape(lms_shape)

        image = data_provider.distort_color(image)

        images, lms, inits = tf.train.batch([image, shape, initial_shape],
                                            FLAGS.batch_size,
                                            dynamic_pad=False,
                                            capacity=5000,
                                            enqueue_many=False,
                                            num_threads=num_preprocess_threads,
                                            name='batch')
        print('Defining model...')
        with tf.device(FLAGS.train_device):
            # Retain the summaries from the final tower.
            summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
            predictions, dxs, _ = mdm_model.model(images, inits)

            total_loss = 0

            for i, dx in enumerate(dxs):
                norm_error = mdm_model.normalized_rmse(dx + inits, lms)
                tf.histogram_summary('errors', norm_error)
                loss = tf.reduce_mean(norm_error)
                total_loss += loss
                summaries.append(tf.scalar_summary('losses/step_{}'.format(i),
                                                   loss))

            # Calculate the gradients for the batch of data
            grads = opt.compute_gradients(total_loss)

        summaries.append(tf.scalar_summary('losses/total', total_loss))
        pred_images, = tf.py_func(utils.batch_draw_landmarks,
                                  [images, predictions], [tf.float32])
        gt_images, = tf.py_func(utils.batch_draw_landmarks, [images, lms],
                                [tf.float32])

        summary = tf.image_summary('images',
                                   tf.concat(2, [gt_images, pred_images]),
                                   max_images=5)
        summaries.append(tf.histogram_summary('dx', predictions - inits))

        summaries.append(summary)

        batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION,
                                              scope)

        # Add a summary to track the learning rate.
        summaries.append(tf.scalar_summary('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(tf.histogram_summary(var.op.name +
                                                      '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.histogram_summary(var.op.name, var))

        # Track the moving averages of all trainable variables.
        # Note that we maintain a "double-average" of the BatchNormalization
        # global statistics. This is more complicated then need be but we employ
        # this for backward-compatibility with our previous models.
        variable_averages = tf.train.ExponentialMovingAverage(
            MOVING_AVERAGE_DECAY, global_step)

        # Another possibility is to use tf.slim.get_variables().
        variables_to_average = (
            tf.trainable_variables() + tf.moving_average_variables())
        variables_averages_op = variable_averages.apply(variables_to_average)

        # Group all updates to into a single train op.
        # NOTE: Currently we are not using batchnorm in MDM.
        batchnorm_updates_op = tf.group(*batchnorm_updates)
        train_op = tf.group(apply_gradient_op, variables_averages_op,
                            batchnorm_updates_op)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.merge_summary(summaries)
        # Start running operations on the Graph. allow_soft_placement must be
        # set to True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()
        print('Initializing variables...')
        sess.run(init)
        print('Initialized variables.')

        if FLAGS.pretrained_model_checkpoint_path:
            assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
            variables_to_restore = tf.get_collection(
                slim.variables.VARIABLES_TO_RESTORE)
            restorer = tf.train.Saver(variables_to_restore)
            restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
            print('%s: Pre-trained model restored from %s' %
                  (datetime.now(), FLAGS.pretrained_model_checkpoint_path))

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

        print('Starting training...')
        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, total_loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                examples_per_sec = FLAGS.batch_size / float(duration)
                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, duration))

            if step % 10 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 50 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Пример #21
0
    def __init__(self):
        """
        initialize bisenetv2 trainner
        """
        # define solver params and dataset
        self._carla_io = carla_tf_io.CarlaTfIO()
        self._train_dataset = self._carla_io.train_dataset_reader
        self._steps_per_epoch = len(self._train_dataset)

        self._model_name = CFG.MODEL.MODEL_NAME

        self._train_epoch_nums = CFG.TRAIN.EPOCH_NUMS
        self._batch_size = CFG.TRAIN.BATCH_SIZE
        self._snapshot_epoch = CFG.TRAIN.SNAPSHOT_EPOCH
        self._model_save_dir = ops.join(CFG.TRAIN.MODEL_SAVE_DIR,
                                        self._model_name)
        self._tboard_save_dir = ops.join(CFG.TRAIN.TBOARD_SAVE_DIR,
                                         self._model_name)
        self._enable_miou = CFG.TRAIN.COMPUTE_MIOU.ENABLE
        if self._enable_miou:
            self._record_miou_epoch = CFG.TRAIN.COMPUTE_MIOU.EPOCH
        self._input_tensor_size = [
            int(tmp / 2) for tmp in CFG.AUG.TRAIN_CROP_SIZE
        ]

        self._init_learning_rate = CFG.SOLVER.LR
        self._moving_ave_decay = CFG.SOLVER.MOVING_AVE_DECAY
        self._momentum = CFG.SOLVER.MOMENTUM
        self._lr_polynimal_decay_power = CFG.SOLVER.LR_POLYNOMIAL_POWER
        self._optimizer_mode = CFG.SOLVER.OPTIMIZER.lower()

        if CFG.TRAIN.RESTORE_FROM_SNAPSHOT.ENABLE:
            self._initial_weight = CFG.TRAIN.RESTORE_FROM_SNAPSHOT.SNAPSHOT_PATH
        else:
            self._initial_weight = None
        if CFG.TRAIN.WARM_UP.ENABLE:
            self._warmup_epoches = CFG.TRAIN.WARM_UP.EPOCH_NUMS
            self._warmup_init_learning_rate = self._init_learning_rate / 1000.0
        else:
            self._warmup_epoches = 0

        # define tensorflow session
        sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.GPU.GPU_MEMORY_FRACTION
        sess_config.gpu_options.allow_growth = CFG.GPU.TF_ALLOW_GROWTH
        sess_config.gpu_options.allocator_type = 'BFC'
        self._sess = tf.Session(config=sess_config)

        # define graph input tensor
        with tf.variable_scope(name_or_scope='graph_input_node'):
            self._input_src_image, self._input_label_image = self._train_dataset.next_batch(
                batch_size=self._batch_size)

        # define model loss
        self._model = bisenet_v2.BiseNetV2(phase='train', cfg=CFG)
        loss_set = self._model.compute_loss(
            input_tensor=self._input_src_image,
            label_tensor=self._input_label_image,
            name='BiseNetV2',
            reuse=False)
        self._prediciton = self._model.inference(
            input_tensor=self._input_src_image, name='BiseNetV2', reuse=True)
        self._loss = loss_set['total_loss']
        self._l2_loss = loss_set['l2_loss']

        # define miou
        if self._enable_miou:
            with tf.variable_scope('miou'):
                pred = tf.reshape(self._prediciton, [
                    -1,
                ])
                gt = tf.reshape(self._input_label_image, [
                    -1,
                ])
                indices = tf.squeeze(
                    tf.where(tf.less_equal(gt, CFG.DATASET.NUM_CLASSES - 1)),
                    1)
                gt = tf.gather(gt, indices)
                pred = tf.gather(pred, indices)
                self._miou, self._miou_update_op = tf.metrics.mean_iou(
                    labels=gt,
                    predictions=pred,
                    num_classes=CFG.DATASET.NUM_CLASSES)

        # define learning rate
        with tf.variable_scope('learning_rate'):
            self._global_step = tf.Variable(1.0,
                                            dtype=tf.float32,
                                            trainable=False,
                                            name='global_step')
            warmup_steps = tf.constant(self._warmup_epoches *
                                       self._steps_per_epoch,
                                       dtype=tf.float32,
                                       name='warmup_steps')
            train_steps = tf.constant(self._train_epoch_nums *
                                      self._steps_per_epoch,
                                      dtype=tf.float32,
                                      name='train_steps')
            self._learn_rate = tf.cond(
                pred=self._global_step < warmup_steps,
                true_fn=lambda: self._compute_warmup_lr(
                    warmup_steps=warmup_steps, name='warmup_lr'),
                false_fn=lambda: tf.train.polynomial_decay(
                    learning_rate=self._init_learning_rate,
                    global_step=self._global_step,
                    decay_steps=train_steps,
                    end_learning_rate=0.000001,
                    power=self._lr_polynimal_decay_power))
            self._learn_rate = tf.identity(self._learn_rate, 'lr')
            global_step_update = tf.assign_add(self._global_step, 1.0)

        # define moving average op
        with tf.variable_scope(name_or_scope='moving_avg'):
            if CFG.TRAIN.FREEZE_BN.ENABLE:
                train_var_list = [
                    v for v in tf.trainable_variables()
                    if 'beta' not in v.name and 'gamma' not in v.name
                ]
            else:
                train_var_list = tf.trainable_variables()
            moving_ave_op = tf.train.ExponentialMovingAverage(
                self._moving_ave_decay).apply(train_var_list +
                                              tf.moving_average_variables())

        # define training op
        with tf.variable_scope(name_or_scope='train_step'):
            if CFG.TRAIN.FREEZE_BN.ENABLE:
                train_var_list = [
                    v for v in tf.trainable_variables()
                    if 'beta' not in v.name and 'gamma' not in v.name
                ]
            else:
                train_var_list = tf.trainable_variables()
            if self._optimizer_mode == 'sgd':
                optimizer = tf.train.MomentumOptimizer(
                    learning_rate=self._learn_rate, momentum=self._momentum)
            elif self._optimizer_mode == 'adam':
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self._learn_rate, )
            else:
                raise ValueError('Not support optimizer: {:s}'.format(
                    self._optimizer_mode))
            optimize_op = optimizer.minimize(self._loss,
                                             var_list=train_var_list)
            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                with tf.control_dependencies([optimize_op,
                                              global_step_update]):
                    with tf.control_dependencies([moving_ave_op]):
                        self._train_op = tf.no_op()

        # define saver and loader
        with tf.variable_scope('loader_and_saver'):
            self._net_var = [
                vv for vv in tf.global_variables() if 'lr' not in vv.name
            ]
            self._loader = tf.train.Saver(self._net_var)
            self._saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)

        # define summary
        with tf.variable_scope('summary'):
            summary_merge_list = [
                tf.summary.scalar("learn_rate", self._learn_rate),
                tf.summary.scalar("total", self._loss),
                tf.summary.scalar('l2_loss', self._l2_loss)
            ]
            if self._enable_miou:
                with tf.control_dependencies([self._miou_update_op]):
                    summary_merge_list_with_miou = [
                        tf.summary.scalar("learn_rate", self._learn_rate),
                        tf.summary.scalar("total", self._loss),
                        tf.summary.scalar('l2_loss', self._l2_loss),
                        tf.summary.scalar('miou', self._miou)
                    ]
                    self._write_summary_op_with_miou = tf.summary.merge(
                        summary_merge_list_with_miou)
            if ops.exists(self._tboard_save_dir):
                shutil.rmtree(self._tboard_save_dir)
            os.makedirs(self._tboard_save_dir, exist_ok=True)
            model_params_file_save_path = ops.join(
                self._tboard_save_dir, CFG.TRAIN.MODEL_PARAMS_CONFIG_FILE_NAME)
            with open(model_params_file_save_path, 'w',
                      encoding='utf-8') as f_obj:
                CFG.dump_to_json_file(f_obj)
            self._write_summary_op = tf.summary.merge(summary_merge_list)
            self._summary_writer = tf.summary.FileWriter(
                self._tboard_save_dir, graph=self._sess.graph)

        LOG.info('Initialize carla bisenetv2 trainner complete')
    def test_restore_ema(self):
        
        # Create 100 phony x, y data points in NumPy, y = x * 0.1 + 0.3
        x_data = np.random.rand(100).astype(np.float32)
        y_data = x_data * 0.1 + 0.3
        
        # Try to find values for W and b that compute y_data = W * x_data + b
        # (We know that W should be 0.1 and b 0.3, but TensorFlow will
        # figure that out for us.)
        W = tf.Variable(tf.random_uniform([1], -1.0, 1.0), name='W')
        b = tf.Variable(tf.zeros([1]), name='b')
        y = W * x_data + b
        
        # Minimize the mean squared errors.
        loss = tf.reduce_mean(tf.square(y - y_data))
        optimizer = tf.train.GradientDescentOptimizer(0.5)
        opt_op = optimizer.minimize(loss)

        # Track the moving averages of all trainable variables.
        ema = tf.train.ExponentialMovingAverage(decay=0.9999)
        averages_op = ema.apply(tf.trainable_variables())
        with tf.control_dependencies([opt_op]):
            train_op = tf.group(averages_op)
  
        # Before starting, initialize the variables.  We will 'run' this first.
        init = tf.global_variables_initializer()

        saver = tf.train.Saver(tf.trainable_variables())
        
        # Launch the graph.
        sess = tf.Session()
        sess.run(init)
        
        # Fit the line.
        for _ in range(201):
            sess.run(train_op)
        
        w_reference = sess.run('W/ExponentialMovingAverage:0')
        b_reference = sess.run('b/ExponentialMovingAverage:0')
        
        saver.save(sess, os.path.join(self.tmp_dir, "model_ex1"))
                
        tf.reset_default_graph()

        tf.train.import_meta_graph(os.path.join(self.tmp_dir, "model_ex1.meta"))
        sess = tf.Session()
        
        print('------------------------------------------------------')
        for var in tf.global_variables():
            print('all variables: ' + var.op.name)
        for var in tf.trainable_variables():
            print('normal variable: ' + var.op.name)
        for var in tf.moving_average_variables():
            print('ema variable: ' + var.op.name)
        print('------------------------------------------------------')

        mode = 1
        restore_vars = {}
        if mode == 0:
            ema = tf.train.ExponentialMovingAverage(1.0)
            for var in tf.trainable_variables():
                print('%s: %s' % (ema.average_name(var), var.op.name))
                restore_vars[ema.average_name(var)] = var
        elif mode == 1:
            for var in tf.trainable_variables():
                ema_name = var.op.name + '/ExponentialMovingAverage'
                print('%s: %s' % (ema_name, var.op.name))
                restore_vars[ema_name] = var
            
        saver = tf.train.Saver(restore_vars, name='ema_restore')
        
        saver.restore(sess, os.path.join(self.tmp_dir, "model_ex1"))
        
        w_restored = sess.run('W:0')
        b_restored = sess.run('b:0')
        
        self.assertAlmostEqual(w_reference, w_restored, 'Restored model modes not use the EMA filtered weight')
        self.assertAlmostEqual(b_reference, b_restored, 'Restored model modes not use the EMA filtered bias')
Пример #23
0
def train_shadownet_multi_gpu(dataset_dir, weights_path, char_dict_path,
                              ord_map_dict_path):
    """

    :param dataset_dir:
    :param weights_path:
    :param char_dict_path:
    :param ord_map_dict_path:
    :return:
    """
    # prepare dataset information
    train_dataset = shadownet_data_feed_pipline.CrnnDataFeeder(
        dataset_dir=dataset_dir,
        char_dict_path=char_dict_path,
        ord_map_dict_path=ord_map_dict_path,
        flags='train')
    val_dataset = shadownet_data_feed_pipline.CrnnDataFeeder(
        dataset_dir=dataset_dir,
        char_dict_path=char_dict_path,
        ord_map_dict_path=ord_map_dict_path,
        flags='val')
    train_images, train_labels, train_images_paths = train_dataset.inputs(
        batch_size=CFG.TRAIN.BATCH_SIZE)
    val_images, val_labels, val_images_paths = val_dataset.inputs(
        batch_size=CFG.TRAIN.BATCH_SIZE)

    # set crnn net
    shadownet = crnn_net.ShadowNet(phase='train',
                                   hidden_nums=CFG.ARCH.HIDDEN_UNITS,
                                   layers_nums=CFG.ARCH.HIDDEN_LAYERS,
                                   num_classes=CFG.ARCH.NUM_CLASSES)
    shadownet_val = crnn_net.ShadowNet(phase='test',
                                       hidden_nums=CFG.ARCH.HIDDEN_UNITS,
                                       layers_nums=CFG.ARCH.HIDDEN_LAYERS,
                                       num_classes=CFG.ARCH.NUM_CLASSES)

    # set average container
    tower_grads = []
    train_tower_loss = []
    val_tower_loss = []
    batchnorm_updates = None
    train_summary_op_updates = None

    # set lr
    global_step = tf.Variable(0, name='global_step', trainable=False)
    learning_rate = tf.train.exponential_decay(
        learning_rate=CFG.TRAIN.LEARNING_RATE,
        global_step=global_step,
        decay_steps=CFG.TRAIN.LR_DECAY_STEPS,
        decay_rate=CFG.TRAIN.LR_DECAY_RATE,
        staircase=CFG.TRAIN.LR_STAIRCASE)

    # set up optimizer
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                           momentum=0.9)

    # set distributed train op
    with tf.variable_scope(tf.get_variable_scope()):
        is_network_initialized = False
        for i in range(CFG.TRAIN.GPU_NUM):
            with tf.device('/gpu:{:d}'.format(i)):
                with tf.name_scope('tower_{:d}'.format(i)) as _:
                    train_loss, grads = compute_net_gradients(
                        train_images,
                        train_labels,
                        shadownet,
                        optimizer,
                        is_net_first_initialized=is_network_initialized)

                    is_network_initialized = True

                    # Only use the mean and var in the first gpu tower to update the parameter
                    # TODO implement batch normalization for distributed device ([email protected])
                    if i == 0:
                        batchnorm_updates = tf.get_collection(
                            tf.GraphKeys.UPDATE_OPS)
                        train_summary_op_updates = tf.get_collection(
                            tf.GraphKeys.SUMMARIES)

                    tower_grads.append(grads)
                    train_tower_loss.append(train_loss)
                with tf.name_scope('validation_{:d}'.format(i)) as _:
                    val_loss, _ = compute_net_gradients(
                        val_images,
                        val_labels,
                        shadownet_val,
                        optimizer,
                        is_net_first_initialized=is_network_initialized)
                    val_tower_loss.append(val_loss)

    grads = average_gradients(tower_grads)
    avg_train_loss = tf.reduce_mean(train_tower_loss)
    avg_val_loss = tf.reduce_mean(val_tower_loss)

    # Track the moving averages of all trainable variables
    variable_averages = tf.train.ExponentialMovingAverage(
        CFG.TRAIN.MOVING_AVERAGE_DECAY, num_updates=global_step)
    variables_to_average = tf.trainable_variables(
    ) + tf.moving_average_variables()
    variables_averages_op = variable_averages.apply(variables_to_average)

    # Group all the op needed for training
    batchnorm_updates_op = tf.group(*batchnorm_updates)
    apply_gradient_op = optimizer.apply_gradients(grads,
                                                  global_step=global_step)
    train_op = tf.group(apply_gradient_op, variables_averages_op,
                        batchnorm_updates_op)

    # set tensorflow summary
    tboard_save_path = 'tboard/crnn_syn90k_multi_gpu'
    os.makedirs(tboard_save_path, exist_ok=True)

    summary_writer = tf.summary.FileWriter(tboard_save_path)

    avg_train_loss_scalar = tf.summary.scalar(name='average_train_loss',
                                              tensor=avg_train_loss)
    avg_val_loss_scalar = tf.summary.scalar(name='average_val_loss',
                                            tensor=avg_val_loss)
    learning_rate_scalar = tf.summary.scalar(name='learning_rate_scalar',
                                             tensor=learning_rate)
    train_merge_summary_op = tf.summary.merge(
        [avg_train_loss_scalar, learning_rate_scalar] +
        train_summary_op_updates)
    val_merge_summary_op = tf.summary.merge([avg_val_loss_scalar])

    # set tensorflow saver
    saver = tf.train.Saver()
    model_save_dir = 'model/crnn_syn90k_multi_gpu'
    os.makedirs(model_save_dir, exist_ok=True)
    train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S',
                                     time.localtime(time.time()))
    model_name = 'shadownet_{:s}.ckpt'.format(str(train_start_time))
    model_save_path = ops.join(model_save_dir, model_name)

    # set sess config
    sess_config = tf.ConfigProto(device_count={'GPU': CFG.TRAIN.GPU_NUM},
                                 allow_soft_placement=True)
    sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.TRAIN.GPU_MEMORY_FRACTION
    sess_config.gpu_options.allow_growth = CFG.TRAIN.TF_ALLOW_GROWTH
    sess_config.gpu_options.allocator_type = 'BFC'

    # Set the training parameters
    train_epochs = CFG.TRAIN.EPOCHS

    logger.info('Global configuration is as follows:')
    logger.info(CFG)

    sess = tf.Session(config=sess_config)

    summary_writer.add_graph(sess.graph)

    with sess.as_default():

        tf.train.write_graph(
            graph_or_graph_def=sess.graph,
            logdir='',
            name='{:s}/shadownet_model.pb'.format(model_save_dir))

        if weights_path is None:
            logger.info('Training from scratch')
            init = tf.global_variables_initializer()
            sess.run(init)
        else:
            logger.info('Restore model from last model checkpoint {:s}'.format(
                weights_path))
            saver.restore(sess=sess, save_path=weights_path)

        train_cost_time_mean = []
        val_cost_time_mean = []

        for epoch in range(train_epochs):

            # training part
            t_start = time.time()

            _, train_loss_value, train_summary, lr = \
                sess.run(fetches=[train_op,
                                  avg_train_loss,
                                  train_merge_summary_op,
                                  learning_rate])

            if math.isnan(train_loss_value):
                raise ValueError('Train loss is nan')

            cost_time = time.time() - t_start
            train_cost_time_mean.append(cost_time)

            summary_writer.add_summary(summary=train_summary,
                                       global_step=epoch)

            # validation part
            t_start_val = time.time()

            val_loss_value, val_summary = \
                sess.run(fetches=[avg_val_loss,
                                  val_merge_summary_op])

            summary_writer.add_summary(val_summary, global_step=epoch)

            cost_time_val = time.time() - t_start_val
            val_cost_time_mean.append(cost_time_val)

            if epoch % CFG.TRAIN.DISPLAY_STEP == 0:
                logger.info('Epoch_Train: {:d} total_loss= {:6f} '
                            'lr= {:6f} mean_cost_time= {:5f}s '.format(
                                epoch + 1, train_loss_value, lr,
                                np.mean(train_cost_time_mean)))
                train_cost_time_mean.clear()

            if epoch % CFG.TRAIN.VAL_DISPLAY_STEP == 0:
                logger.info('Epoch_Val: {:d} total_loss= {:6f} '
                            ' mean_cost_time= {:5f}s '.format(
                                epoch + 1, val_loss_value,
                                np.mean(val_cost_time_mean)))
                val_cost_time_mean.clear()

            if epoch % 5000 == 0:
                saver.save(sess=sess,
                           save_path=model_save_path,
                           global_step=epoch)
    sess.close()

    return
def train(retrain=False, retrain_list=None):
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        if not retrain:
            train_op = cifar10.train(loss, global_step)
        else:
            if retrain_count == 1:
                train_op = cifar10.train(loss, global_step, ["softmax_linear"])
            else:
                train_op = cifar10.train(loss, global_step,
                                         ["softmax_linear", "local4"])

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        ### RETRAINING START

        if FLAGS.retrain:
            if FLAGS.debug:
                print(
                    "GLOBAL ============================================================================="
                )
                for v in tf.all_variables():
                    print(v.name)
                print(
                    "TRAINABLE ============================================================================="
                )
                for v in tf.trainable_variables():
                    print(v.name)
                print(
                    "MOVING AVERAGES ============================================================================="
                )
                for v in tf.moving_average_variables():
                    print(v.name)
            variables_to_restore = [
                v for v in tf.global_variables()
                if not v.name.split('/')[0] in retrain_list
            ]
            variables_to_initialize = [
                v for v in tf.global_variables()
                if v.name.split('/')[0] in retrain_list
            ]
            if FLAGS.debug:
                print(
                    "RESTORE ============================================================================="
                )
                for v in variables_to_restore:
                    print(v.name)
                print(
                    "INITIALIZE ============================================================================="
                )
                for v in variables_to_initialize:
                    print(v.name)
            saver_retrain = tf.train.Saver(variables_to_restore)
            ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
            if not (ckpt and ckpt.model_checkpoint_path):
                print('Yikes! No checkpoint file found at %s to retrain :-(' %
                      (FLAGS.checkpoint_dir))
                return
            # Build an initialization operation to run below.
            init = tf.variables_initializer(variables_to_initialize)
        else:
            # Build an initialization operation to run below.
            init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))

        if FLAGS.retrain:
            # Restores from checkpoint
            saver_retrain.restore(sess, ckpt.model_checkpoint_path)

        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        if FLAGS.print_params:
            print(tf.all_variables()[2].name)
            print(tf.all_variables()[2].eval(session=sess))
            print(tf.all_variables()[9].name)
            print(tf.all_variables()[9].eval(session=sess))
            print(tf.all_variables()[10].name)
            print(tf.all_variables()[10].eval(session=sess))
            print("-------------------------------------------")

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))
                if FLAGS.print_params:
                    print(tf.all_variables()[2].name)
                    print(tf.all_variables()[2].eval(session=sess))
                    print(tf.all_variables()[9].name)
                    print(tf.all_variables()[9].eval(session=sess))
                    print(tf.all_variables()[10].name)
                    print(tf.all_variables()[10].eval(session=sess))
                    print("-------------------------------------------")
            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Пример #25
0
  def model_fn(self, features, labels, mode, params):
    """Build the model based on features, labels, and mode.

    Args:
      features: The features dictionary containing the data Tensor
        and the number of examples.
      labels: The labels Tensor resulting from calling the model.
      mode: A string indicating the training mode.
      params: A dictionary of hyperparameters.

    Returns:
      A tf.estimator.EstimatorSpec.
    """
    del params
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    eval_active = (mode == tf.estimator.ModeKeys.EVAL)
    is_predict = (mode == tf.estimator.ModeKeys.PREDICT)
    features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC
    loss, logits = self._build_network(features, labels, mode)

    if is_predict:
      predictions = {'logits': logits}
      if self.hparams.use_tpu:
        return  tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                predictions=predictions)
      else:
        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions)
    host_call = None
    train_op = None

    if is_training:
      global_step = tf.train.get_or_create_global_step()
      gs_t = tf.reshape(tf.cast(global_step, tf.int32), [1])

      # Setup learning rate schedule
      learning_rate = self._build_learning_rate_schedule(global_step)

      # Setup optimizer.
      optimizer = self._build_optimizer(learning_rate)

      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
      with tf.control_dependencies(update_ops):
        train_op = self._build_train_op(optimizer, loss,
                                        global_step=global_step)
      if self.hparams.moving_average_decay > 0:
        ema = tf.train.ExponentialMovingAverage(
            decay=self.hparams.moving_average_decay, num_updates=global_step)
        variables_to_average = (tf.trainable_variables() +
                                tf.moving_average_variables())
        with tf.control_dependencies([train_op]):
          with tf.name_scope('moving_average'):
            train_op = ema.apply(variables_to_average)

      lr_t = tf.reshape(learning_rate, [1])
      host_call = None
      if self.hparams.enable_hostcall:
        def host_call_fn(gs, lr):
          # Outfeed supports int32 but global_step is expected to be int64.
          gs = tf.cast(tf.reduce_mean(gs), tf.int64)
          with tf.contrib.summary.create_file_writer(
              self.model_dir).as_default():
            with tf.contrib.summary.always_record_summaries():
              tf.contrib.summary.scalar('learning_rate', tf.reduce_mean(lr),
                                        step=gs)
              return tf.contrib.summary.all_summary_ops()
        host_call = (host_call_fn, [gs_t, lr_t])

    eval_metrics = None
    eval_metric_ops = None
    if eval_active:
      def metric_fn(labels, logits):
        """Evaluation metric fn. Performed on CPU, do not reference TPU ops."""
        # Outfeed supports int32 but global_step is expected to be int64.
        predictions = tf.argmax(logits, axis=1)
        categorical_labels = labels
        top_1_accuracy = tf.metrics.accuracy(categorical_labels, predictions)
        in_top_5 = tf.cast(tf.nn.in_top_k(logits, categorical_labels, 5),
                           tf.float32)
        top_5_accuracy = tf.metrics.mean(in_top_5)

        return {
            'top_1_accuracy': top_1_accuracy,
            'top_5_accuracy': top_5_accuracy,
        }

      eval_metrics = (metric_fn, [labels, logits])
      eval_metric_ops = metric_fn(labels, logits)

    if self.hparams.use_tpu:
      return tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode, loss=loss, train_op=train_op,
          host_call=host_call, eval_metrics=eval_metrics)
    return tf.estimator.EstimatorSpec(
        mode=mode, loss=loss, train_op=train_op,
        eval_metric_ops=eval_metric_ops)
Пример #26
0
def train(scope=''):
    """Train on dataset for a number of steps."""
    with tf.Graph().as_default(), tf.device('/gpu:0'):
        # Global steps
        tf_global_step = tf.get_variable(
            'GlobalStep', [],
            initializer=tf.constant_initializer(0),
            trainable=False)

        # Learning rate
        tf_lr = tf.train.exponential_decay(g_config['learning_rate'],
                                           tf_global_step,
                                           g_config['learning_rate_step'],
                                           g_config['learning_rate_decay'],
                                           staircase=True,
                                           name='LearningRate')
        tf.summary.scalar('learning_rate', tf_lr)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.AdamOptimizer(tf_lr)

        data_provider.prepare_images(g_config['train_dataset'].split(':'),
                                     num_patches=g_config['num_patches'],
                                     verbose=True)
        path_base = Path(g_config['train_dataset'].split(':')[0]).parent.parent
        _mean_shape = mio.import_pickle(path_base / 'reference_shape.pkl')
        with Path(path_base / 'meta.txt').open('r') as ifs:
            _image_shape = [int(x) for x in ifs.read().split(' ')]
        assert (isinstance(_mean_shape, np.ndarray))
        _pca_shapes = []
        _pca_bbs = []
        for item in tf.io.tf_record_iterator(str(path_base / 'pca.bin')):
            example = tf.train.Example()
            example.ParseFromString(item)
            _pca_shape = np.array(example.features.feature['pca/shape'].
                                  float_list.value).reshape((-1, 2))
            _pca_bb = np.array(
                example.features.feature['pca/bb'].float_list.value).reshape(
                    (-1, 2))
            _pca_shapes.append(PointCloud(_pca_shape))
            _pca_bbs.append(PointCloud(_pca_bb))
        _pca_model = detect.create_generator(_pca_shapes, _pca_bbs)
        assert (_mean_shape.shape[0] == g_config['num_patches'])

        tf_mean_shape = tf.constant(_mean_shape,
                                    dtype=tf.float32,
                                    name='MeanShape')

        def decode_feature(serialized):
            feature = {
                'train/image': tf.FixedLenFeature([], tf.string),
                'train/shape': tf.VarLenFeature(tf.float32),
            }
            features = tf.parse_single_example(serialized, features=feature)
            decoded_image = tf.decode_raw(features['train/image'], tf.float32)
            decoded_image = tf.reshape(decoded_image, _image_shape)
            decoded_shape = tf.sparse.to_dense(features['train/shape'])
            decoded_shape = tf.reshape(decoded_shape,
                                       (g_config['num_patches'], 2))
            return decoded_image, decoded_shape

        def get_random_sample(image, shape, rotation_stddev=10):
            # Read a random image with landmarks and bb
            image = menpo.image.Image(image.transpose((2, 0, 1)), copy=False)
            image.landmarks['PTS'] = PointCloud(shape)

            if np.random.rand() < .5:
                image = utils.mirror_image(image)
            if np.random.rand() < .5:
                theta = np.random.normal(scale=rotation_stddev)
                rot = menpo.transform.rotate_ccw_about_centre(
                    image.landmarks['PTS'], theta)
                image = image.warp_to_shape(image.shape, rot)
            bb = image.landmarks['PTS'].bounding_box().points
            miny, minx = np.min(bb, 0)
            maxy, maxx = np.max(bb, 0)
            bbsize = max(maxx - minx, maxy - miny)
            center = [(miny + maxy) / 2., (minx + maxx) / 2.]
            image.landmarks['bb'] = PointCloud([
                [center[0] - bbsize * 0.5, center[1] - bbsize * 0.5],
                [center[0] + bbsize * 0.5, center[1] + bbsize * 0.5],
            ]).bounding_box()
            proportion = float(np.random.rand() / 3)
            image = image.crop_to_landmarks_proportion(proportion, group='bb')
            image = image.resize((112, 112))

            random_image = image.pixels.transpose(1, 2, 0).astype('float32')
            random_shape = image.landmarks['PTS'].points.astype('float32')
            return random_image, random_shape

        def get_init_shape(image, shape, mean_shape):
            def norm(x):
                return tf.sqrt(
                    tf.reduce_sum(tf.square(x - tf.reduce_mean(x, 0))))

            with tf.name_scope('align_shape_to_bb', values=[mean_shape]):
                min_xy = tf.reduce_min(mean_shape, 0)
                max_xy = tf.reduce_max(mean_shape, 0)
                min_x, min_y = min_xy[0], min_xy[1]
                max_x, max_y = max_xy[0], max_xy[1]
                mean_shape_bb = tf.stack([[min_x, min_y], [max_x, min_y],
                                          [max_x, max_y], [min_x, max_y]])
                bb = tf.stack([[0.0, 0.0], [112.0, 0.0], [112.0, 112.0],
                               [0.0, 112.0]])
                ratio = norm(bb) / norm(mean_shape_bb)
                initial_shape = tf.add(
                    (mean_shape - tf.reduce_mean(mean_shape_bb, 0)) * ratio,
                    tf.reduce_mean(bb, 0),
                    name='initial_shape')
                initial_shape.set_shape(tf_mean_shape.get_shape())
            return image, shape, initial_shape

        def distort_color(image, shape, init_shape):
            return data_provider.distort_color(image), shape, init_shape

        with tf.name_scope('DataProvider', values=[tf_mean_shape]):
            tf_dataset = tf.data.TFRecordDataset(
                [str(path_base / 'train.bin')])
            tf_dataset = tf_dataset.repeat()
            tf_dataset = tf_dataset.map(decode_feature)
            tf_dataset = tf_dataset.map(lambda x, y: tf.py_func(
                get_random_sample, [x, y], [tf.float32, tf.float32],
                stateful=True,
                name='RandomSample'))
            tf_dataset = tf_dataset.map(
                partial(get_init_shape, mean_shape=tf_mean_shape))
            tf_dataset = tf_dataset.map(distort_color)
            tf_dataset = tf_dataset.batch(g_config['batch_size'], True)
            tf_dataset = tf_dataset.prefetch(7500)
            tf_iterator = tf_dataset.make_one_shot_iterator()
            tf_images, tf_shapes, tf_initial_shapes = tf_iterator.get_next(
                name='Batch')
            tf_images.set_shape([g_config['batch_size'], 112, 112, 3])
            tf_shapes.set_shape([g_config['batch_size'], 73, 2])
            tf_initial_shapes.set_shape([g_config['batch_size'], 73, 2])

        print('Defining model...')
        with tf.device(g_config['train_device']):
            tf_model = mdm_model.MDMModel(
                tf_images,
                tf_shapes,
                tf_initial_shapes,
                batch_size=g_config['batch_size'],
                num_iterations=g_config['num_iterations'],
                num_patches=g_config['num_patches'],
                patch_shape=(g_config['patch_size'], g_config['patch_size']),
                num_channels=3)
            with tf.name_scope('Losses',
                               values=[tf_model.prediction, tf_shapes]):
                tf_norm_error = tf_model.normalized_rmse(
                    tf_model.prediction, tf_shapes)
                tf_loss = tf.reduce_mean(tf_norm_error)
            tf.summary.scalar('losses/total', tf_loss)
            # Calculate the gradients for the batch of data
            tf_grads = opt.compute_gradients(tf_loss)
        tf.summary.histogram('dx', tf_model.prediction - tf_shapes)

        bn_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope)

        # Add histograms for gradients.
        for grad, var in tf_grads:
            if grad is not None:
                tf.summary.histogram(var.op.name + '/gradients', grad)

        # Apply the gradients to adjust the shared variables.
        with tf.name_scope('Optimizer', values=[tf_grads, tf_global_step]):
            apply_gradient_op = opt.apply_gradients(tf_grads,
                                                    global_step=tf_global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)

        # Track the moving averages of all trainable variables.
        # Note that we maintain a "double-average" of the BatchNormalization
        # global statistics. This is more complicated then need be but we employ
        # this for backward-compatibility with our previous models.
        with tf.name_scope('MovingAverage', values=[tf_global_step]):
            variable_averages = tf.train.ExponentialMovingAverage(
                g_config['MOVING_AVERAGE_DECAY'], tf_global_step)
            variables_to_average = (tf.trainable_variables() +
                                    tf.moving_average_variables())
            variables_averages_op = variable_averages.apply(
                variables_to_average)

        # Group all updates to into a single train op.
        bn_updates_op = tf.group(*bn_updates, name='BNGroup')
        train_op = tf.group(apply_gradient_op,
                            variables_averages_op,
                            bn_updates_op,
                            name='TrainGroup')

        # Create a saver.
        saver = tf.train.Saver()

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge_all()
        # Start running operations on the Graph. allow_soft_placement must be
        # set to True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()
        print('Initializing variables...')
        sess.run(init)
        print('Initialized variables.')

        start_step = 0
        ckpt = tf.train.get_checkpoint_state(g_config['train_dir'])
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            # Assuming model_checkpoint_path looks something like:
            #   /ckpt/train/model.ckpt-0,
            # extract global_step from it.
            start_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) + 1
            print('%s: Pre-trained model restored from %s' %
                  (datetime.now(), g_config['train_dir']))

        summary_writer = tf.summary.FileWriter(g_config['train_dir'],
                                               sess.graph)

        print('Starting training...')
        for step in range(start_step, g_config['max_steps']):
            start_time = time.time()
            _, loss_value = sess.run([train_op, tf_loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 100 == 0:
                examples_per_sec = g_config['batch_size'] / float(duration)
                format_str = (
                    '%s: step %d, loss = %.4f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, duration))

            if step % 200 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == g_config['max_steps']:
                checkpoint_path = os.path.join(g_config['train_dir'],
                                               'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Пример #27
0
def Test_Mixup_ResNet():
    image_w, image_h, image_c = [32, 32, 3]
    path_mom = "/scratch/mixup"
    path_son_all = os.listdir(path_mom)
    path_selec = [
        v for v in path_son_all if 'Iter' in v and '_preactivation' not in v
    ]
    print(path_selec)
    MOVING_AVERAGE_DECAY = 0.999
    NUM_CLASS = 10
    TEST_PATH = ["/scratch/mixup/eval.tfrecords"]
    batch_size = 1
    num_image = 10000
    with tf.Graph().as_default():
        images_test = tf.placeholder(tf.float32,
                                     [batch_size, image_w, image_h, image_c])
        labels_test = tf.placeholder(tf.int64, [batch_size])
        phase_train = tf.placeholder(tf.bool, shape=None)
        Selec_Layer_Index = tf.placeholder(tf.int64)
        input_lambda_tensor = tf.placeholder(tf.float32, shape=[1])
        image_batch_te, label_batch_te = Read_Data_from_Record(
            TEST_PATH, batch_size, augmentation=False, shuffle=False)
        logits, target = build_tower_basic(images_test, labels_test,
                                           input_lambda_tensor, NUM_CLASS,
                                           phase_train, Selec_Layer_Index)
        Data_Error_Loss, Init_Accu, Reweight_Accu = Calc_Loss(
            logits, labels_test, target, input_lambda_tensor)
        var_train = tf.trainable_variables()
        variable_averages = tf.train.ExponentialMovingAverage(
            MOVING_AVERAGE_DECAY)
        variable_averages.apply(var_train)
        variables_to_restore = variable_averages.variables_to_restore(
            tf.moving_average_variables())
        saver = tf.train.Saver(variables_to_restore)
        for single_ckpt in path_selec:
            ckpt_dir = os.path.join(path_mom, single_ckpt)
            print("\n================================")
            with tf.Session() as sess:
                ckpt = tf.train.get_checkpoint_state(ckpt_dir)
                if ckpt and ckpt.model_checkpoint_path:
                    saver.restore(sess, ckpt.model_checkpoint_path)
                    print("restore parameter from ",
                          ckpt.model_checkpoint_path)
                coord = tf.train.Coordinator()
                threads = tf.train.start_queue_runners(coord=coord)
                Total_Test_Stat = np.zeros([num_image, 2
                                            ])  #loss, init_accu, reweight_accu
                print(
                    "-------------------------Start Training-------------------------------------"
                )
                for step in range(num_image):
                    selected_layer_index = 3
                    input_lambda = 1
                    #print("----------------Input lambda is--------", input_lambda)
                    image_test_batch, label_test_batch = sess.run(
                        [image_batch_te, label_batch_te])
                    feed_dict = {
                        images_test: image_test_batch,
                        labels_test: label_test_batch,
                        phase_train: False,
                        Selec_Layer_Index: selected_layer_index,
                        input_lambda_tensor: [input_lambda]
                    }
                    Data_Error_Loss_Train, Init_Accu_Train = sess.run(
                        [Data_Error_Loss, Init_Accu], feed_dict=feed_dict)
                    Total_Test_Stat[step, :] = [
                        Data_Error_Loss_Train, Init_Accu_Train
                    ]
                print("-------Test Error, Test Accu------------",
                      np.mean(Total_Test_Stat, axis=0))
                coord.request_stop()
                coord.join(threads)
Пример #28
0
def model_fn(features, labels, mode, params):
  """Mobilenet v1 model using Estimator API."""
  num_classes = FLAGS.num_classes
  training_active = (mode == tf.estimator.ModeKeys.TRAIN)
  eval_active = (mode == tf.estimator.ModeKeys.EVAL)

  features = tensor_transform_fn(features, params['input_perm'])

  if FLAGS.clear_update_collections:
    # updates_collections must be set to None in order to use fused batchnorm
    with tf.variable_scope('cg', custom_getter=get_custom_getter()):
      with arg_scope(mobilenet_v1.mobilenet_v1_arg_scope()):
        logits, end_points = mobilenet_v1.mobilenet_v1(
          features,
          num_classes,
          is_training=training_active,
          depth_multiplier=FLAGS.depth_multiplier)
        logits = tf.cast(logits, tf.float32)
  else:
    with tf.variable_scope('cg', custom_getter=get_custom_getter()):
      with arg_scope(mobilenet_v1.mobilenet_v1_arg_scope()):
        logits, end_points = mobilenet_v1.mobilenet_v1(
          features,
          num_classes,
          is_training=training_active,
          depth_multiplier=FLAGS.depth_multiplier)
        logits = tf.cast(logits, tf.float32)
  predictions = {
      'classes': tf.argmax(input=logits, axis=1),
      'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
  }

  if mode == tf.estimator.ModeKeys.PREDICT:
    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

  if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and (
      not FLAGS.use_tpu):
    with tf.control_dependencies([
        tf.Print(
            predictions['classes'], [predictions['classes']],
            summarize=FLAGS.eval_batch_size,
            message='prediction: ')
    ]):
      labels = tf.Print(
          labels, [labels], summarize=FLAGS.eval_batch_size, message='label: ')

  one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32)

  cross_entropy = tf.losses.softmax_cross_entropy(
      onehot_labels=one_hot_labels,
      logits=logits,
      weights=1.0,
      label_smoothing=0.1)
#  loss = tf.losses.get_total_loss(add_regularization_losses=True)
  loss = cross_entropy + 1e-4 * tf.add_n(
      [tf.nn.l2_loss(v) for v in tf.trainable_variables()
       if 'batch_normalization' not in v.name])

  initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256
  final_learning_rate = 0.0001 * initial_learning_rate

  train_op = None
  if training_active:
    batches_per_epoch = _NUM_TRAIN_IMAGES // FLAGS.train_batch_size
    global_step = tf.train.get_or_create_global_step()

    learning_rate = tf.train.exponential_decay(
        learning_rate=initial_learning_rate,
        global_step=global_step,
        decay_steps=FLAGS.learning_rate_decay_epochs * batches_per_epoch,
        decay_rate=FLAGS.learning_rate_decay,
        staircase=True)

    # Set a minimum boundary for the learning rate.
    learning_rate = tf.maximum(
        learning_rate, final_learning_rate, name='learning_rate')

    if FLAGS.optimizer == 'sgd':
      tf.logging.info('Using SGD optimizer')
      optimizer = tf.train.GradientDescentOptimizer(
          learning_rate=learning_rate)
    elif FLAGS.optimizer == 'momentum':
      tf.logging.info('Using Momentum optimizer')
      optimizer = tf.train.MomentumOptimizer(
          learning_rate=learning_rate, momentum=0.9)
    elif FLAGS.optimizer == 'RMS':
      tf.logging.info('Using RMS optimizer')
      optimizer = tf.train.RMSPropOptimizer(
          learning_rate,
          RMSPROP_DECAY,
          momentum=RMSPROP_MOMENTUM,
          epsilon=RMSPROP_EPSILON)
    else:
      tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer)

    if FLAGS.use_tpu:
      optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
      train_op = optimizer.minimize(loss, global_step=global_step)
    if FLAGS.moving_average:
      ema = tf.train.ExponentialMovingAverage(
          decay=MOVING_AVERAGE_DECAY, num_updates=global_step)
      variables_to_average = (tf.trainable_variables() +
                              tf.moving_average_variables())
      with tf.control_dependencies([train_op]), tf.name_scope('moving_average'):
        train_op = ema.apply(variables_to_average)

  eval_metrics = None
  if eval_active:
    def metric_fn(labels, predictions):
      accuracy = tf.metrics.accuracy(labels, tf.argmax(
          input=predictions, axis=1))
      return {'accuracy': accuracy}

    if FLAGS.use_logits:
      eval_predictions = logits
    else:
      eval_predictions = end_points['Predictions']

    eval_metrics = (metric_fn, [labels, eval_predictions])

  param_stats = tf.profiler.profile(
    tf.get_default_graph(),
    options=ProfileOptionBuilder.trainable_variables_parameter())
  fl_stats = tf.profiler.profile(
    tf.get_default_graph(),
    options=tf.profiler.ProfileOptionBuilder.float_operation())
  return tpu_estimator.TPUEstimatorSpec(
      mode=mode, loss=loss, train_op=train_op, eval_metrics=eval_metrics)
Пример #29
0
def train():
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    print('PS hosts are: %s' % ps_hosts)
    print('Worker hosts are: %s' % worker_hosts)
    server = tf.train.Server({
        'ps': ps_hosts,
        'worker': worker_hosts
    },
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_id)
    if FLAGS.job_name == 'ps':
        server.join()
    is_chief = (FLAGS.task_id == 0)
    if is_chief:
        if tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.DeleteRecursively(FLAGS.train_dir)
        tf.gfile.MakeDirs(FLAGS.train_dir)

    device_setter = tf.train.replica_device_setter(ps_tasks=len(ps_hosts))
    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        partitioner = tf.fixed_size_partitioner(len(ps_hosts), axis=0)
        with tf.variable_scope('partitioned_space', partitioner=partitioner):
            with tf.device(device_setter):
                global_step = tf.Variable(0, trainable=False)
                decay_steps = 50000 * 350.0 / FLAGS.batch_size
                batch_size = tf.placeholder(dtype=tf.int32,
                                            shape=(),
                                            name='batch_size')
                images, labels = cifar10.distorted_inputs(batch_size)
                inputs = tf.reshape(images, [-1, _HEIGHT, _WIDTH, _DEPTH])
                labels = tf.one_hot(labels, 10, 1, 0)

                #  	    network_fn = nets_factory.get_network_fn('alexnet_v2',num_classes=10)
                # 	    (logits,_) = network_fn(inputs)
                #            with slim.arg_scope(alexnet.alexnet_v2_arg_scope(weight_decay=0.0)):
                (logits, _) = alexnet.alexnet_v2(inputs,
                                                 num_classes=10,
                                                 is_training=True)

                cross_entropy = tf.losses.softmax_cross_entropy(
                    logits=logits, onehot_labels=labels)
                loss = cross_entropy + _WEIGHT_DECAY * tf.add_n(
                    [tf.nn.l2_loss(v) for v in tf.trainable_variables()])

                # Decay the learning rate exponentially based on the number of steps.
                lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE *
                                                len(worker_hosts),
                                                global_step,
                                                decay_steps,
                                                LEARNING_RATE_DECAY_FACTOR,
                                                staircase=True)
                opt = tf.train.GradientDescentOptimizer(lr)
                # Track the moving averages of all trainable variables.
                exp_moving_averager = tf.train.ExponentialMovingAverage(
                    MOVING_AVERAGE_DECAY, global_step)
                variables_to_average = (tf.trainable_variables() +
                                        tf.moving_average_variables())
                opt = tf.train.SyncReplicasOptimizer(
                    opt,
                    replicas_to_aggregate=len(worker_hosts),
                    total_num_replicas=len(worker_hosts),
                    variable_averages=exp_moving_averager,
                    variables_to_average=variables_to_average)
                naive_grads = opt.compute_gradients(loss)
                grads = [(tf.scalar_mul(
                    tf.cast(batch_size / FLAGS.batch_size, tf.float32),
                    grad), var) for grad, var in naive_grads]
                apply_gradients_op = opt.apply_gradients(
                    grads, global_step=global_step)
                with tf.control_dependencies([apply_gradients_op]):
                    train_op = tf.identity(loss, name='train_op')

                chief_queue_runners = [opt.get_chief_queue_runner()]
                init_tokens_op = opt.get_init_tokens_op()
                saver = tf.train.Saver()
                sv = tf.train.Supervisor(is_chief=is_chief,
                                         logdir=FLAGS.train_dir,
                                         init_op=tf.group(
                                             tf.global_variables_initializer(),
                                             tf.local_variables_initializer()),
                                         summary_op=None,
                                         global_step=global_step,
                                         saver=saver,
                                         recovery_wait_secs=1,
                                         save_model_secs=60)

                tf.logging.info('%s Supervisor' % datetime.now())
                sess_config = tf.ConfigProto(
                    allow_soft_placement=True,
                    log_device_placement=FLAGS.log_device_placement)
                sess_config.gpu_options.allow_growth = True
                sess = sv.prepare_or_wait_for_session(server.target,
                                                      config=sess_config)
                queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
                sv.start_queue_runners(sess, queue_runners)

                sv.start_queue_runners(sess, chief_queue_runners)
                sess.run(init_tokens_op)
                """Train CIFAR-10 for a number of steps."""
                time0 = time.time()
                batch_size_num = FLAGS.batch_size
                for step in range(FLAGS.max_steps):
                    start_time = time.time()
                    run_options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                    num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size_num
                    decay_steps_num = int(num_batches_per_epoch *
                                          NUM_EPOCHS_PER_DECAY)
                    _, loss_value, gs = sess.run(
                        [train_op, loss, global_step],
                        feed_dict={batch_size: batch_size_num},
                        options=run_options,
                        run_metadata=run_metadata)
                    b = time.time()

                    if step % 1 == 0:
                        duration = time.time() - start_time
                        num_examples_per_step = batch_size_num
                        examples_per_sec = num_examples_per_step / duration
                        sec_per_batch = float(duration)
                        format_str = (
                            "time: " + str(time.time()) +
                            '; %s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)'
                        )
                        tf.logging.info(format_str %
                                        (datetime.now(), step, gs, loss_value,
                                         examples_per_sec, sec_per_batch))
Пример #30
0
 def begin(self):
     """ Create restoring operations before the graph been finalized. """
     ema_variables = tf.moving_average_variables()
     self._restore_ops = [
         tf.assign(x, self._ema.average(x)) for x in ema_variables
     ]
Пример #31
0
  def model_fn(self, features, labels, mode, params):
    """Build the model based on features, labels, and mode.

    Args:
      features: The features dictionary containing the data Tensor
        and the number of examples.
      labels: The labels Tensor resulting from calling the model.
      mode: A string indicating the training mode.
      params: A dictionary of hyperparameters.

    Returns:
      A tf.estimator.EstimatorSpec.
    """
    del params
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    if is_training:
      features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC
    total_loss, outputs = self._build_network(features, labels, mode)

    devices = cluster_utils.get_pipeline_devices(FLAGS.pipeline_device_num)
    slice_num = len(devices)
    micro_batch_num = FLAGS.micro_batch_num
    losses = []
    all_outputs = []
    losses.append(total_loss)
    all_outputs.append(outputs)
    layer_grads = [[[] for i in xrange(slice_num)] for j in xrange(micro_batch_num)]
    layer_vars = [[] for i in xrange(slice_num)]
    remained_vars = tf.trainable_variables()
    ys = losses[0]
    prev_grads=None
    # layers-1 ~ 1 compute grads
    for i in xrange(slice_num - 1, 0, -1):
      vars_i = [v for v in remained_vars if v.device==devices[i]]
      remained_vars = [v for v in remained_vars if v not in vars_i]
      prev_y = all_outputs[0][i-1]
      prev_y = prev_y if isinstance(prev_y, list) else [prev_y]
      num_tensors = len(prev_y)
      y_grads = tf.gradients(ys=ys, xs=prev_y+vars_i, grad_ys=prev_grads, colocate_gradients_with_ops=True)
      ys = prev_y
      prev_grads = y_grads[0:num_tensors]
      grads_i = y_grads[num_tensors:]
      layer_grads[0][i] = [g for g in grads_i if g is not None]
      layer_vars[i] = [v for (g, v) in zip(grads_i, vars_i) if g is not None]
    # layer 0 compute grads
    grads_0 = tf.gradients(ys=ys, xs=remained_vars, grad_ys=prev_grads, colocate_gradients_with_ops=True)
    layer_grads[0][0] = [g for g in grads_0 if g is not None]
    layer_vars[0] = [v for (g, v) in zip(grads_0, remained_vars) if g is not None]

    # other micro_batch_num
    for j in xrange(1, micro_batch_num):
      dep_outputs = []
      for i in xrange(slice_num):
        dep_outputs.append(all_outputs[j-1][i] if i+j < 2*slice_num-1 else layer_grads[i+j-2*slice_num+1][i])
      loss, outputs = self._build_network(features, labels, mode, dep_outputs=dep_outputs)
      losses.append(loss)
      all_outputs.append(outputs)
      ys = losses[j]
      prev_grads=None
      for i in xrange(slice_num - 1, 0, -1):
        prev_y = all_outputs[j][i-1]
        prev_y = prev_y if isinstance(prev_y, list) else [prev_y]
        num_tensors = len(prev_y)
        y_grads = tf.gradients(ys=ys, xs=prev_y+layer_vars[i], grad_ys=prev_grads, colocate_gradients_with_ops=True)
        ys = prev_y
        prev_grads = y_grads[0:num_tensors]
        grads_i = y_grads[num_tensors:]
        layer_grads[j][i] = [g for g in grads_i if g is not None]
      grads_0 = tf.gradients(ys=ys, xs=layer_vars[0], grad_ys=prev_grads, colocate_gradients_with_ops=True)
      layer_grads[j][0] = [g for g in grads_0 if g is not None]

    grads_set = []
    vars_set = []
    for i in xrange(slice_num):
      for j in xrange(len(layer_grads[0][i])):
        grad_i_set = [layer_grads[m][i][j] for m in range(micro_batch_num)]
        #print (grad_i_set)
        if micro_batch_num == 1:
          with tf.device(grad_i_set[0].device):
            acc_grads = grad_i_set[0]
        else:
          with tf.control_dependencies(grad_i_set), tf.device(grad_i_set[0].device): # replica
            if isinstance(grad_i_set[0], tf.IndexedSlices):
              acc_grads = tf.add_n(grad_i_set)
            else:
              acc_grads = tf.accumulate_n(grad_i_set)
        grads_set.append(acc_grads)
        vars_set.append(layer_vars[i][j])
    grads_and_vars = zip(grads_set, vars_set)
#######################

    train_op = None

    if is_training:
      global_step = tf.train.get_or_create_global_step()
      gs_t = tf.reshape(tf.cast(global_step, tf.int32), [1])

      # Setup learning rate schedule
      learning_rate = self._build_learning_rate_schedule(global_step)

      # Setup optimizer.
      optimizer = self._build_optimizer(learning_rate)

      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
      with tf.control_dependencies(None): # original is update_ops
        train_op = self._build_train_op(optimizer, grads_and_vars,
                                        global_step=global_step)

      if self.hparams.moving_average_decay > 0:
        ema = tf.train.ExponentialMovingAverage(
            decay=self.hparams.moving_average_decay, num_updates=global_step)
        variables_to_average = (tf.trainable_variables() +
                                tf.moving_average_variables())
        with tf.control_dependencies([train_op]):
          with tf.name_scope('moving_average'):
            train_op = ema.apply(variables_to_average)

      lr_t = tf.reshape(learning_rate, [1])
      host_call = None
      if self.hparams.enable_hostcall:
        def host_call_fn(gs, lr):
          # Outfeed supports int32 but global_step is expected to be int64.
          gs = tf.cast(tf.reduce_mean(gs), tf.int64)
          with tf.contrib.summary.create_file_writer(
              self.model_dir).as_default():
            with tf.contrib.summary.always_record_summaries():
              tf.contrib.summary.scalar('learning_rate', tf.reduce_mean(lr),
                                        step=gs)
              return tf.contrib.summary.all_summary_ops()
        host_call = (host_call_fn, [gs_t, lr_t])

    return tf.estimator.EstimatorSpec(
        mode=mode, loss=total_loss, train_op=train_op)
Пример #32
0
def main(argv=None):
    # load config file and setup
    params = {}
    config = configparser.ConfigParser()
    config_file = "configurations/mv2_cpm.cfg"
    if len(argv) != 1:
        config_file = argv[1]
    config.read(config_file)
    for _ in config.options("Train"):
        params[_] = eval(config.get("Train", _))

    os.environ['CUDA_VISIBLE_DEVICES'] = params['visible_devices']

    gpus_index = params['visible_devices'].split(",")
    params['gpus'] = len(gpus_index)

    if not os.path.exists(params['modelpath']):
        os.makedirs(params['modelpath'])
    if not os.path.exists(params['logpath']):
        os.makedirs(params['logpath'])

    src.dataloaders.dataset.set_config(params)
    set_network_input_wh(params['input_width'], params['input_height'])
    if (config_file == argv[1]):
        set_network_scale(params['scale'])
    else:
        ## For the hourglass model the last layer outputs a 32 times smaller output
        ## which is upsampled later
        ## TODO : Understand the architecture and make necessary changes
        ##  For now work with a scale value of 4 for the Hourglass model
        set_network_scale(4)

    ## Train on cpus for MAC
    gpus = 'gpus'
    if platform.system() == 'Darwin':
        gpus = 'cpu'
    training_name = '{}_batch-{}_lr-{}_{}-{}_{}x{}_{}'.format(
        params['model'], params['batchsize'], params['lr'], gpus,
        params['gpus'], params['input_width'], params['input_height'],
        config_file.replace("/", "-").replace(".cfg", ""))

    ## Processing for CPU
    ## Obtaining the dataset pipeline from dataloaders.datasets
    ## Define the learning rate and optimizer function

    with tf.Graph().as_default(), tf.device("/cpu:0"):
        input_image, input_heat = get_input(params['batchsize'],
                                            params['max_epoch'],
                                            is_train=True)
        valid_input_image, valid_input_heat = get_input(params['batchsize'],
                                                        params['max_epoch'],
                                                        is_train=False)

        global_step = tf.Variable(0, trainable=False)
        learning_rate = tf.train.exponential_decay(float(params['lr']),
                                                   global_step,
                                                   decay_steps=10000,
                                                   decay_rate=float(
                                                       params['decay_rate']),
                                                   staircase=True)
        opt = tf.train.AdamOptimizer(learning_rate, epsilon=1e-8)
        tower_grads = []
        reuse_variable = False

        if platform.system() == 'Darwin':
            # cpu (mac only)
            with tf.device("/cpu:0"):
                with tf.name_scope("CPU_0"):
                    loss, last_heat_loss, pred_heat = get_loss_and_output(
                        params['model'], params['batchsize'], input_image,
                        input_heat, reuse_variable, params['scale'])
                    reuse_variable = True
                    grads = opt.compute_gradients(loss)
                    tower_grads.append(grads)

                    valid_loss, valid_last_heat_loss, valid_pred_heat = get_loss_and_output(
                        params['model'], params['batchsize'],
                        valid_input_image, valid_input_heat, reuse_variable)
        else:
            # multiple gpus
            for i in range(params['gpus']):
                with tf.device("/gpu:%d" % i):
                    with tf.name_scope("GPU_%d" % i):
                        loss, last_heat_loss, pred_heat = get_loss_and_output(
                            params['model'], params['batchsize'], input_image,
                            input_heat, reuse_variable, params['scale'])
                        reuse_variable = True
                        grads = opt.compute_gradients(loss)
                        tower_grads.append(grads)

                        valid_loss, valid_last_heat_loss, valid_pred_heat = get_loss_and_output(
                            params['model'], params['batchsize'],
                            valid_input_image, valid_input_heat,
                            reuse_variable, params['scale'])

        grads = average_gradients(tower_grads)
        for grad, var in grads:
            if grad is not None:
                tf.summary.histogram("gradients_on_average/%s" % var.op.name,
                                     grad)

        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)

        ## Update model parameters based on moving average rather than final values
        ## Better performace

        MOVING_AVERAGE_DECAY = 0.99
        variable_averages = tf.train.ExponentialMovingAverage(
            MOVING_AVERAGE_DECAY, global_step)
        variable_to_average = (tf.trainable_variables() +
                               tf.moving_average_variables())
        variables_averages_op = variable_averages.apply(variable_to_average)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = tf.group(apply_gradient_op, variables_averages_op)

        saver = tf.train.Saver(max_to_keep=100)

        tf.summary.scalar("learning_rate", learning_rate)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("loss_lastlayer_heat", last_heat_loss)
        summary_merge_op = tf.summary.merge_all()

        pred_result_image = tf.placeholder(
            tf.float32, shape=[params['batchsize'], 480, 640, 3])
        pred_result__summary = tf.summary.image("pred_result_image",
                                                pred_result_image,
                                                params['batchsize'])

        init = tf.global_variables_initializer()
        config = tf.ConfigProto()
        # occupy gpu gracefully
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            init.run()

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)

            summary_writer = tf.summary.FileWriter(
                os.path.join(params['logpath'], training_name), sess.graph)
            total_step_num = params['num_train_samples'] * params[
                'max_epoch'] // (params['batchsize'] * params['gpus'])
            print("Start training...")
            for step in range(total_step_num):
                start_time = time.time()
                _, loss_value, lh_loss, in_image, in_heat, p_heat = sess.run([
                    train_op, loss, last_heat_loss, input_image, input_heat,
                    pred_heat
                ])
                duration = time.time() - start_time

                if step != 0 and step % params[
                        'per_update_tensorboard_step'] == 0:
                    # False will speed up the training time.
                    if params['pred_image_on_tensorboard'] is True:

                        valid_loss_value, valid_lh_loss, valid_in_image, valid_in_heat, valid_p_heat = sess.run(
                            [
                                valid_loss, valid_last_heat_loss,
                                valid_input_image, valid_input_heat,
                                valid_pred_heat
                            ])

                        ## TODO: Check why for the third iteration only 12 images are passed in validation batch
                        result = []
                        for index in range(params['batchsize']):
                            r = CocoPose.display_image(
                                valid_in_image[index, :, :, :],
                                valid_in_heat[index, :, :, :],
                                valid_p_heat[index, :, :, :], True)
                            result.append(r.astype(np.float32))

                        comparsion_of_pred_result = sess.run(
                            pred_result__summary,
                            feed_dict={pred_result_image: np.array(result)})
                        summary_writer.add_summary(comparsion_of_pred_result,
                                                   step)

                    # print train info
                    num_examples_per_step = params['batchsize'] * params['gpus']
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = duration / params['gpus']
                    format_str = (
                        '%s: step %d, loss = %.2f, last_heat_loss = %.2f (%.1f examples/sec; %.3f sec/batch)'
                    )
                    print(format_str %
                          (datetime.now(), step, loss_value, lh_loss,
                           examples_per_sec, sec_per_batch))

                    # tensorboard visualization
                    merge_op = sess.run(summary_merge_op)
                    summary_writer.add_summary(merge_op, step)

                # save model
                if step % params['per_saved_model_step'] == 0:
                    checkpoint_path = os.path.join(params['modelpath'],
                                                   training_name, 'model')
                    saver.save(sess, checkpoint_path, global_step=step)
            coord.request_stop()
            coord.join(threads)
def train(target, dataset, cluster_spec):
  """Train Inception on a dataset for a number of steps."""
  # Number of workers and parameter servers are inferred from the workers and ps
  # hosts string.
  num_workers = len(cluster_spec.as_dict()['worker'])
  num_parameter_servers = len(cluster_spec.as_dict()['ps'])
  # If no value is given, num_replicas_to_aggregate defaults to be the number of
  # workers.
  if FLAGS.num_replicas_to_aggregate == -1:
    num_replicas_to_aggregate = num_workers
  else:
    num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate

  # Both should be greater than 0 in a distributed training.
  assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and '
                                                         'num_parameter_servers'
                                                         ' must be > 0.')

  # Choose worker 0 as the chief. Note that any worker could be the chief
  # but there should be only one chief.
  is_chief = (FLAGS.task_id == 0)

  #batchSizeManager = BatchSizeManager(32, 4)

  # Ops are assigned to worker by default.
  tf.logging.info('cccc-num_parameter_servers:'+str(num_parameter_servers))
  partitioner = tf.fixed_size_partitioner(num_parameter_servers, 0)  

  device_setter = tf.train.replica_device_setter(ps_tasks=num_parameter_servers)
  slim = tf.contrib.slim
  with tf.device('/job:worker/task:%d' % FLAGS.task_id):
   with tf.variable_scope('root', partitioner=partitioner):
    # Variables and its related init/assign ops are assigned to ps.
#    with slim.arg_scope(
#        [slim.variables.variable, slim.variables.global_step],
#        device=slim.variables.VariableDeviceChooser(num_parameter_servers)):
    with tf.device(device_setter):
#	partitioner=partitioner):
      # Create a variable to count the number of train() calls. This equals the
      # number of updates applied to the variables.
#      global_step = slim.variables.global_step()
      global_step = tf.Variable(0, trainable=False)

      # Calculate the learning rate schedule.

      batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size')
      num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                               FLAGS.batch_size)
      # Decay steps need to be divided by the number of replicas to aggregate.
      decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay /
                        num_replicas_to_aggregate)

      # Decay the learning rate exponentially based on the number of steps.
      lr = tf.train.exponential_decay(FLAGS.initial_learning_rate*num_workers,
                                      global_step,
                                      decay_steps,
                                      FLAGS.learning_rate_decay_factor,
                                      staircase=True)
      # Add a summary to track the learning rate.
#      tf.summary.scalar('learning_rate', lr)

      # Create an optimizer that performs gradient descent.
      opt = tf.train.RMSPropOptimizer(lr,
                                      RMSPROP_DECAY,
                                      momentum=RMSPROP_MOMENTUM,
                                      epsilon=RMSPROP_EPSILON)

      images, labels = image_processing.distorted_inputs(
          dataset,
          batch_size,
          num_preprocess_threads=FLAGS.num_preprocess_threads)
      print(images.get_shape())
      print(labels.get_shape())

      # Number of classes in the Dataset label set plus 1.
      # Label 0 is reserved for an (unused) background class.
#      num_classes = dataset.num_classes() + 1
      num_classes = dataset.num_classes()
      print(num_classes)
#      logits = inception.inference(images, num_classes, for_training=True)
      network_fn = nets_factory.get_network_fn('inception_v3',num_classes=num_classes) 
      (logits,_) = network_fn(images)
      print(logits.get_shape())
      # Add classification loss.
#      inception.loss(logits, labels, batch_size)

      # Gather all of the losses including regularization losses.
      labels = tf.one_hot(labels, 1000, 1, 0)
      cross_entropy = tf.losses.softmax_cross_entropy(
          logits=logits, 
          onehot_labels=labels)
#      losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
#      losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
      losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
      total_loss = cross_entropy + _WEIGHT_DECAY * tf.add_n(
          [tf.nn.l2_loss(v) for v in tf.trainable_variables()])

#      total_loss = tf.add_n(losses, name='total_loss')

      if is_chief:
        # Compute the moving average of all individual losses and the
        # total loss.
        loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
        loss_averages_op = loss_averages.apply(losses + [total_loss])

        # Attach a scalar summmary to all individual losses and the total loss;
        # do the same for the averaged version of the losses.
#        for l in losses + [total_loss]:
#          loss_name = l.op.name
          # Name each loss as '(raw)' and name the moving average version of the
          # loss as the original loss name.
#          tf.summary.scalar(loss_name + ' (raw)', l)
#          tf.summary.scalar(loss_name, loss_averages.average(l))

        # Add dependency to compute loss_averages.
        with tf.control_dependencies([loss_averages_op]):
          total_loss = tf.identity(total_loss)

      # Track the moving averages of all trainable variables.
      # Note that we maintain a 'double-average' of the BatchNormalization
      # global statistics.
      # This is not needed when the number of replicas are small but important
      # for synchronous distributed training with tens of workers/replicas.
      exp_moving_averager = tf.train.ExponentialMovingAverage(
          MOVING_AVERAGE_DECAY, global_step)

      variables_to_average = (
          tf.trainable_variables() + tf.moving_average_variables())

      # Add histograms for model variables.
#      for var in variables_to_average:
#        tf.summary.histogram(var.op.name, var)

      # Create synchronous replica optimizer.
      opt = tf.train.SyncReplicasOptimizer(
          opt,
          replicas_to_aggregate=num_replicas_to_aggregate,
          total_num_replicas=num_workers,
          variable_averages=exp_moving_averager,
          variables_to_average=variables_to_average)

#      batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION)
#      assert batchnorm_updates, 'Batchnorm updates are missing'
#      batchnorm_updates_op = tf.group(*batchnorm_updates)
#      # Add dependency to compute batchnorm_updates.
#      with tf.control_dependencies([batchnorm_updates_op]):
#        total_loss = tf.identity(total_loss)

      # Compute gradients with respect to the loss.
      # grads = opt.compute_gradients(total_loss)
      grads0 = opt.compute_gradients(total_loss) 
      grads = [(tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), grad), var) for grad, var in grads0]

      # Add histograms for gradients.
#      for grad, var in grads:
#        if grad is not None:
#          tf.summary.histogram(var.op.name + '/gradients', grad)

      apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)

      with tf.control_dependencies([apply_gradients_op]):
        train_op = tf.identity(total_loss, name='train_op')

      # Get chief queue_runners and init_tokens, which is used to synchronize
      # replicas. More details can be found in SyncReplicasOptimizer.
      chief_queue_runners = [opt.get_chief_queue_runner()]
      init_tokens_op = opt.get_init_tokens_op()

      # Create a saver.
      saver = tf.train.Saver()

      # Build the summary operation based on the TF collection of Summaries.
#      summary_op = tf.summary.merge_all()

      # Build an initialization operation to run below.
      init_op = tf.global_variables_initializer()

      # We run the summaries in the same thread as the training operations by
      # passing in None for summary_op to avoid a summary_thread being started.
      # Running summaries and training operations in parallel could run out of
      # GPU memory.
      sv = tf.train.Supervisor(is_chief=is_chief,
                               logdir=FLAGS.train_dir,
                               init_op=init_op,
                               summary_op=None,
                               global_step=global_step,
                               recovery_wait_secs=1,
                               saver=None,
                               save_model_secs=FLAGS.save_interval_secs)

      tf.logging.info('%s Supervisor' % datetime.now())

      sess_config = tf.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=FLAGS.log_device_placement)

      # Get a session.
      sess = sv.prepare_or_wait_for_session(target, config=sess_config)

      # Start the queue runners.
      queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
      sv.start_queue_runners(sess, queue_runners)
      tf.logging.info('Started %d queues for processing input data.',
                      len(queue_runners))

      if is_chief:
        sv.start_queue_runners(sess, chief_queue_runners)
        sess.run(init_tokens_op)

      # Train, checking for Nans. Concurrently run the summary operation at a
      # specified interval. Note that the summary_op and train_op never run
      # simultaneously in order to prevent running out of GPU memory.
#      next_summary_time = time.time() + FLAGS.save_summaries_secs
      step = 0
      time0 = time.time()
      batch_size_num = 1
      while not sv.should_stop():
        try:
          start_time = time.time()

	  batch_size_num = 32
	  batch_size_num = 2*int(step/5)+16
#	   batch_size_num = int((int(step)/3*10)) % 100000 + 1
#          if step < 5:
#            batch_size_num = 32 
#          batch_size_num = (batch_size_num ) % 64 + 1
#          else:
#            batch_size_num = 80

          run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
          run_metadata = tf.RunMetadata()

          my_images, loss_value, step = sess.run([images, train_op, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata)
	  b = time.time()
#          assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
          if step > FLAGS.max_steps:
            break
          duration = time.time() - start_time
	  thread = threading2.Thread(target=get_computation_time, name="get_computation_time",args=(run_metadata.step_stats,step,))
	  thread.start()
#          tl = timeline.Timeline(run_metadata.step_stats)
#          last_batch_time = tl.get_local_step_duration('sync_token_q_Dequeue')
          c0 = time.time()
#          batch_size_num = batchSizeManager.dictate_new_batch_size(FLAGS.task_id, last_batch_time)
#          batch_size_num = rpcClient.update_batch_size(FLAGS.task_id, last_batch_time, available_cpu, available_memory, step, batch_size_num) 
#          ctf = tl.generate_chrome_trace_format()
#          with open("timeline.json", 'a') as f:
#            f.write(ctf)

          if step % 1 == 0:
            examples_per_sec = FLAGS.batch_size / float(duration)
            c = time.time()
            tf.logging.info("time statistics" + " - train_time: " + str(b-start_time) + " - get_batch_time: " + str(c0-b) + " - get_bs_time:  " + str(c-c0) + " - accum_time: " + str(c-time0) + " - batch_size: " + str(batch_size_num))
            format_str = ('Worker %d: %s: step %d, loss = %.2f'
                          '(%.1f examples/sec; %.3f  sec/batch)')
            tf.logging.info(format_str %
                            (FLAGS.task_id, datetime.now(), step, loss_value,
                             examples_per_sec, duration))

          # Determine if the summary_op should be run on the chief worker.
#          if is_chief and next_summary_time < time.time():
#            tf.logging.info('Running Summary operation on the chief.')
#            summary_str = sess.run(summary_op)
#            sv.summary_computed(sess, summary_str)
#            tf.logging.info('Finished running Summary operation.')

            # Determine the next time for running the summary.
#            next_summary_time += FLAGS.save_summaries_secs
        except:
          if is_chief:
            tf.logging.info('Chief got exception while running!')
          raise

      # Stop the supervisor.  This also waits for service threads to finish.
      sv.stop()
Пример #34
0
def train(dataset):
  #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
  """Train on dataset for a number of steps."""
  with tf.Graph().as_default(), tf.device('/cpu:0'):
    # Create a variable to count the number of train() calls. This equals the
    # number of batches processed * FLAGS.num_gpus.
    tf.set_random_seed(time.time())
    tf.set_random_seed(198918)
    global_step = tf.get_variable(
        'global_step', [],
        initializer=tf.constant_initializer(0), trainable=False)

    bits_ph = []
    for i in range(18):
        bits_ph.append(tf.placeholder(tf.int32))

    nm = norm_monitor.norm_monitor(FLAGS.digits, len(bits_ph), FLAGS.rel_res, FLAGS.interval, FLAGS.stride)
    if FLAGS.layerinfo_file:
      assert tf.gfile.Exists(FLAGS.layerinfo_file)
      tmp = pickle.load(open(FLAGS.layerinfo_file,'rb'))
      nm.set_layerinfo(tmp[-1])
      print("Restore layerinfo")
      print(nm.get_layerinfo())

    # Calculate the learning rate schedule.
    num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size)
    decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay)
    print("num_batches_per_epoch: {}".format(num_batches_per_epoch))
    print("use bitpack: {}".format(FLAGS.use_bitpack))
    print("learning rate: {}".format(FLAGS.initial_learning_rate))
    print("produce trace: {}".format(FLAGS.profile))
    print("digits: {}".format(FLAGS.digits))
    print("rel_res: {}".format(FLAGS.rel_res))
    print("interval: {}".format(FLAGS.interval))
    print("stride: {}".format(FLAGS.stride))

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                    global_step,
                                    decay_steps,
                                    FLAGS.learning_rate_decay_factor,
                                    staircase=True)

    # Create an optimizer that performs gradient descent.
    opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON)

    # Get images and labels for ImageNet and split the batch across GPUs.
    assert FLAGS.batch_size % FLAGS.num_gpus == 0, (
        'Batch size must be divisible by number of GPUs')
    split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus)

    # Override the number of preprocessing threads to account for the increased
    # number of GPU towers.
    num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
    images, labels = image_processing.distorted_inputs(
        dataset,
        num_preprocess_threads=num_preprocess_threads)

    input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES))

    # Number of classes in the Dataset label set plus 1.
    # Label 0 is reserved for an (unused) background class.
    num_classes = dataset.num_classes() + 1

     # Split the batch of images and labels for towers.
    images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images)
    labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels)

    # Calculate the gradients for each model tower.
    tower_norms  = []
    tower_grads  = []
    tower_preds_1  = []
    tower_preds_5  = []
    tower_losses = []

    reuse_variables = None
    for i in range(FLAGS.num_gpus):
      with tf.device('/gpu:%d' % i):
        with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope:
          # Force all Variables to reside on the CPU.
          # Calculate the loss for one tower of the ImageNet model. This
          # function constructs the entire ImageNet model but shares the
          # variables across all towers.
          #print(images_splits[i])
          #print(labels_splits[i])
          loss, norms, logits_split = _tower_loss(images_splits[i], labels_splits[i], num_classes, scope, reuse_variables, bits_ph)
          top_1_correct = tf.nn.in_top_k(logits_split, labels_splits[i], 1)
          top_5_correct = tf.nn.in_top_k(logits_split, labels_splits[i], 5)
          # Reuse variables for the next tower.
          reuse_variables = True

          # Retain the summaries from the final tower.
          summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

          # Retain the Batch Normalization updates operations only from the
          # final tower. Ideally, we should grab the updates from all towers
          # but these stats accumulate extremely fast so we can ignore the
          # other stats from the other towers without significant detriment.
          #batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, scope)
          batchnorm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

          # Calculate the gradients for the batch of data on this ImageNet
          # tower.
          grads = opt.compute_gradients(loss)

          # Keep track of the gradients across all towers.
          tower_grads.append(grads)
          tower_norms.append(norms)
          tower_preds_1.append(tf.reduce_sum(tf.cast(top_1_correct, tf.int32)))
          tower_preds_5.append(tf.reduce_sum(tf.cast(top_5_correct, tf.int32)))
          tower_losses.append(loss)

    # We must calculate the mean of each gradient. Note that this is the
    # synchronization point across all towers.
    grads = _average_gradients(tower_grads)

    top_1_sum = tf.add_n(tower_preds_1)
    top_5_sum = tf.add_n(tower_preds_5)
    losses_sum = tf.add_n(tower_losses)
    # Add a summaries for the input processing and global_step.
    summaries.extend(input_summaries)

    # Add a summary to track the learning rate.
    summaries.append(tf.summary.scalar('learning_rate', lr))

    # Add histograms for gradients.
    for grad, var in grads:
      if grad is not None:
        summaries.append(
            tf.summary.histogram(var.op.name + '/gradients', grad))

    # Apply the gradients to adjust the shared variables.
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # Add histograms for trainable variables.
    for var in tf.trainable_variables():
      summaries.append(tf.summary.histogram(var.op.name, var))

    # Track the moving averages of all trainable variables.
    # Note that we maintain a "double-average" of the BatchNormalization
    # global statistics. This is more complicated then need be but we employ
    # this for backward-compatibility with our previous models.
    variable_averages = tf.train.ExponentialMovingAverage(
        inception.MOVING_AVERAGE_DECAY, global_step)

    # Another possibility is to use tf.slim.get_variables().
    variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())
    variables_averages_op = variable_averages.apply(variables_to_average)

    # Group all updates to into a single train op.
    batchnorm_updates_op = tf.group(*batchnorm_updates)
    train_op = tf.group(apply_gradient_op, variables_averages_op, 
            batchnorm_updates_op) 

    # Create a saver.
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)

    # Build the summary operation from the last tower summaries.
    summary_op = tf.summary.merge(summaries)

    # Build an initialization operation to run below.
    init = tf.global_variables_initializer()

    # Start running operations on the Graph. allow_soft_placement must be set to
    # True to build towers on GPU, as some of the ops do not have GPU
    # implementations.
    sess = tf.Session(config=tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    if FLAGS.pretrained_model_checkpoint_path:
      assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
      #variables_to_restore = tf.get_collection(slim.variables.VARIABLES_TO_RESTORE)
      restorer = tf.train.Saver(tf.global_variables(), max_to_keep=100)
      restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
      print('%s: Pre-trained model restored from %s' %
            (datetime.now(), FLAGS.pretrained_model_checkpoint_path))
    #for v in tf.all_variables():
    #  print("%s %s %s %s" % (v.name, v.get_shape(), v.dtype, v.device))
    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.summary.FileWriter(
        FLAGS.train_dir,
        graph=sess.graph)

    bits_dict = dict()
    #run_metadata = tf.RunMetadata()
    elapse = []

    #gweights = []
    glayerinfo = []
    #wnp_name = 'weights_norm_{}_{}_{}_{}_{}_{}_{}.dat'.format(9, 2048, 0, FLAGS.digits, FLAGS.stride, FLAGS.interval, FLAGS.use_bitpack)
    lip_name = 'layerinfo_{}_{}_{}_{}_{}_{}_{}.dat'.format(9, 4096, 0, FLAGS.digits, FLAGS.stride, FLAGS.interval, FLAGS.use_bitpack)

    for step in range(FLAGS.max_steps):
      run_metadata = tf.RunMetadata()
      start_time = time.time()
      info = nm.get_layerinfo()
      for i, bits in enumerate(bits_ph):
        bits_dict[bits] = info[i][0]
      if FLAGS.profile is False:
        _, loss_value, norms, top_1, top_5 = sess.run([train_op, losses_sum, tower_norms, top_1_sum, top_5_sum], feed_dict=bits_dict)
      else:
        _, loss_value, norms = sess.run([train_op, loss, tower_norms], 
                                 feed_dict=bits_dict, 
                                 options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), 
                                 run_metadata=run_metadata)
        top_1 = 5
        top_5 = 25

      nm.adjust_digits(norms)
      duration = time.time() - start_time
      #gweights.append(norms)
      #glayerinfo.append(copy.deepcopy(nm.get_layerinfo()))
      elapse.append(duration)

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        glayerinfo.append(copy.deepcopy(nm.get_layerinfo()))
        # Print layerinfo
        print(info)
        examples_per_sec = FLAGS.batch_size / float(duration)
        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch) elapse %.5f s top_1 %.5f top_5 %.5f')
        pred_1 = top_1 / (FLAGS.batch_size*FLAGS.num_gpus)
        pred_5 = top_5 / (FLAGS.batch_size*FLAGS.num_gpus)
        print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration, sum(elapse), pred_1, pred_5))
        sys.stdout.flush()
        tl = timeline.Timeline(run_metadata.step_stats)
        if FLAGS.profile is True:
          if FLAGS.use_bitpack is False:
            trace_file = tf.gfile.Open(name='timeline%03d.json' % step, mode='w')
          else:
            trace_file = tf.gfile.Open(name='bitpack_timeline%03d.json' % step, mode='w')
          trace_file.write(tl.generate_chrome_trace_format(show_memory=True))

      if step % 100 == 0:
        summary_str = sess.run(summary_op, feed_dict=bits_dict)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 4000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

  glayerinfo.append(copy.deepcopy(nm.get_layerinfo()))
  #pickle.dump(gweights, open(wnp_name,'wb'))
  pickle.dump(glayerinfo, open(lip_name,'wb'))
def train():
    assert FLAGS.job_name in ['ps', 'worker'], 'job_name must be ps or worker'

    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')

    tf.logging.info('PS hosts are %s ' % ps_hosts)
    tf.logging.info('Worker hosts are %s ' % worker_hosts)

    cluster_spec = tf.train.ClusterSpec({'ps': ps_hosts,
                                         'worker': worker_hosts})

    server = tf.train.Server(cluster_spec, job_name=FLAGS.job_name,
                             task_index=FLAGS.task_id)

    if FLAGS.job_name == 'ps':
        server.join()
    else:

        """Train Inception on a dataset for a number of steps."""
        # Number of workers and parameter servers are infered from the workers and ps
        # hosts string.
        num_workers = len(cluster_spec.as_dict()['worker'])
        num_parameter_servers = len(cluster_spec.as_dict()['ps'])
        # If no value is given, num_replicas_to_aggregate defaults to be the number of
        # workers.
        if FLAGS.num_replicas_to_aggregate == -1:
            num_replicas_to_aggregate = num_workers
        else:
            num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate

            # Both should be greater than 0 in a distributed training.
            assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and '
                                                                   'num_parameter_servers'
                                                                   ' must be > 0.')
        # Choose worker 0 as the chief. Note that any worker could be the chief
        # but there should be only one chief.
        is_chief = (FLAGS.task_id == 0)

        # Ops are assigned to worker by default.
        with tf.device(tf.train.replica_device_setter(worker_device='/job:worker/task:%d' % FLAGS.task_id,
                       cluster=cluster_spec)):
            # Variables and its related init/assign ops are assigned to ps.
            # with slim.scopes.arg_scope(
            # [slim.variables.variable, slim.variables.global_step],
            # device=slim.variables.VariableDeviceChooser(num_parameter_servers)):
            # Create a variable to count the number of train() calls. This equals the
            # number of updates applied to the variables.
            #global_step = slim.variables.global_step()
            global_step = tf.Variable(0, name='global_step', trainable=False)
            num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
            decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

            # Decay the learning rate exponentially based on the number of steps.
            lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                            global_step,
                                            decay_steps,
                                            LEARNING_RATE_DECAY_FACTOR,
                                            staircase=True)
            tf.scalar_summary('learning_rate', lr)
            opt = tf.train.GradientDescentOptimizer(lr)

            images, labels = image_two_stream.distorted_inputs()
            logits = image_two_stream.inference_final(images)
            total_loss = image_two_stream.loss(logits, labels)

            # train_op = image.train(loss, global_step)

            if is_chief:
                loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
                losses = tf.get_collection('losses')
                loss_averages_op = loss_averages.apply(losses + [total_loss])

                for l in losses + [total_loss]:
                    # Name each loss as '(raw)' and name the moving average version of the loss
                    # as the original loss name.
                    tf.scalar_summary(l.op.name + ' (raw)', l)
                    tf.scalar_summary(l.op.name, loss_averages.average(l))
                with tf.control_dependencies([loss_averages_op]):
                    total_loss = tf.identity(total_loss)

            variable_averages = tf.train.ExponentialMovingAverage(
                MOVING_AVERAGE_DECAY, global_step)
            variables_averages_op = (tf.trainable_variables() + tf.moving_average_variables())

            for var in variables_averages_op:
                tf.histogram_summary(var.op.name, var)

            opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=num_replicas_to_aggregate,
                replica_id=FLAGS.task_id,
                total_num_replicas=num_workers,
                variable_averages=variable_averages,
                variables_to_average=variables_averages_op)

            #batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION)
            #assert batchnorm_updates, 'Batchnorm updates are missing'
           # batchnorm_updates_op = tf.group(*batchnorm_updates)
            ## Add dependency to compute batchnorm_updates.
            #with tf.control_dependencies([batchnorm_updates_op]):
             #   total_loss = tf.identity(total_loss)

            # Compute gradients with respect to the loss.
            grads = opt.compute_gradients(total_loss)

            # Add histograms for gradients.
            for grad, var in grads:
                if grad is not None:
                    tf.histogram_summary(var.op.name + '/gradients', grad)

            apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)

            with tf.control_dependencies([apply_gradients_op]):
                train_op = tf.identity(total_loss, name='train_op')

            # Get chief queue_runners, init_tokens and clean_up_op, which is used to
            # synchronize replicas.
            # More details can be found in sync_replicas_optimizer.
            chief_queue_runners = [opt.get_chief_queue_runner()]
            init_tokens_op = opt.get_init_tokens_op()
            clean_up_op = opt.get_clean_up_op()

            # Create a saver.
            saver = tf.train.Saver()

            # Build the summary operation based on the TF collection of Summaries.
            summary_op = tf.merge_all_summaries()

            # Build an initialization operation to run below.
            init_op = tf.initialize_all_variables()

            # We run the summaries in the same thread as the training operations by
            # passing in None for summary_op to avoid a summary_thread being started.
            # Running summaries and training operations in parallel could run out of
            # GPU memory.
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=init_op,
                                     summary_op=None,
                                     global_step=global_step,
                                     saver=saver,
                                     save_model_secs=FLAGS.save_interval_secs)

            tf.logging.info('%s Supervisor' % datetime.now())

            sess_config = tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=FLAGS.log_device_placement)

            # Get a session.
            sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)

            # Start the queue runners.
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            sv.start_queue_runners(sess, queue_runners)
            tf.logging.info('Started %d queues for processing input data.',
                            len(queue_runners))

            if is_chief:
                sv.start_queue_runners(sess, chief_queue_runners)
                sess.run(init_tokens_op)

            # Train, checking for Nans. Concurrently run the summary operation at a
            # specified interval. Note that the summary_op and train_op never run
            # simultaneously in order to prevent running out of GPU memory.
            next_summary_time = time.time() + FLAGS.save_summaries_secs
            while not sv.should_stop():
                try:
                    start_time = time.time()
                    loss_value, step = sess.run([train_op, global_step])
                    assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
                    if step > FLAGS.max_steps:
                        break
                    duration = time.time() - start_time

                    if step % 30 == 0:
                        examples_per_sec = FLAGS.batch_size / float(duration)
                        format_str = ('Worker %d: %s: step %d, loss = %.2f'
                                      '(%.1f examples/sec; %.3f  sec/batch)')
                        tf.logging.info(format_str %
                                        (FLAGS.task_id, datetime.now(), step, loss_value,
                                         examples_per_sec, duration))

                    # Determine if the summary_op should be run on the chief worker.
                    if is_chief and next_summary_time < time.time():
                        tf.logging.info('Running Summary operation on the chief.')
                        summary_str = sess.run(summary_op)
                        sv.summary_computed(sess, summary_str)
                        tf.logging.info('Finished running Summary operation.')

                        # Determine the next time for running the summary.
                        next_summary_time += FLAGS.save_summaries_secs
                except:
                    if is_chief:
                        tf.logging.info('About to execute sync_clean_up_op!')
                        sess.run(clean_up_op)
                    raise

            # Stop the supervisor.  This also waits for service threads to finish.
            sv.stop()

            # Save after the training ends.
            if is_chief:
                saver.save(sess,
                           os.path.join(FLAGS.train_dir, 'model.ckpt'),
                           global_step=global_step)
            print("end")
Пример #36
0
def train(dataset):
  """Train on dataset for a number of steps."""
  with tf.Graph().as_default(), tf.device('/cpu:0'):
    # Create a variable to count the number of train() calls. This equals the
    # number of batches processed * FLAGS.num_gpus.
    global_step = tf.get_variable(
        'global_step', [],
        initializer=tf.constant_initializer(0), trainable=False)

    # Calculate the learning rate schedule.
    num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                             FLAGS.batch_size)
    decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay)

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                    global_step,
                                    decay_steps,
                                    FLAGS.learning_rate_decay_factor,
                                    staircase=True)

    # Create an optimizer that performs gradient descent.
    opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY,
                                    momentum=RMSPROP_MOMENTUM,
                                    epsilon=RMSPROP_EPSILON)

    # Get images and labels for ImageNet and split the batch across GPUs.
    assert FLAGS.batch_size % FLAGS.num_gpus == 0, (
        'Batch size must be divisible by number of GPUs')
    split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus)

    # Override the number of preprocessing threads to account for the increased
    # number of GPU towers.
    num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
    images, labels = image_processing.distorted_inputs(
        dataset,
        num_preprocess_threads=num_preprocess_threads)

    input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES))

    # Number of classes in the Dataset label set plus 1.
    # Label 0 is reserved for an (unused) background class.
    num_classes = dataset.num_classes() + 1

     # Split the batch of images and labels for towers.
    images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images)
    labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels)

    # Calculate the gradients for each model tower.
    tower_grads = []
    reuse_variables = None
    for i in range(FLAGS.num_gpus):
      with tf.device('/gpu:%d' % i):
        with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope:
          # Force all Variables to reside on the CPU.
          with slim.arg_scope([slim.variables.variable], device='/cpu:0'):
            # Calculate the loss for one tower of the ImageNet model. This
            # function constructs the entire ImageNet model but shares the
            # variables across all towers.
            loss = _tower_loss(images_splits[i], labels_splits[i], num_classes,
                               scope, reuse_variables)

          # Reuse variables for the next tower.
          reuse_variables = True

          # Retain the summaries from the final tower.
          summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

          # Retain the Batch Normalization updates operations only from the
          # final tower. Ideally, we should grab the updates from all towers
          # but these stats accumulate extremely fast so we can ignore the
          # other stats from the other towers without significant detriment.
          batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION,
                                                scope)

          # Calculate the gradients for the batch of data on this ImageNet
          # tower.
          grads = opt.compute_gradients(loss)

          # Keep track of the gradients across all towers.
          tower_grads.append(grads)

    # We must calculate the mean of each gradient. Note that this is the
    # synchronization point across all towers.
    grads = _average_gradients(tower_grads)

    # Add a summaries for the input processing and global_step.
    summaries.extend(input_summaries)

    # Add a summary to track the learning rate.
    summaries.append(tf.summary.scalar('learning_rate', lr))

    # Add histograms for gradients.
    for grad, var in grads:
      if grad is not None:
        summaries.append(
            tf.summary.histogram(var.op.name + '/gradients', grad))

    # Apply the gradients to adjust the shared variables.
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # Add histograms for trainable variables.
    for var in tf.trainable_variables():
      summaries.append(tf.summary.histogram(var.op.name, var))

    # Track the moving averages of all trainable variables.
    # Note that we maintain a "double-average" of the BatchNormalization
    # global statistics. This is more complicated then need be but we employ
    # this for backward-compatibility with our previous models.
    variable_averages = tf.train.ExponentialMovingAverage(
        inception.MOVING_AVERAGE_DECAY, global_step)

    # Another possibility is to use tf.slim.get_variables().
    variables_to_average = (tf.trainable_variables() +
                            tf.moving_average_variables())
    variables_averages_op = variable_averages.apply(variables_to_average)

    # Group all updates to into a single train op.
    batchnorm_updates_op = tf.group(*batchnorm_updates)
    train_op = tf.group(apply_gradient_op, variables_averages_op,
                        batchnorm_updates_op)

    # Create a saver.
    saver = tf.train.Saver(tf.global_variables())

    # Build the summary operation from the last tower summaries.
    summary_op = tf.summary.merge(summaries)

    # Build an initialization operation to run below.
    init = tf.global_variables_initializer()

    # Start running operations on the Graph. allow_soft_placement must be set to
    # True to build towers on GPU, as some of the ops do not have GPU
    # implementations.
    sess = tf.Session(config=tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    if FLAGS.pretrained_model_checkpoint_path:
      assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
      variables_to_restore = tf.get_collection(
          slim.variables.VARIABLES_TO_RESTORE)
      restorer = tf.train.Saver(variables_to_restore)
      restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
      print('%s: Pre-trained model restored from %s' %
            (datetime.now(), FLAGS.pretrained_model_checkpoint_path))

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.summary.FileWriter(
        FLAGS.train_dir,
        graph=sess.graph)

    for step in range(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        examples_per_sec = FLAGS.batch_size / float(duration)
        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print(format_str % (datetime.now(), step, loss_value,
                            examples_per_sec, duration))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 5000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Пример #37
0
def main(_):
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        dataset = ImagenetData(subset=FLAGS.subset)
        assert dataset.data_files()
        global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (dataset.num_examples_per_epoch() /FLAGS.batch_size)
        decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay)

        # Decay the learning rate exponentially based on the number of steps.
        learning_rate = tf.train.exponential_decay(FLAGS.learning_rate,
                                    global_step,
                                    decay_steps,
                                    FLAGS.learning_rate_decay_factor,
                                    staircase=True)

        tf.summary.scalar('lr', learning_rate)

        is_training = tf.placeholder(tf.bool)

        #opt = tf.train.AdamOptimizer(learning_rate)
        opt = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY,
                                        momentum=RMSPROP_MOMENTUM,
                                        epsilon=RMSPROP_EPSILON)

        with tf.name_scope("create_inputs"):
            #if tf.gfile.Exists(FLAGS.SNAPSHOT_DIR):
            #    tf.gfile.DeleteRecursively(FLAGS.SNAPSHOT_DIR)
            #tf.gfile.MakeDirs(FLAGS.SNAPSHOT_DIR)

            # Get images and labels for ImageNet and split the batch across GPUs.
            assert FLAGS.batch_size % FLAGS.gpu_nums == 0, ('Batch size must be divisible by number of GPUs')
            split_batch_size = int(FLAGS.batch_size / FLAGS.gpu_nums)

            # Override the number of preprocessing threads to account for the increased
            # number of GPU towers.
            num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.gpu_nums
            images, labels = image_processing.distorted_inputs(dataset, num_preprocess_threads=num_preprocess_threads)
            #tf.summary.image('images', images, max_outputs = 10)

            images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.gpu_nums, value=images)
            labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.gpu_nums, value=tf.one_hot(indices = labels, depth = FLAGS.num_classes))

        multi_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in xrange(FLAGS.gpu_nums):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' % ('ImageNet', i)) as scope:

                        graph = Model_Graph(num_class = FLAGS.num_classes, is_training = is_training)

                        model = graph._build_defaut_graph(images = images_splits[i])

                        # Top-1 accuracy
                        top1acc = tf.reduce_mean(tf.cast(tf.nn.in_top_k(model.logits, tf.argmax(labels_splits[i], axis=1), 1), tf.float32))
                        # Top-n accuracy
                        topnacc = tf.reduce_mean(tf.cast(tf.nn.in_top_k(model.logits, tf.argmax(labels_splits[i], axis=1), FLAGS.top_k), tf.float32))

                        tf.summary.scalar('top1acc_{}'.format(i), top1acc)
                        tf.summary.scalar('topkacc_{}'.format(i), topnacc)

                        all_trainable = [v for v in tf.trainable_variables()]

                        loss = tf.nn.softmax_cross_entropy_with_logits(logits=model.logits, labels=labels_splits[i])

                        l2_losses = [FLAGS.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name]
                        reduced_loss = tf.reduce_mean(loss) + tf.add_n(l2_losses)

                        tf.summary.scalar('loss_{}'.format(i), reduced_loss)

                        tf.get_variable_scope().reuse_variables()

                        #batchnorm_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope)
                        batchnorm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

                        grads = opt.compute_gradients(reduced_loss, all_trainable)
                        multi_grads.append(grads)

        grads = average_gradients(multi_grads)

        # Track the moving averages of all trainable variables.
        # Note that we maintain a "double-average" of the BatchNormalization
        # global statistics. This is more complicated then need be but we employ
        # this for backward-compatibility with our previous models.
        variable_averages = tf.train.ExponentialMovingAverage(FLAGS.MOVING_AVERAGE_DECAY, global_step)

        variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())
        variables_averages_op = variable_averages.apply(variables_to_average)

        # Group all updates to into a single train op.
        batchnorm_updates_op = tf.group(*batchnorm_updates)
        train_op = tf.group(opt.apply_gradients(grads, global_step), variables_averages_op, batchnorm_updates_op)

        #grads_value = list(zip(grads, all_trainable))
        #for grad, var in grads_value:
        #    tf.summary.histogram(var.name + '/gradient', grad)

        summary_op = tf.summary.merge_all()

        # Set up tf session and initialize variables. 
        config = tf.ConfigProto()
        config.allow_soft_placement=True
        sess = tf.Session(config=config)
        init = tf.global_variables_initializer()

        sess.run(init)

        # Saver for storing checkpoints of the model.
        saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=2)

        restore_var = [v for v in tf.trainable_variables()]+[v for v in tf.global_variables() if 'moving_mean' in v.name or 'moving_variance' in v.name or 'global_step' in v.name]

        ckpt = tf.train.get_checkpoint_state(FLAGS.SNAPSHOT_DIR)
        if ckpt and ckpt.model_checkpoint_path:
            loader = tf.train.Saver(var_list=restore_var)
            load(loader, sess, ckpt.model_checkpoint_path)
        else:
            print('No checkpoint file found.')
            load_step = 0


        summary_writer = tf.summary.FileWriter(FLAGS.SNAPSHOT_DIR, graph=sess.graph)

        # Iterate over training steps.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord, sess=sess)

        for step in range(FLAGS.num_steps):
            start_time = time.time()

            feed_dict = {is_training: True}
            if step%50000 == 0 and step != 0:
                loss_value, _ = sess.run([reduced_loss, train_op], feed_dict=feed_dict)
                save(saver, sess, FLAGS.SNAPSHOT_DIR, step)
            elif step%100 == 0:
                summary_str, loss_value, _ = sess.run([summary_op, reduced_loss, train_op], feed_dict=feed_dict)
                duration = time.time() - start_time
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()
                print('step {:d} \t loss = {:.3f}, ({:.3f} sec/step)'.format(step, loss_value, duration))
            else:
                loss_value, _ = sess.run([reduced_loss, train_op], feed_dict=feed_dict)

        coord.request_stop()
        coord.join(threads)
Пример #38
0
def main(argv=None): 
    # 将简单的运算放在CPU上,只有神经网络的训练过程放在GPU上。
	with tf.Graph().as_default(), tf.device('/cpu:0'):
 		
 		# 定义基本的训练过程
        x, y_ = get_input()
        regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE)
        
        global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
        learning_rate = tf.train.exponential_decay(
            LEARNING_RATE_BASE, global_step, 60000 / BATCH_SIZE, LEARNING_RATE_DECAY)       
        
        opt = tf.train.GradientDescentOptimizer(learning_rate)
        
        tower_grads = []
        reuse_variables = False
        # 将神经网络的优化过程跑在不同的GPU上。
        for i in range(N_GPU):
            # 将优化过程指定在一个GPU上。
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('GPU_%d' % i) as scope:
                    cur_loss = get_loss(x, y_, regularizer, scope, reuse_variables)
                    reuse_variables = True
                    grads = opt.compute_gradients(cur_loss)
                    tower_grads.append(grads)
        
        # 计算变量的平均梯度。
        grads = average_gradients(tower_grads)
        for grad, var in grads:
            if grad is not None:
            	tf.histogram_summary('gradients_on_average/%s' % var.op.name, grad)

        # 使用平均梯度更新参数。
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
        for var in tf.trainable_variables():
            tf.histogram_summary(var.op.name, var)

        # 计算变量的滑动平均值。
        variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
        variables_to_average = (tf.trainable_variables() +tf.moving_average_variables())
        variables_averages_op = variable_averages.apply(variables_to_average)
        # 每一轮迭代需要更新变量的取值并更新变量的滑动平均值。
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        saver = tf.train.Saver(tf.all_variables())
        summary_op = tf.merge_all_summaries()        
        init = tf.initialize_all_variables()
        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True, log_device_placement=True)) as sess:
            # 初始化所有变量并启动队列。
            init.run()
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            summary_writer = tf.train.SummaryWriter(MODEL_SAVE_PATH, sess.graph)

            for step in range(TRAINING_STEPS):
                # 执行神经网络训练操作,并记录训练操作的运行时间。
                start_time = time.time()
                _, loss_value = sess.run([train_op, cur_loss])
                duration = time.time() - start_time
                
                # 每隔一段时间数据当前的训练进度,并统计训练速度。
                if step != 0 and step % 10 == 0:
                    # 计算使用过的训练数据个数。
                    num_examples_per_step = BATCH_SIZE * N_GPU
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = duration / N_GPU
    
                    # 输出训练信息。
                    format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                    print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))
                    
                    # 通过TensorBoard可视化训练过程。
                    summary = sess.run(summary_op)
                    summary_writer.add_summary(summary, step)
    
                # 每隔一段时间保存当前的模型。
                if step % 1000 == 0 or (step + 1) == TRAINING_STEPS:
                    checkpoint_path = os.path.join(MODEL_SAVE_PATH, MODEL_NAME)
                    saver.save(sess, checkpoint_path, global_step=step)
        
            coord.request_stop()
            coord.join(threads)
        
if __name__ == '__main__':
	tf.app.run()
Пример #39
0
def train():
    import multiprocessing as mp
    mp.set_start_method('spawn', force=True)
    os.environ['CUDA_VISIBLE_DEVICES'] = cfg.TRAIN.GPU_LIST
    gpus = list(range(len(cfg.TRAIN.GPU_LIST.split(','))))
    num_gpus = len(gpus)

    restore_from_original_checkpoint = True
    checkpoint_path = cfg.TRAIN.LOG_DIR + COMMON_POSTFIX
    if not tf.io.gfile.exists(checkpoint_path):
        tf.io.gfile.makedirs(checkpoint_path)
    else:
        restore_from_original_checkpoint = False

    register_coco(os.path.expanduser(cfg.DATA.BASEDIR))
    data_iter = get_train_dataflow(batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU *
                                   num_gpus)
    ds = tf.data.Dataset.from_generator(
        lambda: map(
            lambda x: tuple([
                x[k]
                for k in ['images', 'gt_boxes', 'gt_labels', 'orig_gt_counts']
            ]), data_iter), (tf.float32, tf.float32, tf.int64, tf.int32),
        (tf.TensorShape([None, None, None, 3]), tf.TensorShape([
            None, None, 4
        ]), tf.TensorShape([None, None]), tf.TensorShape([
            None,
        ])))
    ds = ds.prefetch(buffer_size=128)
    ds = ds.make_one_shot_iterator()
    images, gt_boxes, gt_labels, orig_gt_counts = ds.get_next()

    if cfg.BACKBONE.DATA_FORMAT == 'channels_first':
        images = tf.transpose(images, [0, 3, 1, 2])  # NHWC --> NCHW

    # build optimizers
    global_step = tf.train.get_or_create_global_step()
    learning_rate = warmup_lr_schedule(init_learning_rate=cfg.TRAIN.BASE_LR,
                                       global_step=global_step,
                                       warmup_step=cfg.TRAIN.WARMUP_STEP)
    opt = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)

    sess_config = tf.ConfigProto()
    sess_config.allow_soft_placement = True
    sess_config.log_device_placement = False
    sess_config.gpu_options.allow_growth = True
    sess = tf.Session(config=sess_config)

    if num_gpus > 1:

        base_inputs_list = [
            tf.split(value, num_or_size_splits=num_gpus, axis=0)
            for value in [images, gt_boxes, gt_labels, orig_gt_counts]
        ]

        tower_grads = []
        total_loss_dict = {
            'cls_loss': tf.constant(0.),
            'reg_loss': tf.constant(0.),
            'centerness_loss': tf.constant(0.)
        }
        for i, gpu_id in enumerate(gpus):
            with tf.device('/gpu:%d' % gpu_id):
                with tf.name_scope('model_%d' % gpu_id) as scope:
                    net_inputs = [input[i] for input in base_inputs_list]
                    tower_loss_dict = tower_loss_func(net_inputs,
                                                      reuse=(gpu_id > 0))
                    batch_norm_updates = tf.get_collection(
                        tf.GraphKeys.UPDATE_OPS, scope)

                    tower_loss = tf.add_n(
                        [v for k, v in tower_loss_dict.items()])

                    for k, v in tower_loss_dict.items():
                        total_loss_dict[k] += v

                    if i == num_gpus - 1:
                        wd_loss = regularize_cost('.*/kernel',
                                                  l2_regularizer(
                                                      cfg.TRAIN.WEIGHT_DECAY),
                                                  name='wd_cost')
                        tower_loss = tower_loss + wd_loss

                        # Retain the summaries from the final tower.
                        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                      scope)

                        if cfg.FCOS.VISUALIZATION:
                            with tf.device('/cpu:0'):
                                with tf.name_scope('loss-summaries'):
                                    for k, v in tower_loss_dict.items():
                                        summaries.append(
                                            tf.summary.scalar(k, v))

                    grads = opt.compute_gradients(tower_loss)
                    tower_grads.append(grads)

        grads = average_gradients(tower_grads)
        for k, v in total_loss_dict.items():
            total_loss_dict[k] = v / tf.cast(num_gpus, tf.float32)
        average_total_loss = tf.add_n([v for k, v in total_loss_dict.items()] +
                                      [wd_loss])
    else:
        net_inputs = [images, gt_boxes, gt_labels, orig_gt_counts]
        tower_loss_dict = tower_loss_func(net_inputs)
        batch_norm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        wd_loss = regularize_cost('.*/kernel',
                                  l2_regularizer(cfg.TRAIN.WEIGHT_DECAY),
                                  name='wd_cost')
        average_total_loss = tf.add_n([v for k, v in tower_loss_dict.items()] +
                                      [wd_loss])
        grads = opt.compute_gradients(average_total_loss)
        total_loss_dict = tower_loss_dict

        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
        if cfg.FCOS.VISUALIZATION:
            with tf.device('/cpu:0'):
                with tf.name_scope('loss-summaries'):
                    for k, v in tower_loss_dict.items():
                        summaries.append(tf.summary.scalar(k, v))

    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
    summaries.append(tf.summary.scalar('learning_rate', learning_rate))

    # add histograms for trainable variables
    for grad, var in grads:
        # print(grad, var)
        if grad is not None:
            summaries.append(
                tf.summary.histogram(var.op.name + '/gradients', grad))

    # add histograms for trainable variables
    for var in tf.trainable_variables():
        summaries.append(tf.summary.histogram(var.op.name, var))

    variable_averages = tf.train.ExponentialMovingAverage(
        cfg.TRAIN.MOVING_AVERAGE_DECAY, num_updates=global_step)
    variable_averages_op = variable_averages.apply(tf.trainable_variables())

    all_global_vars = []
    for var in tf.global_variables():
        all_global_vars.append(var.name + '\n')
        # print(var.name, var.shape)
    with open('all_global_vars.txt', 'w') as fp:
        fp.writelines(all_global_vars)

    all_trainable_vars = []
    for var in tf.trainable_variables():
        all_trainable_vars.append(var.name + '\n')
    with open('all_trainable_vars.txt', 'w') as fp:
        fp.writelines(all_trainable_vars)

    all_moving_average_vars = []
    for var in tf.moving_average_variables():
        all_moving_average_vars.append(var.name + '\n')
    with open('all_moving_average_variables.txt', 'w') as fp:
        fp.writelines(all_moving_average_vars)

    # batch norm updates
    batch_norm_updates_op = tf.group(*batch_norm_updates)
    with tf.control_dependencies(
        [apply_gradient_op, variable_averages_op, batch_norm_updates_op]):
        train_op = tf.no_op(name='train_op')

    saver = tf.train.Saver(tf.global_variables())
    summary_op = tf.summary.merge(summaries)
    summary_writer = tf.summary.FileWriter(checkpoint_path,
                                           tf.get_default_graph())

    init_op = tf.group(
        [tf.global_variables_initializer(),
         tf.local_variables_initializer()])
    sess.run(init_op)

    if False:
        print('load weights ...')
        ckpt_params = dict(np.load('MSRA-R50.npz'))
        assign_ops = []
        all_variables = []
        for var in tf.global_variables():
            dst_name = var.name
            all_variables.append(dst_name + '\n')
            if 'resnet50' in dst_name:
                src_name = dst_name.replace('resnet50/', ''). \
                    replace('conv2d/kernel:0', 'W') \
                    .replace('conv2d/bias:0', 'b') \
                    .replace('batch_normalization/gamma:0', 'gamma') \
                    .replace('batch_normalization/beta:0', 'beta') \
                    .replace('batch_normalization/moving_mean:0', 'mean/EMA') \
                    .replace('batch_normalization/moving_variance:0', 'variance/EMA') \
                    .replace('kernel:0', 'W').replace('bias:0', 'b')
                if 'batch_normalization' in dst_name:
                    src_name = src_name.replace('res', 'bn')
                    if 'conv1' in src_name:
                        src_name = 'bn_' + src_name

                if src_name == 'fc1000/W':
                    print('{} --> {} {}'.format('fc1000/W', dst_name,
                                                var.shape))
                    assign_ops.append(
                        tf.assign(
                            var, np.reshape(ckpt_params[src_name],
                                            [2048, 1000])))
                    continue
                if src_name in ckpt_params:
                    print('{} --> {} {}'.format(src_name, dst_name, var.shape))
                    assign_ops.append(tf.assign(var, ckpt_params[src_name]))
        print('load weights done.')
        with open('all_vars.txt', 'w') as fp:
            fp.writelines(all_variables)
        all_update_ops = []
        for op in tf.get_collection(tf.GraphKeys.UPDATE_OPS):
            all_update_ops.append(op.name + '\n')
        with open('all_update_ops.txt', 'w') as fp:
            fp.writelines(all_update_ops)
        sess.run(assign_ops)
    else:
        if False:
            all_vars = []
            restore_var_dict = {}
            for var in tf.global_variables():
                all_vars.append(var.name + '\n')
                if 'rpn' not in var.name and 'rcnn' not in var.name and 'global_step' not in var.name and \
                        'Momentum' not in var.name and 'ExponentialMovingAverage' not in var.name:
                    restore_var_dict[var.name.replace(':0', '')] = var
            with open('all_vars.txt', 'w') as fp:
                fp.writelines(all_vars)
            restorer = tf.train.Saver(var_list=restore_var_dict)
            restorer.restore(sess, cfg.BACKBONE.CHECKPOINT_PATH)
        else:
            if restore_from_original_checkpoint:
                # restore from official ResNet checkpoint
                all_vars = []
                restore_var_dict = {}
                for var in tf.global_variables():
                    all_vars.append(var.name + '\n')
                    if 'rpn' not in var.name and 'rcnn' not in var.name and 'fpn' not in var.name \
                        and 'fcos' not in var.name \
                            and 'global_step' not in var.name and \
                            'Momentum' not in var.name and 'ExponentialMovingAverage' not in var.name:
                        restore_var_dict[var.name.replace('resnet50/',
                                                          '').replace(
                                                              ':0', '')] = var
                        print(var.name, var.shape)
                with open('all_vars.txt', 'w') as fp:
                    fp.writelines(all_vars)
                restore_vars_names = [
                    k + '\n' for k in restore_var_dict.keys()
                ]
                with open('all_restore_vars.txt', 'w') as fp:
                    fp.writelines(restore_vars_names)
                restorer = tf.train.Saver(var_list=restore_var_dict)
                restorer.restore(sess, cfg.BACKBONE.CHECKPOINT_PATH)
            else:
                all_vars = []
                restore_var_dict = {}
                for var in tf.global_variables():
                    all_vars.append(var.name + '\n')
                    restore_var_dict[var.name.replace(':0', '')] = var
                with open('all_vars.txt', 'w') as fp:
                    fp.writelines(all_vars)
                # restore from local checkpoint
                restorer = tf.train.Saver(tf.global_variables())
                try:
                    restorer.restore(
                        sess, tf.train.latest_checkpoint(checkpoint_path))
                except:
                    pass

    # record all ops
    all_operations = []
    for op in sess.graph.get_operations():
        all_operations.append(op.name + '\n')
    with open('all_ops.txt', 'w') as fp:
        fp.writelines(all_operations)

    loss_names = ['cls_loss', 'reg_loss', 'centerness_loss']
    sess2run = list()
    sess2run.append(train_op)
    sess2run.append(learning_rate)
    sess2run.append(average_total_loss)
    sess2run.append(wd_loss)
    sess2run.extend([total_loss_dict[k] for k in loss_names])

    print('begin training ...')
    step = sess.run(global_step)
    step0 = step
    start = time.time()
    for step in range(step, cfg.TRAIN.MAX_STEPS):

        if step % cfg.TRAIN.SAVE_SUMMARY_STEPS == 0:

            _, lr_, tl_, wd_loss_, \
            cls_loss_, reg_loss_, centerness_loss_, \
            summary_str = sess.run(sess2run + [summary_op])

            avg_time_per_step = (time.time() -
                                 start) / cfg.TRAIN.SAVE_SUMMARY_STEPS
            avg_examples_per_second = (cfg.TRAIN.SAVE_SUMMARY_STEPS * cfg.TRAIN.BATCH_SIZE_PER_GPU * num_gpus) \
                                      / (time.time() - start)
            start = time.time()
            print('Step {:06d}, LR: {:.6f} LOSS: {:.4f}, '
                  'CLS: {:.4f}, BOX: {:.4f}, CET: {:.4f}, wd: {:.4f}, '
                  '{:.2f} s/step, {:.2f} samples/s'.format(
                      step, lr_, tl_, cls_loss_, reg_loss_, centerness_loss_,
                      wd_loss_, avg_time_per_step, avg_examples_per_second))

            summary_writer.add_summary(summary_str, global_step=step)
        else:
            sess.run(train_op)

        if step % 1000 == 0:
            saver.save(sess, checkpoint_path + '/model.ckpt', global_step=step)

        # profile the graph executation
        if 1510 <= (step - step0) <= 1520:
            from tensorflow.python.client import timeline
            options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
            sess.run(train_op, options=options, run_metadata=run_metadata)
            fetched_timeline = timeline.Timeline(run_metadata.step_stats)
            chrome_trace = fetched_timeline.generate_chrome_trace_format()
            with open('{}/timeline_step{}.json'.format(checkpoint_path, step),
                      'w') as fp:
                fp.write(chrome_trace)
Пример #40
0
def train(dataset):
  """Train on dataset for a number of steps."""
  with tf.Graph().as_default(), tf.device('/cpu:0'):
    # Create a variable to count the number of train() calls. This equals the
    # number of batches processed * FLAGS.num_gpus.
    global_step = tf.get_variable(
        'global_step', [],
        initializer=tf.constant_initializer(0), trainable=False)

    # Calculate the learning rate schedule.
    num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                             FLAGS.batch_size)
    decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay)

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                    global_step,
                                    decay_steps,
                                    FLAGS.learning_rate_decay_factor,
                                    staircase=True)

    # Create an optimizer that performs gradient descent.
    opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY,
                                    momentum=RMSPROP_MOMENTUM,
                                    epsilon=RMSPROP_EPSILON)

    # Get images and labels for ImageNet and split the batch across GPUs.
    assert FLAGS.batch_size % FLAGS.num_gpus == 0, (
        'Batch size must be divisible by number of GPUs')
    split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus)

    # Override the number of preprocessing threads to account for the increased
    # number of GPU towers.
    num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
    images, labels = image_processing.distorted_inputs(
        dataset,
        num_preprocess_threads=num_preprocess_threads)

    input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES))

    # Number of classes in the Dataset label set plus 1.
    # Label 0 is reserved for an (unused) background class.
    num_classes = dataset.num_classes() + 1

     # Split the batch of images and labels for towers.
    images_splits = tf.split(0, FLAGS.num_gpus, images)
    labels_splits = tf.split(0, FLAGS.num_gpus, labels)

    # Calculate the gradients for each model tower.
    tower_grads = []
    reuse_variables = None
    for i in xrange(FLAGS.num_gpus):
      with tf.device('/gpu:%d' % i):
        with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope:
          # Force all Variables to reside on the CPU.
          with slim.arg_scope([slim.variables.variable], device='/cpu:0'):
            # Calculate the loss for one tower of the ImageNet model. This
            # function constructs the entire ImageNet model but shares the
            # variables across all towers.
            loss = _tower_loss(images_splits[i], labels_splits[i], num_classes,
                               scope, reuse_variables)

          # Reuse variables for the next tower.
          reuse_variables = True

          # Retain the summaries from the final tower.
          summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

          # Retain the Batch Normalization updates operations only from the
          # final tower. Ideally, we should grab the updates from all towers
          # but these stats accumulate extremely fast so we can ignore the
          # other stats from the other towers without significant detriment.
          batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION,
                                                scope)

          # Calculate the gradients for the batch of data on this ImageNet
          # tower.
          grads = opt.compute_gradients(loss)

          # Keep track of the gradients across all towers.
          tower_grads.append(grads)

    # We must calculate the mean of each gradient. Note that this is the
    # synchronization point across all towers.
    grads = _average_gradients(tower_grads)

    # Add a summaries for the input processing and global_step.
    summaries.extend(input_summaries)

    # Add a summary to track the learning rate.
    summaries.append(tf.scalar_summary('learning_rate', lr))

    # Add histograms for gradients.
    for grad, var in grads:
      if grad is not None:
        summaries.append(
            tf.histogram_summary(var.op.name + '/gradients', grad))

    # Apply the gradients to adjust the shared variables.
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # Add histograms for trainable variables.
    for var in tf.trainable_variables():
      summaries.append(tf.histogram_summary(var.op.name, var))

    # Track the moving averages of all trainable variables.
    # Note that we maintain a "double-average" of the BatchNormalization
    # global statistics. This is more complicated then need be but we employ
    # this for backward-compatibility with our previous models.
    variable_averages = tf.train.ExponentialMovingAverage(
        inception.MOVING_AVERAGE_DECAY, global_step)

    # Another possiblility is to use tf.slim.get_variables().
    variables_to_average = (tf.trainable_variables() +
                            tf.moving_average_variables())
    variables_averages_op = variable_averages.apply(variables_to_average)

    # Group all updates to into a single train op.
    batchnorm_updates_op = tf.group(*batchnorm_updates)
    train_op = tf.group(apply_gradient_op, variables_averages_op,
                        batchnorm_updates_op)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation from the last tower summaries.
    summary_op = tf.merge_summary(summaries)

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph. allow_soft_placement must be set to
    # True to build towers on GPU, as some of the ops do not have GPU
    # implementations.
    sess = tf.Session(config=tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    if FLAGS.pretrained_model_checkpoint_path:
      assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
      variables_to_restore = tf.get_collection(
          slim.variables.VARIABLES_TO_RESTORE)
      restorer = tf.train.Saver(variables_to_restore)
      restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
      print('%s: Pre-trained model restored from %s' %
            (datetime.now(), FLAGS.pretrained_model_checkpoint_path))

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(
        FLAGS.train_dir,
        graph_def=sess.graph.as_graph_def(add_shapes=True))

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        examples_per_sec = FLAGS.batch_size / float(duration)
        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print(format_str % (datetime.now(), step, loss_value,
                            examples_per_sec, duration))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 5000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Пример #41
0
def inception_model_fn(features, labels, mode, params):
    """Inception v4 model using Estimator API."""
    num_classes = FLAGS.num_classes
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    features = tensor_transform_fn(features, params['model_transpose_dims'])

    if FLAGS.clear_update_collections:
        with arg_scope(
                inception.inception_v4_arg_scope(
                    batch_norm_decay=BATCH_NORM_DECAY,
                    batch_norm_epsilon=BATCH_NORM_EPSILON,
                    updates_collections=None)):
            logits, end_points = inception.inception_v4(
                features, num_classes, is_training=is_training)
    else:
        with arg_scope(
                inception.inception_v4_arg_scope(
                    batch_norm_decay=BATCH_NORM_DECAY,
                    batch_norm_epsilon=BATCH_NORM_EPSILON)):
            logits, end_points = inception.inception_v4(
                features, num_classes, is_training=is_training)

        predictions = {
            'classes': tf.argmax(input=logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and (
            not FLAGS.use_tpu):
        with tf.control_dependencies([
                tf.Print(predictions['classes'], [predictions['classes']],
                         summarize=FLAGS.eval_batch_size,
                         message='prediction: ')
        ]):
            labels = tf.Print(labels, [labels],
                              summarize=FLAGS.eval_batch_size,
                              message='label: ')

    one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32)
    if 'AuxLogits' in end_points:
        tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels,
                                        logits=end_points['AuxLogits'],
                                        weights=0.4,
                                        label_smoothing=0.1,
                                        scope='aux_loss')

    tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels,
                                    logits=logits,
                                    weights=1.0,
                                    label_smoothing=0.1)
    loss = tf.losses.get_total_loss(add_regularization_losses=True)

    initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256
    # Adjust the initial learning rate for warmup
    initial_learning_rate /= (
        FLAGS.learning_rate_decay**((FLAGS.warmup_epochs + FLAGS.cold_epochs) /
                                    FLAGS.learning_rate_decay_epochs))
    final_learning_rate = 0.0001 * initial_learning_rate

    train_op = None
    if is_training:
        batches_per_epoch = _NUM_TRAIN_IMAGES // FLAGS.train_batch_size
        global_step = tf.train.get_or_create_global_step()
        cur_epoch = tf.cast(
            (tf.cast(global_step, tf.float32) / batches_per_epoch), tf.int32)

        clr = FLAGS.cold_learning_rate
        wlr = initial_learning_rate / (FLAGS.warmup_epochs + FLAGS.cold_epochs)
        learning_rate = tf.where(
            tf.greater_equal(cur_epoch, FLAGS.cold_epochs), (tf.where(
                tf.greater_equal(cur_epoch,
                                 FLAGS.warmup_epochs + FLAGS.cold_epochs),
                tf.train.exponential_decay(
                    learning_rate=initial_learning_rate,
                    global_step=global_step,
                    decay_steps=FLAGS.learning_rate_decay_epochs *
                    batches_per_epoch,
                    decay_rate=FLAGS.learning_rate_decay,
                    staircase=True),
                tf.multiply(tf.cast(cur_epoch, tf.float32), wlr))), clr)

        # Set a minimum boundary for the learning rate.
        learning_rate = tf.maximum(learning_rate,
                                   final_learning_rate,
                                   name='learning_rate')

        if FLAGS.optimizer == 'sgd':
            tf.logging.info('Using SGD optimizer')
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=learning_rate)
        elif FLAGS.optimizer == 'momentum':
            tf.logging.info('Using Momentum optimizer')
            optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                                   momentum=0.9)
        elif FLAGS.optimizer == 'RMS':
            tf.logging.info('Using RMS optimizer')
            optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                                  RMSPROP_DECAY,
                                                  momentum=RMSPROP_MOMENTUM,
                                                  epsilon=RMSPROP_EPSILON)
        else:
            tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer)

        if FLAGS.use_tpu:
            optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step=global_step)
        if FLAGS.moving_average:
            ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY,
                                                    num_updates=global_step)
            variables_to_average = (tf.trainable_variables() +
                                    tf.moving_average_variables())
            with tf.control_dependencies([train_op
                                          ]), tf.name_scope('moving_average'):
                train_op = ema.apply(variables_to_average)

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(labels, predictions):
            accuracy = tf.metrics.accuracy(
                labels, tf.argmax(input=predictions, axis=1))
            return {'accuracy': accuracy}

        if FLAGS.use_logits:
            eval_predictions = logits
        else:
            eval_predictions = end_points['Predictions']

        eval_metrics = (metric_fn, [labels, eval_predictions])

    return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          eval_metrics=eval_metrics)
Пример #42
0
    def build_graph(self, filenames, labels, subset, feed_hypes=None):

        hypes = self.hypes.copy()

        if feed_hypes:
            with tf.name_scope(None):
                for i in feed_hypes:
                    hypes[i] = tf.placeholder("float32", name=i)
                    hypes[i].set_shape([])

        with tf.name_scope("inputs"):

            filenames, labels = tf.train.slice_input_producer(
                tensor_list=[filenames, labels], capacity=hypes["batch_size"] * 2, shuffle=(subset == "train")
            )

            filenames, labels = tf.train.batch(
                tensor_list=[filenames, labels], capacity=hypes["batch_size"] * 2, batch_size=hypes["batch_size"]
            )

            images0 = [
                tf.image.decode_jpeg(tf.read_file(i[0]), channels=3)
                for i in tf.split(0, hypes["batch_size"], filenames)
            ]

            images0 = [skin.util.square_pad(i) for i in images0]

            if subset == "train":
                images0 = [tf.image.random_flip_left_right(i) for i in images0]
                images0 = [tf.image.random_flip_up_down(i) for i in images0]

            if hypes["spatial_transformer"]:
                images = skin.util.spatial_tranform(
                    images0, hypes["batch_size"], subset, hypes["loc_net"], hypes["xform_reg"]
                )
            else:
                images = tf.pack([tf.image.resize_images(i, 299, 299) for i in images0])

            with tf.name_scope(None):
                images = tf.identity(images, name="input")

        logits, logits_aux = inception_model.inference(
            images=(images - 128) / 128.0,
            num_classes=len(self.labels),
            for_training=(subset == "train"),
            restore_logits=(subset != "train"),
        )

        with tf.name_scope(None):
            logits = tf.identity(logits, name="logits")
        tf.histogram_summary("logits", logits)

        with tf.name_scope("loss"):

            batch_size, num_classes = logits.get_shape().as_list()

            labels_sparse = tf.sparse_to_dense(
                sparse_indices=tf.transpose(tf.pack([tf.range(batch_size), labels])),
                output_shape=[batch_size, num_classes],
                sparse_values=np.ones(batch_size, dtype="float32"),
            )

            loss = tf.nn.softmax_cross_entropy_with_logits(logits, labels_sparse)
            loss = tf.reduce_mean(loss, name="loss")

            loss_aux = tf.nn.softmax_cross_entropy_with_logits(logits_aux, labels_sparse)
            loss_aux = tf.reduce_mean(loss_aux, name="loss_aux")

            loss = 0.7 * loss + 0.3 * loss_aux

            tf.scalar_summary("loss", loss)

        fetches = {"loss": loss, "filenames": filenames, "logits": logits}

        def print_graph_ops():
            with open("/tmp/graph_ops.txt", "w") as f:
                for op in tf.get_default_graph().get_operations():
                    f.write(op.type.ljust(35) + "\t" + op.name + "\n")

        if subset == "train":

            reg_losses = tf.get_collection("regularization_losses")

            for i, j in enumerate(reg_losses):
                if "loc_net" in j.name:
                    reg_losses[i] *= hypes["loc_net_reg"]

            reg_loss = tf.add_n(reg_losses)
            tf.scalar_summary("reg_loss", reg_loss)

            with tf.variable_scope("reg_loss"):
                loss += reg_loss

            print_graph_ops()

            global_step = tf.Variable(0, name="global_step", trainable=False)

            opt = eval("tf.train.{}Optimizer".format("Adam"))(
                learning_rate=hypes["learning_rate"],
                epsilon=hypes["epsilon"],
                beta1=hypes["beta1"],
                beta2=hypes["beta2"],
            )

            grads = opt.compute_gradients(loss)
            apply_grads = opt.apply_gradients(grads, global_step)

            variable_averages = tf.train.ExponentialMovingAverage(hypes["variable_averages_decay"], global_step)
            variables_to_average = tf.trainable_variables() + tf.moving_average_variables()
            variables_averages_op = variable_averages.apply(variables_to_average)

            batchnorm_updates_op = tf.group(*tf.get_collection("_update_ops_"))

            train_op = tf.group(apply_grads, variables_averages_op, batchnorm_updates_op)

            for grad, var in grads:
                tf.histogram_summary(var.op.name, var)
                try:
                    tf.histogram_summary(var.op.name + "/gradients", grad)
                except:
                    print var.op.name

            fetches.update({"reg_loss": reg_loss, "train_op": train_op, "global_step": global_step})

        else:

            print_graph_ops()

        return fetches
Пример #43
0
def train(scope=''):
    """Train on dataset for a number of steps."""
    with tf.Graph().as_default(), tf.device('/gpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        train_dirs = FLAGS.datasets.split(':')

        # Calculate the learning rate schedule.
        decay_steps = 15000

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                        global_step,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.AdamOptimizer(lr)

        # Override the number of preprocessing threads to account for the increased
        # number of GPU towers.
        num_preprocess_threads = FLAGS.num_preprocess_threads

        _images, _shapes, _reference_shape, pca_model = \
            data_provider.load_images(train_dirs)

        reference_shape = tf.constant(_reference_shape,
                                      dtype=tf.float32,
                                      name='reference_shape')

        image_shape = _images[0].shape
        lms_shape = _shapes[0].points.shape

        def get_random_sample(rotation_stddev=10):
            idx = np.random.randint(low=0, high=len(_images))
            im = menpo.image.Image(_images[idx].transpose(2, 0, 1), copy=False)
            lms = _shapes[idx]
            im.landmarks['PTS'] = lms

            if np.random.rand() < .5:
                im = utils.mirror_image(im)

            if np.random.rand() < .5:
                theta = np.random.normal(scale=rotation_stddev)
                rot = menpo.transform.rotate_ccw_about_centre(lms, theta)
                im = im.warp_to_shape(im.shape, rot)

            pixels = im.pixels.transpose(1, 2, 0).astype('float32')
            shape = im.landmarks['PTS'].lms.points.astype('float32')
            return pixels, shape

        image, shape = tf.py_func(get_random_sample, [],
                                  [tf.float32, tf.float32], stateful=True)

        initial_shape = data_provider.random_shape(shape, reference_shape,
                                                   pca_model)
        image.set_shape(image_shape)
        shape.set_shape(lms_shape)
        initial_shape.set_shape(lms_shape)

        image = data_provider.distort_color(image)

        images, lms, inits = tf.train.batch([image, shape, initial_shape],
                                            FLAGS.batch_size,
                                            dynamic_pad=False,
                                            capacity=5000,
                                            enqueue_many=False,
                                            num_threads=num_preprocess_threads,
                                            name='batch')
        print('Defining model...')
        with tf.device(FLAGS.train_device):
            # Retain the summaries from the final tower.
            summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
            predictions, dxs, _ = mdm_model.model(
                images, inits, patch_shape=(FLAGS.patch_size, FLAGS.patch_size))

            total_loss = 0

            for i, dx in enumerate(dxs):
                norm_error = mdm_model.normalized_rmse(dx + inits, lms)
                tf.summary.histogram('errors', norm_error)
                loss = tf.reduce_mean(norm_error)
                total_loss += loss
                summaries.append(tf.summary.scalar('losses/step_{}'.format(i),
                                                   loss))

            # Calculate the gradients for the batch of data
            grads = opt.compute_gradients(total_loss)

        summaries.append(tf.summary.scalar('losses/total', total_loss))
        pred_images, = tf.py_func(utils.batch_draw_landmarks,
                                  [images, predictions], [tf.float32])
        gt_images, = tf.py_func(utils.batch_draw_landmarks, [images, lms],
                                [tf.float32])

        summary = tf.summary.image('images',
                                   tf.concat([gt_images, pred_images], 2),
                                   max_outputs=5)
        summaries.append(tf.summary.histogram('dx', predictions - inits))

        summaries.append(summary)

        batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION,
                                              scope)

        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(tf.summary.histogram(var.op.name +
                                                      '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))

        # Track the moving averages of all trainable variables.
        # Note that we maintain a "double-average" of the BatchNormalization
        # global statistics. This is more complicated then need be but we employ
        # this for backward-compatibility with our previous models.
        variable_averages = tf.train.ExponentialMovingAverage(
            MOVING_AVERAGE_DECAY, global_step)

        # Another possibility is to use tf.slim.get_variables().
        variables_to_average = (
            tf.trainable_variables() + tf.moving_average_variables())
        variables_averages_op = variable_averages.apply(variables_to_average)

        # Group all updates to into a single train op.
        # NOTE: Currently we are not using batchnorm in MDM.
        batchnorm_updates_op = tf.group(*batchnorm_updates)
        train_op = tf.group(apply_gradient_op, variables_averages_op,
                            batchnorm_updates_op)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)
        # Start running operations on the Graph. allow_soft_placement must be
        # set to True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()
        print('Initializing variables...')
        sess.run(init)
        print('Initialized variables.')

        if FLAGS.pretrained_model_checkpoint_path:
            assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
            variables_to_restore = tf.get_collection(
                slim.variables.VARIABLES_TO_RESTORE)
            restorer = tf.train.Saver(variables_to_restore)
            restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
            print('%s: Pre-trained model restored from %s' %
                  (datetime.now(), FLAGS.pretrained_model_checkpoint_path))

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.summary.FileWriter(FLAGS.train_dir)

        print('Starting training...')
        for step in range(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, total_loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                examples_per_sec = FLAGS.batch_size / float(duration)
                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, duration))

            if step % 20 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 50 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)