def get_train_op(loss, params):
    """Generate training operation that updates variables based on loss."""
    with tf.variable_scope("get_train_op"):
        mlperf_log.transformer_print(key=mlperf_log.OPT_LR_WARMUP_STEPS,
                                     value=params.learning_rate_warmup_steps)
        learning_rate = get_learning_rate(params.learning_rate,
                                          params.hidden_size,
                                          params.learning_rate_warmup_steps)
        log_id = mlperf_log.resnet_print(key=mlperf_log.OPT_LR, deferred=True)
        learning_rate = tf_mlperf_log.log_deferred(op=learning_rate,
                                                   log_id=log_id,
                                                   every_n=100)

        # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster
        # than the TF core Adam optimizer.
        mlperf_log.transformer_print(key=mlperf_log.OPT_NAME,
                                     value=mlperf_log.LAZY_ADAM)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1,
                                     value=params.optimizer_adam_beta1)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2,
                                     value=params.optimizer_adam_beta2)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON,
                                     value=params.optimizer_adam_epsilon)
        optimizer = tf.contrib.opt.LazyAdamOptimizer(
            learning_rate,
            beta1=params.optimizer_adam_beta1,
            beta2=params.optimizer_adam_beta2,
            epsilon=params.optimizer_adam_epsilon)

        # Calculate and apply gradients using LazyAdamOptimizer.
        global_step = tf.train.get_global_step()
        tvars = tf.trainable_variables()
        gradients = optimizer.compute_gradients(
            loss, tvars, colocate_gradients_with_ops=True)
        train_op = optimizer.apply_gradients(gradients,
                                             global_step=global_step,
                                             name="train")

        # Save gradient norm to Tensorboard
        tf.summary.scalar("global_norm/gradient_norm",
                          tf.global_norm(list(zip(*gradients))[0]))

        return train_op
Exemplo n.º 2
0
def get_train_op(loss, params):
    """Generate training operation that updates variables based on loss."""
    with tf.compat.v1.variable_scope("get_train_op"):
        mlperf_log.transformer_print(key=mlperf_log.OPT_LR_WARMUP_STEPS,
                                     value=params.learning_rate_warmup_steps)
        learning_rate = get_learning_rate(params.learning_rate,
                                          params.hidden_size,
                                          params.learning_rate_warmup_steps)
        log_id = mlperf_log.resnet_print(key=mlperf_log.OPT_LR, deferred=True)
        learning_rate = tf_mlperf_log.log_deferred(op=learning_rate,
                                                   log_id=log_id,
                                                   every_n=100)

        # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster
        # than the TF core Adam optimizer.
        mlperf_log.transformer_print(key=mlperf_log.OPT_NAME,
                                     value=mlperf_log.LAZY_ADAM)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1,
                                     value=params.optimizer_adam_beta1)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2,
                                     value=params.optimizer_adam_beta2)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON,
                                     value=params.optimizer_adam_epsilon)
        # Using optimizer v1(from tensorflow.python.trainings*)
        # The optimizer v2 version of code is in the below.
        # Optimzer v1 does not
        # have lazyAdam optimizer (was in contrib, now deprecated)
        optimizer = adam.AdamOptimizer(learning_rate,
                                       beta1=params.optimizer_adam_beta1,
                                       beta2=params.optimizer_adam_beta2,
                                       epsilon=params.optimizer_adam_epsilon)

        # Calculate and apply gradients using LazyAdamOptimizer.
        global_step = tf.compat.v1.train.get_global_step()
        tvars = tf.compat.v1.trainable_variables()
        grads_and_vars = optimizer.compute_gradients(loss, tvars)
        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step,
                                             name="train")
        # Save gradient norm to Tensorboard
        tf.compat.v1.summary.scalar(
            "global_norm/gradient_norm",
            tf.linalg.global_norm(list(zip(*grads_and_vars))[0]))
        # Using tfa (tensorflow_addons) optimizer, which in turn
        # uses optimizer_v2 (from tf.python.keras.optimizer_v2)
        # which has waringin issues about global step not updated since
        # global_step is not accepted in apply_gradients() function of
        # optimizer_v2 version.
        # Thus the global step is updated and grouped with training op
        # To activate LazyAdams from tensroflow-addons activate the
        # following code and take out the above optimer v1 related code
        # Currently both optimizer v1 and v2 take about same time
        '''                  
    optimizer = tfa.optimizers.LazyAdam(
        learning_rate,
        beta_1=params.optimizer_adam_beta1,
        beta_2=params.optimizer_adam_beta2,
        epsilon=params.optimizer_adam_epsilon)

    # Calculate and apply gradients using LazyAdamOptimizer.
    global_step = tf.compat.v1.train.get_global_step()
    tvars = tf.compat.v1.trainable_variables()
    tvars = tvars[0:len(tvars)-1]
    gradients = optimizer.get_gradients(
        loss, tvars)
    grads_and_vars = zip(gradients, tvars)
    train_op = optimizer.apply_gradients(
        grads_and_vars)
    # Save gradient norm to Tensorboard
    tf.compat.v1.summary.scalar("global_norm/gradient_norm",
                      tf.compat.v1.linalg.global_norm(list(gradients)))
    update_global_step = tf.compat.v1.assign(global_step, global_step + 1, name = "update_global_step")
    train_op = tf.compat.v1.group(train_op, [(update_global_step)])
    '''
        return train_op
Exemplo n.º 3
0
def resnet_model_fn(features,
                    labels,
                    mode,
                    model_class,
                    resnet_size,
                    weight_decay,
                    learning_rate_fn,
                    momentum,
                    data_format,
                    version,
                    loss_scale,
                    loss_filter_fn=None,
                    dtype=resnet_model.DEFAULT_DTYPE,
                    label_smoothing=0.0,
                    enable_lars=False):
    """Shared functionality for different resnet model_fns.

  Initializes the ResnetModel representing the model layers
  and uses that model to build the necessary EstimatorSpecs for
  the `mode` in question. For training, this means building losses,
  the optimizer, and the train op that get passed into the EstimatorSpec.
  For evaluation and prediction, the EstimatorSpec is returned without
  a train op, but with the necessary parameters for the given mode.

  Args:
    features: tensor representing input images
    labels: tensor representing class labels for all input images
    mode: current estimator mode; should be one of
      `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
    model_class: a class representing a TensorFlow model that has a __call__
      function. We assume here that this is a subclass of ResnetModel.
    resnet_size: A single integer for the size of the ResNet model.
    weight_decay: weight decay loss rate used to regularize learned variables.
    learning_rate_fn: function that returns the current learning rate given
      the current global_step
    momentum: momentum term used for optimization
    data_format: Input format ('channels_last', 'channels_first', or None).
      If set to None, the format is dependent on whether a GPU is available.
    version: Integer representing which version of the ResNet network to use.
      See README for details. Valid values: [1, 2]
    loss_scale: The factor to scale the loss for numerical stability. A detailed
      summary is present in the arg parser help text.
    loss_filter_fn: function that takes a string variable name and returns
      True if the var should be included in loss calculation, and False
      otherwise. If None, batch_normalization variables will be excluded
      from the loss.
    dtype: the TensorFlow dtype to use for calculations.

  Returns:
    EstimatorSpec parameterized according to the input params and the
    current mode.
  """

    # Generate a summary node for the images
    tf.summary.image('images', features, max_outputs=6)

    # Checks that features/images have same data type being used for calculations.
    assert features.dtype == dtype

    features = tf.cast(features, dtype)

    model = model_class(resnet_size, data_format, version=version, dtype=dtype)

    logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)

    # This acts as a no-op if the logits are already in fp32 (provided logits are
    # not a SparseTensor). If dtype is is low precision, logits must be cast to
    # fp32 for numerical stability.
    logits = tf.cast(logits, tf.float32)

    num_examples_metric = tf_mlperf_log.sum_metric(tensor=tf.shape(logits)[0],
                                                   name=_NUM_EXAMPLES_NAME)

    predictions = {
        'classes': tf.argmax(logits, axis=1),
        'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
    }

    if mode == tf.estimator.ModeKeys.PREDICT:
        # Return the predictions and the specification for serving a SavedModel
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'predict': tf.estimator.export.PredictOutput(predictions)
            })

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_LOSS_FN,
                            value=mlperf_log.CCE)

    if label_smoothing != 0.0:
        one_hot_labels = tf.one_hot(labels, 1001)
        cross_entropy = tf.losses.softmax_cross_entropy(
            logits=logits,
            onehot_labels=one_hot_labels,
            label_smoothing=label_smoothing)
    else:
        cross_entropy = tf.losses.sparse_softmax_cross_entropy(logits=logits,
                                                               labels=labels)

    # Create a tensor named cross_entropy for logging purposes.
    tf.identity(cross_entropy, name='cross_entropy')
    tf.summary.scalar('cross_entropy', cross_entropy)

    # If no loss_filter_fn is passed, assume we want the default behavior,
    # which is that batch_normalization variables are excluded from loss.
    def exclude_batch_norm(name):
        return 'batch_normalization' not in name

    loss_filter_fn = loss_filter_fn or exclude_batch_norm

    mlperf_log.resnet_print(key=mlperf_log.MODEL_EXCLUDE_BN_FROM_L2,
                            value=not loss_filter_fn('batch_normalization'))

    # Add weight decay to the loss.
    mlperf_log.resnet_print(key=mlperf_log.MODEL_L2_REGULARIZATION,
                            value=weight_decay)
    l2_loss = weight_decay * tf.add_n(
        # loss is computed using fp32 for numerical stability.
        [
            tf.nn.l2_loss(tf.cast(v, tf.float32))
            for v in tf.trainable_variables() if loss_filter_fn(v.name)
        ])
    tf.summary.scalar('l2_loss', l2_loss)
    loss = cross_entropy + l2_loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        global_step = tf.train.get_or_create_global_step()

        learning_rate = learning_rate_fn(global_step)

        log_id = mlperf_log.resnet_print(key=mlperf_log.OPT_LR, deferred=True)
        learning_rate = tf_mlperf_log.log_deferred(op=learning_rate,
                                                   log_id=log_id,
                                                   every_n=100)

        # Create a tensor named learning_rate for logging purposes
        tf.identity(learning_rate, name='learning_rate')
        tf.summary.scalar('learning_rate', learning_rate)

        mlperf_log.resnet_print(key=mlperf_log.OPT_NAME,
                                value=mlperf_log.SGD_WITH_MOMENTUM)
        mlperf_log.resnet_print(key=mlperf_log.OPT_MOMENTUM, value=momentum)

        if enable_lars:
            optimizer = tf.contrib.opt.LARSOptimizer(
                learning_rate,
                momentum=momentum,
                weight_decay=weight_decay,
                skip_list=['batch_normalization', 'bias'])
        else:
            optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                                   momentum=momentum)

        if loss_scale != 1:
            # When computing fp16 gradients, often intermediate tensor values are
            # so small, they underflow to 0. To avoid this, we multiply the loss by
            # loss_scale to make these tensor values loss_scale times bigger.
            scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale)

            # Once the gradient computation is complete we can scale the gradients
            # back to the correct scale before passing them to the optimizer.
            unscaled_grad_vars = [(grad / loss_scale,
                                   var) if grad is not None else (grad, var)
                                  for grad, var in scaled_grad_vars]
            minimize_op = optimizer.apply_gradients(unscaled_grad_vars,
                                                    global_step)
        else:
            minimize_op = optimizer.minimize(loss, global_step)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        train_op = tf.group(minimize_op, update_ops, num_examples_metric[1])
    else:
        train_op = None

    accuracy = tf.metrics.accuracy(labels, predictions['classes'])
    accuracy_top_5 = tf.metrics.mean(
        tf.nn.in_top_k(predictions=logits,
                       targets=labels,
                       k=5,
                       name='top_5_op'))

    metrics = {
        'accuracy': accuracy,
        'accuracy_top_5': accuracy_top_5,
        _NUM_EXAMPLES_NAME: num_examples_metric
    }

    # Create a tensor named train_accuracy for logging purposes
    tf.identity(accuracy[1], name='train_accuracy')
    tf.identity(accuracy_top_5[1], name='train_accuracy_top_5')
    tf.summary.scalar('train_accuracy', accuracy[1])
    tf.summary.scalar('train_accuracy_top_5', accuracy_top_5[1])

    return tf.estimator.EstimatorSpec(mode=mode,
                                      predictions=predictions,
                                      loss=loss,
                                      train_op=train_op,
                                      eval_metric_ops=metrics)
Exemplo n.º 4
0
 def log_deferred_tensor_value(self, key, tensor_value, stack_offset=2,
                               every_n=1, first_n=None):
   log_id = self._log_fn(key, stack_offset=stack_offset, deferred=True)
   return tf_mlperf_log.log_deferred(op=tensor_value, log_id=log_id,
                                     every_n=every_n, first_n=first_n)