예제 #1
0
 def test_policy_scope(self):
     if base_layer_utils.v2_dtype_behavior_enabled():
         default_policy = 'float32'
     else:
         default_policy = '_infer'
     with mp_policy.policy_scope('mixed_float16'):
         self.assertEqual(mp_policy.global_policy().name, 'mixed_float16')
         with mp_policy.policy_scope('_infer'):
             self.assertEqual(mp_policy.global_policy().name, '_infer')
         self.assertEqual(mp_policy.global_policy().name, 'mixed_float16')
     self.assertEqual(mp_policy.global_policy().name, default_policy)
예제 #2
0
 def test_global_policy(self):
     if base_layer_utils.v2_dtype_behavior_enabled():
         default_policy = 'float32'
     else:
         default_policy = '_infer'
     self.assertEqual(mp_policy.global_policy().name, default_policy)
     try:
         mp_policy.set_policy('mixed_float16')
         self.assertEqual(mp_policy.global_policy().name, 'mixed_float16')
         with ops.Graph().as_default(
         ):  # Policies are not associated with a graph
             self.assertEqual(mp_policy.global_policy().name,
                              'mixed_float16')
         mp_policy.set_policy('_infer')
         self.assertEqual(mp_policy.global_policy().name, '_infer')
         policy = mp_policy.Policy('mixed_bfloat16')
         mp_policy.set_policy(policy)
         self.assertIs(mp_policy.global_policy(), policy)
     finally:
         mp_policy.set_policy(None)
예제 #3
0
    def _test_mixed_precision(self, task_type, task_id, num_gpus):
        """Tests mixed precision works with the CollectiveAllReduceStrategy.

    This tests:
      1. Variables are in float32, by running with a small enough learning rate
         that if the variables are float16, their values wouldn't change when
         gradients are applied.
      2. The loss scale is doubled if there are no NaNs.
      3. The loss scale is halved if the first worker has a NaN, even if the
         other works do not have NaNs.

    Args:
      task_type: A string, such as "worker", indicating the type of the replica.
      task_id: Zero-indexed ID of the task.
      num_gpus: The number of GPUs to use.
    """
        d, master_target, config = self._get_test_object(
            task_type, task_id, num_gpus)
        # Should be set to mixed_float16 by caller.
        self.assertEqual(policy.global_policy().name, 'mixed_float16')

        with ops.Graph().as_default(), \
             self.cached_session(config=config,
                                 target=master_target) as sess:
            # The loss on the first worker is multiplied by this value. Allows
            # testing the first worker having NaN loss and gradients while keeping the
            # other workers' losses and gradients finite.
            loss_multiplier_for_first_worker = variables.Variable(
                1., dtype='float16', trainable=False)
            with d.scope():
                model = sequential.Sequential([
                    mp_test_util.MultiplyLayer(assert_type=dtypes.float16,
                                               input_shape=(1, )),
                ])
                loss_scale = loss_scale_module.DynamicLossScale(
                    2**10, increment_period=1)

                def model_fn():
                    """Simple model to test mixed precision."""
                    x = np.ones((1, 1))
                    loss = model(x, training=True)

                    if ((task_type == 'worker' and task_id == 0)
                            or task_type is task_id is None):
                        loss *= loss_multiplier_for_first_worker
                    # Learning rate is small enough that if applied to a float16 variable,
                    # the variable will not change. So this tests the learning rate is not
                    # applied to a float16 value, but instead the float32 variable.
                    optimizer = gradient_descent.GradientDescentOptimizer(
                        2**-14)
                    optimizer = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(
                        optimizer, loss_scale)
                    train_op = optimizer.minimize(
                        loss, training_util.get_or_create_global_step())
                    return train_op

                train_op = d.extended.call_for_each_replica(model_fn)
                train_op = d.group(d.experimental_local_results(train_op))

            sess.run(variables.global_variables_initializer())
            sess.run(train_op)

            (var, ) = model.trainable_weights
            # Variable starts at 1. Each worker's gradient is 2 ** -14, the learning
            # rate, and each worker's gradient will be subtracted from the variable.
            expected = 1 - d.num_replicas_in_sync * 2**-14
            self.assertEqual(sess.run(var), expected)
            # Loss scale should double, as are gradients are finite.
            self.assertEqual(sess.run(loss_scale()), 2**11)

            # Set the first worker to have NaN loss and gradients.
            sess.run(loss_multiplier_for_first_worker.assign(float('NaN')))
            sess.run(train_op)
            # Variable should not change, since first worker had NaN
            self.assertEqual(sess.run(var), expected)
            # Loss scale should halve due to NaN
            self.assertEqual(sess.run(loss_scale()), 2**10)
예제 #4
0
def compare_results(results_with_ds,
                    results_without_ds,
                    distribution,
                    testcase,
                    partial_last_batch=None):
    """Compares results of model compiled with/without distribution strategy."""
    if policy.global_policy().compute_dtype in ('float16', 'bfloat16'):
        default_tolerance = 1e-2
        relaxed_tolerance = 1e-2
    elif partial_last_batch == 'train_and_eval':
        # We relax the tolerance a lot in the partial last batch case as
        #   1. the examples in uneven batches may have different weights when
        #      applying the gradients in the distributed case.
        #   2. TF Keras and TF Keras DS have different ways to handle the case when
        #      training with epochs > 1 with numpy inputs. In TF Keras, every epoch
        #      may have a partial batch. While in TF Keras DS, as we convert
        #      numpy inputs into dataset, it will do a repeat() first and calculate
        #      steps_per_epoch, so it will at most have one partial batch. This
        #      makes the 1-CPU result even different.
        default_tolerance = 1e-3
        relaxed_tolerance = 1e-3
    else:
        default_tolerance = 1e-5
        relaxed_tolerance = 1e-4

    def _get_compare_result_tolerance(key):
        """Returns tolerance to compare results."""
        # See b/119257215 for more details. DS test run on GPU could have larger
        # variance then test on CPU.
        if (test_util.is_gpu_available() and key.startswith(
            ('weights_1', 'weights_2', 'predict_result'))):
            return relaxed_tolerance

        return default_tolerance

    for key in sorted(results_with_ds.keys()):
        if (key.startswith('training_history') and isinstance(
                distribution,
            (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1))
                and distribution.extended.steps_per_run > 1):
            # TODO(b/119894254): Enable this test for all cases once the
            # underlying bug is fixed.
            continue

        tolerance = _get_compare_result_tolerance(key)

        # We don't compare the loss as loss is currently not computed as metric
        # in Keras, the loss value is inaccurate for last partial batch due to
        # more weights for the last batch samples.
        if partial_last_batch is not None:
            if key.startswith('eval_result'):
                results_with_ds[key] = results_with_ds[key][1:]
                results_without_ds[key] = results_without_ds[key][1:]
            if key.startswith('training_history'):
                results_with_ds[key]['val_loss'] = 0
                results_without_ds[key]['val_loss'] = 0

        testcase.assertAllClose(results_with_ds[key],
                                results_without_ds[key],
                                atol=tolerance,
                                rtol=tolerance,
                                msg='Fail to assert {}.'.format(key))
def compare_results(results_with_ds,
                    results_without_ds,
                    distribution,
                    testcase,
                    partial_last_batch=None):
  """Compares results of model compiled with/without distribution strategy."""
  if policy.global_policy().compute_dtype in ('float16', 'bfloat16'):
    default_tolerance = 1e-2
    relaxed_tolerance = 1e-2
  elif partial_last_batch == 'train_and_eval':
    # We relax the tolerance a lot in the partial last batch case as
    #   1. the examples in uneven batches may have different weights when
    #      applying the gradients in the distributed case.
    #   2. TF Keras and TF Keras DS have different ways to handle the case when
    #      training with epochs > 1 with numpy inputs. In TF Keras, every epoch
    #      may have a partial batch. While in TF Keras DS, as we convert
    #      numpy inputs into dataset, it will do a repeat() first and calculate
    #      steps_per_epoch, so it will at most have one partial batch. This
    #      makes the 1-CPU result even different.
    default_tolerance = 1e-3
    relaxed_tolerance = 1e-3
  else:
    default_tolerance = 1e-5
    relaxed_tolerance = 1e-4

  def _get_compare_result_tolerance(key):
    """Returns tolerance to compare results."""
    # TODO(b/119257215): For MirroredStrategy, weights are not exactly the same,
    # so use larger tolerance for now. Predict should be related to weights.
    # Also for CentralStorageStrategy and OneDeviceStrategy which is observed in
    # b/172956754.
    if (isinstance(distribution,
                   (mirrored_strategy.MirroredStrategy,
                    mirrored_strategy.MirroredStrategyV1,
                    central_storage_strategy.CentralStorageStrategy,
                    central_storage_strategy.CentralStorageStrategyV1,
                    one_device_strategy.OneDeviceStrategy,
                    one_device_strategy.OneDeviceStrategyV1,
                    distribute_lib._DefaultDistributionStrategy)) and  # pylint: disable=protected-access
        key.startswith(('weights_1', 'weights_2', 'predict_result'))):
      return relaxed_tolerance

    return default_tolerance

  for key in sorted(results_with_ds.keys()):
    if (key.startswith('training_history') and
        isinstance(distribution,
                   (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)) and
        distribution.extended.steps_per_run > 1):
      # TODO(b/119894254): Enable this test for all cases once the
      # underlying bug is fixed.
      continue

    tolerance = _get_compare_result_tolerance(key)

    # We don't compare the loss as loss is currently not computed as metric
    # in Keras, the loss value is inaccurate for last partial batch due to
    # more weights for the last batch samples.
    if partial_last_batch is not None:
      if key.startswith('eval_result'):
        results_with_ds[key] = results_with_ds[key][1:]
        results_without_ds[key] = results_without_ds[key][1:]
      if key.startswith('training_history'):
        results_with_ds[key]['val_loss'] = 0
        results_without_ds[key]['val_loss'] = 0

    testcase.assertAllClose(
        results_with_ds[key],
        results_without_ds[key],
        atol=tolerance,
        rtol=tolerance,
        msg='Fail to assert {}.'.format(key))