示例#1
0
  def test_fixed_loss_scaling(self, strategy_fn, cloning=True):
    # Note: We do not test mixed precision in this method, only loss scaling.
    loss_scale = 8.
    batch_size = 4
    with strategy_fn().scope():
      x = layers.Input(shape=(1,), batch_size=batch_size)
      layer = AddLayer()
      y = layer(x)

      # The gradient of 'y' at this point is 1. With loss scaling, the gradient
      # is 'loss_scale'. We divide by the batch size since the loss is averaged
      # across batch elements.
      expected_gradient = loss_scale / batch_size
      identity_with_grad_check_fn = (
          mp_test_util.create_identity_with_grad_check_fn([expected_gradient]))
      y = core.Lambda(identity_with_grad_check_fn)(y)
      model = models.Model(inputs=x, outputs=y)

      def loss_fn(y_true, y_pred):
        del y_true
        return math_ops.reduce_mean(y_pred)

      opt = gradient_descent.SGD(1.)
      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
      model.compile(opt, loss=loss_fn, cloning=cloning)

    self.assertEqual(backend.eval(layer.v), 1)
    x = np.ones((batch_size, 1))
    y = np.ones((batch_size, 1))
    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(batch_size)
    model.fit(dataset)
    # Variable starts at 1, and should have gradient of 1 subtracted from it.
    expected = 0
    self.assertEqual(backend.eval(layer.v), expected)
 def testFixedLossScaleAppliedToLossWithGetGradients(self):
   var = variables.Variable([2.0])
   opt = gradient_descent.GradientDescentOptimizer(1.0)
   loss_scale = 10.
   opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(opt, loss_scale)
   grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(loss_scale)
   loss = grad_check_fn(var)
   run_op = get_gradients(opt, loss, [var])
   self.evaluate(variables.global_variables_initializer())
   # This will cause an assertion to run, as
   # mp_test_util.create_identity_with_grad_check_fn added an assertion op.
   self.evaluate(run_op)
示例#3
0
 def testFixedLossScaleAppliedToLossWithGetGradients(self):
     var = variables.Variable([2.0])
     opt = gradient_descent.SGD(1.0)
     loss_scale = 10.
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
     grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(
         loss_scale)
     loss = grad_check_fn(var)
     run_op = opt.get_gradients(loss, [var])
     self.evaluate(variables.global_variables_initializer())
     # This will cause an assertion to run, as
     # mp_test_util.create_identity_with_grad_check_fn added an assertion op.
     self.evaluate(run_op)
示例#4
0
    def test_fixed_loss_scaling(self, strategy_fn, cloning=True):
        if testing_utils.should_run_distributed():
            self.skipTest('b/137397816')
        # Note: We do not test mixed precision in this method, only loss scaling.
        if not self._is_strategy_supported(strategy_fn):
            return
        loss_scale = 8.
        batch_size = 4
        with strategy_fn().scope():
            x = layers.Input(shape=(1, ), batch_size=batch_size)
            layer = AddLayer()
            y = layer(x)

            # The gradient of 'y' at this point is 1. With loss scaling, the gradient
            # is 'loss_scale'. We divide by the batch size since the loss is averaged
            # across batch elements.
            expected_gradient = loss_scale / batch_size
            identity_with_grad_check_fn = (
                mp_test_util.create_identity_with_grad_check_fn(
                    [expected_gradient]))
            y = core.Lambda(identity_with_grad_check_fn)(y)
            model = models.Model(inputs=x, outputs=y)

            def loss_fn(y_true, y_pred):
                del y_true
                return math_ops.reduce_mean(y_pred)

            opt = gradient_descent.SGD(1.)
            opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            model.compile(
                opt,
                loss=loss_fn,
                cloning=cloning,
                run_eagerly=testing_utils.should_run_eagerly(),
                run_distributed=testing_utils.should_run_distributed())

        self.assertEqual(backend.eval(layer.v), 1)
        x = np.ones((batch_size, 1))
        y = np.ones((batch_size, 1))
        dataset = dataset_ops.Dataset.from_tensor_slices(
            (x, y)).batch(batch_size)
        model.fit(dataset)
        # Variable starts at 1, and should have gradient of 1 subtracted from it.
        expected = 0
        self.assertEqual(backend.eval(layer.v), expected)
示例#5
0
 def _run_fn_with_grad_check(self, strategy, var, opt, expected_grad):
     grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(
         expected_grad)
     loss = lambda: grad_check_fn(var) / strategy.num_replicas_in_sync
     return lambda: opt.minimize(loss, var_list=[var])
示例#6
0
    def test_dynamic_loss_scaling(self, strategy_fn, cloning=True):
        if testing_utils.should_run_distributed():
            self.skipTest('b/137397816')
        if not self._is_strategy_supported(strategy_fn):
            return
        strategy = strategy_fn()
        initial_loss_scale = 2.
        batch_size = 4
        expected_gradient = backend.variable([initial_loss_scale / batch_size],
                                             dtype=dtypes.float16)
        # If this variable is set to True, the model below will have NaN gradients
        have_nan_gradients = backend.variable(False, dtype=dtypes.bool)
        with strategy.scope():
            with policy.policy_scope(policy.Policy('infer_float32_vars')):
                x = layers.Input(shape=(1, ),
                                 batch_size=batch_size,
                                 dtype=dtypes.float16)
                layer = AddLayer(assert_type=dtypes.float16)
                y = layer(x)
                identity_with_nan_grads = (
                    mp_test_util.create_identity_with_nan_gradients_fn(
                        have_nan_gradients))
                y = core.Lambda(identity_with_nan_grads)(y)
                identity_with_grad_check_fn = (
                    mp_test_util.create_identity_with_grad_check_fn(
                        expected_dtype=dtypes.float16,
                        expected_gradient=expected_gradient))
                y = core.Lambda(identity_with_grad_check_fn)(y)
                y = math_ops.cast(y, dtypes.float32)
                model = models.Model(inputs=x, outputs=y)

                def loss_fn(y_true, y_pred):
                    del y_true
                    return math_ops.reduce_mean(y_pred)

                opt = gradient_descent.SGD(1.)
                loss_scale = loss_scale_module.DynamicLossScale(
                    initial_loss_scale=initial_loss_scale, increment_period=2)
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
                model.compile(
                    opt,
                    loss=loss_fn,
                    cloning=cloning,
                    run_eagerly=testing_utils.should_run_eagerly(),
                    run_distributed=testing_utils.should_run_distributed())

        self.assertEqual(backend.eval(layer.v), 1)
        x = np.ones((batch_size, 1))
        y = np.ones((batch_size, 1))
        dataset = dataset_ops.Dataset.from_tensor_slices(
            (x, y)).batch(batch_size)
        model.fit(dataset)
        # The variables starts with 1 and has a gradient of 1, so will go down by 1
        # each step.
        self.assertEqual(backend.eval(layer.v), 0)

        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -1)

        # There have been two steps without NaNs, so the loss scale will double
        backend.set_value(expected_gradient,
                          backend.get_value(expected_gradient * 2))
        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -2)

        # Next test with NaN gradients.
        backend.set_value(have_nan_gradients, True)
        model.fit(dataset)
        # Variable should not be updated
        self.assertEqual(backend.eval(layer.v), -2)

        # Test with finite gradients again
        backend.set_value(have_nan_gradients, False)
        # The loss scale will be halved due to the NaNs, so the gradient will also
        # be halved
        backend.set_value(expected_gradient,
                          backend.get_value(expected_gradient / 2))
        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -3)
示例#7
0
    def test_advanced_model(self, strategy_fn, use_loss_scaling=False):
        if testing_utils.should_run_distributed():
            self.skipTest('b/137397816')
        # The advanced model tests mixed-precision-related features that would occur
        # in a resnet50 model. It tests a model that has:
        #  * Multiple layers, some which use auto-cast variables and some which do
        #    not
        #  * Regularization on some variables and not others.
        #  * A fixed loss scale (if use_loss_scaling is True)

        if not self._is_strategy_supported(strategy_fn):
            return
        strategy = strategy_fn()
        if use_loss_scaling:
            loss_scale = 8.
        learning_rate = 2**-14

        with strategy.scope():
            with policy.policy_scope(policy.Policy('infer_float32_vars')):
                x = layers.Input(shape=(1, ),
                                 batch_size=2,
                                 dtype=dtypes.float16)
                layer1 = AddLayer(assert_type=dtypes.float16,
                                  regularizer=IdentityRegularizer(),
                                  use_operator=True)
                layer2 = AddLayerWithoutAutoCast(assert_type=dtypes.float16,
                                                 use_operator=True)
                layer3 = AddLayer(assert_type=dtypes.float16,
                                  use_operator=False)
                layer4 = AddLayerWithoutAutoCast(
                    assert_type=dtypes.float16,
                    regularizer=IdentityRegularizer(),
                    use_operator=False)
                y = layer1(x)
                y = layer2(y)
                y = layer3(y)
                y = layer4(y)
                if use_loss_scaling:
                    # The gradient of 'y' at this point is 1. With loss scaling, the
                    # gradient is 'loss_scale'. We divide by the batch size of 2 since the
                    # loss is averaged across batch elements.
                    expected_gradient = loss_scale / 2
                    identity_with_grad_check_fn = (
                        mp_test_util.create_identity_with_grad_check_fn(
                            expected_dtype=dtypes.float16,
                            expected_gradient=[expected_gradient]))
                    y = core.Lambda(identity_with_grad_check_fn)(y)
                y = math_ops.cast(y, dtypes.float32)
                model = models.Model(inputs=x, outputs=y)

                def loss_fn(y_true, y_pred):
                    self.assertEqual(y_true.dtype, dtypes.float32)
                    self.assertEqual(y_pred.dtype, dtypes.float32)
                    return math_ops.reduce_mean(y_pred)

                opt = gradient_descent.SGD(learning_rate)
                if use_loss_scaling:
                    opt = loss_scale_optimizer.LossScaleOptimizer(
                        opt, loss_scale)
                model.compile(
                    opt,
                    loss=loss_fn,
                    run_eagerly=testing_utils.should_run_eagerly(),
                    run_distributed=testing_utils.should_run_distributed())

        x = np.ones((2, 1))
        y = np.ones((2, 1))
        dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2)
        model.fit(dataset)
        for layer in (layer1, layer2, layer3, layer4):
            if layer.losses:
                # Layer has weight regularizer
                self.assertEqual(backend.eval(layer.v), 1 - 2 * learning_rate)
            else:
                # Layer does not have weight regularizer
                self.assertEqual(backend.eval(layer.v), 1 - learning_rate)
 def _run_fn_with_grad_check(self, strategy, var, opt, expected_grad):
   grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(
       expected_grad)
   loss = lambda: grad_check_fn(var) / strategy.num_replicas_in_sync
   return lambda: opt.minimize(loss, var_list=[var])
示例#9
0
    def test_dynamic_loss_scaling(self,
                                  strategy_fn,
                                  pass_loss_scale_to_policy=False,
                                  get_config=False):
        strategy = strategy_fn()
        initial_loss_scale = 2.
        batch_size = 4
        loss_scale = loss_scale_module.DynamicLossScale(
            initial_loss_scale=initial_loss_scale, increment_period=2)
        expected_gradient = backend.variable([initial_loss_scale / batch_size],
                                             dtype=dtypes.float16)
        # If this variable is set to True, the model below will have NaN gradients
        have_nan_gradients = backend.variable(False, dtype=dtypes.bool)
        with strategy.scope():
            opt = gradient_descent.SGD(1.)
            if pass_loss_scale_to_policy:
                p = policy.Policy('mixed_float16', loss_scale=loss_scale)
            else:
                p = policy.Policy('mixed_float16', loss_scale=None)
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            with policy.policy_scope(p):
                x = layers.Input(shape=(1, ),
                                 batch_size=batch_size,
                                 dtype=dtypes.float16)
                layer = mp_test_util.MultiplyLayer(assert_type=dtypes.float16)
                y = layer(x)
                identity_with_nan_grads = (
                    mp_test_util.create_identity_with_nan_gradients_fn(
                        have_nan_gradients))
                y = core.Lambda(identity_with_nan_grads)(y)
                identity_with_grad_check_fn = (
                    mp_test_util.create_identity_with_grad_check_fn(
                        expected_dtype=dtypes.float16,
                        expected_gradient=expected_gradient))
                y = core.Lambda(identity_with_grad_check_fn)(y)
                model = models.Model(inputs=x, outputs=y)
                if get_config:
                    config = model.get_config()
                    model = model.__class__.from_config(
                        config,
                        custom_objects={
                            'MultiplyLayer': mp_test_util.MultiplyLayer
                        })
                    (layer, ) = (
                        layer for layer in model.layers
                        if isinstance(layer, mp_test_util.MultiplyLayer))

                def loss_fn(y_true, y_pred):
                    del y_true
                    return math_ops.reduce_mean(y_pred)

                model.compile(opt,
                              loss=loss_fn,
                              run_eagerly=testing_utils.should_run_eagerly())

        self.assertEqual(backend.eval(layer.v), 1)
        x = np.ones((batch_size, 1))
        y = np.ones((batch_size, 1))
        dataset = dataset_ops.Dataset.from_tensor_slices(
            (x, y)).batch(batch_size)
        model.fit(dataset)
        # The variables starts with 1 and has a gradient of 1, so will go down by 1
        # each step.
        self.assertEqual(backend.eval(layer.v), 0)

        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -1)

        # There have been two steps without NaNs, so the loss scale will double
        backend.set_value(expected_gradient,
                          backend.get_value(expected_gradient * 2))
        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -2)

        # Next test with NaN gradients.
        backend.set_value(have_nan_gradients, True)
        model.fit(dataset)
        # Variable should not be updated
        self.assertEqual(backend.eval(layer.v), -2)

        # Test with finite gradients again
        backend.set_value(have_nan_gradients, False)
        # The loss scale will be halved due to the NaNs, so the gradient will also
        # be halved
        backend.set_value(expected_gradient,
                          backend.get_value(expected_gradient / 2))
        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -3)
示例#10
0
  def test_dynamic_loss_scaling(self, strategy_fn, cloning=True):
    strategy = strategy_fn()
    initial_loss_scale = 2.
    batch_size = 4
    expected_gradient = backend.variable([initial_loss_scale / batch_size],
                                         dtype=dtypes.float16)
    # If this variable is set to True, the model below will have NaN gradients
    have_nan_gradients = backend.variable(False, dtype=dtypes.bool)
    with strategy.scope():
      with policy.policy_scope(policy.Policy('infer_float32_vars')):
        x = layers.Input(shape=(1,), batch_size=batch_size,
                         dtype=dtypes.float16)
        layer = AddLayer(assert_type=dtypes.float16)
        y = layer(x)
        identity_with_nan_grads = (
            mp_test_util.create_identity_with_nan_gradients_fn(
                have_nan_gradients))
        y = core.Lambda(identity_with_nan_grads)(y)
        identity_with_grad_check_fn = (
            mp_test_util.create_identity_with_grad_check_fn(
                expected_dtype=dtypes.float16,
                expected_gradient=expected_gradient))
        y = core.Lambda(identity_with_grad_check_fn)(y)
        y = math_ops.cast(y, dtypes.float32)
        model = models.Model(inputs=x, outputs=y)

        def loss_fn(y_true, y_pred):
          del y_true
          return math_ops.reduce_mean(y_pred)

        opt = gradient_descent.SGD(1.)
        loss_scale = loss_scale_module.DynamicLossScale(
            initial_loss_scale=initial_loss_scale, increment_period=2)
        opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
        model.compile(opt, loss=loss_fn, cloning=cloning)

    self.assertEqual(backend.eval(layer.v), 1)
    x = np.ones((batch_size, 1))
    y = np.ones((batch_size, 1))
    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(batch_size)
    model.fit(dataset)
    # The variables starts with 1 and has a gradient of 1, so will go down by 1
    # each step.
    self.assertEqual(backend.eval(layer.v), 0)

    model.fit(dataset)
    self.assertEqual(backend.eval(layer.v), -1)

    # There have been two steps without NaNs, so the loss scale will double
    backend.set_value(expected_gradient,
                      backend.get_value(expected_gradient * 2))
    model.fit(dataset)
    self.assertEqual(backend.eval(layer.v), -2)

    # Next test with NaN gradients.
    backend.set_value(have_nan_gradients, True)
    model.fit(dataset)
    # Variable should not be updated
    self.assertEqual(backend.eval(layer.v), -2)

    # Test with finite gradients again
    backend.set_value(have_nan_gradients, False)
    # The loss scale will be halved due to the NaNs, so the gradient will also
    # be halved
    backend.set_value(expected_gradient,
                      backend.get_value(expected_gradient / 2))
    model.fit(dataset)
    self.assertEqual(backend.eval(layer.v), -3)
示例#11
0
  def test_advanced_model(self, strategy_fn, use_loss_scaling=False):

    # The advanced model tests mixed-precision-related features that would occur
    # in a resnet50 model. It tests a model that has:
    #  * Multiple layers, some which use auto-cast variables and some which do
    #    not
    #  * Regularization on some variables and not others.
    #  * A fixed loss scale (if use_loss_scaling is True)

    strategy = strategy_fn()
    if use_loss_scaling:
      loss_scale = 8.
    learning_rate = 2 ** -14

    with strategy.scope():
      with policy.policy_scope(policy.Policy('infer_float32_vars')):
        x = layers.Input(shape=(1,), batch_size=2, dtype=dtypes.float16)
        layer1 = AddLayer(assert_type=dtypes.float16,
                          regularizer=IdentityRegularizer(), use_operator=True)
        layer2 = AddLayerWithoutAutoCast(assert_type=dtypes.float16,
                                         use_operator=True)
        layer3 = AddLayer(assert_type=dtypes.float16, use_operator=False)
        layer4 = AddLayerWithoutAutoCast(assert_type=dtypes.float16,
                                         regularizer=IdentityRegularizer(),
                                         use_operator=False)
        y = layer1(x)
        y = layer2(y)
        y = layer3(y)
        y = layer4(y)
        if use_loss_scaling:
          # The gradient of 'y' at this point is 1. With loss scaling, the
          # gradient is 'loss_scale'. We divide by the batch size of 2 since the
          # loss is averaged across batch elements.
          expected_gradient = loss_scale / 2
          identity_with_grad_check_fn = (
              mp_test_util.create_identity_with_grad_check_fn(
                  expected_dtype=dtypes.float16,
                  expected_gradient=[expected_gradient]))
          y = core.Lambda(identity_with_grad_check_fn)(y)
        y = math_ops.cast(y, dtypes.float32)
        model = models.Model(inputs=x, outputs=y)

        def loss_fn(y_true, y_pred):
          self.assertEqual(y_true.dtype, dtypes.float32)
          self.assertEqual(y_pred.dtype, dtypes.float32)
          return math_ops.reduce_mean(y_pred)

        opt = gradient_descent.SGD(learning_rate)
        if use_loss_scaling:
          opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
        model.compile(opt, loss=loss_fn)

    x = np.ones((2, 1))
    y = np.ones((2, 1))
    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2)
    model.fit(dataset)
    for layer in (layer1, layer2, layer3, layer4):
      if layer.losses:
        # Layer has weight regularizer
        self.assertEqual(backend.eval(layer.v), 1 - 2 * learning_rate)
      else:
        # Layer does not have weight regularizer
        self.assertEqual(backend.eval(layer.v), 1 - learning_rate)