def testDynamicLossScaleWithSlots(self, strategy_fn): with strategy_fn().scope() as strategy: var = variables.Variable([1.0, 2.0]) # An SGD optimizer with momentum has slot variables. opt = momentum.MomentumOptimizer(1.0, momentum=1.) initial_loss_scale = 2. loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=1, multiplier=4) opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer( opt, loss_scale) loss = lambda: var / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The momentum accumulator starts at 0 and the gradient is 1. The # accumulator is incremented by the gradient, so it is now 1. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 1. self.assertAllClose([0.0, 1.0], self.evaluate(var)) self.assertEqual(self.evaluate(opt._loss_scale()), initial_loss_scale * 4) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # The momentum accumulator was 1 before this step and the gradient is 1. # The accumulator is incremented by the gradient, so it is now 2. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 2. self.assertAllClose([-2., -1.], self.evaluate(var)) self.assertEqual(self.evaluate(opt._loss_scale()), initial_loss_scale * 16)
def test_optimizer_errors(self): opt = 1 if tf2.enabled(): expected_regex = ('"opt" must be an instance of a ' 'tf.keras.optimizers.Optimizer, but got') else: expected_regex = ('"opt" must be an instance of a tf.train.Optimizer or ' 'a tf.keras.optimizers.Optimizer, but got') with self.assertRaisesRegexp(ValueError, expected_regex): enable_mixed_precision_graph_rewrite(opt) self.assertFalse(config.get_optimizer_experimental_options() .get('auto_mixed_precision', False)) opt = gradient_descent_v1.GradientDescentOptimizer(1.0) opt = loss_scale_optimizer_v1.MixedPrecisionLossScaleOptimizer(opt, 'dynamic') with self.assertRaisesRegexp(ValueError, '"opt" must not already be an instance of a ' 'MixedPrecisionLossScaleOptimizer.'): enable_mixed_precision_graph_rewrite(opt) self.assertFalse(config.get_optimizer_experimental_options() .get('auto_mixed_precision', False)) opt = gradient_descent_v2.SGD(1.0) opt = loss_scale_optimizer_v2.LossScaleOptimizer(opt, 'dynamic') with self.assertRaisesRegexp(ValueError, '"opt" must not already be an instance of a ' 'LossScaleOptimizer.'): enable_mixed_precision_graph_rewrite(opt) self.assertFalse(config.get_optimizer_experimental_options() .get('auto_mixed_precision', False))
def testDynamicUpdate(self, strategy_fn): with strategy_fn().scope() as strategy: var = variables.Variable([1.0, 2.0]) opt = gradient_descent.GradientDescentOptimizer(1.0) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=2, increment_period=1, multiplier=2) opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer( opt, loss_scale) # Test optimizer with finite gradients loss = lambda: var * 2.0 / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # Gradient is 2, so variable will have 2 subtracted from it self.assertAllClose([-1.0, 0.0], self.evaluate(var)) # Loss scale has doubled from 2 to 4 self.assertEqual(4., self.evaluate(opt._loss_scale())) # Test optimizer with NaN gradients loss = lambda: var * float('NaN') run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # Variable should not change from before, due to NaN gradients. self.assertAllClose(self.evaluate(var), [-1.0, 0.0]) # Loss scale should half due to NaN gradients. self.assertEqual(2., self.evaluate(opt._loss_scale()))
def testDynamicLossScale(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. expected_gradient = resource_variable_ops.ResourceVariable( learning_rate / strategy.num_replicas_in_sync) with strategy.scope(): var = variables.Variable([5.0]) opt = gradient_descent.GradientDescentOptimizer(learning_rate) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=2, increment_period=1, multiplier=2) opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer( opt, loss_scale) self.assertEqual( loss_scale.initial_loss_scale % strategy.num_replicas_in_sync, 0) run_fn = self._run_fn_with_grad_check(strategy, var, opt, expected_gradient) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var)) # Loss scale will be double, so the expected gradient is also doubled. self.evaluate( expected_gradient.assign(2 * learning_rate / strategy.num_replicas_in_sync)) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # As before, the 2 is subtracted from the variable, making it's new value # 1. self.assertAllClose([1.], self.evaluate(var))
def testFixedLossScaleAppliedToLossWithGetGradients(self): var = variables.Variable([2.0]) opt = gradient_descent.GradientDescentOptimizer(1.0) loss_scale = 10. opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(opt, loss_scale) grad_check_fn = create_identity_with_grad_check_fn(loss_scale) loss = grad_check_fn(var) run_op = get_gradients(opt, loss, [var]) self.evaluate(variables.global_variables_initializer()) # This will cause an assertion to run, as # create_identity_with_grad_check_fn added an assertion op. self.evaluate(run_op)
def _wrap_optimizer(opt, loss_scale, use_v1_behavior): """Wraps an optimizer with a LossScaleOptimizer.""" if isinstance(opt, loss_scale_optimizer_v1.MixedPrecisionLossScaleOptimizer): raise ValueError('"opt" must not already be an instance of a ' 'MixedPrecisionLossScaleOptimizer. ' '`enable_mixed_precision_graph_rewrite` will ' 'automatically wrap the optimizer with a ' 'MixedPrecisionLossScaleOptimizer.') # To avoid a circular dependency, we cannot depend on tf.keras. Because # LossScaleOptimizer is in Keras, we cannot use isinstance, so instead check # the class name. if opt.__class__.__name__ == 'LossScaleOptimizer': raise ValueError('"opt" must not already be an instance of a ' 'LossScaleOptimizer. ' '`enable_mixed_precision_graph_rewrite` will ' 'automatically wrap the optimizer with a ' 'LossScaleOptimizer.') if isinstance(opt, optimizer.Optimizer): # For convenience, we allow the V2 version of this function to wrap the V1 # optimizer, even though we do not document this. return loss_scale_optimizer_v1.MixedPrecisionLossScaleOptimizer( opt, loss_scale) # Because we cannot depend on tf.keras, we see if `opt` is an instance of the # Keras OptimizerV2 class by checking the subclass names. base_classes = tf_inspect.getmro(opt.__class__) base_class_names = [cls.__name__ for cls in base_classes] is_loss_scale_optimizer_v2 = 'OptimizerV2' in base_class_names if is_loss_scale_optimizer_v2: # Because we cannot depend on tf.keras, we cannot unconditionally do this # import. But since `opt` is a Keras OptimizerV2, we know keras is # importable, so it is safe to do this import. (Technically, it's possible # to have a dependency on OptimizerV2 and not LossScaleOptimizer, but this # is not done in practice). from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as loss_scale_optimizer_v2 # pylint: disable=g-import-not-at-top return loss_scale_optimizer_v2.LossScaleOptimizer(opt, loss_scale) if use_v1_behavior: raise ValueError( '"opt" must be an instance of a tf.train.Optimizer or a ' 'tf.keras.optimizers.Optimizer, but got: %s' % opt) else: raise ValueError('"opt" must be an instance of a ' 'tf.keras.optimizers.Optimizer, but got: %s' % opt)
def model_fn(): """Simple model to test mixed precision.""" x = np.ones((1, 1)) loss = model(x, training=True) if ((task_type == 'worker' and task_id == 0) or task_type is task_id is None): loss *= loss_multiplier_for_first_worker # Learning rate is small enough that if applied to a float16 variable, # the variable will not change. So this tests the learning rate is not # applied to a float16 value, but instead the float32 variable. optimizer = gradient_descent.GradientDescentOptimizer(2 ** -14) optimizer = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer( optimizer, loss_scale) train_op = optimizer.minimize( loss, training_util.get_or_create_global_step()) return train_op
def testFixedLossScaleAppliedToLossWithMinimize(self, strategy_fn): with strategy_fn().scope() as strategy: var = variables.Variable([5.0]) opt = gradient_descent.GradientDescentOptimizer(2.0) loss_scale = 10. opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer( opt, loss_scale) # We need num_replicas_in_sync to divide loss_scale, otherwise loss_scale # / strategy.num_replicas_in_sync will not be exact, which could lead to # assertion failures due to rounding issues. self.assertEqual(loss_scale % strategy.num_replicas_in_sync, 0) run_fn = self._run_fn_with_grad_check( strategy, var, opt, loss_scale / strategy.num_replicas_in_sync) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var))
def testCheckpoint(self, strategy_fn): strategy = strategy_fn() if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and not context.executing_eagerly()): # TODO(b/121381184): Enable running the test in this case. return with self.test_session(), strategy.scope(): # Build and run a simple model. var = variables.Variable([2.0]) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=1., increment_period=2., multiplier=2.) opt = momentum.MomentumOptimizer(1.0, momentum=1.) opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer( opt, loss_scale) run_fn = lambda: opt.minimize(lambda: var + 1., var_list=[var]) opt_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self.evaluate(opt_op) self.assertEqual(self.evaluate(loss_scale()), 1.) self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1) # Save a checkpoint. checkpoint = trackable_utils.Checkpoint(optimizer=opt) prefix = os.path.join(self.get_temp_dir(), 'ckpt') save_path = checkpoint.save(prefix) # Run model again. self.evaluate(strategy.experimental_run(run_fn)) self.assertEqual(self.evaluate(loss_scale()), 2.) self.assertEqual(self.evaluate(loss_scale._num_good_steps), 0) # Load checkpoint and ensure loss scale is back to it's original value. status = checkpoint.restore(save_path) status.assert_consumed() status.run_restore_ops() self.assertEqual(self.evaluate(loss_scale()), 1.) self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
def testPassingNoneToLossScale(self): opt = gradient_descent.GradientDescentOptimizer(1.0) with self.assertRaisesRegex(ValueError, r'loss_scale cannot be None'): loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(opt, None)