def testInvalidArgsWithFixedLossScale(self): opt = gradient_descent.SGD() with self.assertRaisesRegex( ValueError, '"initial_scale" must be specified if "dynamic" is False'): loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False) with self.assertRaisesRegex( ValueError, '"dynamic_growth_steps" must be None if "dynamic" is ' 'False, but got: 2'): loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=1, dynamic_growth_steps=2)
def testDynamicMustBeBool(self): opt = gradient_descent.SGD() with self.assertRaisesRegex( TypeError, '"dynamic" argument to LossScaleOptimizer.__init__ must be ' "a bool, but got: 'dynamic'"): loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
def testSerializationWithBuiltInOptimizer(self, use_v1): opt = gradient_descent.SGD(2., momentum=0.5) if use_v1: loss_scale = tf.mixed_precision.experimental.DynamicLossScale( initial_loss_scale=2., increment_period=3.) opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) else: opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2., dynamic_growth_steps=3.) config = optimizers.serialize(opt) opt = optimizers.deserialize(config) # Force hyperparameters to be created opt.lr # pylint: disable=pointless-statement self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(opt.lr), 2.) self.assertEqual(self.evaluate(opt.inner_optimizer.momentum), 0.5) self.assertEqual(self.evaluate(opt.loss_scale), 2.) self.assertEqual(opt.dynamic_growth_steps, 3.) self.assertTrue(opt.dynamic, 4.) # Deserializing a LossScaleOptimizer always always results in a V2 # LossScaleOptimizer, even if serialized with a LossScaleOptimizerV1. self.assertAllEqual(type(opt), loss_scale_optimizer.LossScaleOptimizer) # Ensure the optimizer can be used var = tf.Variable([5.0]) run_op = self._run_fn_with_grad_check(tf.distribute.get_strategy(), var, opt, 2)() self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertEqual(self.evaluate(var), [3.]) self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
def testUnsupportedStrategy(self): strategy = tf.distribute.experimental.CentralStorageStrategy() expected_error = ( 'Loss scaling is not supported with the tf.distribute.Strategy: ' 'CentralStorageStrategy. Try using a different Strategy, e.g. a ' 'MirroredStrategy') with strategy.scope(), self.assertRaisesRegex(ValueError, expected_error): loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD()) opt = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD()) with strategy.scope(): var = tf.Variable(1.0) loss = lambda: var * 2.0 run_fn = lambda: opt.minimize(loss, [var]) with self.assertRaisesRegex(ValueError, expected_error): strategy.experimental_run(run_fn)
def testDynamicLossScaleWithSlots(self, strategy_fn): strategy_obj = strategy_fn() if (isinstance(strategy_obj, tf.distribute.MirroredStrategy) and tf.compat.v1.control_flow_v2_enabled() and not tf.executing_eagerly()): self.skipTest('b/138667997') with strategy_obj.scope() as strategy: var = tf.Variable([1.0, 2.0]) # An SGD optimizer with momentum has slot variables. opt = gradient_descent.SGD(1.0, momentum=1.) initial_scale = 2. opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=initial_scale, dynamic_growth_steps=1) loss = lambda: var / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The momentum accumulator starts at 0 and the gradient is 1. The # accumulator is incremented by the gradient, so it is now 1. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 1. self.assertAllClose([0.0, 1.0], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 2) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # The momentum accumulator was 1 before this step and the gradient is 1. # The accumulator is incremented by the gradient, so it is now 2. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 2. self.assertAllClose([-2., -1.], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 4) self.assertEqual(opt.get_slot_names(), ['momentum'])
def testApplyGradientsGetsUnwrappedTensors(self): # Tests that gradients passed to apply_gradients are not wrapped in a # DistributionStrategy wrapper, such as PerReplica, but instead are raw # Tensors. Optimizer subclasses that override apply_gradients() expect raw # Tensors, even though the base Optimizer can handle PerReplica gradients. outer_self = self class MyOptimizer(gradient_descent.SGD): def apply_gradients(self, grads_and_vars, name=None, experimental_aggregate_gradients=True): for grad, _ in grads_and_vars: outer_self.assertIsInstance(grad, tf.Tensor) return super(MyOptimizer, self).apply_gradients(grads_and_vars, name, experimental_aggregate_gradients) with create_mirrored_strategy().scope() as strategy: var = tf.Variable([5.0]) opt = MyOptimizer(learning_rate=1.0) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=1) loss = lambda: var * 2.0 run_fn = lambda: opt.minimize(loss, [var]) strategy.experimental_run(run_fn)
def testDynamicLossScaleDefaultValues(self): opt = gradient_descent.SGD() opt = loss_scale_optimizer.LossScaleOptimizer(opt) self.assertEqual(opt.initial_scale, 2**15) self.assertEqual(opt.dynamic_growth_steps, 2000) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(opt.loss_scale), 2**15)
def testNanOnOneReplicaOnly(self): if not tf.test.is_gpu_available(): self.skipTest('Test requires GPU') if (not tf.executing_eagerly() and not tf.compat.v1.control_flow_v2_enabled()): self.skipTest( 'b/181283011: GradientTape does not work properly with ' 'V1 control flow, and opt.minimize uses GradientTape') with create_mirrored_strategy().scope() as strategy: var = tf.Variable([1.0, 2.0]) opt = gradient_descent.SGD(1.0) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2, dynamic_growth_steps=2) def loss(): rep_id = (tf.distribute.get_replica_context(). replica_id_in_sync_group) # The last element of last replica's gradient is NaN. return tf.compat.v1.cond( tf.constant(rep_id == 0), lambda: var * 2., lambda: var * tf.constant([1., float('NaN')])) run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # Variable should not change from before, due to NaN gradients. self.assertAllClose(self.evaluate(var), [1.0, 2.0]) # Loss scale should half due to NaN gradients. self.assertEqual(1., self.evaluate(opt.loss_scale))
def testDynamicUpdate(self, strategy_fn): with strategy_fn().scope() as strategy: var = tf.Variable([1.0, 2.0]) opt = gradient_descent.SGD(1.0) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2, dynamic_growth_steps=1) # Test optimizer with finite gradients loss = lambda: var * 2.0 / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # Gradient is 2, so variable will have 2 subtracted from it self.assertAllClose([-1.0, 0.0], self.evaluate(var)) # Loss scale has doubled from 2 to 4 self.assertEqual(4., self.evaluate(opt.loss_scale)) # Test optimizer with NaN gradients loss = lambda: var * float('NaN') run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # Variable should not change from before, due to NaN gradients. self.assertAllClose(self.evaluate(var), [-1.0, 0.0]) # Loss scale should half due to NaN gradients. self.assertEqual(2., self.evaluate(opt.loss_scale))
def testDynamicLossScale(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. expected_gradient = tf.Variable(learning_rate / strategy.num_replicas_in_sync) with strategy.scope(): var = tf.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2, dynamic_growth_steps=1) self.assertEqual(opt.initial_scale, 2.) self.assertIsInstance(opt.initial_scale, float) self.assertEqual(opt.dynamic_growth_steps, 1) self.assertIsInstance(opt.dynamic_growth_steps, int) self.assertEqual(opt.initial_scale % strategy.num_replicas_in_sync, 0) run_fn = self._run_fn_with_grad_check(strategy, var, opt, expected_gradient) run_op = strategy.experimental_run(run_fn) self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var)) # Loss scale will be double, so the expected gradient is also doubled. self.evaluate( expected_gradient.assign(2 * learning_rate / strategy.num_replicas_in_sync)) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # As before, the 2 is subtracted from the variable, making it's new value # 1. self.assertAllClose([1.], self.evaluate(var))
def test_save_slot_variables_with_autocast_vars(self, strategy_fn, var_name='v'): p = policy.Policy('mixed_float16') with strategy_fn().scope(), policy.policy_scope(p): x = layers.Input(shape=(2,), batch_size=2) # Having a var_name other than 'v' tests that a fixed bug (b/134713714) # does not reoccur. The bug was that a crash would occur when saving a # checkpoint where an AutoCastVariable with a slot variable would have a # different name than the layer attribute's name (layer.v in this case). layer = mp_test_util.MultiplyLayer(assert_type=tf.float16, var_name=var_name) y = layer(x) model = models.Model(inputs=x, outputs=y) opt = gradient_descent.SGD(1., 1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=1) model.compile( optimizer=opt, loss='mse', run_eagerly=testing_utils.should_run_eagerly()) model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2) weights_file = os.path.join(self.get_temp_dir(), 'weights') model.save_weights(weights_file) saved_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2) new_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) self.assertNotEqual(new_slot, saved_slot) model.load_weights(weights_file) restored_slot = backend.get_value(opt.get_slot(layer.v, 'momentum')) self.assertEqual(restored_slot, saved_slot)
def testIterations(self): opt = gradient_descent.SGD(2.0) lso = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=10.) lso.iterations = 7 self.assertEqual(lso.iterations, 7) self.assertEqual(opt.iterations, 7)
def test_save_model_with_dynamic_loss_scaling(self, strategy_fn, h5=False): # TODO(reedwm): Support and test saving model with a mixed_[b]float16 policy # as well. strategy = strategy_fn() if (isinstance(strategy, tf.distribute.MirroredStrategy) and not tf.executing_eagerly()): # TODO(b/121381184): Enable running the test in this case. return # Create and run model. with strategy.scope(): x = layers.Input(shape=(2, ), batch_size=2, dtype=tf.float32) y = mp_test_util.MultiplyLayer()(x) model = models.Model(inputs=x, outputs=y) opt = gradient_descent.SGD(1.) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=1., dynamic_growth_steps=2.) model.compile(optimizer=opt, loss='mse', run_eagerly=test_utils.should_run_eagerly()) # Run for 3 steps (6 examples with a batch size of 2) model.fit(np.ones((6, 2)), np.zeros((6, 2)), batch_size=2) self.assertEqual(backend.get_value(opt.loss_scale), 2) self.assertEqual(backend.get_value(opt.dynamic_counter), 1) (weight, ) = model.trainable_weights orig_weight = backend.get_value(weight) # Save model weights. save_path = os.path.join(self.get_temp_dir(), 'model') model.save(save_path, save_format='h5' if h5 else 'tf') # Run model again for 1 step (2 examples with a batch size of 2) model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2) new_weight = backend.get_value(weight) self.assertNotEqual(new_weight, orig_weight) self.assertEqual(backend.get_value(opt.loss_scale), 4) self.assertEqual(backend.get_value(opt.dynamic_counter), 0) # Load model weights and ensure loss scale weights are restored. model = save.load_model( save_path, custom_objects={'MultiplyLayer': mp_test_util.MultiplyLayer}) (weight, ) = model.trainable_weights loaded_weight = backend.get_value(weight) self.assertEqual(loaded_weight, orig_weight) # Currently the loss scale isn't always saved when the model is saved with # Model.save(). So we assert the loss scale either has the value when it was # saved, or the value it was initialized with. # TODO(reedwm): Always save/restore the loss scale with Model.save(). self.assertIn(backend.get_value(model.optimizer.loss_scale), (1, 2)) self.assertIn(backend.get_value(model.optimizer.dynamic_counter), (0, 1)) # Test optimizer attributes and type self.assertEqual(model.optimizer.initial_scale, 1.) self.assertEqual(model.optimizer.dynamic_growth_steps, 2.) self.assertEqual(type(model.optimizer), loss_scale_optimizer.LossScaleOptimizer)
def testDynamicAttrsWithFixedLossScale(self): opt = gradient_descent.SGD() opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=2.) self.assertFalse(opt.dynamic) self.assertIsNone(opt.dynamic_counter) self.assertIsNone(opt.dynamic_growth_steps)
def testGetConfigFixed(self, config_version): # Get a config from LossScaleOptimizer, LossScaleOptimizerV3, or the # LossScaleOptimizer from TF 2.3. Then restore the config into a # LossScaleOptimizer or LossScaleOptimizerV3 if config_version == 'v2': opt = gradient_descent.SGD(2., momentum=0.5) opt = loss_scale_optimizer.LossScaleOptimizer( opt, dynamic=False, initial_scale=2) config = opt.get_config() opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config) elif config_version == 'v3': opt = sgd_experimental.SGD(2., momentum=0.5) opt = loss_scale_optimizer.LossScaleOptimizerV3( opt, dynamic=False, initial_scale=2) config = opt.get_config() opt = loss_scale_optimizer.LossScaleOptimizerV3.from_config(config) else: self.assertEqual(config_version, 'tf2_3') config = { 'optimizer': { 'class_name': 'SGD', 'config': { 'learning_rate': 2.0, 'momentum': 0.5, 'decay': 0.0, 'nesterov': False, 'name': 'SGD', } }, 'loss_scale': { 'class_name': 'FixedLossScale', 'config': {'loss_scale_value': 2.0} }, } opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config) # Force hyperparameters to be created opt.learning_rate # pylint: disable=pointless-statement self.evaluate(tf.compat.v1.global_variables_initializer()) # Test attributes on the optimizer self.assertEqual(self.evaluate(opt.learning_rate), 2.) self.assertEqual(self.evaluate(opt.inner_optimizer.learning_rate), 2.) self.assertEqual(self._eval_if_tensor(opt.inner_optimizer.momentum), 0.5) self.assertEqual(self.evaluate(opt.loss_scale), 2.) self.assertEqual(opt.initial_scale, 2.) self.assertIsNone(opt.dynamic_growth_steps) self.assertIsNone(opt.dynamic_counter) self.assertFalse(opt.dynamic) # Ensure the optimizer can be used var = tf.Variable([5.0]) run_op = self._run_fn_with_grad_check( tf.distribute.get_strategy(), var, opt, 2)() self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertEqual(self.evaluate(var), [3.])
def testDir(self): lso = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD()) dir_result = dir(lso) self.assertIn('learning_rate', dir_result) # Hyperparameter self.assertIn('lr', dir_result) # Hyperparameter self.assertIn('minimize', dir_result) # Attribute self.assertIn('loss_scale', dir_result) # Attribute self.assertNotIn('nesterov', dir_result) # Attribute on inner optimizer self.assertIn('nesterov', dir(lso.inner_optimizer))
def test_optimizer_errors(self): opt = gradient_descent_v2.SGD(1.0) opt = loss_scale_optimizer_v2.LossScaleOptimizer(opt) with self.assertRaisesRegex( ValueError, '"opt" must not already be an instance of a ' 'LossScaleOptimizer.'): tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite( opt) self.assertFalse(tf.config.optimizer.get_experimental_options().get( 'auto_mixed_precision', False))
def test_loss_scale_optimizer_overrides_policy_v1_loss_scale(self): with policy.policy_scope(policy.PolicyV1('float32', loss_scale=10.)): opt = gradient_descent.SGD(1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=5.) x = layers.Input(shape=(1,)) y = mp_test_util.MultiplyLayer()(x) model = models.Model(x, y) model.compile(opt, loss='mse') self.assertEqual(self.evaluate(model.optimizer.loss_scale), 5.)
def test_restore_old_loss_scale_checkpoint(self): # Ensure a checkpoint from TF 2.2 can be loaded. The checkpoint format # of LossScaleOptimizer changed, but old checkpoints can still be loaded opt = gradient_descent.SGD(0.1, momentum=0.1) opt = loss_scale_optimizer.LossScaleOptimizer(opt) model = sequential.Sequential( [ core.Dense( 2, ) ] ) # The checkpoint and expected values were obtained from the program in # testdata/BUILD. ckpt_dir = os.path.join( flags.FLAGS["test_srcdir"].value, "org_keras/keras", "mixed_precision/testdata/lso_ckpt_tf2.2", ) # ckpt_dir = test.test_src_dir_path( # 'python/keras/mixed_precision/testdata/lso_ckpt_tf2.2') model.load_weights(os.path.join(ckpt_dir, "ckpt")) model.compile(opt, "mse", run_eagerly=test_utils.should_run_eagerly()) model(np.zeros((2, 2))) # Create model weights opt._create_all_weights(model.weights) expected_kernel = np.array( [[9.229685, 10.901115], [10.370763, 9.757362]] ) expected_slot = np.array([[10.049943, 9.917691], [10.049943, 9.917691]]) self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel) self.assertAllClose( self.evaluate(opt.get_slot(model.weights[0], "momentum")), expected_slot, ) self.assertEqual(self.evaluate(opt.loss_scale), 32768) self.assertEqual(self.evaluate(opt.dynamic_counter), 1) # Check restoring works even after the model is compiled and the weights # have been created. model.fit(np.random.normal(size=(2, 2)), np.random.normal(size=(2, 2))) self.assertNotAllClose(self.evaluate(model.weights[0]), expected_kernel) self.assertNotAllClose( self.evaluate(opt.get_slot(model.weights[0], "momentum")), expected_slot, ) model.load_weights(os.path.join(ckpt_dir, "ckpt")) self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel) self.assertAllClose( self.evaluate(opt.get_slot(model.weights[0], "momentum")), expected_slot, ) self.assertEqual(self.evaluate(opt.loss_scale), 32768) self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
def testHyperParametersExposed(self): with self.cached_session(): opt = adam.Adam(learning_rate=1.0, beta_1=0.5, beta_2=0.9) lso = loss_scale_optimizer.LossScaleOptimizer(opt) # Force hyperparameters to be created opt.lr # pylint: disable=pointless-statement self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(lso.beta_1), 0.5) self.assertIsInstance(lso.beta_1, tf.Variable) self.assertEqual(self.evaluate(lso.lr), 1.0) self.assertIs(lso.lr, opt.lr) self.assertIs(lso.lr, lso.learning_rate) lso.beta_1 = 0.25 self.assertEqual(self.evaluate(lso.beta_1), 0.25) self.assertEqual(self.evaluate(opt.beta_1), 0.25) self.assertIs(lso.beta_1, opt.beta_1) opt.beta_1 = 0.75 self.assertEqual(self.evaluate(lso.beta_1), 0.75) self.assertEqual(self.evaluate(opt.beta_1), 0.75) self.assertIs(lso.beta_1, opt.beta_1) lso.lr = 2.0 self.assertEqual(self.evaluate(lso.lr), 2.0) self.assertEqual(self.evaluate(lso.learning_rate), 2.0) self.assertEqual(self.evaluate(opt.lr), 2.0) self.assertEqual(self.evaluate(opt.learning_rate), 2.0) self.assertIs(lso.lr, opt.lr) # Test setting attribute that is both attribute on LossScaleOptimizer and # hyperparameter on wrapped optimizer. class MyOpt(gradient_descent.SGD): def __init__(self): super().__init__() self._set_hyper('loss_scale', 123.) opt = MyOpt() lso = loss_scale_optimizer.LossScaleOptimizer(opt) with self.assertRaises(AttributeError): lso.loss_scale = 2.
def testArbitraryAttributesNotExposed(self): opt = gradient_descent.SGD() lso = loss_scale_optimizer.LossScaleOptimizer(opt) self.assertFalse(opt.nesterov) with self.assertRaisesRegex( AttributeError, "'LossScaleOptimizer' object has no attribute 'nesterov'"): lso.nesterov # pylint: disable=pointless-statement lso.nesterov = True self.assertTrue(lso.nesterov) self.assertFalse(opt.nesterov)
def testGetUnscaledGradients(self): opt = gradient_descent.SGD(2.0) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=2) scaled_grads = [ tf.convert_to_tensor(3.), None, tf.convert_to_tensor(-4., dtype='float16') ] grads = opt.get_unscaled_gradients(scaled_grads) grads = [self.evaluate(g) if g is not None else g for g in grads] self.assertEqual([1.5, None, -2.], grads)
def testGetScaledLoss(self): opt = gradient_descent.SGD(2.0) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=2.) loss = tf.convert_to_tensor(5.) self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss))) self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)())) loss = tf.convert_to_tensor(5., dtype='float16') self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss))) self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
def testGetUnscaledSparseGradients(self): opt = gradient_descent.SGD(2.0) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=2) sparse_scaled_grad = tf.IndexedSlices( tf.convert_to_tensor([[4., 2.], [8., 5.]]), tf.convert_to_tensor([1, 3], dtype='int32'), dense_shape=tf.convert_to_tensor([5, 2], dtype='int32')) sparse_grad = opt.get_unscaled_gradients([sparse_scaled_grad])[0] self.assertIsInstance(sparse_grad, tf.IndexedSlices) self.assertAllEqual([[2., 1.], [4., 2.5]], self.evaluate(sparse_grad.values))
def testFixedLossScaleAppliedToLossWithGetGradients(self): with tf.Graph().as_default(): var = tf.Variable([2.0]) opt = gradient_descent.SGD(1.0) loss_scale = 10. opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=loss_scale) grad_check_fn = mp_test_util.create_identity_with_grad_check_fn( loss_scale) loss = grad_check_fn(var) run_op = opt.get_gradients(loss, [var]) self.evaluate(tf.compat.v1.global_variables_initializer()) # This will cause an assertion to run, as # mp_test_util.create_identity_with_grad_check_fn added an assertion op. self.evaluate(run_op)
def testWeightMethods(self): with self.test_session(): var = tf.Variable([1.0]) opt = gradient_descent.SGD(1.0) opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2., dynamic_growth_steps=1) run_op = opt.minimize(lambda: var * 2, [var]) self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertLen(opt.weights, 1) # The 'iterations' weight self.assertEqual(self.evaluate(opt.weights[0]), 1) self.assertEqual(opt.get_weights()[0], 1) self.assertEqual(self.evaluate(opt.variables()[0]), 1) opt.set_weights([np.array(2.)]) self.assertEqual(self.evaluate(opt.variables()[0]), 2)
def testDynamicLossScaleWithFloat16Loss(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. with strategy.scope(): var = tf.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2, dynamic_growth_steps=1) def loss(): return tf.cast(var / strategy.num_replicas_in_sync, 'float16') run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var))
def test_save_weights_with_dynamic_loss_scaling(self, strategy_fn): strategy = strategy_fn() if ( isinstance(strategy, tf.distribute.MirroredStrategy) and not tf.executing_eagerly() ): # TODO(b/121381184): Enable running the test in this case. return # Create and run model. with strategy.scope(): x = layers.Input(shape=(2,), batch_size=2, dtype=tf.float32) y = mp_test_util.MultiplyLayer(assert_type=tf.float32)(x) model = models.Model(inputs=x, outputs=y) opt = gradient_descent.SGD(1.0) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=1.0, dynamic_growth_steps=2.0 ) model.compile( optimizer=opt, loss="mse", run_eagerly=test_utils.should_run_eagerly(), ) # Run for 3 steps (6 examples with a batch size of 2) model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2) self.assertEqual(backend.get_value(opt.loss_scale), 2) self.assertEqual(backend.get_value(opt.dynamic_counter), 1) # Save model weights. save_prefix = os.path.join(self.get_temp_dir(), "ckpt") model.save_weights(save_prefix) # Run model again for 1 step (2 examples with a batch size of 2) model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2) self.assertEqual(backend.get_value(opt.loss_scale), 4) self.assertEqual(backend.get_value(opt.dynamic_counter), 0) # Load model weights and ensure loss scale weights are restored. model.load_weights(save_prefix) self.assertEqual(backend.get_value(opt.loss_scale), 2) self.assertEqual(backend.get_value(opt.dynamic_counter), 1)
def test_compile_wraps_with_loss_scale_optimizer(self): x = layers.Input(shape=(1,)) y = mp_test_util.MultiplyLayer()(x) with policy.policy_scope("mixed_float16"): # Test optimizer is automatically wrapped with LSO model = models.Model(x, y) model.compile(gradient_descent.SGD(1.0), "mse") self.assertIsInstance( model.optimizer, loss_scale_optimizer.LossScaleOptimizer ) self.assertEqual( backend.get_value(model.optimizer.learning_rate), 1.0 ) # Test optimizer specified as string is automatically wrapped in LSO model = models.Model(x, y) model.compile("sgd", "mse") self.assertIsInstance( model.optimizer, loss_scale_optimizer.LossScaleOptimizer ) # Test if an LSO is passed, optimizer is not automatically wrapped with # another LSO model = models.Model(x, y) optimizer = loss_scale_optimizer.LossScaleOptimizer( gradient_descent.SGD(1.0), dynamic_growth_steps=2 ) model.compile(optimizer, "mse") self.assertIsInstance( model.optimizer, loss_scale_optimizer.LossScaleOptimizer ) self.assertEqual(model.optimizer.dynamic_growth_steps, 2) with policy.policy_scope("mixed_bfloat16"): # Test mixed_bfloat16 models are not automatically wrapped with LSO model = models.Model(x, y) model.compile(gradient_descent.SGD(1.0), "mse") self.assertNotIsInstance( model.optimizer, loss_scale_optimizer.LossScaleOptimizer ) self.assertIsInstance(model.optimizer, gradient_descent.SGD)
def testClipping(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. for clip_type in ('clipnorm', 'global_clipnorm', 'clipvalue'): with strategy.scope(), self.subTest(clip_type=clip_type): var = tf.Variable([5.0]) opt = gradient_descent.SGD(learning_rate, **{clip_type: 2.0}) opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=2, dynamic_growth_steps=1) self.assertEqual(getattr(opt, clip_type), 2.0) self.assertEqual( opt.initial_scale % strategy.num_replicas_in_sync, 0) loss = lambda: var * 4 / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) # Test running with clipped gradients run_op = strategy.experimental_run(run_fn) self.evaluate(tf.compat.v1.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The gradient is 4 but is clipped to 2, so the variable will be # init_val - clipped_grad * lr == 5 - 2 * 2 == 1 self.assertAllClose([1.], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale), 4) # Test changing the clip amount and running again setattr(opt, clip_type, 3.0) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # The gradient is 4 but is clipped to 3, so the variable will be # prev_var - clipped_grad * lr == 1 - 3 * 2 == -5 self.assertAllClose([-5.], self.evaluate(var)) self.assertEqual(self.evaluate(opt.loss_scale), 8) # Test Inf gradients are still skipped instead of being clipped loss = lambda: var * float('Inf') run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) self.assertAllClose([-5.], self.evaluate(var)) # Var does not change self.assertEqual(self.evaluate(opt.loss_scale), 4)