def from_config(cls, config, custom_objects=None): config = config.copy() # Make a copy, since we mutate config config['optimizer'] = optimizers.deserialize( config['optimizer'], custom_objects=custom_objects) # If loss_scale is in config, we assume we are deserializing a # LossScaleOptimizer from TF 2.3 or below. Otherwise, we assume we are # deserializing a LossScaleOptimizer from TF 2.4 or above. if 'loss_scale' in config: config['loss_scale'] = keras_loss_scale_module.deserialize( config['loss_scale']) if (isinstance(config['loss_scale'], loss_scale_module.DynamicLossScale) and config['loss_scale'].multiplier != 2): raise ValueError('Cannot deserialize LossScaleOptimizer with a ' 'DynamicLossScale whose multiplier is not 2. Got ' 'DynamicLossScale: %s' % (config['loss_scale'],)) return cls(**config) # We convert the config, as generated by LossScaleOptimizer.get_config, to a # version that can be passed to LossScaleOptimizerV1.__init__ if config['dynamic']: config['loss_scale'] = loss_scale_module.DynamicLossScale( config['initial_scale'], config['dynamic_growth_steps'], multiplier=2) else: config['loss_scale'] = loss_scale_module.FixedLossScale( config['initial_scale']) del config['dynamic'] del config['initial_scale'] del config['dynamic_growth_steps'] return cls(**config)
def test_non_persistent_tapes_error(self): x = variables.Variable(3.0) with lsgt.LossScaleGradientTape(loss_scale_module.FixedLossScale(32), persistent=False) as g: y = x * x z = y * y g.gradient(z, x) with self.assertRaisesRegexp(RuntimeError, 'persistent'): g.gradient(y, x)
def test_fixed_scaling_no_change_non_finite_gradient( self, non_finite_term, is_non_finite): loss_scale = loss_scale_module.FixedLossScale(32) x = constant_op.constant(1.0) with lsgt.LossScalingGradientTape(loss_scale) as g: g.watch(x) y = x * non_finite_term dy_dx = g.gradient(y, x) self.assertTrue(is_non_finite(self.evaluate(dy_dx))) self.assertEqual(self.evaluate(loss_scale()), 32.0)
def test_non_persistent_tapes_error(self): x = constant_op.constant(3.0) with lsgt.LossScaleGradientTape(loss_scale_module.FixedLossScale(32), persistent=False) as g: g.watch(x) y = x * x z = y * y g.gradient(z, x) with self.assertRaisesRegex(RuntimeError, 'persistent'): g.gradient(y, x)
def get(identifier): """Get a loss scale object.""" if isinstance(identifier, dict): return deserialize(identifier) if isinstance(identifier, (int, float)): return loss_scale_module.FixedLossScale(identifier) if identifier == 'dynamic': return loss_scale_module.DynamicLossScale() if isinstance(identifier, loss_scale_module.LossScale): return identifier elif identifier is None: return None else: raise ValueError('Could not interpret loss scale identifier: %s' % identifier)
def test_fixed_scaling_no_change_non_finite_gradient( self, strategy_fn, non_finite_term): loss_scale = loss_scale_module.FixedLossScale(32) def run_fn(): x = constant_op.constant(1.0) with lsgt.LossScaleGradientTape(loss_scale) as g: g.watch(x) y = x * non_finite_term return g.gradient(y, x) dy_dx_list = self._run_with_strategy(run_fn, strategy_fn()) check_fn = np.isposinf if non_finite_term == np.inf else np.isnan for dy_dx in dy_dx_list: self.assertTrue(check_fn(self.evaluate(dy_dx))) self.assertEqual(self.evaluate(loss_scale()), 32.0)
def test_jacobian_raises_error(self): loss_scale = loss_scale_module.FixedLossScale(2.) x = variables.Variable([1.0, 2.0]) with lsgt.LossScaleGradientTape(loss_scale) as g: y = x * 2 with self.assertRaisesRegexp( NotImplementedError, 'LossScaleGradientTape.jacobian is not yet implemented'): g.jacobian(y, x) x = variables.Variable([[1.0, 2.0], [3.0, 4.0]]) with lsgt.LossScaleGradientTape(loss_scale) as g: y = x * 2 with self.assertRaisesRegexp( NotImplementedError, 'LossScaleGradientTape.batch_jacobian is not yet implemented'): g.batch_jacobian(y, x)
def test_basic(self): loss_scale_value = 1000 loss_scale = loss_scale_module.FixedLossScale(loss_scale_value) update_op, should_apply = loss_scale.update([constant_op.constant(0.)]) self.evaluate(update_op) # should_apply should be a bool instead of a tensor, so that a tf.cond does # not have to be built in the graph by the caller. self.assertIsInstance(should_apply, bool) self.assertTrue(should_apply) self.assertEqual(loss_scale_value, self.evaluate(loss_scale())) update_op, should_apply = loss_scale.update( [constant_op.constant(float('NaN'))]) self.evaluate(update_op) self.assertIsInstance(should_apply, bool) self.assertTrue(should_apply) self.assertEqual(loss_scale_value, self.evaluate(loss_scale()))
def testPassingV1LossScale(self, strategy_fn): strategy = strategy_fn() learning_rate = 2. with strategy.scope(): # Test FixedLossScale var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) loss_scale = tf_loss_scale_module.FixedLossScale(2.) opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) self.assertIsInstance(opt.loss_scale, ops.Tensor) self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(opt.loss_scale), 2) run_fn = self._run_fn_with_grad_check( strategy, var, opt, 2 / strategy.num_replicas_in_sync) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The loss is the identity of the variable. Therefore the gradient is 1, # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3 self.assertAllClose([3.], self.evaluate(var)) # Test DynamicLossScale var = variables.Variable([5.0]) opt = gradient_descent.SGD(learning_rate) loss_scale = tf_loss_scale_module.DynamicLossScale( initial_loss_scale=4, increment_period=1, multiplier=2) loss_scale._current_loss_scale.assign(2) opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale) self.assertEqual(opt.initial_scale, 4) self.assertEqual(opt.dynamic_growth_steps, 1) self.evaluate(variables.global_variables_initializer()) # Current loss scale is not copied so loss scale is reinitialized to 4 self.assertEqual(self.evaluate(opt.loss_scale), 4) for s in strategy.experimental_local_results(opt.dynamic_counter): self.assertEqual(self.evaluate(s), 0) run_fn = self._run_fn_with_grad_check( strategy, var, opt, 4 / strategy.num_replicas_in_sync) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) self.assertAllClose([3.], self.evaluate(var))
def testHyperParametersExposed(self): with self.cached_session(): opt = adam.Adam(learning_rate=1.0, beta_1=0.5, beta_2=0.9) lso = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic') # Force hyperparameters to be created opt.lr # pylint: disable=pointless-statement self.evaluate(variables.global_variables_initializer()) self.assertEqual(self.evaluate(lso.beta_1), 0.5) self.assertIsInstance(lso.beta_1, variables.Variable) self.assertEqual(self.evaluate(lso.lr), 1.0) self.assertIs(lso.lr, opt.lr) self.assertIs(lso.lr, lso.learning_rate) lso.beta_1 = 0.25 self.assertEqual(self.evaluate(lso.beta_1), 0.25) self.assertEqual(self.evaluate(opt.beta_1), 0.25) self.assertIs(lso.beta_1, opt.beta_1) opt.beta_1 = 0.75 self.assertEqual(self.evaluate(lso.beta_1), 0.75) self.assertEqual(self.evaluate(opt.beta_1), 0.75) self.assertIs(lso.beta_1, opt.beta_1) lso.lr = 2.0 self.assertEqual(self.evaluate(lso.lr), 2.0) self.assertEqual(self.evaluate(lso.learning_rate), 2.0) self.assertEqual(self.evaluate(opt.lr), 2.0) self.assertEqual(self.evaluate(opt.learning_rate), 2.0) self.assertIs(lso.lr, opt.lr) # Test setting attribute that is both attribute on LossScaleOptimizer and # hyperparameter on wrapped optimizer. class MyOpt(gradient_descent.SGD): def __init__(self): super().__init__() self._set_hyper('loss_scale', 123.) opt = MyOpt() lso = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic') with self.assertRaises(AttributeError): lso.loss_scale = loss_scale_module.FixedLossScale(2.)
def _benchmark(self, gradient_type, num_gpus, mode, loss_scaling): """Benchmarks loss scaling. We run a simple model with several scalar variables. The loss is the sum of all variables. The model is simple because we want to measure only the performance of loss scaling, not the performance of the model itself. Args: gradient_type: "optimizer" or "gradient_tape". How gradients are computed. "optimizer" uses Optimizer.minimize. "gradient_tape" uses GradientTape.gradient along with LossScaleOptimizer.get_scaled_loss and LossScaleOptimizer.get_unscaled_gradients. num_gpus: The number of GPUs to use. Must be at least 1. mode: "eager" or "tf_function". "tf_function" causes all computations to be wrapped in a tf.function, while "eager" runs computations eagerly. loss_scaling: "fixed", "dynamic", or None. The type of loss scaling to use. None means use no loss scaling, which is useful as a baseline to see how much slower loss scaling is in comparison. """ ls_str = loss_scaling or 'no_loss_scaling' name = '%s_%d_GPU_%s_%s' % (gradient_type, num_gpus, mode, ls_str) with context.eager_mode(), _get_strategy(num_gpus).scope() as strategy: opt = adam.Adam() if loss_scaling == 'fixed': loss_scale = loss_scale_module.FixedLossScale(2.) elif loss_scaling == 'dynamic': # Make increment_period so high that it's effectively infinite. This # means the loss scale will never change. Any performance overhead # from increasing/decreasing the loss scale is typically negligible # since it happens infrequently, so we only benchmark the common case # of the loss scale not changing. increment_period = 1000000 loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=2., increment_period=increment_period) else: assert loss_scaling is None loss_scale = None if loss_scale: opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) num_vars = 200 num_warmup_iters = 1 num_iters = 20 # By using scalar variables, we reduce overhead of the actual GPU work of # multiplying variables, dividing gradients, and checking gradients for # NaNs. Measuring these overheads isn't very useful as there is little we # can do to reduce them (one such way would be to fuse dividing gradients # and checking them for NaNs). We still have all other overheads, such as # all-reducing the `is_finite` values and having a tf.cond or # tf.while_loop based on whether gradients are NaNs. Currently, these # other overheads are much more significant than the GPU work. var_list = [ variables.Variable(i, dtype='float32') for i in range(num_vars) ] def get_loss(): return math_ops.add_n(var_list) if gradient_type == 'gradient_tape': if loss_scale is None: def minimize_fn(): with backprop.GradientTape() as tape: loss = get_loss() grads = tape.gradient(loss, var_list) return opt.apply_gradients(zip(grads, var_list)) else: def minimize_fn(): with backprop.GradientTape() as tape: loss = get_loss() scaled_loss = opt.get_scaled_loss(loss) scaled_grads = tape.gradient(scaled_loss, var_list) grads = opt.get_unscaled_gradients(scaled_grads) return opt.apply_gradients(zip(grads, var_list)) else: assert gradient_type == 'optimizer' def minimize_fn(): return opt.minimize(get_loss, var_list) def run_fn(): strategy.run(minimize_fn) if mode == 'tf_function': run_fn = def_function.function(run_fn) for _ in range(num_warmup_iters): run_fn() start = time.time() for _ in range(num_iters): run_fn() end = time.time() self.report_benchmark(iters=num_iters, wall_time=(end - start) / num_iters, name=name)
def test_repr(self): loss_scale = loss_scale_module.FixedLossScale(123) self.assertEqual(repr(loss_scale), 'FixedLossScale(123.0)')
def test_call_type(self): scalar = loss_scale_module.FixedLossScale(123) self.assertIsInstance(scalar(), ops.Tensor)