def strategy_and_input_combinations(): return (combinations.times( combinations.combine(distribution=strategies_minus_tpu), combinations.combine(mode=['graph'], use_numpy=[True, False], use_validation_data=[True, False]) + combinations.combine( mode=['eager'], use_numpy=[False], use_validation_data=[False])) + combinations.times( combinations.combine(distribution=tpu_strategies), combinations.combine(mode=['graph'], use_numpy=[True, False], use_validation_data=[True, False])))
def strategy_and_input_combinations(): return ( combinations.times( combinations.combine(distribution=strategies_minus_tpu), combinations.combine(mode=['graph'], use_numpy=[True, False], use_validation_data=[True, False]) + combinations.combine(mode=['eager'], use_numpy=[False], use_validation_data=[False])) + combinations.times( combinations.combine(distribution=tpu_strategies), combinations.combine(mode=['graph'], use_numpy=[True, False], use_validation_data=[True, False])))
class TestDistributionStrategyWithNormalizationLayer( test.TestCase, parameterized.TestCase): @combinations.generate(combinations.times( all_strategy_combinations(), combinations.combine(fused=[True, False]))) def test_batchnorm_correctness(self, distribution, fused): with self.cached_session(): with distribution.scope(): model = keras.models.Sequential() norm = keras.layers.BatchNormalization( input_shape=(10,), momentum=0.8, fused=fused) model.add(norm) model.compile(loss='mse', optimizer=gradient_descent.GradientDescentOptimizer(0.01)) # centered on 5.0, variance 10.0 x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10)) x = x.astype('float32') dataset = dataset_ops.Dataset.from_tensor_slices((x, x)) dataset = dataset.repeat(100) dataset = batch_wrapper(dataset, 32, distribution) predict_dataset = dataset_ops.Dataset.from_tensor_slices(x) predict_dataset = predict_dataset.repeat(100) predict_dataset = batch_wrapper(predict_dataset, 32, distribution) model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10) out = model.predict(predict_dataset, steps=2) out -= keras.backend.eval(norm.beta) out /= keras.backend.eval(norm.gamma) np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1) np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
class SingleLossStepTest(test.TestCase, parameterized.TestCase): @combinations.generate( combinations.times( combinations.distributions_and_v1_optimizers(), combinations.combine(mode=combinations.graph_and_eager_modes))) def testTrainNetwork(self, distribution, optimizer_fn): with distribution.scope(): single_loss_step, layer = single_loss_example(optimizer_fn, distribution, use_bias=True) if context.executing_eagerly(): run_step = single_loss_step else: with self.test_session() as sess: run_step = sess.make_callable(single_loss_step()) self.evaluate(variables.global_variables_initializer()) weights, biases = [], [] for _ in range(10): run_step() weights.append(self.evaluate(layer.kernel)) biases.append(self.evaluate(layer.bias)) error = abs( numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) self.assertTrue(is_not_increasing)
def test_times_variable_arguments(self): c1 = combinations.combine(mode=["graph", "eager"]) c2 = combinations.combine(optimizer=["adam", "gd"]) c3 = combinations.combine(distribution=["d1", "d2"]) c4 = combinations.times(c3, c1, c2) self.assertEqual([ OrderedDict([("distribution", "d1"), ("mode", "graph"), ("optimizer", "adam")]), OrderedDict([("distribution", "d1"), ("mode", "graph"), ("optimizer", "gd")]), OrderedDict([("distribution", "d1"), ("mode", "eager"), ("optimizer", "adam")]), OrderedDict([("distribution", "d1"), ("mode", "eager"), ("optimizer", "gd")]), OrderedDict([("distribution", "d2"), ("mode", "graph"), ("optimizer", "adam")]), OrderedDict([("distribution", "d2"), ("mode", "graph"), ("optimizer", "gd")]), OrderedDict([("distribution", "d2"), ("mode", "eager"), ("optimizer", "adam")]), OrderedDict([("distribution", "d2"), ("mode", "eager"), ("optimizer", "gd")]) ], c4) self.assertEqual( combinations.combine(mode=["graph", "eager"], optimizer=["adam", "gd"], distribution=["d1", "d2"]), c4)
def test_combinations_for_embedding_model(): return ( combinations.times( combinations.combine(distribution= strategies_for_embedding_models()), (graph_mode_test_configuration() + eager_mode_test_configuration())))
def test_times_variable_arguments(self): c1 = combinations.combine(mode=["graph", "eager"]) c2 = combinations.combine(optimizer=["adam", "gd"]) c3 = combinations.combine(distribution=["d1", "d2"]) c4 = combinations.times(c3, c1, c2) self.assertEqual([ OrderedDict([("distribution", "d1"), ("mode", "graph"), ("optimizer", "adam")]), OrderedDict([("distribution", "d1"), ("mode", "graph"), ("optimizer", "gd")]), OrderedDict([("distribution", "d1"), ("mode", "eager"), ("optimizer", "adam")]), OrderedDict([("distribution", "d1"), ("mode", "eager"), ("optimizer", "gd")]), OrderedDict([("distribution", "d2"), ("mode", "graph"), ("optimizer", "adam")]), OrderedDict([("distribution", "d2"), ("mode", "graph"), ("optimizer", "gd")]), OrderedDict([("distribution", "d2"), ("mode", "eager"), ("optimizer", "adam")]), OrderedDict([("distribution", "d2"), ("mode", "eager"), ("optimizer", "gd")]) ], c4) self.assertEqual( combinations.combine( mode=["graph", "eager"], optimizer=["adam", "gd"], distribution=["d1", "d2"]), c4)
def test_combinations_with_tpu_strategies(): tpu_strategies = [ combinations.tpu_strategy, combinations.tpu_strategy_one_step ] return (combinations.times( combinations.combine(distribution=tpu_strategies), graph_mode_test_configuration()))
def test_combinations_with_tpu_strategies(): tpu_strategies = [combinations.tpu_strategy, combinations.tpu_strategy_one_step] return ( combinations.times( combinations.combine(distribution=tpu_strategies), graph_mode_test_configuration()))
def strategy_and_optimizer_combinations(): return combinations.times( all_strategy_combinations(), combinations.combine( optimizer=[combinations.adagrad_optimizer_v1_fn, combinations.adam_optimizer_v1_fn, combinations.gradient_descent_optimizer_v1_fn, combinations.rmsprop_optimizer_v1_fn]))
def strategy_and_optimizer_combinations(): # TODO(b/122372746): Uncomment optimizers after they pass tests. return combinations.times( all_strategy_combinations(), combinations.combine(optimizer=[ combinations.adagrad_optimizer_v1_fn, # combinations.adagrad_optimizer_keras_v2_fn, combinations.adam_optimizer_v1_fn, combinations.adam_optimizer_keras_v2_fn, combinations.gradient_descent_optimizer_v1_fn, combinations.gradient_descent_optimizer_keras_v2_fn, combinations.rmsprop_optimizer_v1_fn, # combinations.rmsprop_optimizer_keras_v2_fn ]))
class MonitorTest(test.TestCase, parameterized.TestCase): @combinations.generate( combinations.times( combinations.distributions_and_v1_optimizers(), combinations.combine(mode=combinations.graph_and_eager_modes))) def testTrainNetwork(self, distribution, optimizer_fn): with distribution.scope(): single_loss_step, layer = single_loss_example( optimizer_fn, distribution) if context.executing_eagerly(): monitor = monitor_lib.Monitor(single_loss_step, None) else: with self.test_session() as sess: monitor = monitor_lib.Monitor(single_loss_step, sess) monitor.run_steps(1) self.assertEqual(1, len(layer.trainable_variables)) mirrored_weight_variable = layer.trainable_variables[0] start_error = self.evaluate( distribution.fetch(mirrored_weight_variable)) start_error = abs(numpy.array(start_error) - 1) monitor.run_steps(9) end_error = self.evaluate( distribution.fetch(mirrored_weight_variable)) end_error = abs(numpy.array(end_error) - 1) self.assertGreaterEqual(start_error, end_error) def testPassingASessionInEager(self): distribution = one_device_strategy.OneDeviceStrategy("/device:CPU:0") step_function, _ = single_loss_example( lambda: gradient_descent.GradientDescentOptimizer(0.2), distribution) with session.Session() as sess, context.eager_mode(): with self.assertRaisesRegexp(ValueError, "Should not provide"): _ = monitor_lib.Monitor(step_function, sess) def testNotPassingASessionInGraph(self): distribution = one_device_strategy.OneDeviceStrategy("/device:CPU:0") step_function, _ = single_loss_example( lambda: gradient_descent.GradientDescentOptimizer(0.2), distribution) with context.graph_mode(), ops.Graph().as_default(): with self.assertRaisesRegexp(ValueError, "Should provide"): _ = monitor_lib.Monitor(step_function, session=None)
class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase): @combinations.generate( combinations.times( combinations.distributions_and_v2_optimizers(), combinations.combine(mode=["graph"], use_callable_loss=[True, False]) + combinations.combine(mode=["eager"], use_callable_loss=[True]))) def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss=True): with distribution.scope(): model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) ds = distribution.distribute_dataset(dataset_fn) if context.executing_eagerly(): iterator = ds.make_one_shot_iterator() else: iterator = ds.make_initializable_iterator() def run_step(): return control_flow_ops.group( distribution.unwrap( distribution.call_for_each_replica( model_fn, iterator.get_next(), run_concurrently=layer.built))) if not context.executing_eagerly(): with self.cached_session() as sess: sess.run(iterator.initializer) run_step = sess.make_callable(run_step()) self.evaluate(variables.global_variables_initializer()) weights, biases = [], [] for _ in range(10): run_step() weights.append(self.evaluate(layer.kernel)) biases.append(self.evaluate(layer.bias)) error = abs( numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) self.assertTrue(is_not_increasing)
def strategy_and_input_combinations(): def cnn_model_with_batch_norm(**kwargs): return _create_cnn_model(with_batch_norm=True, **kwargs) return (combinations.times( combinations.combine(distribution=all_strategies), combinations.combine(mode=['graph', 'eager'], use_numpy=[True, False], use_validation_data=[True, False]), combinations.combine(model_with_data=[ ModelWithData('dnn', _create_dnn_model, _dnn_training_data), ModelWithData('cnn', _create_cnn_model, _cnn_training_data), ModelWithData('cnn_batch_norm', cnn_model_with_batch_norm, _cnn_training_data, with_batch_norm=True), ])))
def strategy_and_input_combinations(): def cnn_model_with_batch_norm(**kwargs): return _create_cnn_model(with_batch_norm=True, **kwargs) return ( combinations.times( combinations.combine(distribution=all_strategies), combinations.combine(mode=['graph', 'eager'], use_numpy=[True, False], use_validation_data=[True, False]), combinations.combine(model_with_data=[ ModelWithData('dnn', _create_dnn_model, _dnn_training_data), ModelWithData('cnn', _create_cnn_model, _cnn_training_data), ModelWithData('cnn_batch_norm', cnn_model_with_batch_norm, _cnn_training_data, with_batch_norm=True), ])))
def test_times(self): c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"]) c2 = combinations.combine(mode=["eager"], loss=["callable"]) c3 = combinations.combine(distribution=["d1", "d2"]) c4 = combinations.times(c3, c1 + c2) self.assertEqual([ OrderedDict([("distribution", "d1"), ("loss", "callable"), ("mode", "graph")]), OrderedDict([("distribution", "d1"), ("loss", "tensor"), ("mode", "graph")]), OrderedDict([("distribution", "d1"), ("loss", "callable"), ("mode", "eager")]), OrderedDict([("distribution", "d2"), ("loss", "callable"), ("mode", "graph")]), OrderedDict([("distribution", "d2"), ("loss", "tensor"), ("mode", "graph")]), OrderedDict([("distribution", "d2"), ("loss", "callable"), ("mode", "eager")]) ], c4)
class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): def _get_iterator(self, ds): if context.executing_eagerly(): iterator = ds.make_one_shot_iterator() else: iterator = ds.make_initializable_iterator() self.evaluate(iterator.initializer) return iterator @combinations.generate( combinations.times( combinations.distributions_and_v1_optimizers(), combinations.combine(mode=["graph"], use_callable_loss=[True, False]) + combinations.combine(mode=["eager"], use_callable_loss=[True])) + combinations.combine(distribution=[combinations.tpu_strategy], optimizer_fn=combinations.optimizers_v1, mode=["graph"], use_callable_loss=[True, False])) def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss): with distribution.scope(): model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) def step_fn(ctx, inputs): del ctx # Unused return distribution.group( distribution.call_for_each_replica(model_fn, args=(inputs, ))) iterator = self._get_iterator( distribution.distribute_dataset(dataset_fn)) def run_step(): return distribution.run_steps_on_dataset(step_fn, iterator, iterations=2).run_op self.evaluate(distribution.initialize()) if not context.executing_eagerly(): with self.cached_session() as sess: run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) weights, biases = [], [] for _ in range(5): run_step() weights.append(self.evaluate(layer.kernel)) biases.append(self.evaluate(layer.bias)) self.evaluate(distribution.finalize()) error = abs( numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) self.assertTrue(is_not_increasing) @combinations.generate( combinations.times( combinations.distributions_and_v1_optimizers(), combinations.combine(mode=["graph"], use_callable_loss=[True, False]) + combinations.combine(mode=["eager"], use_callable_loss=[True]))) def testTrainNetworkByCallForEachReplica(self, distribution, optimizer_fn, use_callable_loss): with distribution.scope(): model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) iterator = self._get_iterator( distribution.distribute_dataset(dataset_fn)) def run_step(): return distribution.group( distribution.call_for_each_replica( model_fn, args=(iterator.get_next(), ))) if not context.executing_eagerly(): with self.cached_session() as sess: run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) weights, biases = [], [] for _ in range(10): run_step() weights.append(self.evaluate(layer.kernel)) biases.append(self.evaluate(layer.bias)) error = abs( numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) self.assertTrue(is_not_increasing) @combinations.generate( combinations.times( combinations.distributions_and_v1_optimizers() + combinations.distributions_and_v2_optimizers(), combinations.combine(mode=["graph", "eager"])) + combinations.combine(distribution=[combinations.tpu_strategy], optimizer_fn=combinations.optimizers_v1 + combinations.optimizers_v2, mode=["graph"])) def testOptimizerInsideModelFn(self, distribution, optimizer_fn): created_variables = [] trainable_variables = [] def appending_creator(next_creator, *args, **kwargs): v = next_creator(*args, **kwargs) created_variables.append(v.name) if "trainable" in kwargs and kwargs["trainable"]: trainable_variables.append(v.name) return v # Creator scope needs to be set before it's used inside # `distribution.scope`. with variable_scope.variable_creator_scope( appending_creator), distribution.scope(): model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=True, create_optimizer_inside_model_fn=True) def step_fn(ctx, inputs): del ctx # Unused return distribution.group( distribution.call_for_each_replica(model_fn, args=(inputs, ))) iterator = self._get_iterator( distribution.distribute_dataset(dataset_fn)) def run_step(): return distribution.run_steps_on_dataset(step_fn, iterator, iterations=1).run_op self.evaluate(distribution.initialize()) if not context.executing_eagerly(): with self.cached_session() as sess: run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) run_step() self.evaluate(distribution.finalize()) def get_expected_variables(optimizer_fn, num_parameter_devices): variables_map = { "GradientDescent": ["dense/kernel", "dense/bias"], "Adagrad": [ "dense/kernel/Adagrad", "dense/kernel", "dense/bias/Adagrad", "dense/bias" ] } variables = variables_map[optimizer_fn().get_name()] variables.extend([ v + "/replica_{}".format(replica) for v in variables for replica in range(1, num_parameter_devices) ]) return set([v + ":0" for v in variables]) self.assertEqual( get_expected_variables(optimizer_fn, len(distribution.parameter_devices)), set(created_variables)) @combinations.generate( combinations.times( combinations.combine(momentum=[0.8, 0.9, 0.99], renorm=[False, True]), combinations.times( combinations.distributions_and_v1_optimizers(), combinations.combine( mode=["graph", "eager"], # TODO(isaprykin): Allow False here. Currently subsequent # replicas will re-execute UPDATE_OPS of previous replicas. update_ops_in_cross_replica_mode=[True])) + combinations.combine(distribution=[combinations.tpu_strategy], optimizer_fn=combinations.optimizers_v1, mode=["graph"], update_ops_in_cross_replica_mode=[False]))) def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum, renorm, update_ops_in_cross_replica_mode): """Verifies that moving mean updates are reduced across replicas.""" with distribution.scope(): num_replicas = distribution.num_replicas_in_sync model_fn, dataset_fn, batchnorm = batchnorm_example( optimizer_fn, batch_per_epoch=num_replicas, momentum=momentum, renorm=renorm, update_ops_in_replica_mode=not update_ops_in_cross_replica_mode ) def step_fn(ctx, inputs): del ctx # Unused fetches = distribution.unwrap( distribution.call_for_each_replica(model_fn, args=(inputs, ))) if update_ops_in_cross_replica_mode: fetches += tuple( ops.get_collection(ops.GraphKeys.UPDATE_OPS)) return control_flow_ops.group(fetches) iterator = self._get_iterator( distribution.distribute_dataset(dataset_fn)) def run_step(): return distribution.run_steps_on_dataset(step_fn, iterator, iterations=1).run_op self.evaluate(distribution.initialize()) if not context.executing_eagerly(): with self.cached_session() as sess: run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) expected_moving_means = [0.] * 8 def averaged_batch_mean(i): # Each batch has shape [16, 8] where the ith element in jth list is # (8 * j + i + replica_id * 100). So the batch mean in each replica is # (60 + i + replica_id * 100). So here comes its batch mean over all # replicas: return 60. + i + (num_replicas - 1.) / 2. * 100. for _ in range(10): run_step() moving_means = self.evaluate(batchnorm.moving_mean) # We make sure that the moving_mean is updated as if the sample mean is # calculated over all replicas. for i, expected_moving_mean in enumerate( expected_moving_means): expected_moving_means[i] -= ( (expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum)) self.assertNear(expected_moving_means[i], moving_means[i], 0.0001) self.evaluate(distribution.finalize()) @combinations.generate( combinations.times( combinations.combine( optimizer_fn=[ combinations.gradient_descent_optimizer_v1_fn, combinations.gradient_descent_optimizer_v2_fn ], loss_reduction=[ losses_impl.Reduction.SUM, losses_impl.Reduction.MEAN, losses_impl.Reduction.SUM_OVER_BATCH_SIZE, losses_impl.Reduction.SUM_OVER_NONZERO_WEIGHTS ]), combinations.times( combinations.combine(distribution=[ combinations.one_device_strategy, combinations.mirrored_strategy_with_gpu_and_cpu, combinations.mirrored_strategy_with_two_gpus, combinations.core_mirrored_strategy_with_gpu_and_cpu, combinations.core_mirrored_strategy_with_two_gpus ]), combinations.combine(mode=["graph"], use_callable_loss=[True, False]) + combinations.combine(mode=["eager"], use_callable_loss=[True])) + combinations.combine(distribution=[combinations.tpu_strategy], mode=["graph"], use_callable_loss=[True, False]))) def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction, use_callable_loss): with distribution.scope(): all_vars = [] def model_fn(inputs): x, y = inputs def loss_fn(): # Use fixed initialization to make the steps deterministic. w = variable_scope.get_variable("w", initializer=[[2.]]) all_vars.append(w) predict = math_ops.matmul(x, w) return losses_impl.mean_squared_error( y, predict, reduction=loss_reduction) optimizer = optimizer_fn( ) # GradientDescent with 0.2 learning rate if use_callable_loss: return optimizer.minimize(loss_fn) else: return optimizer.minimize(loss_fn()) def dataset_fn(): features = dataset_ops.Dataset.from_tensors([[2.], [7.]]) labels = dataset_ops.Dataset.from_tensors([[6.], [21.]]) return dataset_ops.Dataset.zip((features, labels)).repeat() def step_fn(ctx, inputs): del ctx # Unused return distribution.group( distribution.call_for_each_replica(model_fn, args=(inputs, ))) iterator = self._get_iterator( distribution.distribute_dataset(dataset_fn)) def run_step(): return distribution.run_steps_on_dataset(step_fn, iterator, iterations=1).run_op self.evaluate(distribution.initialize()) if not context.executing_eagerly(): with self.cached_session() as sess: run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) run_step() v = all_vars[0] self.assertTrue(all(v is vi for vi in all_vars[1:])) weight = numpy.squeeze(self.evaluate(v)) # Our model is: # predict = x * w # loss = (predict - y)^2 # dloss/dpredict = 2*(predict - y) # dloss/dw = 2 * x^T @ (predict - y) # For our batch size of 2, assuming sum loss reduction: # x = [2, 7] # y = [6, 21] # w_initial = 2 # predict = [4, 14] # predict - y = [-2, -7] # dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106 # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2 # with sum loss reduction, or 10.6 with mean. if loss_reduction == losses_impl.Reduction.SUM: # Note that the "distribution.num_replicas_in_sync" factor will go away # once we split the input across replicas, instead of pulling a complete # batch of input per replica. self.assertNear(weight, 2 + 21.2 * distribution.num_replicas_in_sync, 0.0001) else: # One of the mean loss reductions. self.assertNear(weight, 2 + 10.6, 0.0001) self.evaluate(distribution.finalize()) @combinations.generate( combinations.times(combinations.distributions_and_v1_optimizers(), combinations.combine(mode=["graph", "eager"]), combinations.combine(is_tpu=[False])) + combinations.combine(distribution=[combinations.tpu_strategy], optimizer_fn=combinations.optimizers_v1, mode=["graph"], is_tpu=[True])) def testRunStepsWithOutputContext(self, distribution, optimizer_fn, is_tpu): with distribution.scope(): def dataset_fn(): dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat() # TODO(priyag): batch with drop_remainder=True causes shapes to be # fully defined for TPU. Remove this when XLA supports dynamic shapes. return dataset.batch(batch_size=1, drop_remainder=True) optimizer = optimizer_fn() layer = core.Dense(1, use_bias=True) key1 = "foo" value1 = "bar" def model_fn(output_context, x): """A very simple model written by the user.""" def loss_fn(): y = array_ops.reshape(layer(x), []) - constant_op.constant(1.) return y * y train_op = optimizer.minimize(loss_fn) loss = loss_fn() output_context.set_last_step_output( name="replica_loss_reduced", output=loss, reduce_op=reduce_util.ReduceOp.MEAN) output_context.set_non_tensor_output(key1, value1) return (train_op, loss) def step_fn(output_context, inputs): (train_op, loss) = distribution.call_for_each_replica( model_fn, args=(output_context, inputs)) output_context.set_last_step_output( name="cross_replica_loss_reduced", output=loss, reduce_op=reduce_util.ReduceOp.MEAN) output_context.set_last_step_output( name="cross_replica_loss_not_reduced", output=loss) return distribution.group(train_op) iterator = self._get_iterator( distribution.distribute_dataset(dataset_fn)) def run_step(): initial_loss = lambda: constant_op.constant(1e7) # Initial values corresponding to reduced losses are just single # tensors. But for non reduced losses, we need to have initial # values that are of the same structure as non reduced losses. In # MirroredStrategy, this will be a list of losses, in TPUStrategy # it will be single tensor. Using `broadcast` followed by `unwrap` # gives us the desired initial value structure. initial_loop_values = { "replica_loss_reduced": initial_loss(), "cross_replica_loss_reduced": initial_loss(), "cross_replica_loss_not_reduced": distribution.unwrap(distribution.broadcast(initial_loss())) } ctx = distribution.run_steps_on_dataset( step_fn, iterator, iterations=2, initial_loop_values=initial_loop_values) self.assertEqual({key1: (value1, )}, ctx.non_tensor_outputs) self._verify_loss_output( initial_loss(), loss_output=ctx.last_step_outputs["replica_loss_reduced"], reduced=True, distribution=distribution) self._verify_loss_output( initial_loss(), loss_output=ctx. last_step_outputs["cross_replica_loss_reduced"], reduced=True, distribution=distribution) self._verify_loss_output( initial_loss(), loss_output=ctx. last_step_outputs["cross_replica_loss_not_reduced"], reduced=False, distribution=distribution) return (ctx.run_op, ctx.last_step_outputs["replica_loss_reduced"]) self.evaluate(distribution.initialize()) if not context.executing_eagerly(): with self.cached_session() as sess: run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) weights, biases, losses = [], [], [] for _ in range(5): _, loss = run_step() losses.append(loss) weights.append(self.evaluate(layer.kernel)) biases.append(self.evaluate(layer.bias)) self.evaluate(distribution.finalize()) loss_is_not_increasing = all(y <= x for x, y in zip(losses, losses[1:])) self.assertTrue(loss_is_not_increasing) error = abs( numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) error_is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) self.assertTrue(error_is_not_increasing) def _verify_loss_output(self, initial_loss, loss_output, reduced, distribution): if not reduced: self.assertLen(distribution.unwrap(loss_output), distribution.num_replicas_in_sync) loss_tensor = distribution.reduce(reduce_util.ReduceOp.MEAN, loss_output) else: unwrapped_output = distribution.unwrap(loss_output) self.assertLen(unwrapped_output, 1) loss_tensor = unwrapped_output[0] self.assertEqual(initial_loss.dtype, loss_tensor.dtype) self.assertEqual(initial_loss.shape, loss_tensor.shape)
class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): @combinations.generate( combinations.times( combinations.distributions_and_v1_optimizers(), combinations.combine(mode=["graph"], use_callable_loss=[True, False]) + combinations.combine(mode=["eager"], use_callable_loss=[True]))) def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss=True): with distribution.scope(): model_fn, dataset, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) iterator = distribution.distribute_dataset(dataset) def run_step(): return distribution.group( distribution.call_for_each_tower( model_fn, iterator.get_next(), run_concurrently=layer.built)) if not context.executing_eagerly(): with self.test_session() as sess: run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) weights, biases = [], [] for _ in range(10): run_step() weights.append(self.evaluate(distribution.fetch(layer.kernel))) biases.append(self.evaluate(distribution.fetch(layer.bias))) error = abs( numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) self.assertTrue(is_not_increasing) @combinations.generate( combinations.times( combinations.distributions_and_v1_optimizers() + combinations.distributions_and_v2_optimizers(), combinations.combine(mode=["graph", "eager"]))) def testOptimizerInsideModelFn(self, distribution, optimizer_fn): created_variables = [] trainable_variables = [] def appending_creator(next_creator, *args, **kwargs): v = next_creator(*args, **kwargs) created_variables.append(v.name) if "trainable" in kwargs and kwargs["trainable"]: trainable_variables.append(v.name) return v # Creator scope needs to be set before it's used inside # `distribution.scope`. with variable_scope.variable_creator_scope( appending_creator), distribution.scope(): model_fn, dataset, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=True, create_optimizer_inside_model_fn=True) iterator = distribution.distribute_dataset(dataset) def run_step(): return distribution.group( distribution.call_for_each_tower( model_fn, iterator.get_next(), run_concurrently=layer.built)) if not context.executing_eagerly(): with self.test_session() as sess: run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) run_step() def get_expected_variables(optimizer_fn, num_parameter_devices): variables_map = { "GradientDescent": ["dense/kernel", "dense/bias"], "Adam": [ "dense/kernel", "dense/bias", "beta1_power", "beta2_power", "dense/kernel/Adam", "dense/kernel/Adam_1", "dense/bias/Adam", "dense/bias/Adam_1" ] } variables = variables_map[optimizer_fn().get_name()] variables.extend([ v + "/replica_{}".format(replica) for v in variables for replica in range(1, num_parameter_devices) ]) return set([v + ":0" for v in variables]) self.assertEqual( get_expected_variables(optimizer_fn, len(distribution.parameter_devices)), set(created_variables)) @combinations.generate( combinations.times( combinations.distributions_and_v1_optimizers(), combinations.combine(mode=["graph", "eager"], momentum=[0.8, 0.9, 0.99], renorm=[False, True]))) def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum, renorm): """Verifies that moving mean updates are reduced across towers.""" with distribution.scope(): num_towers = len(distribution.worker_devices) model_fn, dataset, batchnorm = batchnorm_example( optimizer_fn, batch_per_epoch=num_towers, momentum=momentum, renorm=renorm) # Disable prefetching since that makes the specific input on each device # to be non deterministic, and this test relies on specific input being # on each device. if isinstance(distribution, mirrored_strategy.MirroredStrategy): distribution._prefetch_on_device = False iterator = distribution.distribute_dataset(dataset) def run_step(): return control_flow_ops.group( distribution.unwrap( distribution.call_for_each_tower( model_fn, iterator.get_next(), run_concurrently=batchnorm.built)) + ops.get_collection(ops.GraphKeys.UPDATE_OPS)) if not context.executing_eagerly(): with self.test_session() as sess: run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) expected_moving_means = [0.] * 8 def averaged_batch_mean(i): # Each batch has shape [16, 8] where the ith element in jth list is # (8 * j + i + tower_id * 100). So the batch mean in each tower is # (60 + i + tower_id * 100). So here comes its batch mean over all # towers: return 60. + i + (num_towers - 1.) / 2. * 100. for _ in range(10): run_step() moving_means = self.evaluate( distribution.fetch(batchnorm.moving_mean)) # We make sure that the moving_mean is updated as if the sample mean is # calculated over all towers. for i, expected_moving_mean in enumerate( expected_moving_means): expected_moving_means[i] -= ( (expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum)) self.assertNear(expected_moving_means[i], moving_means[i], 0.0001) @combinations.generate( combinations.times( combinations.combine( distribution=[ combinations.one_device_strategy, combinations.mirrored_strategy_with_gpu_and_cpu, combinations.mirrored_strategy_with_two_gpus ], optimizer_fn=[ combinations.gradient_descent_optimizer_v1_fn, combinations.gradient_descent_optimizer_v2_fn ], loss_reduction=[ losses_impl.Reduction.SUM, losses_impl.Reduction.MEAN, losses_impl.Reduction.SUM_OVER_BATCH_SIZE, losses_impl.Reduction.SUM_OVER_NONZERO_WEIGHTS ]), combinations.combine(mode=["graph"], use_callable_loss=[True, False]) + combinations.combine(mode=["eager"], use_callable_loss=[True]))) def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction, use_callable_loss): with distribution.scope(): all_vars = [] def model_fn(x, y): def loss_fn(): # Use fixed initialization to make the steps deterministic. w = variable_scope.get_variable("w", initializer=[[2.]]) all_vars.append(w) predict = math_ops.matmul(x, w) return losses_impl.mean_squared_error( y, predict, reduction=loss_reduction) optimizer = optimizer_fn( ) # GradientDescent with 0.2 learning rate if use_callable_loss: return optimizer.minimize(loss_fn) else: return optimizer.minimize(loss_fn()) features = dataset_ops.Dataset.from_tensors([[2.], [7.]]) labels = dataset_ops.Dataset.from_tensors([[6.], [21.]]) dataset = dataset_ops.Dataset.zip((features, labels)).repeat() iterator = distribution.distribute_dataset(dataset) def run_step(): return distribution.group( distribution.call_for_each_tower(model_fn, *iterator.get_next(), run_concurrently=False)) if not context.executing_eagerly(): with self.test_session() as sess: run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) run_step() self.assertEqual(distribution.num_towers, len(all_vars)) v = all_vars[0] self.assertTrue(all([v is vi for vi in all_vars[1:]])) weight = numpy.squeeze(self.evaluate(distribution.fetch(v))) # Our model is: # predict = x * w # loss = (predict - y)^2 # dloss/dpredict = 2*(predict - y) # dloss/dw = 2 * x^T @ (predict - y) # For our batch size of 2, assuming sum loss reduction: # x = [2, 7] # y = [6, 21] # w_initial = 2 # predict = [4, 14] # predict - y = [-2, -7] # dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106 # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2 # with sum loss reduction, or 10.6 with mean. if loss_reduction == losses_impl.Reduction.SUM: # Note that the "distribution.num_towers" factor will go away once # we split the input across towers, instead of pulling a complete # batch of input per tower. self.assertNear(weight, 2 + 21.2 * distribution.num_towers, 0.0001) else: # One of the mean loss reductions. self.assertNear(weight, 2 + 10.6, 0.0001)
def test_overlapping_keys(self): c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"]) c2 = combinations.combine(mode=["eager"], loss=["callable"]) with self.assertRaisesRegexp(ValueError, ".*Keys.+overlap.+"): _ = combinations.times(c1, c2)
def all_strategy_and_input_config_combinations(): return ( combinations.times( combinations.combine(distribution=all_strategies), eager_mode_test_configuration() + graph_mode_test_configuration()))
def all_strategy_and_input_config_combinations(): return (combinations.times( combinations.combine(distribution=all_strategies), combinations.combine(mode=['graph', 'eager'], use_numpy=[True, False], use_validation_data=[True, False])))
def all_strategy_and_input_config_combinations(): return (combinations.times( combinations.combine(distribution=all_strategies), eager_mode_test_configuration() + graph_mode_test_configuration()))