def test_times_variable_arguments(self): c1 = combinations.combine(mode=["graph", "eager"]) c2 = combinations.combine(optimizer=["adam", "gd"]) c3 = combinations.combine(distribution=["d1", "d2"]) c4 = combinations.times(c3, c1, c2) self.assertEqual([ OrderedDict([("distribution", "d1"), ("mode", "graph"), ("optimizer", "adam")]), OrderedDict([("distribution", "d1"), ("mode", "graph"), ("optimizer", "gd")]), OrderedDict([("distribution", "d1"), ("mode", "eager"), ("optimizer", "adam")]), OrderedDict([("distribution", "d1"), ("mode", "eager"), ("optimizer", "gd")]), OrderedDict([("distribution", "d2"), ("mode", "graph"), ("optimizer", "adam")]), OrderedDict([("distribution", "d2"), ("mode", "graph"), ("optimizer", "gd")]), OrderedDict([("distribution", "d2"), ("mode", "eager"), ("optimizer", "adam")]), OrderedDict([("distribution", "d2"), ("mode", "eager"), ("optimizer", "gd")]) ], c4) self.assertEqual( combinations.combine( mode=["graph", "eager"], optimizer=["adam", "gd"], distribution=["d1", "d2"]), c4)
def test_add(self): self.assertEqual( [{ "a": 1 }, { "a": 2 }, { "b": 2 }, { "b": 3 }], combinations.combine(a=[1, 2]) + combinations.combine(b=[2, 3]))
def strategy_and_input_combinations(): return ( combinations.times( combinations.combine(distribution=strategies_minus_tpu), combinations.combine(mode=['graph'], use_numpy=[True, False], use_validation_data=[True, False]) + combinations.combine(mode=['eager'], use_numpy=[False], use_validation_data=[False])) + combinations.times( combinations.combine(distribution=tpu_strategies), combinations.combine(mode=['graph'], use_numpy=[True, False], use_validation_data=[True, False])))
def tpu_combinations(): return combinations.combine( distribution=[ strategy_combinations.tpu_strategy_one_step, strategy_combinations.tpu_strategy ], mode=["graph"])
def test_arguments_sorted(self): self.assertEqual([ OrderedDict([("aa", 1), ("ab", 2)]), OrderedDict([("aa", 1), ("ab", 3)]), OrderedDict([("aa", 2), ("ab", 2)]), OrderedDict([("aa", 2), ("ab", 3)]) ], combinations.combine(ab=[2, 3], aa=[1, 2]))
def test_combinations_for_embedding_model(): return ( combinations.times( combinations.combine(distribution= strategies_for_embedding_models()), (graph_mode_test_configuration() + eager_mode_test_configuration())))
def all_strategy_minus_default_and_tpu_combinations(): return combinations.combine( distribution=[ one_device_strategy, one_device_strategy_gpu, mirrored_strategy_with_gpu_and_cpu, mirrored_strategy_with_two_gpus ], mode=["graph", "eager"])
def test_combinations_for_stateful_embedding_model(): return ( combinations.combine( distribution=strategies_for_stateful_embedding_model(), mode='graph', use_numpy=False, use_validation_data=False ))
def test_combine_single_parameter(self): self.assertEqual([{ "a": 1, "b": 2 }, { "a": 2, "b": 2 }], combinations.combine(a=[1, 2], b=2))
def distributions_and_v1_optimizers(): """A common set of combination with DistributionStrategies and Optimizers.""" return combinations.combine( distribution=[ one_device_strategy, mirrored_strategy_with_gpu_and_cpu, mirrored_strategy_with_two_gpus, ], optimizer_fn=optimizers_v1)
def all_combinations(): return combinations.combine( distribution=[ strategy_combinations.default_strategy, strategy_combinations.one_device_strategy, strategy_combinations.mirrored_strategy_with_gpu_and_cpu, strategy_combinations.mirrored_strategy_with_two_gpus, ], mode=["graph"])
def distributions_and_v2_optimizers(): """DistributionStrategies and V2 Optimizers.""" return combinations.combine( distribution=[ strategy_combinations.one_device_strategy, strategy_combinations.mirrored_strategy_with_gpu_and_cpu, strategy_combinations.mirrored_strategy_with_two_gpus, ], optimizer_fn=optimizers_v2)
def test_combinations_with_tpu_strategies(): tpu_strategies = [ strategy_combinations.tpu_strategy, strategy_combinations.tpu_strategy_one_step ] return ( combinations.times( combinations.combine(distribution=tpu_strategies), graph_mode_test_configuration()))
def test_times(self): c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"]) c2 = combinations.combine(mode=["eager"], loss=["callable"]) c3 = combinations.combine(distribution=["d1", "d2"]) c4 = combinations.times(c3, c1 + c2) self.assertEqual([ OrderedDict([("distribution", "d1"), ("loss", "callable"), ("mode", "graph")]), OrderedDict([("distribution", "d1"), ("loss", "tensor"), ("mode", "graph")]), OrderedDict([("distribution", "d1"), ("loss", "callable"), ("mode", "eager")]), OrderedDict([("distribution", "d2"), ("loss", "callable"), ("mode", "graph")]), OrderedDict([("distribution", "d2"), ("loss", "tensor"), ("mode", "graph")]), OrderedDict([("distribution", "d2"), ("loss", "callable"), ("mode", "eager")]) ], c4)
def generate_callback_test_function(custom_callable): """Generic template for callback tests using mnist synthetic dataset.""" @combinations.generate( combinations.combine( mode=['graph'], strategy_cls=[collective_strategy.CollectiveAllReduceStrategy], required_gpus=[0, 1])) def test_template(self, strategy_cls): num_workers = 2 num_epoch = 2 cluster_spec = test_base.create_cluster_spec(num_workers=num_workers) self._barrier = dc._Barrier(2) def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument """Simulates an Independent Worker inside of a thread.""" with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): strategy = get_strategy_object(strategy_cls) batch_size = 64 steps = 2 train_ds, _ = _mnist_synthetic_dataset(batch_size, steps) with strategy.scope(): model = _get_model((28, 28, 1)) custom_callable( model, self, train_ds, num_epoch, steps, strategy, saving_filepath=kwargs['saving_filepath']) # Pass saving_filepath from the parent thread to ensure every worker has the # same fileapth to save. saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint.h5') threads = self.run_multiple_tasks_in_threads( _independent_worker_fn, cluster_spec, saving_filepath=saving_filepath) if os.path.exists(saving_filepath): os.remove(saving_filepath) threads_to_join = [] strategy = get_strategy_object(strategy_cls) if strategy.extended.experimental_between_graph: for ts in threads.values(): threads_to_join.extend(ts) else: threads_to_join = [threads['worker'][0]] self.join_independent_workers(threads_to_join) return test_template
def strategy_and_optimizer_combinations(): return combinations.times( all_strategy_combinations(), combinations.combine(optimizer=[ strategy_combinations.adagrad_optimizer_v1_fn, strategy_combinations.adagrad_optimizer_keras_v2_fn, strategy_combinations.adam_optimizer_v1_fn, strategy_combinations.adam_optimizer_keras_v2_fn, strategy_combinations.gradient_descent_optimizer_v1_fn, strategy_combinations.gradient_descent_optimizer_keras_v2_fn, strategy_combinations.rmsprop_optimizer_v1_fn, strategy_combinations.rmsprop_optimizer_keras_v2_fn ]))
def test_combine(self): self.assertEqual([{ "a": 1, "b": 2 }, { "a": 1, "b": 3 }, { "a": 2, "b": 2 }, { "a": 2, "b": 3 }], combinations.combine(a=[1, 2], b=[2, 3]))
def all_strategy_and_eager_plus_graph(): return combinations.times( combinations.combine(distribution=contrib_mirrored_strategies), combinations.combine(mode=["eager", "graph"]))
with self.name_scope: self._layers = [ keras.layers.Dense(4, name="dense"), ] @module.Module.with_name_scope def __call__(self, x): for layer in self._layers: x = layer(x) return x @combinations.generate( combinations.combine( distribution=(strategy_combinations.all_strategies + strategy_combinations.multiworker_strategies), mode=["eager"] ) ) class KerasModelsTest(test.TestCase, parameterized.TestCase): def test_single_keras_layer_run(self, distribution): dataset = _get_dataset() input_iterator = iter(distribution.experimental_distribute_dataset(dataset)) with distribution.scope(): model = keras.layers.Dense(4, name="dense") @def_function.function def train_step(iterator): def step_fn(inputs):
def all_strategy_combinations_with_graph_mode(): return combinations.combine(distribution=keras_correctness_test_base. all_strategies, mode=['graph'])
def graph_mode_test_configuration(): return combinations.combine(mode='graph', use_numpy=[True, False], use_validation_data=[True, False])
class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): def _assign_replica_local(self, v, new): for var, n in zip(v, new): with ops.device(var.device): self.evaluate(var.assign(n)) def _save_return_saver(self, sess, var): saver = saver_lib.Saver(var_list=[var]) test_dir = self.get_temp_dir() prefix = os.path.join(test_dir, "ckpt") return saver.save(sess, prefix), saver def _save(self, sess, var): save_path, _ = self._save_return_saver(sess, var) return save_path config = config_pb2.ConfigProto() config.allow_soft_placement = True @test_util.run_in_graph_and_eager_modes(config=config) def testProperties(self): if context.num_gpus() < 1 and context.executing_eagerly(): self.skipTest( "A GPU is not available for this test in eager mode.") v, replica_local = _make_replica_local( variable_scope.VariableAggregation.SUM) self.assertEqual(v[0].constraint, replica_local.constraint) self.assertEqual(v[0].name, replica_local.name) self.assertEqual(v[0].dtype, replica_local.dtype) self.assertEqual(v[0].shape, replica_local.shape) self.assertEqual(variable_scope.VariableAggregation.SUM, replica_local.aggregation) @combinations.generate( combinations.combine(distribution=[ strategy_combinations.mirrored_strategy_with_gpu_and_cpu ], mode=["eager"])) def testCanPassToDefFun(self, distribution): @def_function.function def add1(x): return x + 1. with distribution.scope(): v = variables_lib.Variable( 1., aggregation=variables_lib.VariableAggregation.MEAN, synchronization=variables_lib.VariableSynchronization.ON_READ) self.assertEqual(2., self.evaluate(add1(v))) @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testTensorConversion(self, distribution): with context.graph_mode(): _, replica_local = _make_replica_local( variable_scope.VariableAggregation.SUM, distribution) converted = ops.convert_to_tensor(replica_local, as_ref=False) self.assertIsInstance(converted, ops.Tensor) self.assertEqual(converted.dtype, replica_local.dtype) converted = ops.convert_to_tensor(replica_local, as_ref=True) # Resources variable are converted to tensors as well when as_ref is True. self.assertIsInstance(converted, ops.Tensor) self.assertEqual(converted.dtype, replica_local.dtype) @combinations.generate( combinations.combine(distribution=[ strategy_combinations.mirrored_strategy_with_gpu_and_cpu, strategy_combinations. mirrored_strategy_with_two_gpus_no_merge_call, strategy_combinations.tpu_strategy, strategy_combinations.tpu_strategy_packed_var, ], mode=["eager"])) def testValueInCrossReplicaContext(self, distribution): value_list, replica_local = _make_replica_local( variable_scope.VariableAggregation.ONLY_FIRST_REPLICA, distribution) self.assertIsInstance(replica_local.value(), ops.Tensor) self.assertEqual(self.evaluate(replica_local.value()), self.evaluate(value_list[0].value())) @combinations.generate( combinations.combine(distribution=[ strategy_combinations.mirrored_strategy_with_gpu_and_cpu, strategy_combinations.tpu_strategy_packed_var, ], mode=["eager"])) def testValueInDefaultReplicaContext(self, distribution): with distribution.scope(): v1 = variables_lib.Variable( 0.0, aggregation=variables_lib.VariableAggregation.SUM, synchronization=variables_lib.VariableSynchronization.ON_READ) v2 = variables_lib.Variable( 0.0, aggregation=variables_lib.VariableAggregation.SUM, synchronization=variables_lib.VariableSynchronization.ON_READ) @def_function.function def replica_fn(): v1.assign_add(1.0) v2.assign_add(2.0) distribution.run(replica_fn) sum_v = v1 + v2 self.assertEqual(sum_v, 6.0) @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveAndRestoreReplicaLocalSumOneGraph(self, distribution): with self.cached_session() as sess: v, replica_local = _make_replica_local( variable_scope.VariableAggregation.SUM, distribution) # Overwrite the initial values. self._assign_replica_local(v, [3., 4.]) with distribution.scope(): # Saves the current value of v[0] + v[1], 7. save_path, saver = self._save_return_saver(sess, replica_local) # Change the values between save and restore. self._assign_replica_local(v, [5., 6.]) # Restores the saved value of 7. which gets divided equally # between the variables. saver.restore(sess, save_path) self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]])) @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveAndRestoreReplicaLocalMeanOneGraph(self, distribution): if context.num_gpus() < 1 and context.executing_eagerly(): self.skipTest( "A GPU is not available for this test in eager mode.") with self.cached_session() as sess: v, replica_local = _make_replica_local( variable_scope.VariableAggregation.MEAN, distribution) # Overwrite the initial values. self._assign_replica_local(v, [3., 4.]) with distribution.scope(): # Saves the current value of (v[0] + v[1])/2, 3.5. save_path, saver = self._save_return_saver(sess, replica_local) # Change the values between save and restore. self._assign_replica_local(v, [5., 6.]) # Restores the saved value of 3.5 to both variables. saver.restore(sess, save_path) self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]])) def _save_replica_local_mean(self, distribution): """Save variables with mirroring, returns save_path.""" with self.session(graph=ops.Graph()) as sess: v, replica_local = _make_replica_local( variable_scope.VariableAggregation.MEAN, distribution) # Overwrite the initial values. self._assign_replica_local(v, [3., 4.]) with distribution.scope(): # Saves the current value of (v[0] + v[1])/2, 3.5 save_path = self._save(sess, replica_local) # Change the values between save and restore. self._assign_replica_local(v, [5., 6.]) return save_path def _save_replica_local_sum(self, distribution): """Save variables with mirroring, returns save_path.""" with self.session(graph=ops.Graph()) as sess: v, replica_local = _make_replica_local( variable_scope.VariableAggregation.SUM, distribution) # Overwrite the initial values. self._assign_replica_local(v, [1.5, 2.]) with distribution.scope(): # Saves the current value of v[0] + v[1], 3.5 save_path = self._save(sess, replica_local) # Change the values between save and restore. self._assign_replica_local(v, [5., 6.]) return save_path def _save_normal(self): """Save variables without mirroring, returns save_path.""" with self.session(graph=ops.Graph()) as sess: var = variable_scope.get_variable(name="v", initializer=1., use_resource=True) # Overwrite the initial value. self.evaluate(var.assign(3.5)) # Saves the current value of var, 3.5. save_path = self._save(sess, var) # Change the values between save and restore. self.evaluate(var.assign(5.)) return save_path def _restore_normal(self, save_path): """Restore to variables without mirroring in a fresh graph.""" with self.session(graph=ops.Graph()) as sess: var = variable_scope.get_variable(name="v", initializer=7., use_resource=True) # Overwrite the initial value. self.evaluate(var.assign(8.)) # Restores the saved value of 3.5 to `var`. saver = saver_lib.Saver(var_list=[var]) saver.restore(sess, save_path) self.assertEqual(3.5, self.evaluate(var)) def _restore_replica_local_mean(self, save_path, distribution): """Restore to variables with mirroring in a fresh graph.""" with self.session(graph=ops.Graph()) as sess: v, replica_local = _make_replica_local( variable_scope.VariableAggregation.MEAN, distribution) # Overwrite the initial values. self._assign_replica_local(v, [7., 8.]) with distribution.scope(): # Restores the saved value of 3.5 to both variables. saver = saver_lib.Saver(var_list=[replica_local]) saver.restore(sess, save_path) self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]])) def _restore_replica_local_sum(self, save_path, distribution): """Restore to variables with mirroring in a fresh graph.""" with self.session(graph=ops.Graph()) as sess: v, replica_local = _make_replica_local( variable_scope.VariableAggregation.SUM, distribution) # Overwrite the initial values. self._assign_replica_local(v, [7., 8.]) with distribution.scope(): # Restores the saved value of 3.5 to both variables. saver = saver_lib.Saver(var_list=[replica_local]) saver.restore(sess, save_path) self.assertEqual([1.75, 1.75], self.evaluate([v[0], v[1]])) @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveReplicaLocalRestoreReplicaLocalMean(self, distribution): save_path = self._save_replica_local_mean(distribution) self._restore_replica_local_mean(save_path, distribution) @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveReplicaLocalRestoreReplicaLocalSum(self, distribution): save_path = self._save_replica_local_sum(distribution) self._restore_replica_local_sum(save_path, distribution) @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveReplicaLocalMeanRestoreNormal(self, distribution): save_path = self._save_replica_local_mean(distribution) self._restore_normal(save_path) @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveReplicaLocalSumRestoreNormal(self, distribution): save_path = self._save_replica_local_sum(distribution) self._restore_normal(save_path) @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveNormalRestoreReplicaLocalMean(self, distribution): save_path = self._save_normal() self._restore_replica_local_mean(save_path, distribution) @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveNormalRestoreReplicaLocalSum(self, distribution): save_path = self._save_normal() self._restore_replica_local_sum(save_path, distribution)
class RetinaNetTest(parameterized.TestCase, tf.test.TestCase): @parameterized.parameters( { 'use_separable_conv': True, 'build_anchor_boxes': True, 'is_training': False, 'has_att_heads': False }, { 'use_separable_conv': False, 'build_anchor_boxes': True, 'is_training': False, 'has_att_heads': False }, { 'use_separable_conv': False, 'build_anchor_boxes': False, 'is_training': False, 'has_att_heads': False }, { 'use_separable_conv': False, 'build_anchor_boxes': False, 'is_training': True, 'has_att_heads': False }, { 'use_separable_conv': False, 'build_anchor_boxes': True, 'is_training': True, 'has_att_heads': True }, { 'use_separable_conv': False, 'build_anchor_boxes': True, 'is_training': False, 'has_att_heads': True }, ) def test_build_model(self, use_separable_conv, build_anchor_boxes, is_training, has_att_heads): num_classes = 3 min_level = 3 max_level = 7 num_scales = 3 aspect_ratios = [1.0] anchor_size = 3 fpn_num_filters = 256 head_num_convs = 4 head_num_filters = 256 num_anchors_per_location = num_scales * len(aspect_ratios) image_size = 384 images = np.random.rand(2, image_size, image_size, 3) image_shape = np.array([[image_size, image_size], [image_size, image_size]]) if build_anchor_boxes: anchor_boxes = anchor.Anchor( min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=anchor_size, image_size=(image_size, image_size)).multilevel_boxes for l in anchor_boxes: anchor_boxes[l] = tf.tile( tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1]) else: anchor_boxes = None if has_att_heads: attribute_heads = [dict(name='depth', type='regression', size=1)] else: attribute_heads = None backbone = resnet.ResNet(model_id=50) decoder = fpn.FPN( input_specs=backbone.output_specs, min_level=min_level, max_level=max_level, num_filters=fpn_num_filters, use_separable_conv=use_separable_conv) head = dense_prediction_heads.RetinaNetHead( min_level=min_level, max_level=max_level, num_classes=num_classes, attribute_heads=attribute_heads, num_anchors_per_location=num_anchors_per_location, use_separable_conv=use_separable_conv, num_convs=head_num_convs, num_filters=head_num_filters) generator = detection_generator.MultilevelDetectionGenerator( max_num_detections=10) model = retinanet_model.RetinaNetModel( backbone=backbone, decoder=decoder, head=head, detection_generator=generator, min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=anchor_size) _ = model(images, image_shape, anchor_boxes, training=is_training) @combinations.generate( combinations.combine( strategy=[ strategy_combinations.cloud_tpu_strategy, strategy_combinations.one_device_strategy_gpu, ], image_size=[ (128, 128), ], training=[True, False], has_att_heads=[True, False], output_intermediate_features=[True, False], )) def test_forward(self, strategy, image_size, training, has_att_heads, output_intermediate_features): """Test for creation of a R50-FPN RetinaNet.""" tf.keras.backend.set_image_data_format('channels_last') num_classes = 3 min_level = 3 max_level = 7 num_scales = 3 aspect_ratios = [1.0] num_anchors_per_location = num_scales * len(aspect_ratios) images = np.random.rand(2, image_size[0], image_size[1], 3) image_shape = np.array( [[image_size[0], image_size[1]], [image_size[0], image_size[1]]]) with strategy.scope(): anchor_gen = anchor.build_anchor_generator( min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=3) anchor_boxes = anchor_gen(image_size) for l in anchor_boxes: anchor_boxes[l] = tf.tile( tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1]) backbone = resnet.ResNet(model_id=50) decoder = fpn.FPN( input_specs=backbone.output_specs, min_level=min_level, max_level=max_level) if has_att_heads: attribute_heads = [dict(name='depth', type='regression', size=1)] else: attribute_heads = None head = dense_prediction_heads.RetinaNetHead( min_level=min_level, max_level=max_level, num_classes=num_classes, attribute_heads=attribute_heads, num_anchors_per_location=num_anchors_per_location) generator = detection_generator.MultilevelDetectionGenerator( max_num_detections=10, nms_version='v1') model = retinanet_model.RetinaNetModel( backbone=backbone, decoder=decoder, head=head, detection_generator=generator) model_outputs = model( images, image_shape, anchor_boxes, output_intermediate_features=output_intermediate_features, training=training) if training: cls_outputs = model_outputs['cls_outputs'] box_outputs = model_outputs['box_outputs'] for level in range(min_level, max_level + 1): self.assertIn(str(level), cls_outputs) self.assertIn(str(level), box_outputs) self.assertAllEqual([ 2, image_size[0] // 2**level, image_size[1] // 2**level, num_classes * num_anchors_per_location ], cls_outputs[str(level)].numpy().shape) self.assertAllEqual([ 2, image_size[0] // 2**level, image_size[1] // 2**level, 4 * num_anchors_per_location ], box_outputs[str(level)].numpy().shape) if has_att_heads: att_outputs = model_outputs['attribute_outputs'] for att in att_outputs.values(): self.assertAllEqual([ 2, image_size[0] // 2**level, image_size[1] // 2**level, 1 * num_anchors_per_location ], att[str(level)].numpy().shape) else: self.assertIn('detection_boxes', model_outputs) self.assertIn('detection_scores', model_outputs) self.assertIn('detection_classes', model_outputs) self.assertIn('num_detections', model_outputs) self.assertAllEqual( [2, 10, 4], model_outputs['detection_boxes'].numpy().shape) self.assertAllEqual( [2, 10], model_outputs['detection_scores'].numpy().shape) self.assertAllEqual( [2, 10], model_outputs['detection_classes'].numpy().shape) self.assertAllEqual( [2,], model_outputs['num_detections'].numpy().shape) if has_att_heads: self.assertIn('detection_attributes', model_outputs) self.assertAllEqual( [2, 10, 1], model_outputs['detection_attributes']['depth'].numpy().shape) if output_intermediate_features: for l in range(2, 6): self.assertIn('backbone_{}'.format(l), model_outputs) self.assertAllEqual([ 2, image_size[0] // 2**l, image_size[1] // 2**l, backbone.output_specs[str(l)].as_list()[-1] ], model_outputs['backbone_{}'.format(l)].numpy().shape) for l in range(min_level, max_level + 1): self.assertIn('decoder_{}'.format(l), model_outputs) self.assertAllEqual([ 2, image_size[0] // 2**l, image_size[1] // 2**l, decoder.output_specs[str(l)].as_list()[-1] ], model_outputs['decoder_{}'.format(l)].numpy().shape) def test_serialize_deserialize(self): """Validate the network can be serialized and deserialized.""" num_classes = 3 min_level = 3 max_level = 7 num_scales = 3 aspect_ratios = [1.0] num_anchors_per_location = num_scales * len(aspect_ratios) backbone = resnet.ResNet(model_id=50) decoder = fpn.FPN( input_specs=backbone.output_specs, min_level=min_level, max_level=max_level) head = dense_prediction_heads.RetinaNetHead( min_level=min_level, max_level=max_level, num_classes=num_classes, num_anchors_per_location=num_anchors_per_location) generator = detection_generator.MultilevelDetectionGenerator( max_num_detections=10) model = retinanet_model.RetinaNetModel( backbone=backbone, decoder=decoder, head=head, detection_generator=generator, min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=3) config = model.get_config() new_model = retinanet_model.RetinaNetModel.from_config(config) # Validate that the config can be forced to JSON. _ = new_model.to_json() # If the serialization was successful, the new config should match the old. self.assertAllEqual(model.get_config(), new_model.get_config())
def all_strategy_combinations(): return combinations.combine(distribution=[ strategy_combinations.default_strategy, strategy_combinations.cloud_tpu_strategy, strategy_combinations.one_device_strategy_gpu, ], )
class OpCancellationTest(test.TestCase, parameterized.TestCase): def setUp(self): _setup_context() super().setUp() @combinations.generate( combinations.times( combinations.combine(collective_op=[ combinations.NamedObject('all_reduce', CollectiveOpsV1.all_reduce), combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce), combinations.NamedObject('all_gather', CollectiveOpsV1.all_gather), combinations.NamedObject('all_gather_v2', CollectiveOpsV2.all_gather), ], mode='eager'), device_combination)) def testOpErrorNotAbortIfNoCollective(self, collective_op, device, communication): # Do not abort if there's no active collective ops. There could be # exceptions like EOF which we expect users to catch, aborting collective # ops on all op errors intervenes with this workflow. dev0 = '/device:%s:0' % device dev1 = '/device:%s:1' % device group_size = 2 group_key = 100 instance_key = 100 dataset = dataset_ops.Dataset.from_tensors([1.]) @def_function.function def collective_fn(in_tensor): for device in [dev0, dev1]: with ops.device(device): collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) @def_function.function def f(): iterator = iter(dataset) collective_fn(next(iterator)) # This next(iterator) should raise EOF. collective_fn(next(iterator)) with self.assertRaises(errors.OutOfRangeError): f() collective_fn(constant_op.constant([1.])) @combinations.generate( combinations.times( combinations.combine(collective_op=[ combinations.NamedObject('all_reduce', CollectiveOpsV1.all_reduce), combinations.NamedObject('all_gather', CollectiveOpsV1.all_gather), ], mode='eager'), device_combination)) def testOpErrorAbortWithCollective(self, collective_op, device, communication): # Abort v1 collective ops if there're active collective ops at the time of # an op error. This is due to the inability to cancel collective ops, and op # errors may cause running collective ops to hang. dev0 = '/device:%s:0' % device group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant([1.]) # Make the dataset sleep a while so that the collective is being executed # when the EOF happens. dataset = dataset_ops.Dataset.from_tensors([1.]).apply( dataset_testing.sleep(sleep_microseconds=200)) @def_function.function def f(): # Launch a collective op that won't be able to finish to test abortion # when other ops error. with ops.device(dev0): ret = collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) iterator = iter(dataset) next(iterator) # This should raise EOF. next(iterator) return ret with self.assertRaises(errors.OutOfRangeError): f() # Now collective ops is aborted, subsequent collective ops should fail with # the previous error. with self.assertRaises(errors.CancelledError): with ops.device(dev0): collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) @combinations.generate( combinations.times( combinations.combine(collective_op=[ combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce), combinations.NamedObject('all_gather_v2', CollectiveOpsV2.all_gather), ], mode='eager'), device_combination)) def testOpErrorNotAbortWithCollective(self, collective_op, device, communication): # Do not abort v2 collective ops even if there're active collective ops at # the time of an op error. We rely cancellation to terminate active # collective ops. dev0 = '/device:%s:0' % device dev1 = '/device:%s:1' % device group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant([1.]) @def_function.function def collective_fn(): for device in [dev0, dev1]: with ops.device(device): collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) # Local params resolution cannot be cancelled yet, so we perform a normal # collective so that the group is resolved. collective_fn() # Make the dataset sleep a while so that the collective is being executed # when the EOF happens. dataset = dataset_ops.Dataset.from_tensors([1.]).apply( dataset_testing.sleep(sleep_microseconds=200)) @def_function.function def f(): # Launch a collective op that won't be able to finish to test cancellation # when other ops error. with ops.device(dev0): ret = collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) iterator = iter(dataset) next(iterator) # This should raise EOF. next(iterator) return ret with self.assertRaises(errors.OutOfRangeError): f() # Collective ops shouldn't be aborted and new collectives should be able to # proceed. collective_fn() @combinations.generate( combinations.times( combinations.combine(collective_op=[ combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce), combinations.NamedObject('all_gather_v2', CollectiveOpsV2.all_gather), ], mode='eager'), device_combination)) def testCancelDuringParamResolution(self, collective_op, device, communication): dev0 = '/device:%s:0' % device dev1 = '/device:%s:1' % device group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant([1.]) t1_cancellation_manager = cancellation.CancellationManager() t2_cancellation_manager = cancellation.CancellationManager() @def_function.function def _collective_fn(x): # Run an assertion to crash one of the two function executions running # collectives. We explicitly cancel the other in response. assert_op = check_ops.assert_equal(x, in_tensor) with ops.control_dependencies([assert_op]): return collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) collective_concrete = _collective_fn.get_concrete_function(in_tensor) finish_mu = threading.Lock() finishes = 0 def _placement_wrapper(device, x, my_cancellation, other_cancellation): try: with ops.device(device): cancelable_collective = my_cancellation.get_cancelable_function( collective_concrete) return cancelable_collective(x) except errors.InvalidArgumentError: # `assert_equal` failed for this execution of the function. The other # function would deadlock without cancellation. other_cancellation.start_cancel() except errors.CancelledError: pass nonlocal finishes with finish_mu: finishes += 1 t1 = threading.Thread(target=_placement_wrapper, args=(dev0, constant_op.constant([1.]), t1_cancellation_manager, t2_cancellation_manager)) t2 = threading.Thread( target=_placement_wrapper, # Will cause the assertion to fail args=(dev1, constant_op.constant([2.]), t2_cancellation_manager, t1_cancellation_manager)) t1.start() t2.start() t1.join() t2.join() self.assertEqual(finishes, 2)
class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase): def testMergeCall(self): _assert_in_default_state(self) def merge_fn(dist, s): self.assertIs(ds_context._get_default_strategy(), dist) self.assertIs(None, ds_context.get_replica_context()) self.assertIs(dist, ds_context.get_cross_replica_context()) self.assertTrue(ds_context.in_cross_replica_context()) self.assertIs(dist, ds_context.get_strategy()) self.assertFalse(ds_context.has_strategy()) return "foo_" + s replica_ctx = ds_context.get_replica_context() self.assertIs(ds_context._get_default_replica_context(), replica_ctx) self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, args=("bar",))) _assert_in_default_state(self) def testScopeMostlyNoOp(self): _assert_in_default_state(self) test_strategy = _TestStrategy2() with test_strategy.scope(): variable_scope.variable(1.0, name="before") default_strategy = ds_context._get_default_strategy() scope = default_strategy.scope() with scope: _assert_in_default_state(self) with test_strategy.scope(): with self.assertRaisesRegexp( RuntimeError, "Mixing different tf.distribute.Strategy objects"): variable_scope.variable(1.0, name="error") with scope: _assert_in_default_state(self) with test_strategy.scope(): with self.assertRaisesRegexp( RuntimeError, "Mixing different tf.distribute.Strategy objects"): variable_scope.variable(1.0, name="also_error") _assert_in_default_state(self) _assert_in_default_state(self) with test_strategy.scope(): variable_scope.variable(1.0, name="after") def testExperimentalRunV2(self): default_strategy = ds_context._get_default_strategy() dataset = dataset_ops.Dataset.range(10).batch(2) iterator = default_strategy.extended._make_dataset_iterator(dataset) next_val = iterator.get_next() def train_step(input_data): return input_data for _ in range(2): default_strategy.experimental_run_v2(train_step, args=(next_val,)) @combinations.generate(combinations.combine(mode=["graph", "eager"])) def testDistributedDatasets(self): default_strategy = ds_context._get_default_strategy() if context.executing_eagerly(): dataset_fn = lambda _: dataset_ops.DatasetV2.range(10).batch(2) dist_dataset = default_strategy.experimental_distribute_dataset( dataset_fn(distribute_lib.InputContext())) next_val = next(iter(dist_dataset)) else: dataset_fn = lambda _: dataset_ops.DatasetV1.range(10).batch(2) dist_dataset = default_strategy.experimental_distribute_dataset( dataset_fn(distribute_lib.InputContext())) iterator = dist_dataset.make_initializable_iterator() self.evaluate(iterator.initializer) next_val = iterator.get_next() self.assertAllEqual([0, 1], self.evaluate(next_val)) @combinations.generate(combinations.combine(mode=["graph", "eager"])) def testDistributedDatasetsFromFunction(self): default_strategy = ds_context._get_default_strategy() if context.executing_eagerly(): dataset_fn = lambda _: dataset_ops.DatasetV2.range(10).batch(2) dist_dataset_from_func = \ default_strategy.experimental_distribute_datasets_from_function( dataset_fn) next_val = next(iter(dist_dataset_from_func)) self.assertAllEqual([0, 1], self.evaluate(next_val)) else: dataset_fn = lambda _: dataset_ops.DatasetV2.range(10).batch(2) with self.assertRaisesRegexp(RuntimeError, "only supported when eager execution is " "enabled"): dist_dataset_from_func = \ default_strategy.experimental_distribute_datasets_from_function( dataset_fn)
instance_key, *args, **kwargs) @staticmethod def broadcast_recv(shape, dtype, group_size, group_key, instance_key, *args, **kwargs): group_size = array_ops.identity(group_size) group_key = array_ops.identity(group_key) instance_key = array_ops.identity(instance_key) shape = array_ops.identity(shape) return _collective_ops.broadcast_recv_v2(shape, dtype, group_size, group_key, instance_key, *args, **kwargs) device_combination = ( combinations.combine(device='CPU', communication='RING', required_gpus=0) + combinations.combine( device='GPU', communication=['RING', 'NCCL'], required_gpus=2)) collective_op_combinations = combinations.combine(collective_op=[ combinations.NamedObject('all_reduce', CollectiveOpsV1.all_reduce), combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce), combinations.NamedObject('all_gather', CollectiveOpsV1.all_gather), combinations.NamedObject('all_gather_v2', CollectiveOpsV2.all_gather) ]) @combinations.generate( combinations.times( combinations.combine(collective_ops=[ combinations.NamedObject('v1', CollectiveOpsV1),
class ResNetTest(parameterized.TestCase, tf.test.TestCase): @parameterized.parameters( (128, 50, 4, 8), (128, 101, 4, 8), (128, 50, 4, 16), (128, 101, 4, 16), ) def test_network_creation(self, input_size, model_id, endpoint_filter_scale, output_stride): """Test creation of ResNet models.""" tf.keras.backend.set_image_data_format('channels_last') network = resnet_deeplab.DilatedResNet(model_id=model_id, output_stride=output_stride) inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) endpoints = network(inputs) print(endpoints) self.assertAllEqual([ 1, input_size / output_stride, input_size / output_stride, 512 * endpoint_filter_scale ], endpoints[str(int(np.math.log2(output_stride)))].shape.as_list()) @parameterized.parameters( ('v0', None, 0.0), ('v1', None, 0.0), ('v1', 0.25, 0.0), ('v1', 0.25, 0.2), ) def test_network_features(self, stem_type, se_ratio, init_stochastic_depth_rate): """Test additional features of ResNet models.""" input_size = 128 model_id = 50 endpoint_filter_scale = 4 output_stride = 8 tf.keras.backend.set_image_data_format('channels_last') network = resnet_deeplab.DilatedResNet( model_id=model_id, output_stride=output_stride, stem_type=stem_type, se_ratio=se_ratio, init_stochastic_depth_rate=init_stochastic_depth_rate) inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) endpoints = network(inputs) print(endpoints) self.assertAllEqual([ 1, input_size / output_stride, input_size / output_stride, 512 * endpoint_filter_scale ], endpoints[str(int(np.math.log2(output_stride)))].shape.as_list()) @combinations.generate( combinations.combine( strategy=[ strategy_combinations.cloud_tpu_strategy, strategy_combinations.one_device_strategy_gpu, ], use_sync_bn=[False, True], )) def test_sync_bn_multiple_devices(self, strategy, use_sync_bn): """Test for sync bn on TPU and GPU devices.""" inputs = np.random.rand(64, 128, 128, 3) tf.keras.backend.set_image_data_format('channels_last') with strategy.scope(): network = resnet_deeplab.DilatedResNet(model_id=50, output_stride=8, use_sync_bn=use_sync_bn) _ = network(inputs) @parameterized.parameters(1, 3, 4) def test_input_specs(self, input_dim): """Test different input feature dimensions.""" tf.keras.backend.set_image_data_format('channels_last') input_specs = tf.keras.layers.InputSpec( shape=[None, None, None, input_dim]) network = resnet_deeplab.DilatedResNet(model_id=50, output_stride=8, input_specs=input_specs) inputs = tf.keras.Input(shape=(128, 128, input_dim), batch_size=1) _ = network(inputs) def test_serialize_deserialize(self): # Create a network object that sets all of its config options. kwargs = dict( model_id=50, output_stride=8, stem_type='v0', se_ratio=0.25, init_stochastic_depth_rate=0.2, use_sync_bn=False, activation='relu', norm_momentum=0.99, norm_epsilon=0.001, kernel_initializer='VarianceScaling', kernel_regularizer=None, bias_regularizer=None, ) network = resnet_deeplab.DilatedResNet(**kwargs) expected_config = dict(kwargs) self.assertEqual(network.get_config(), expected_config) # Create another network object from the first object's config. new_network = resnet_deeplab.DilatedResNet.from_config( network.get_config()) # Validate that the config can be forced to JSON. _ = new_network.to_json() # If the serialization was successful, the new config should match the old. self.assertAllEqual(network.get_config(), new_network.get_config())
def strategy_minus_tpu_and_input_config_combinations_eager(): return (combinations.times( combinations.combine( distribution=strategy_combinations.strategies_minus_tpu), eager_mode_test_configuration()))
def strategy_minus_tpu_combinations(): return combinations.combine( distribution=strategies_minus_tpu, mode=['graph', 'eager'])
class KerasMultiWorkerOptimizerTest(test_base.IndependentWorkerTestBase, parameterized.TestCase): def run_optimizer_comparison_with_simple_bias_model( self, strategy_cls, optimizer_class_1, optimizer_class_2): def get_input_datasets(): # Simple training input. train_input = [[1]] * 16 train_label = [[0]] * 16 ds = dataset_ops.Dataset.from_tensor_slices((train_input, train_label)) ds = maybe_shard_dataset(ds) # TODO(rchao): Investigate to figure out the reason for having 8 workers # instead of 2 as expected. return ds.batch(8, drop_remainder=True) def get_simple_bias_model(): class Bias(base_layer.Layer): def build(self, input_shape): self.bias = self.add_variable('bias', (1,), initializer='zeros') def call(self, inputs): return inputs + self.bias model = sequential.Sequential() model.add(Bias(input_shape=(1,))) return model self._lock = threading.Lock() cluster_spec = test_base.create_cluster_spec(num_workers=2) self._barrier = dc._Barrier(2) def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument """Simulates an Independent Worker inside a thread.""" # TODO(rchao): Refactor to abstract the common boilerplate out. with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): model = get_simple_bias_model() initial_weights = model.get_weights() def _get_model_results(optimizer, initial_weights): # Clear Keras session to reset device assignment keras.backend._SESSION.session = None strategy = get_strategy_object(strategy_cls) with strategy.scope(): train_ds = get_input_datasets() model = get_simple_bias_model() model.set_weights(initial_weights) model.compile(loss='mae', optimizer=optimizer, metrics=['mae']) return { 'trained_loss_and_accuracy': model.fit(x=train_ds, epochs=20).history, 'trained_weights': model.get_weights(), } results1 = _get_model_results(optimizer_class_1(0.01), initial_weights) results2 = _get_model_results(optimizer_class_2(0.01), initial_weights) for key in results1: self.assertAllClose( results1[key], results2[key], atol=1e-5, rtol=1e-5, msg='Fail to assert {}'.format(key)) threads = self.run_multiple_tasks_in_threads(_independent_worker_fn, cluster_spec) threads_to_join = [] strategy = get_strategy_object(strategy_cls) if strategy.extended.experimental_between_graph: for ts in threads.values(): threads_to_join.extend(ts) else: threads_to_join = [threads['worker'][0]] self.join_independent_workers(threads_to_join) @combinations.generate( combinations.combine( mode=['graph'], strategy_cls=[collective_strategy.CollectiveAllReduceStrategy], required_gpus=[0, 1])) def test_sgd_optimizer_v1_v2_comparison(self, strategy_cls): self.run_optimizer_comparison_with_simple_bias_model( strategy_cls, gradient_descent.SGD, gradient_descent_v1.GradientDescentOptimizer) @combinations.generate( combinations.combine( mode=['graph'], strategy_cls=[collective_strategy.CollectiveAllReduceStrategy], required_gpus=[0, 1])) def test_rmsprop_optimizer_v1_v2_comparison(self, strategy_cls): self.skipTest('There is an issue in collective ops (b/127700538) that ' 'prevent us from running this test with rmsprop optimizers.') self.run_optimizer_comparison_with_simple_bias_model( strategy_cls, rmsprop.RMSprop, rmsprop_v1.RMSPropOptimizer)
class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase): """Test multi-worker training flow demo'ed in go/multi-worker-with-keras.""" @contextlib.contextmanager def skip_fetch_failure_exception(self): try: yield except zipfile.BadZipfile as e: self.skipTest('Data loading error: Bad magic number for file header.') except Exception as e: # pylint: disable=broad-except if 'URL fetch failure' in str(e): self.skipTest('URL fetch error not considered failure of the test.') else: raise @combinations.generate( combinations.combine( mode=['eager'], shard_policy=[None] + list(distribute_options.AutoShardPolicy))) def testMultiWorkerTutorial(self, mode, shard_policy): """Test multi-worker training flow demo'ed in go/multi-worker-with-keras. This test should be kept in sync with the code samples in go/multi-worker-with-keras. Args: mode: Runtime mode. shard_policy: None or any of tf.data.experimental.AutoShardPolicy for testing. """ if shard_policy is distribute_options.AutoShardPolicy.FILE: self.skipTest('TensorSliceDataset is not shardable with FILE policy.') def mnist_dataset(batch_size): with self.skip_fetch_failure_exception(): (x_train, y_train), _ = mnist.load_data() # The `x` arrays are in uint8 and have values in the range [0, 255]. # We need to convert them to float32 with values in the range [0, 1] x_train = x_train / np.float32(255) y_train = y_train.astype(np.int64) train_dataset = dataset_ops.DatasetV2.from_tensor_slices( (x_train, y_train)).shuffle(60000).repeat().batch(batch_size) return train_dataset def build_and_compile_cnn_model(): model = keras.Sequential([ keras.layers.Input(shape=(28, 28)), keras.layers.Reshape(target_shape=(28, 28, 1)), keras.layers.Conv2D(32, 3, activation='relu'), keras.layers.Flatten(), keras.layers.Dense(128, activation='relu'), keras.layers.Dense(10) ]) model.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=gradient_descent.SGD(learning_rate=0.001), metrics=['accuracy']) return model per_worker_batch_size = 64 single_worker_dataset = mnist_dataset(per_worker_batch_size) single_worker_model = build_and_compile_cnn_model() single_worker_model.fit(single_worker_dataset, epochs=3, steps_per_epoch=70) num_workers = 4 def proc_func(model_path, checkpoint_dir): global_batch_size = per_worker_batch_size * num_workers strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy() with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() callbacks = [ keras.callbacks.ModelCheckpoint( filepath=os.path.join(self.get_temp_dir(), 'checkpoint')) ] multi_worker_dataset = mnist_dataset(global_batch_size) if shard_policy: options = dataset_ops.Options() options.experimental_distribute.auto_shard_policy = shard_policy multi_worker_dataset = multi_worker_dataset.with_options(options) multi_worker_model.fit( multi_worker_dataset, epochs=2, steps_per_epoch=20, callbacks=callbacks) def _is_chief(task_type, task_id): return task_type is None or task_type == 'chief' or ( task_type == 'worker' and task_id == 0) def _get_temp_dir(dirpath, task_id): base_dirpath = 'workertemp_' + str(task_id) temp_dir = os.path.join(dirpath, base_dirpath) file_io.recursive_create_dir_v2(temp_dir) return temp_dir def write_filepath(filepath, task_type, task_id): dirpath = os.path.dirname(filepath) base = os.path.basename(filepath) if not _is_chief(task_type, task_id): dirpath = _get_temp_dir(dirpath, task_id) return os.path.join(dirpath, base) task_type, task_id = (strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id) write_model_path = write_filepath(model_path, task_type, task_id) multi_worker_model.save(write_model_path) if not _is_chief(task_type, task_id): file_io.delete_recursively_v2(os.path.dirname(write_model_path)) # Make sure chief finishes saving before non-chief's assertions. multi_process_runner.barrier().wait() if not file_io.file_exists(model_path): raise RuntimeError() if file_io.file_exists(write_model_path) != _is_chief(task_type, task_id): raise RuntimeError() loaded_model = keras.saving.save.load_model(model_path) loaded_model.fit(multi_worker_dataset, epochs=2, steps_per_epoch=20) checkpoint = tracking_util.Checkpoint(model=multi_worker_model) write_checkpoint_dir = write_filepath(checkpoint_dir, task_type, task_id) checkpoint_manager = checkpoint_management.CheckpointManager( checkpoint, directory=write_checkpoint_dir, max_to_keep=1) checkpoint_manager.save() if not _is_chief(task_type, task_id): file_io.delete_recursively_v2(write_checkpoint_dir) # Make sure chief finishes saving before non-chief's assertions. multi_process_runner.barrier().wait() if not file_io.file_exists(checkpoint_dir): raise RuntimeError() if file_io.file_exists(write_checkpoint_dir) != _is_chief( task_type, task_id): raise RuntimeError() latest_checkpoint = checkpoint_management.latest_checkpoint( checkpoint_dir) checkpoint.restore(latest_checkpoint) multi_worker_model.fit(multi_worker_dataset, epochs=2, steps_per_epoch=20) logging.info('testMultiWorkerTutorial successfully ends') model_path = os.path.join(self.get_temp_dir(), 'model.tf') checkpoint_dir = os.path.join(self.get_temp_dir(), 'ckpt') with test_util.skip_if_error(self, errors_impl.UnavailableError): mpr_result = multi_process_runner.run( proc_func, multi_worker_test_base.create_cluster_spec(num_workers=num_workers), args=(model_path, checkpoint_dir), list_stdout=True) self.assertTrue( any([ 'testMultiWorkerTutorial successfully ends' in msg for msg in mpr_result.stdout ])) def extract_accuracy(worker_id, input_string): match = re.match( r'\[worker\-{}\].*accuracy: (\d+\.\d+).*'.format(worker_id), input_string) return None if match is None else float(match.group(1)) for worker_id in range(num_workers): accu_result = nest.map_structure( lambda x: extract_accuracy(worker_id, x), # pylint: disable=cell-var-from-loop mpr_result.stdout) self.assertTrue( any(accu_result), 'Every worker is supposed to have accuracy result.')
class TestEstimatorDistributionStrategy(tf.test.TestCase, parameterized.TestCase): def setUp(self): super(TestEstimatorDistributionStrategy, self).setUp() strategy_combinations.set_virtual_cpus_to_at_least(3) self._base_dir = os.path.join(self.get_temp_dir(), 'keras_to_estimator_strategy_test') tf.compat.v1.gfile.MakeDirs(self._base_dir) self._config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir) def tearDown(self): super(TestEstimatorDistributionStrategy, self).tearDown() tf.compat.v1.summary.FileWriterCache.clear() if os.path.isdir(self._base_dir): tf.compat.v1.gfile.DeleteRecursively(self._base_dir) @combinations.generate( combinations.combine(distribution=[ strategy_combinations.mirrored_strategy_with_cpu_1_and_2, ], mode=['graph'], cloning=[True, False])) def test_train_functional_with_distribution_strategy( self, distribution, cloning): keras_model = simple_functional_model() keras_model.compile( loss='categorical_crossentropy', metrics=[keras.metrics.CategoricalAccuracy()], optimizer=rmsprop_keras.RMSprop(learning_rate=0.01), cloning=cloning) config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir, train_distribute=distribution, eval_distribute=distribution) with self.cached_session(): est_keras = keras_lib.model_to_estimator(keras_model=keras_model, config=config) before_eval_results = est_keras.evaluate( input_fn=get_ds_test_input_fn, steps=1) est_keras.train(input_fn=get_ds_train_input_fn, steps=_TRAIN_SIZE / 16) after_eval_results = est_keras.evaluate( input_fn=get_ds_test_input_fn, steps=1) self.assertLess(after_eval_results['loss'], before_eval_results['loss']) tf.compat.v1.summary.FileWriterCache.clear() tf.compat.v1.gfile.DeleteRecursively(self._config.model_dir) @combinations.generate( combinations.combine(distribution=[ strategy_combinations.mirrored_strategy_with_cpu_1_and_2, ], mode=['graph'], cloning=[True, False])) def test_train_sequential_with_distribution_strategy( self, distribution, cloning): keras_model = simple_sequential_model() keras_model.compile( loss='categorical_crossentropy', metrics=[keras.metrics.CategoricalAccuracy()], optimizer=rmsprop_keras.RMSprop(learning_rate=0.01), cloning=cloning) config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir, train_distribute=distribution) with self.cached_session(): est_keras = keras_lib.model_to_estimator(keras_model=keras_model, config=config) before_eval_results = est_keras.evaluate( input_fn=get_ds_test_input_fn, steps=1) est_keras.train(input_fn=get_ds_train_input_fn, steps=_TRAIN_SIZE / 16) after_eval_results = est_keras.evaluate( input_fn=get_ds_test_input_fn, steps=1) self.assertLess(after_eval_results['loss'], before_eval_results['loss']) tf.compat.v1.summary.FileWriterCache.clear() tf.compat.v1.gfile.DeleteRecursively(self._config.model_dir) @combinations.generate( combinations.combine(distribution=[ strategy_combinations.mirrored_strategy_with_cpu_1_and_2, ], mode=['graph'])) def test_multi_inputs_multi_outputs_with_input_fn_as_dict( self, distribution): train_data, test_data = get_multi_inputs_multi_outputs_data() def train_input_fn(): input_dict = { 'input_a': train_data['input_a'], 'input_b': train_data['input_b'], 'input_m': train_data['input_m'].astype(np.str) } output_dict = { 'dense_2': train_data['output_c'], 'dense_3': train_data['output_d'] } return tf.compat.v1.data.Dataset.from_tensor_slices( (input_dict, output_dict)).batch(16) def eval_input_fn(): input_dict = { 'input_a': test_data['input_a'], 'input_b': test_data['input_b'], 'input_m': test_data['input_m'].astype(np.str) } output_dict = { 'dense_2': test_data['output_c'], 'dense_3': test_data['output_d'] } return tf.compat.v1.data.Dataset.from_tensor_slices( (input_dict, output_dict)).batch(16) self.do_test_multi_inputs_multi_outputs_with_input_fn( distribution, train_input_fn, eval_input_fn) def do_test_multi_inputs_multi_outputs_with_input_fn( self, distribution, train_input_fn, eval_input_fn): config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir, train_distribute=distribution) with self.cached_session(): model = multi_inputs_multi_outputs_model() est_keras = keras_lib.model_to_estimator(keras_model=model, config=config) baseline_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1) est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16) eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1) self.assertLess(eval_results['loss'], baseline_eval_results['loss'])
def tpu_strategy_combinations(): return combinations.combine(distribution=tpu_strategies, mode=["graph"])
class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase): def setUp(self): strategy_combinations.set_virtual_cpus_to_at_least(3) super(LossUtilitiesTest, self).setUp() def testComputeAverageLossGlobalBatchSize(self): per_example_loss = [1, 2, 3, 4, 5] loss = nn_impl.compute_average_loss(per_example_loss, global_batch_size=10) self.assertEqual(self.evaluate(loss), 1.5) @combinations.generate( combinations.combine( distribution=[ strategy_combinations.mirrored_strategy_with_cpu_1_and_2 ], mode=["graph", "eager"])) def testComputeAverageLossDefaultGlobalBatchSize(self, distribution): # Without strategy - num replicas = 1 per_example_loss = constant_op.constant([2.5, 6.2, 5.]) loss = nn_impl.compute_average_loss(per_example_loss) self.assertAllClose(self.evaluate(loss), (2.5 + 6.2 + 5.) / 3) # With strategy - num replicas = 2 with distribution.scope(): per_replica_losses = distribution.experimental_run_v2( nn_impl.compute_average_loss, args=(per_example_loss,)) loss = distribution.reduce("SUM", per_replica_losses, axis=None) self.assertAllClose(self.evaluate(loss), (2.5 + 6.2 + 5.) / 3) @combinations.generate( combinations.combine( distribution=[ strategy_combinations.mirrored_strategy_with_cpu_1_and_2 ], mode=["graph", "eager"])) def testComputeAverageLossSampleWeights(self, distribution): with distribution.scope(): # Scalar sample weight per_replica_losses = distribution.experimental_run_v2( nn_impl.compute_average_loss, args=([2., 4., 6.],), kwargs={"sample_weight": 2}) loss = distribution.reduce("SUM", per_replica_losses, axis=None) self.assertAllClose(self.evaluate(loss), (2. + 4. + 6.) * 2. / 3) # Per example sample weight per_replica_losses = distribution.experimental_run_v2( nn_impl.compute_average_loss, args=([2., 4., 6.],), kwargs={"sample_weight": [0.3, 0.5, 0.2]}) loss = distribution.reduce("SUM", per_replica_losses, axis=None) self.assertAllClose( self.evaluate(loss), (2. * 0.3 + 4. * 0.5 + 6. * 0.2) / 3) # Time-step sample weight per_replica_losses = distribution.experimental_run_v2( nn_impl.compute_average_loss, args=([[2., 0.5], [4., 1.]],), kwargs={"sample_weight": [[0.3, 0.7], [0.2, 0.8]]}) loss = distribution.reduce("SUM", per_replica_losses, axis=None) self.assertAllClose( self.evaluate(loss), (2. * 0.3 + 0.5 * 0.7 + 4. * 0.2 + 1. * 0.8) / 2) def testComputeAverageLossInvalidSampleWeights(self): with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError), (r"Incompatible shapes: \[3\] vs. \[2\]|" "Dimensions must be equal")): nn_impl.compute_average_loss([2.5, 6.2, 5.], sample_weight=[0.2, 0.8], global_batch_size=10) @combinations.generate( combinations.combine( distribution=[ strategy_combinations.mirrored_strategy_with_cpu_1_and_2 ], mode=["graph", "eager"])) def testComputeAverageLossDtype(self, distribution): with distribution.scope(): per_example_loss = constant_op.constant([2., 4., 6.], dtype=dtypes.float64) per_replica_losses = distribution.experimental_run_v2( nn_impl.compute_average_loss, args=(per_example_loss,), kwargs={"sample_weight": 2}) loss = distribution.reduce("SUM", per_replica_losses, axis=None) self.assertEqual(loss.dtype, dtypes.float64) def testComputeAverageLossInvalidRank(self): per_example_loss = constant_op.constant(2) # Static rank with self.assertRaisesRegex( ValueError, "Invalid value passed for `per_example_loss`. " "Expected a tensor with at least rank 1,"): nn_impl.compute_average_loss(per_example_loss) with context.graph_mode(): # Dynamic rank per_example_loss = array_ops.placeholder(dtype=dtypes.float32) loss = nn_impl.compute_average_loss(per_example_loss) with self.cached_session() as sess: with self.assertRaisesRegex( errors.InvalidArgumentError, "Invalid value passed for `per_example_loss`. " "Expected a tensor with at least rank 1."): sess.run(loss, {per_example_loss: 2}) @combinations.generate( combinations.combine( distribution=[ strategy_combinations.mirrored_strategy_with_cpu_1_and_2 ], mode=["graph", "eager"])) def testComputeAverageLossInCrossReplicaContext(self, distribution): with distribution.scope(): with self.assertRaisesRegex( RuntimeError, "You are calling `compute_average_loss` in cross replica context"): nn_impl.compute_average_loss([2, 3]) @combinations.generate( combinations.combine( distribution=[ strategy_combinations.mirrored_strategy_with_cpu_1_and_2 ], mode=["graph", "eager"])) def testScaleRegularizationLoss(self, distribution): # Without strategy - num replicas = 1 reg_losses = constant_op.constant([2.5, 6.2, 5.]) loss = nn_impl.scale_regularization_loss(reg_losses) self.assertAllClose(self.evaluate(loss), (2.5 + 6.2 + 5.)) # With strategy - num replicas = 2 with distribution.scope(): per_replica_losses = distribution.experimental_run_v2( nn_impl.scale_regularization_loss, args=(reg_losses,)) loss = distribution.reduce("SUM", per_replica_losses, axis=None) self.assertAllClose(self.evaluate(loss), (2.5 + 6.2 + 5.)) @combinations.generate( combinations.combine( distribution=[ strategy_combinations.mirrored_strategy_with_cpu_1_and_2 ], mode=["graph", "eager"])) def testScaleRegularizationLossInCrossReplicaContext(self, distribution): with distribution.scope(): with self.assertRaisesRegex( RuntimeError, "You are calling `scale_regularization_loss` in " "cross replica context"): nn_impl.scale_regularization_loss([2, 3])
group_key = array_ops.identity(group_key) instance_key = array_ops.identity(instance_key) return _collective_ops.all_reduce_v2(t, group_size, group_key, instance_key, *args, **kwargs) @staticmethod def all_gather(t, group_size, group_key, instance_key, *args, **kwargs): group_size = array_ops.identity(group_size) group_key = array_ops.identity(group_key) instance_key = array_ops.identity(instance_key) return _collective_ops.all_gather_v2(t, group_size, group_key, instance_key, *args, **kwargs) device_combination = ( combinations.combine(device='CPU', communication='RING', required_gpus=0) + combinations.combine( device='GPU', communication=['RING', 'NCCL'], required_gpus=2)) @combinations.generate( combinations.times( combinations.combine( collective_ops=[ combinations.NamedObject('v1', CollectiveOpsV1), combinations.NamedObject('v2', CollectiveOpsV2) ], mode='eager'), device_combination)) class CollectiveOpsTest(test.TestCase, parameterized.TestCase): def setUp(self):
from tensorflow.python.eager import def_function from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import variables from tensorflow.python.platform import test from tensorflow.python.util import nest @combinations.generate( combinations.combine(strategy=[ strategy_combinations.multi_worker_mirrored_2x1_cpu, strategy_combinations.multi_worker_mirrored_2x1_gpu, ] + strategy_combinations.all_strategies, mode=['eager'])) class StrategyTest(test.TestCase, parameterized.TestCase): def testCaptureReplicaId(self, strategy): m = {} @def_function.function def f(): return ds_context.get_replica_context().replica_id_in_sync_group @def_function.function def g(): # Make g() a stateful function so it's traced twice. if m.get('v', None) is None: m['v'] = variables.Variable(0.)
from absl.testing import parameterized import numpy as np from tensorflow.python.distribute import combinations from tensorflow.python.distribute import strategy_combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.layers import normalization from tensorflow.python.ops import array_ops from tensorflow.python.ops import variables from tensorflow.python.ops.losses import losses from tensorflow.python.platform import test from tensorflow.python.training import gradient_descent all_combinations = combinations.combine( distribution=[ strategy_combinations.one_device_strategy, ], mode=["graph"]) class NormalizationTest(test.TestCase, parameterized.TestCase): @combinations.generate( combinations.times(all_combinations, combinations.combine(fused=[True, False]))) def testBNWithZeroBatchInput(self, distribution, fused): with distribution.scope(), self.cached_session() as sess: bn_list = [] inputs = np.random.random((0, 4, 4, 3)) + 100 targets = np.random.random((0, 4, 4, 3)) inputs_placeholder = array_ops.placeholder( dtype=dtypes.float32, shape=[None, 4, 4, 3])
class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase): @combinations.generate( combinations.combine(mode=['eager'], file_format=['h5', 'tf'], save_weights_only=[True, False])) def test_model_checkpoint_saves_on_chief_but_not_otherwise( self, file_format, mode, save_weights_only): def proc_model_checkpoint_saves_on_chief_but_not_otherwise( test_obj, file_format): model, saving_filepath, train_ds, steps = _model_setup( test_obj, file_format) num_epoch = 2 extension = os.path.splitext(saving_filepath)[1] # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' % (test_base.get_task_type(), test_base.get_task_index(), extension)) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(checkpoint_exists(saving_filepath)) model.fit(x=train_ds, epochs=num_epoch, steps_per_epoch=steps, validation_data=train_ds, validation_steps=steps, callbacks=[ callbacks.ModelCheckpoint( filepath=saving_filepath, save_weights_only=save_weights_only) ]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual(checkpoint_exists(saving_filepath), test_base.is_chief()) # If it's chief, the model should be saved (`write_filepath` should # simply return `saving_filepath`); if not, i.e. for non-chief workers, # the temporary path generated by `write_filepath` should no longer # contain the checkpoint that has been deleted. test_obj.assertEqual( checkpoint_exists( distributed_file_utils.write_filepath( saving_filepath, model._distribution_strategy)), test_base.is_chief()) multi_process_runner.run( proc_model_checkpoint_saves_on_chief_but_not_otherwise, cluster_spec=test_base.create_cluster_spec(num_workers=2), args=(self, file_format)) @combinations.generate(combinations.combine(mode=['eager'])) def test_model_checkpoint_works_with_same_file_path(self, mode): def proc_model_checkpoint_works_with_same_file_path( test_obj, saving_filepath): model, _, train_ds, steps = _model_setup(test_obj, file_format='') num_epoch = 2 # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(file_io.file_exists(saving_filepath)) model.fit(x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[ callbacks.ModelCheckpoint(filepath=saving_filepath) ]) test_obj.assertTrue(file_io.file_exists(saving_filepath)) saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint') multi_process_runner.run( proc_model_checkpoint_works_with_same_file_path, cluster_spec=test_base.create_cluster_spec(num_workers=2), args=(self, saving_filepath)) @combinations.generate(combinations.combine(mode=['eager'])) def test_backupandrestore_checkpoint_works_with_interruption(self, mode): class InterruptingCallback(callbacks.Callback): def on_epoch_begin(self, epoch, logs=None): if epoch == 2: raise RuntimeError('Interrupting!') class AssertCallback(callbacks.Callback): def on_epoch_begin(self, epoch, logs=None): # the interruption happened on epoch 2 as specified in # InterruptingCallback, so the initial epoch after restart will begin # at 2. assert epoch > 1 def proc_model_checkpoint_works_with_same_file_path( test_obj, saving_filepath): model, _, train_ds, steps = _model_setup(test_obj, file_format='') num_epoch = 4 # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(file_io.file_exists(saving_filepath)) bar_dir = os.path.join(os.path.dirname(saving_filepath), 'backup') try: model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[ callbacks.ModelCheckpoint(filepath=saving_filepath), callbacks.BackupAndRestore(backup_dir=bar_dir), InterruptingCallback() ]) except RuntimeError as e: if 'Interrupting!' not in str(e): raise multi_process_runner.barrier().wait() backup_filepath = os.path.join(bar_dir, 'checkpoint') test_obj.assertTrue(file_io.file_exists(backup_filepath)) test_obj.assertTrue(file_io.file_exists(saving_filepath)) model.fit(x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[ callbacks.ModelCheckpoint(filepath=saving_filepath), callbacks.BackupAndRestore(backup_dir=bar_dir), AssertCallback() ]) multi_process_runner.barrier().wait() test_obj.assertFalse(file_io.file_exists(backup_filepath)) test_obj.assertTrue(file_io.file_exists(saving_filepath)) saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint') multi_process_runner.run( proc_model_checkpoint_works_with_same_file_path, cluster_spec=test_base.create_cluster_spec(num_workers=2), args=(self, saving_filepath)) @combinations.generate(combinations.combine(mode=['eager'])) def test_tensorboard_saves_on_chief_but_not_otherwise(self, mode): def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj): model, _, train_ds, steps = _model_setup(test_obj, file_format='') num_epoch = 2 # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves summaries but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'logfile_%s_%d' % (test_base.get_task_type(), test_base.get_task_index())) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(file_io.file_exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)]) # If it's chief, the summaries should be saved in the filepath; if not, # the directory should be empty (although created). Using # `file_io.list_directory()` since the directory may be created at this # point. test_obj.assertEqual(bool(file_io.list_directory(saving_filepath)), test_base.is_chief()) multi_process_runner.run( proc_tensorboard_saves_on_chief_but_not_otherwise, cluster_spec=test_base.create_cluster_spec(num_workers=2), args=(self, )) @combinations.generate(combinations.combine(mode=['eager'])) def test_tensorboard_can_still_save_to_temp_even_if_it_exists(self, mode): def proc_tensorboard_can_still_save_to_temp_even_if_it_exists( test_obj): model, _, train_ds, steps = _model_setup(test_obj, file_format='') num_epoch = 2 saving_filepath = os.path.join( test_obj.get_temp_dir(), 'logfile_%s' % (test_base.get_task_type())) saving_filepath_for_temp = os.path.join(saving_filepath, 'workertemp_1') os.mkdir(saving_filepath) os.mkdir(saving_filepath_for_temp) # Verifies that even if `saving_filepath_for_temp` exists, tensorboard # can still save to temporary directory. test_obj.assertTrue(file_io.file_exists(saving_filepath_for_temp)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)]) multi_process_runner.run( proc_tensorboard_can_still_save_to_temp_even_if_it_exists, cluster_spec=test_base.create_cluster_spec(num_workers=2), args=(self, )) @combinations.generate(combinations.combine(mode=['eager'])) def test_tensorboard_works_with_same_file_path(self, mode): def proc_tensorboard_works_with_same_file_path(test_obj, saving_filepath): model, _, train_ds, steps = _model_setup(test_obj, file_format='') num_epoch = 2 # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(file_io.file_exists(saving_filepath)) multi_process_runner.barrier().wait() model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)]) multi_process_runner.barrier().wait() test_obj.assertTrue(file_io.list_directory(saving_filepath)) saving_filepath = os.path.join(self.get_temp_dir(), 'logfile') multi_process_runner.run( proc_tensorboard_works_with_same_file_path, cluster_spec=test_base.create_cluster_spec(num_workers=2), args=(self, saving_filepath)) @combinations.generate(combinations.combine(mode=['eager'])) def test_early_stopping(self, mode): def proc_early_stopping(test_obj): class EpochCounterCallback(callbacks.Callback): def on_epoch_begin(self, epoch, logs): self.last_epoch = epoch model, _, train_ds, steps = _model_setup(test_obj, file_format='') epoch_counter_cbk = EpochCounterCallback() cbks = [ callbacks.EarlyStopping(monitor='loss', min_delta=0.05, patience=1, verbose=1), epoch_counter_cbk ] # Empirically, it is expected that `model.fit()` terminates around the # 22th epoch. Asserting that it should have been stopped before the 50th # epoch to avoid flakiness and be more predictable. model.fit(x=train_ds, epochs=100, steps_per_epoch=steps, callbacks=cbks) test_obj.assertLess(epoch_counter_cbk.last_epoch, 50) multi_process_runner.run( proc_early_stopping, cluster_spec=test_base.create_cluster_spec(num_workers=2), args=(self, ))
def eager_mode_test_configuration(): return combinations.combine(mode='eager', use_numpy=False, use_validation_data=False)
def all_strategy_and_input_config_combinations(): return (combinations.times( combinations.combine(distribution=all_strategies, experimental_run_tf_function=[True, False]), eager_mode_test_configuration() + graph_mode_test_configuration()))
def all_strategy_combinations_with_graph_mode(): return (combinations.combine( distribution=keras_correctness_test_base.all_strategies, mode=['graph'], cloning=[True, False]))
def all_strategy_combinations_with_graph_mode(): return (combinations.combine( distribution=keras_correctness_test_base.all_strategies, mode=['graph'], run_distributed=[True, False]))
class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase): @combinations.generate( combinations.combine( mode=['eager'], file_format=['h5', 'tf'], save_weights_only=[True, False])) def test_model_checkpoint_saves_on_chief_but_not_otherwise( self, file_format, mode, save_weights_only): def proc_model_checkpoint_saves_on_chief_but_not_otherwise( test_obj, file_format): model, saving_filepath, train_ds, steps = _model_setup( test_obj, file_format) num_epoch = 2 extension = os.path.splitext(saving_filepath)[1] # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' % (test_base.get_task_type(), test_base.get_task_index(), extension)) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[ callbacks.ModelCheckpoint( filepath=saving_filepath, save_weights_only=save_weights_only) ]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual( training_state.checkpoint_exists(saving_filepath), test_base.is_chief()) # TODO(b/141948186): Remove this `with` block once b/141948186 is resolved. with multi_process_runner_util.try_run_and_except_connection_error(self): multi_process_runner.run( proc_model_checkpoint_saves_on_chief_but_not_otherwise, cluster_spec=test_base.create_cluster_spec(num_workers=2), args=(self, file_format)) @combinations.generate(combinations.combine(mode=['eager'])) def test_tensorboard_saves_on_chief_but_not_otherwise(self, mode): def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj): model, _, train_ds, steps = _model_setup(test_obj, file_format='') num_epoch = 2 # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves summaries but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'logfile_%s_%d' % (test_base.get_task_type(), test_base.get_task_index())) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(file_io.file_exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)]) # If it's chief, the summaries should be saved in the filepath; if not, # the directory should be empty (although created). Using # `file_io.list_directory()` since the directory may be created at this # point. test_obj.assertEqual( bool(file_io.list_directory(saving_filepath)), test_base.is_chief()) # TODO(b/141948186): Remove this `with` block once b/141948186 is resolved. with multi_process_runner_util.try_run_and_except_connection_error(self): multi_process_runner.run( proc_tensorboard_saves_on_chief_but_not_otherwise, cluster_spec=test_base.create_cluster_spec(num_workers=2), args=(self,)) @combinations.generate(combinations.combine(mode=['eager'])) def test_tensorboard_can_still_save_to_temp_even_if_it_exists(self, mode): def proc_tensorboard_can_still_save_to_temp_even_if_it_exists(test_obj): model, _, train_ds, steps = _model_setup(test_obj, file_format='') num_epoch = 2 saving_filepath = os.path.join(test_obj.get_temp_dir(), 'logfile_%s' % (test_base.get_task_type())) saving_filepath_for_temp = os.path.join(saving_filepath, 'workertemp_1') os.mkdir(saving_filepath) os.mkdir(saving_filepath_for_temp) # Verifies that even if `saving_filepath_for_temp` exists, tensorboard # can still save to temporary directory. test_obj.assertTrue(file_io.file_exists(saving_filepath_for_temp)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)]) # TODO(b/141948186): Remove this `with` block once b/141948186 is resolved. with multi_process_runner_util.try_run_and_except_connection_error(self): multi_process_runner.run( proc_tensorboard_can_still_save_to_temp_even_if_it_exists, cluster_spec=test_base.create_cluster_spec(num_workers=2), args=(self,))
context.LogicalDeviceConfiguration(64), context.LogicalDeviceConfiguration(64), ]) collective_all_reduce_strategy.CollectiveAllReduceStrategy( cluster_resolver=resolver) # Since we create two logical GPUs out of the last GPU, there should be one # more logical GPUs than physical GPUs. self.assertLen(tf_config.list_logical_devices('GPU'), len(gpus) + 1) context._reset_context() # pylint: disable=protected-access @combinations.generate( combinations.combine( strategy=[ strategy_combinations.multi_worker_mirrored_2x1_cpu, strategy_combinations.multi_worker_mirrored_2x1_gpu, strategy_combinations.multi_worker_mirrored_2x2_gpu, ], mode=['eager'])) class CollectiveAllReduceStrategyV2Test(test.TestCase, parameterized.TestCase): def test_replica_id_in_sync_group(self, strategy): def replica_fn(): replica_ctx = distribution_strategy_context.get_replica_context() return replica_ctx.replica_id_in_sync_group, replica_ctx._replica_id results = test_util.gather(strategy, strategy.run(replica_fn)) self.assertAllEqual(list(range(strategy.extended._num_replicas_in_sync)), results[0].numpy()) self.assertAllEqual(
from __future__ import absolute_import from __future__ import division from __future__ import print_function from tensorflow.python.data.ops import dataset_ops from tensorflow.python.distribute import combinations from tensorflow.python.distribute import strategy_combinations from tensorflow.python.distribute import strategy_test_lib from tensorflow.python.eager import context from tensorflow.python.eager import test @combinations.generate( combinations.combine( distribution=[ strategy_combinations.one_device_strategy, strategy_combinations.one_device_strategy_gpu ], mode=["eager", "graph"])) class OneDeviceStrategyTest( strategy_test_lib.DistributionTestBase, strategy_test_lib.OneDeviceDistributionTestBase): def testMinimizeLoss(self, distribution): if context.executing_eagerly(): self._test_minimize_loss_eager(distribution) else: self._test_minimize_loss_graph(distribution) def testReplicaId(self, distribution): self._test_replica_id(distribution)
class DistributedCollectiveAllReduceStrategyTest( CollectiveAllReduceStrategyTestBase, strategy_test_lib.DistributionTestBase, parameterized.TestCase): @classmethod def setUpClass(cls): """Create a local cluster with 3 workers.""" cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=0) @combinations.generate(combinations.combine(mode=['graph'])) def test_num_replicas_in_sync(self): distribution, _, _ = create_test_objects( cluster_spec=self._cluster_spec, task_type='worker', task_id=0, num_gpus=2) num_workers = len(self._cluster_spec.get('chief', []) + self._cluster_spec.get('worker', [])) self.assertEqual(2 * num_workers, distribution.num_replicas_in_sync) @combinations.generate(combinations.combine( mode=['graph'], prefetch_to_device=[None, True])) def test_prefetch_to_device_dataset(self, prefetch_to_device): distribution, _, _ = self._get_test_object( task_type='worker', task_id=0, num_gpus=2) if prefetch_to_device is None: input_options = None else: input_options = distribute_lib.InputOptions( experimental_prefetch_to_device=prefetch_to_device) dataset = dataset_ops.Dataset.range(100) dataset = dataset.batch(distribution.num_replicas_in_sync) dataset = distribution.experimental_distribute_dataset( dataset, options=input_options) if isinstance(dataset, input_lib.DistributedDatasetV1): item = dataset.make_initializable_iterator().get_next() else: self.skipTest('unsupported test combination') device_types = { tf_device.DeviceSpec.from_string(tensor.device).device_type for tensor in item.values} self.assertAllEqual(list(device_types), ['GPU']) @combinations.generate(combinations.combine(mode=['graph'])) def test_prefetch_to_host_dataset(self): distribution, _, _ = self._get_test_object( task_type='worker', task_id=0, num_gpus=2) input_options = distribute_lib.InputOptions( experimental_prefetch_to_device=False) dataset = dataset_ops.Dataset.range(100) dataset = dataset.batch(distribution.num_replicas_in_sync) dataset = distribution.experimental_distribute_dataset( dataset, options=input_options) if isinstance(dataset, input_lib.DistributedDatasetV1): item = dataset.make_initializable_iterator().get_next() else: self.skipTest('unsupported test combination') device_types = { tf_device.DeviceSpec.from_string(tensor.device).device_type for tensor in item.values} self.assertAllEqual(list(device_types), ['CPU']) @combinations.generate( combinations.combine(mode=['graph'], required_gpus=[0, 1, 2])) def testMinimizeLossGraph(self, required_gpus): self._run_between_graph_clients(self._test_minimize_loss_graph, self._cluster_spec, required_gpus) @combinations.generate( combinations.combine(mode=['graph'], required_gpus=[0, 1, 2])) def testVariableInitialization(self, required_gpus): self._run_between_graph_clients( self._test_variable_initialization, self._cluster_spec, num_gpus=required_gpus) @combinations.generate( combinations.combine( mode=['graph'], required_gpus=[0, 1, 2], use_dataset=[True, False])) def testMakeInputFnIterator(self, required_gpus, use_dataset): def _worker_fn(task_type, task_id, required_gpus): if use_dataset: fn = lambda: dataset_ops.Dataset.range(20) else: def fn(): dataset = dataset_ops.Dataset.range(20) it = dataset_ops.make_one_shot_iterator(dataset) return it.get_next # We use CPU as the device when required_gpus = 0 devices_per_worker = max(1, required_gpus) expected_values = [[i+j for j in range(devices_per_worker)] for i in range(0, 20, devices_per_worker)] input_fn = self._input_fn_to_test_input_context( fn, expected_num_replicas_in_sync=3*devices_per_worker, expected_num_input_pipelines=3, expected_input_pipeline_id=task_id) self._test_input_fn_iterator( task_type, task_id, required_gpus, input_fn, expected_values, test_reinitialize=use_dataset, ignore_order=not use_dataset) self._run_between_graph_clients(_worker_fn, self._cluster_spec, required_gpus) @combinations.generate(combinations.combine(mode=['graph'])) def testUpdateConfigProto(self): strategy, _, _ = self._get_test_object( task_type='worker', task_id=1, num_gpus=2) config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden']) rewrite_options = config_proto.graph_options.rewrite_options rewrite_options.scoped_allocator_opts.enable_op.append('to_be_removed') new_config = strategy.update_config_proto(config_proto) # Verify group leader self.assertEqual('/job:worker/replica:0/task:0', new_config.experimental.collective_group_leader) # Verify device filters. self.assertEqual(['/job:worker/task:1'], new_config.device_filters) # Verify rewrite options. new_rewrite_options = new_config.graph_options.rewrite_options self.assertEqual(rewriter_config_pb2.RewriterConfig.ON, new_rewrite_options.scoped_allocator_optimization) self.assertEqual(['CollectiveReduce'], new_rewrite_options.scoped_allocator_opts.enable_op)
class TrainLibTest(tf.test.TestCase, parameterized.TestCase): def setUp(self): super().setUp() self._test_config = { 'trainer': { 'checkpoint_interval': 10, 'steps_per_loop': 10, 'summary_interval': 10, 'train_steps': 10, 'validation_steps': 5, 'validation_interval': 10, 'continuous_eval_timeout': 1, 'optimizer_config': { 'optimizer': { 'type': 'sgd', }, 'learning_rate': { 'type': 'constant' } } }, } @combinations.generate( combinations.combine( distribution_strategy=[ strategy_combinations.default_strategy, strategy_combinations.cloud_tpu_strategy, strategy_combinations.one_device_strategy_gpu, ], mode='eager', flag_mode=['train', 'eval', 'train_and_eval'])) def test_end_to_end(self, distribution_strategy, flag_mode): model_dir = self.get_temp_dir() experiment_config = configs.MultiTaskExperimentConfig( task=configs.MultiTaskConfig( task_routines=( configs.TaskRoutine( task_name='foo', task_config=test_utils.FooConfig()), configs.TaskRoutine( task_name='bar', task_config=test_utils.BarConfig())))) experiment_config = params_dict.override_params_dict( experiment_config, self._test_config, is_strict=False) with distribution_strategy.scope(): test_multitask = multitask.MultiTask.from_config(experiment_config.task) model = test_utils.MockMultiTaskModel() train_lib.run_experiment( distribution_strategy=distribution_strategy, task=test_multitask, model=model, mode=flag_mode, params=experiment_config, model_dir=model_dir) @combinations.generate( combinations.combine( distribution_strategy=[ strategy_combinations.default_strategy, strategy_combinations.cloud_tpu_strategy, strategy_combinations.one_device_strategy_gpu, ], mode='eager', flag_mode=['train', 'eval', 'train_and_eval'])) def test_end_to_end_multi_eval(self, distribution_strategy, flag_mode): model_dir = self.get_temp_dir() experiment_config = configs.MultiEvalExperimentConfig( task=test_utils.FooConfig(), eval_tasks=(configs.TaskRoutine( task_name='foo', task_config=test_utils.FooConfig(), eval_steps=2), configs.TaskRoutine( task_name='bar', task_config=test_utils.BarConfig(), eval_steps=3))) experiment_config = params_dict.override_params_dict( experiment_config, self._test_config, is_strict=False) with distribution_strategy.scope(): train_task = task_factory.get_task(experiment_config.task) eval_tasks = [ task_factory.get_task(config.task_config, name=config.task_name) for config in experiment_config.eval_tasks ] train_lib.run_experiment_with_multitask_eval( distribution_strategy=distribution_strategy, train_task=train_task, eval_tasks=eval_tasks, mode=flag_mode, params=experiment_config, model_dir=model_dir)
class LocalCollectiveAllReduceStrategy( CollectiveAllReduceStrategyTestBase, strategy_test_lib.DistributionTestBase, strategy_test_lib.TwoDeviceDistributionTestBase, parameterized.TestCase): @combinations.generate( combinations.combine(mode=['graph', 'eager'], required_gpus=[2, 4])) def testMinimizeLoss(self, required_gpus): # Collective ops doesn't support strategy with one device. if context.executing_eagerly(): strategy, _, _ = self._get_test_object(None, None, required_gpus) self._test_minimize_loss_eager(strategy) else: self._test_minimize_loss_graph(None, None, required_gpus) @combinations.generate( combinations.combine( mode=['graph'], required_gpus=2, use_dataset=[True, False])) def testMakeInputFnIterator(self, required_gpus, use_dataset): if use_dataset: fn = lambda: dataset_ops.Dataset.range(5 * required_gpus) else: def fn(): dataset = dataset_ops.Dataset.range(5 * required_gpus) it = dataset_ops.make_one_shot_iterator(dataset) return it.get_next expected_values = [ range(i, i + required_gpus) for i in range(0, 10, required_gpus) ] input_fn = self._input_fn_to_test_input_context( fn, expected_num_replicas_in_sync=required_gpus, expected_num_input_pipelines=1, expected_input_pipeline_id=0) self._test_input_fn_iterator( None, None, required_gpus, input_fn, expected_values, test_reinitialize=use_dataset, ignore_order=not use_dataset) @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2)) def testAllReduceSum(self, required_gpus): distribution, target, config = self._get_test_object( None, None, num_gpus=required_gpus) with self.cached_session(config=config, target=target): self._test_all_reduce_sum(distribution) @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2)) def testAllReduceSumGradients(self, required_gpus): distribution, target, config = self._get_test_object( None, None, num_gpus=required_gpus) with self.cached_session(config=config, target=target): self._test_all_reduce_sum_gradients(distribution) @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2)) def testAllReduceSumGradientTape(self, required_gpus): distribution, target, config = self._get_test_object( None, None, num_gpus=required_gpus) with self.cached_session(config=config, target=target): self._test_all_reduce_sum_gradient_tape(distribution) @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2)) def testAllReduceMean(self, required_gpus): distribution, target, config = self._get_test_object( None, None, num_gpus=required_gpus) with self.cached_session(config=config, target=target): self._test_all_reduce_mean(distribution) @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2)) def testAllReduceMeanGradients(self, required_gpus): distribution, target, config = self._get_test_object( None, None, num_gpus=required_gpus) with self.cached_session(config=config, target=target): self._test_all_reduce_mean_gradients(distribution) @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2)) def testAllReduceMeanGradientTape(self, required_gpus): distribution, target, config = self._get_test_object( None, None, num_gpus=required_gpus) with self.cached_session(config=config, target=target): self._test_all_reduce_mean_gradient_tape(distribution) @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2)) def testNumpyDataset(self, required_gpus): strategy, target, config = self._get_test_object( None, None, num_gpus=required_gpus) self._test_numpy_dataset( strategy, session=self.cached_session(config=config, target=target))
def strategy_for_numpy_input_combinations(): return combinations.combine( distribution=strategies_minus_tpu + tpu_strategies, mode=['graph'])
class ParameterServerStrategyTest( ParameterServerStrategyTestBase, strategy_test_lib.DistributionTestBase, strategy_test_lib.TwoDeviceDistributionTestBase, parameterized.TestCase): @classmethod def setUpClass(cls): cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=2) cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0] @combinations.generate( combinations.combine(mode=['graph'], use_core_strategy=[True, False])) def test_num_replicas_in_sync(self, use_core_strategy): strategy, _, _ = create_test_objects( num_gpus=2, use_core_strategy=use_core_strategy) # All the devices on a given worker are in sync which in this case is the # number of gpus on each worker. self.assertEqual(2, strategy.num_replicas_in_sync) @combinations.generate( combinations.combine(mode=['graph'], use_core_strategy=[True, False])) def testDeviceAssignmentLocalCPU(self, use_core_strategy): strategy, _, _ = create_test_objects( num_gpus=0, use_core_strategy=use_core_strategy) self._test_device_assignment_local(strategy, compute_device='CPU', variable_device='CPU', num_gpus=0) @combinations.generate( combinations.combine(mode=['graph'], use_core_strategy=[True, False])) def testDeviceAssignmentLocalOneGPU(self, use_core_strategy): strategy, _, _ = create_test_objects( num_gpus=1, use_core_strategy=use_core_strategy) self._test_device_assignment_local(strategy, compute_device='GPU', variable_device='GPU', num_gpus=1) @combinations.generate( combinations.combine(mode=['graph'], use_core_strategy=[True, False])) def testDeviceAssignmentLocalTwoGPUs(self, use_core_strategy): strategy, _, _ = create_test_objects( num_gpus=2, use_core_strategy=use_core_strategy) self._test_device_assignment_local(strategy, compute_device='GPU', variable_device='CPU', num_gpus=2) @combinations.generate( combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False])) def testDeviceAssignmentDistributed(self, num_gpus, use_core_strategy): self._test_device_assignment_distributed( 'worker', 1, num_gpus, use_core_strategy=use_core_strategy) @combinations.generate( combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False])) def testDeviceAssignmentDistributedEnablePartitioner( self, num_gpus, use_core_strategy): self._test_device_assignment_distributed_enable_partitioner( 'worker', 1, num_gpus, use_core_strategy=use_core_strategy) @combinations.generate( combinations.combine(mode=['graph'], use_core_strategy=[True, False])) def testSimpleBetweenGraph(self, use_core_strategy): self._run_between_graph_clients(self._test_simple_increment, self._cluster_spec, context.num_gpus(), use_core_strategy=use_core_strategy) @combinations.generate( combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False])) def testLocalSimpleIncrement(self, num_gpus, use_core_strategy): self._test_simple_increment(None, 0, num_gpus, use_core_strategy) @combinations.generate( combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False])) def testMinimizeLossGraphDistributed(self, num_gpus, use_core_strategy): self._run_between_graph_clients(self._test_minimize_loss_graph, self._cluster_spec, num_gpus, use_core_strategy=use_core_strategy) @combinations.generate( combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False])) def testMinimizeLossGraphLocal(self, num_gpus, use_core_strategy): self._test_minimize_loss_graph(None, None, num_gpus, use_core_strategy) # TODO(priyag): Refactor this and other multi worker tests. @combinations.generate( combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1, use_core_strategy=[True, False], use_dataset=[True, False])) def testMakeInputFnIteratorDistributed(self, num_gpus, use_core_strategy, use_dataset): if context.num_gpus() < num_gpus: self.skipTest('Not enough GPUs') if use_dataset: fn = lambda: dataset_ops.Dataset.range(100) else: def fn(): dataset = dataset_ops.Dataset.range(100) it = dataset.make_one_shot_iterator() return it.get_next expected_values = [[i + j for j in range(num_gpus)] for i in range(0, 100, num_gpus)] input_fn = self._input_fn_to_test_input_context( fn, expected_num_replicas_in_sync=num_gpus, expected_num_input_pipelines=3, expected_input_pipeline_id=1) # because task_id = 1 self._test_input_fn_iterator('worker', 1, num_gpus, input_fn, expected_values, test_reinitialize=use_dataset, ignore_order=not use_dataset, use_core_strategy=use_core_strategy) @combinations.generate( combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1, use_core_strategy=[True, False], use_dataset=[True, False])) def testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy, use_dataset): if context.num_gpus() < num_gpus: self.skipTest('Not enough GPUs') if use_dataset: fn = lambda: dataset_ops.Dataset.range(100) else: def fn(): dataset = dataset_ops.Dataset.range(100) it = dataset.make_one_shot_iterator() return it.get_next expected_values = [[i + j for j in range(num_gpus)] for i in range(0, 100, num_gpus)] input_fn = self._input_fn_to_test_input_context( fn, expected_num_replicas_in_sync=num_gpus, expected_num_input_pipelines=1, expected_input_pipeline_id=0 ) # only one worker and pipeline for local. self._test_input_fn_iterator(None, None, num_gpus, input_fn, expected_values, test_reinitialize=use_dataset, ignore_order=not use_dataset, use_core_strategy=use_core_strategy) @combinations.generate( combinations.combine(mode=['graph'], use_core_strategy=[True, False])) def testGlobalStepUpdate(self, use_core_strategy): strategy, _, _ = create_test_objects( use_core_strategy=use_core_strategy) self._test_global_step_update(strategy) @combinations.generate( combinations.combine(mode=['graph'], use_core_strategy=[True, False])) def testUpdateConfigProtoMultiWorker(self, use_core_strategy): strategy, _, _ = create_test_objects( cluster_spec=self._cluster_spec, task_type='worker', task_id=1, num_gpus=2, use_core_strategy=use_core_strategy) config_proto = config_pb2.ConfigProto( device_filters=['to_be_overridden']) new_config = strategy.update_config_proto(config_proto) # Verify device filters. self.assertEqual(['/job:worker/task:1', '/job:ps'], new_config.device_filters) # Verify isolate_session_state self.assertFalse(new_config.isolate_session_state) @combinations.generate( combinations.combine(mode=['graph'], use_core_strategy=[True, False])) def testUpdateConfigProtoLocal(self, use_core_strategy): strategy, _, _ = create_test_objects( num_gpus=2, use_core_strategy=use_core_strategy) config_proto = config_pb2.ConfigProto() new_config = strategy.update_config_proto(config_proto) # Verify isolate_session_state self.assertTrue(new_config.isolate_session_state) @combinations.generate(combinations.combine(required_gpus=[2])) def testAllReduceSum(self): distribution = parameter_server_strategy.ParameterServerStrategy( num_gpus_per_worker=2) self._test_all_reduce_sum(distribution) @combinations.generate(combinations.combine(required_gpus=[2])) def testAllReduceSumGradients(self): distribution = parameter_server_strategy.ParameterServerStrategy( num_gpus_per_worker=2) self._test_all_reduce_sum_gradients(distribution) @combinations.generate(combinations.combine(required_gpus=[2])) def testAllReduceSumGradientTape(self): distribution = parameter_server_strategy.ParameterServerStrategy( num_gpus_per_worker=2) self._test_all_reduce_sum_gradient_tape(distribution) @combinations.generate(combinations.combine(required_gpus=[2])) def testAllReduceMean(self): distribution = parameter_server_strategy.ParameterServerStrategy( num_gpus_per_worker=2) self._test_all_reduce_mean(distribution) @combinations.generate(combinations.combine(required_gpus=[2])) def testAllReduceMeanGradients(self): distribution = parameter_server_strategy.ParameterServerStrategy( num_gpus_per_worker=2) self._test_all_reduce_mean_gradients(distribution) @combinations.generate(combinations.combine(required_gpus=[2])) def testAllReduceMeanGradientTape(self): distribution = parameter_server_strategy.ParameterServerStrategy( num_gpus_per_worker=2) self._test_all_reduce_mean_gradient_tape(distribution) def testTrainableVariables(self): distribution = parameter_server_strategy.ParameterServerStrategy() self._test_trainable_variable(distribution)
def strategy_minus_tpu_combinations(): return combinations.combine( distribution=strategies_minus_tpu, mode=["graph", "eager"])
class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase, parameterized.TestCase): @classmethod def setUpClass(cls): cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=2, has_chief=True) cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0] @combinations.generate( combinations.combine(mode=['graph'], use_core_strategy=[True, False])) def testSimpleBetweenGraph(self, use_core_strategy): self._run_between_graph_clients(self._test_simple_increment, self._cluster_spec, context.num_gpus(), use_core_strategy=use_core_strategy) @combinations.generate( combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False])) def testMinimizeLossGraph(self, num_gpus, use_core_strategy): self._run_between_graph_clients(self._test_minimize_loss_graph, self._cluster_spec, num_gpus, use_core_strategy=use_core_strategy) @combinations.generate( combinations.combine(mode=['graph'], use_core_strategy=[True, False])) def testGlobalStepIsWrappedOnTwoGPUs(self, use_core_strategy): strategy, _, _ = create_test_objects( num_gpus=2, use_core_strategy=use_core_strategy) with ops.Graph().as_default(), strategy.scope(): created_step = training_util.create_global_step() get_step = training_util.get_global_step() self.assertEqual( created_step, get_step, msg=('created_step %s type %s vs. get_step %s type %s' % (id(created_step), created_step.__class__.__name__, id(get_step), get_step.__class__.__name__))) self.assertIs(values.AggregatingVariable, type(created_step)) self.assertIs(values.AggregatingVariable, type(get_step)) self.assertIs(strategy, created_step.distribute_strategy) @combinations.generate( combinations.combine(mode=['graph'], use_core_strategy=[True, False])) def testGlobalStepIsNotWrappedOnOneGPU(self, use_core_strategy): strategy, _, _ = create_test_objects( num_gpus=1, use_core_strategy=use_core_strategy) with ops.Graph().as_default(), strategy.scope(): created_step = training_util.create_global_step() get_step = training_util.get_global_step() self.assertEqual( created_step, get_step, msg=('created_step %s type %s vs. get_step %s type %s' % (id(created_step), created_step.__class__.__name__, id(get_step), get_step.__class__.__name__))) self.assertIs(resource_variable_ops.ResourceVariable, type(created_step)) self.assertIs(resource_variable_ops.ResourceVariable, type(get_step)) # All variables have an _distribute_strategy parameter. Only variable # subclasses in distribution strategy expose it publicly. self.assertFalse(hasattr(strategy, 'distribute_strategy')) self.assertIs(strategy, created_step._distribute_strategy) @combinations.generate( combinations.combine(mode=['graph'], use_core_strategy=[True, False])) def testValueContainer(self, use_core_strategy): strategy, _, _ = create_test_objects( num_gpus=2, use_core_strategy=use_core_strategy) with ops.Graph().as_default(), strategy.scope(): def f(): with backprop.GradientTape() as tape: v = variable_scope.get_variable('v', initializer=10.0) _ = v * v v, = tape.watched_variables() w = strategy.extended.value_container(v) self.assertIs(values.AggregatingVariable, type(w)) strategy.extended.call_for_each_replica(f)
class FactoryTest(tf.test.TestCase, parameterized.TestCase): @combinations.generate( combinations.combine(model_id=[18, 34, 50, 101, 152], )) def test_resnet_creation(self, model_id): """Test creation of ResNet models.""" network = backbones.ResNet(model_id=model_id, se_ratio=0.0, norm_momentum=0.99, norm_epsilon=1e-5) backbone_config = backbones_cfg.Backbone(type='resnet', resnet=backbones_cfg.ResNet( model_id=model_id, se_ratio=0.0)) norm_activation_config = common_cfg.NormActivation(norm_momentum=0.99, norm_epsilon=1e-5, use_sync_bn=False) factory_network = factory.build_backbone( input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]), backbone_config=backbone_config, norm_activation_config=norm_activation_config) network_config = network.get_config() factory_network_config = factory_network.get_config() self.assertEqual(network_config, factory_network_config) @combinations.generate( combinations.combine( model_id=['b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'], se_ratio=[0.0, 0.25], )) def test_efficientnet_creation(self, model_id, se_ratio): """Test creation of EfficientNet models.""" network = backbones.EfficientNet(model_id=model_id, se_ratio=se_ratio, norm_momentum=0.99, norm_epsilon=1e-5) backbone_config = backbones_cfg.Backbone( type='efficientnet', efficientnet=backbones_cfg.EfficientNet(model_id=model_id, se_ratio=se_ratio)) norm_activation_config = common_cfg.NormActivation(norm_momentum=0.99, norm_epsilon=1e-5, use_sync_bn=False) factory_network = factory.build_backbone( input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]), backbone_config=backbone_config, norm_activation_config=norm_activation_config) network_config = network.get_config() factory_network_config = factory_network.get_config() self.assertEqual(network_config, factory_network_config) @combinations.generate( combinations.combine( model_id=[ 'MobileNetV1', 'MobileNetV2', 'MobileNetV3Large', 'MobileNetV3Small', 'MobileNetV3EdgeTPU' ], filter_size_scale=[1.0, 0.75], )) def test_mobilenet_creation(self, model_id, filter_size_scale): """Test creation of Mobilenet models.""" network = backbones.MobileNet(model_id=model_id, filter_size_scale=filter_size_scale, norm_momentum=0.99, norm_epsilon=1e-5) backbone_config = backbones_cfg.Backbone( type='mobilenet', mobilenet=backbones_cfg.MobileNet( model_id=model_id, filter_size_scale=filter_size_scale)) norm_activation_config = common_cfg.NormActivation(norm_momentum=0.99, norm_epsilon=1e-5, use_sync_bn=False) factory_network = factory.build_backbone( input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]), backbone_config=backbone_config, norm_activation_config=norm_activation_config) network_config = network.get_config() factory_network_config = factory_network.get_config() self.assertEqual(network_config, factory_network_config) @combinations.generate(combinations.combine(model_id=['49'], )) def test_spinenet_creation(self, model_id): """Test creation of SpineNet models.""" input_size = 128 min_level = 3 max_level = 7 input_specs = tf.keras.layers.InputSpec( shape=[None, input_size, input_size, 3]) network = backbones.SpineNet(input_specs=input_specs, min_level=min_level, max_level=max_level, norm_momentum=0.99, norm_epsilon=1e-5) backbone_config = backbones_cfg.Backbone( type='spinenet', spinenet=backbones_cfg.SpineNet(model_id=model_id)) norm_activation_config = common_cfg.NormActivation(norm_momentum=0.99, norm_epsilon=1e-5, use_sync_bn=False) factory_network = factory.build_backbone( input_specs=tf.keras.layers.InputSpec( shape=[None, input_size, input_size, 3]), backbone_config=backbone_config, norm_activation_config=norm_activation_config) network_config = network.get_config() factory_network_config = factory_network.get_config() self.assertEqual(network_config, factory_network_config) @combinations.generate(combinations.combine(model_id=[38, 56, 104], )) def test_revnet_creation(self, model_id): """Test creation of RevNet models.""" network = backbones.RevNet(model_id=model_id, norm_momentum=0.99, norm_epsilon=1e-5) backbone_config = backbones_cfg.Backbone( type='revnet', revnet=backbones_cfg.RevNet(model_id=model_id)) norm_activation_config = common_cfg.NormActivation(norm_momentum=0.99, norm_epsilon=1e-5, use_sync_bn=False) factory_network = factory.build_backbone( input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]), backbone_config=backbone_config, norm_activation_config=norm_activation_config) network_config = network.get_config() factory_network_config = factory_network.get_config() self.assertEqual(network_config, factory_network_config) @combinations.generate(combinations.combine(model_type=['resnet_3d'], )) def test_resnet_3d_creation(self, model_type): """Test creation of ResNet 3D models.""" backbone_cfg = backbones_3d_cfg.Backbone3D(type=model_type).get() temporal_strides = [] temporal_kernel_sizes = [] for block_spec in backbone_cfg.block_specs: temporal_strides.append(block_spec.temporal_strides) temporal_kernel_sizes.append(block_spec.temporal_kernel_sizes) _ = backbones.ResNet3D(model_id=backbone_cfg.model_id, temporal_strides=temporal_strides, temporal_kernel_sizes=temporal_kernel_sizes, norm_momentum=0.99, norm_epsilon=1e-5)
class CollectiveOpsTest(test.TestCase, parameterized.TestCase): def setUp(self): super().setUp() # Enabling collectives can be done in "setUpClass", but requires using # different collective_keys in different tests as collectives are reused # across tests. Always resetting collective ops before each test offers # better test isolation. global_mpr_1p.runner.run(enable_collective_ops) global_mpr_2p.runner.run(enable_collective_ops) def make_collective(self, num_processes, gpu_per_process, communication): """Returns collectives and other info to be used in tests. Args: num_processes: an integer indicating the number of processes that participate in the collective. gpu_per_process: number of GPUs (0 if no GPUs) used by each process. communication: one of `CollectiveCommunication`. Returns: A tuple of (collective, devices, group_size) where collective is a instance of `CollectiveAllReduce`, devices are a list of local devices (str) attached to the current process, and group_size is the group_size of collective. """ cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver() devices = [ "/job:worker/replica:0/task:%d/device:CPU:0" % cluster_resolver.task_id ] if gpu_per_process > 0: devices = [ "/job:worker/replica:0/task:%d/device:GPU:%d" % (cluster_resolver.task_id, i) for i in range(gpu_per_process) ] group_size = num_processes * len(devices) collective = cross_device_ops_lib.CollectiveAllReduce( devices=devices, group_size=group_size, communication=communication) return collective, devices, cluster_resolver.task_id def as_list(self, value): """An utility to convert a `Mirrored`, `Tensor` or `IndexedSlices` to a list. The reason it exists is to provide a uniformed view of returned value of "reduce" calls, especially across tf.function boundaries. Returning `Mirrored` from a tf.function will only evaluate the primary value, which makes collective ops of non-primary device being pruned, and will eventually cause hanging. Args: value: the value to convert, can be one of `Mirrored`, `Tensor` and `IndexedSlices`. Returns: A list of `Tensor` or `IndexedSlices`. """ if isinstance(value, ops.Tensor): return [value] elif isinstance(value, indexed_slices.IndexedSlices): return [value] elif isinstance(value, value_lib.Mirrored): return value.values else: raise ValueError("unwrap: unsupported input type: %s" % type(value)) RunOptions = collections.namedtuple( # pylint: disable=invalid-name "RunOptions", [ "mode", # A list of str from ["eager", "func_graph"] "num_processes", "gpus_per_process", "reduce_op", "communication", ]) RunOptions.__new__.__defaults__ = (["eager", "func_graph"], 2, 0, ReduceOp.SUM, CollectiveCommunication.AUTO) def reduce_and_verify(self, inputs, expect, options): """Reduce the given `inputs` and verify the output matches `expect`. Args: inputs: a list of `Tensor` or `IndexedSlices`, where i-th value will be fed to i-th replica. expect: a `Tensor` or `IndexedSlices`. This should be the expected value for one replica. options: a `RunOpotions` instance. """ def replica_fn(): collective, devices, pid = self.make_collective(options.num_processes, options.gpus_per_process, options.communication) def reduce_fn(): value_fn = lambda device_idx: inputs[pid * len(devices) + device_idx] per_replica_value = make_per_replica_value(value_fn, devices) reduced_values = collective.reduce(options.reduce_op, per_replica_value, per_replica_value) reduced_values = self.as_list(reduced_values) self.assertAllEqual(devices, [v.device for v in reduced_values]) return [ops.convert_to_tensor(v) for v in reduced_values] per_replica_expect = [ops.convert_to_tensor(expect)] * len(devices) if "eager" in options.mode: got = reduce_fn() self.assertAllClose(got, per_replica_expect) if "func_graph" in options.mode: got = def_function.function(reduce_fn)() self.assertAllClose(got, per_replica_expect) get_global_mpr(options.num_processes).run(replica_fn) def batch_reduce_and_verify(self, inputs, expect, options): """Batch reduce the given `inputs` and verify the output matches `expect`. Args: inputs: a 2-level nested list of `Tensor` or `IndexedSlices`, where i-th value will be fed to i-th replica. expect: a list of `Tensor` or `IndexedSlices`. This should be the expected value for one replica. options: a `RunOpotions` instance. """ def replica_fn(): collective, devices, pid = self.make_collective(options.num_processes, options.gpus_per_process, options.communication) def batch_reduce_fn(): batch_size = len(inputs[0]) value_dst_pairs = [] for i in range(batch_size): def value_fn(device_idx, idx=i): return inputs[pid * len(devices) + device_idx][idx] per_replica_value = make_per_replica_value(value_fn, devices) value_dst_pairs.append((per_replica_value, per_replica_value)) reduced_values = collective.batch_reduce(options.reduce_op, value_dst_pairs) reduced_values = [self.as_list(v) for v in reduced_values] for v in reduced_values: self.assertAllEqual(devices, [t.device for t in v]) return nest.map_structure(ops.convert_to_tensor, reduced_values) per_replica_expect = nest.map_structure( lambda x: [ops.convert_to_tensor(x)] * len(devices), expect) if "eager" in options.mode: got = batch_reduce_fn() self.assertAllClose(got, per_replica_expect) if "func_graph" in options.mode: got = def_function.function(batch_reduce_fn)() self.assertAllClose(got, per_replica_expect) get_global_mpr(options.num_processes).run(replica_fn) @combinations.generate( combinations.combine( num_processes=[1, 2], required_gpus=[0, 1, 2], communication=[ # NCCL is only used for batch reduce, so we are not including # NCCL combination here. CollectiveCommunication.AUTO, CollectiveCommunication.RING ], reduce_op=[ReduceOp.SUM, ReduceOp.MEAN])) def testAllReduceDense(self, num_processes, required_gpus, communication, reduce_op): options = self.RunOptions( num_processes=num_processes, gpus_per_process=required_gpus, reduce_op=reduce_op, communication=communication) group_size = options.num_processes * (options.gpus_per_process or 1) inputs_data = [1.0, 2.0, 3.0, 4.0] inputs = inputs_data[0:group_size] if group_size == 1: expect = 1.0 if group_size == 2: expect = 3.0 if reduce_op == ReduceOp.SUM else 1.5 elif group_size == 4: expect = 10.0 if reduce_op == ReduceOp.SUM else 2.5 self.reduce_and_verify(inputs, expect, options) @combinations.generate( combinations.combine( num_processes=[1, 2], required_gpus=[0, 1, 2], communication=[ # NCCL is only used for batch reduce, so we are not including # NCCL combination here. CollectiveCommunication.AUTO, CollectiveCommunication.RING ], # TODO(b/166682130): add MEAN reduce once the bug is fixed. reduce_op=ReduceOp.SUM)) def testAllReduceSparse(self, num_processes, required_gpus, communication, reduce_op): options = self.RunOptions( mode=["func_graph"], # Sparse reduce is not supported in eager. num_processes=num_processes, gpus_per_process=required_gpus, reduce_op=reduce_op, communication=communication) group_size = options.num_processes * (options.gpus_per_process or 1) inputs_data = [ indexed_slices.IndexedSlicesValue( values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1]), indexed_slices.IndexedSlicesValue( values=[[3.], [4.]], indices=[1, 2], dense_shape=[10, 1]), indexed_slices.IndexedSlicesValue( values=[[5.], [6.]], indices=[7, 8], dense_shape=[10, 1]), indexed_slices.IndexedSlicesValue( values=[[7.], [8.]], indices=[3, 2], dense_shape=[10, 1]), ] inputs = inputs_data[0:group_size] if group_size == 1: expect = indexed_slices.IndexedSlices( values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1]) elif group_size == 2: expect = indexed_slices.IndexedSlices( values=[[1.], [2.], [3.], [4.]], indices=[0, 1, 1, 2], dense_shape=[10, 1]) elif group_size == 4: expect = indexed_slices.IndexedSlices( values=[[1.], [2.], [3.], [4.], [5.], [6.], [7.], [8.]], indices=[0, 1, 1, 2, 7, 8, 3, 2], dense_shape=[10, 1]) self.reduce_and_verify(inputs, expect, options) def testAllReduceSparseVariableLength(self): # One device per process, 2 processes, 2 replicas in total. inputs = [ indexed_slices.IndexedSlicesValue( values=[[1.]], indices=[0], dense_shape=[10, 1]), indexed_slices.IndexedSlicesValue( values=[[2.], [3.], [4.]], indices=[0, 1, 2], dense_shape=[10, 1]), ] expect = indexed_slices.IndexedSlices( values=[[1.], [2.], [3.], [4.]], indices=[0, 0, 1, 2], dense_shape=[10, 1]) self.reduce_and_verify( inputs, expect, self.RunOptions( mode=["func_graph"], # Sparse reduce is not supported in eager. num_processes=2, reduce_op=ReduceOp.SUM)) @combinations.generate( combinations.combine( num_processes=[1, 2], required_gpus=[0, 1, 2], communication=[ CollectiveCommunication.AUTO, CollectiveCommunication.RING, CollectiveCommunication.NCCL ], reduce_op=[ReduceOp.SUM, ReduceOp.MEAN])) def testBatchAllReduceDense(self, num_processes, required_gpus, communication, reduce_op): if required_gpus == 0 and communication == CollectiveCommunication.NCCL: self.skipTest("Skip CPU + NCCL combination") if num_processes == 2 and communication == CollectiveCommunication.NCCL: self.skipTest("Skip NCCL + 2 processes combination. NCCL requires " "physical GPUs for every process.") options = self.RunOptions( num_processes=num_processes, gpus_per_process=required_gpus, reduce_op=reduce_op, communication=communication) group_size = options.num_processes * (options.gpus_per_process or 1) inputs_data = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]] inputs = inputs_data[0:group_size] if group_size == 1: expect = [1.0, 2.0] if group_size == 2: expect = [4.0, 6.0] if reduce_op == ReduceOp.SUM else [2.0, 3.0] elif group_size == 4: expect = [16.0, 20.0] if reduce_op == ReduceOp.SUM else [4.0, 5.0] self.batch_reduce_and_verify(inputs, expect, options) @combinations.generate( combinations.combine( num_processes=[1, 2], required_gpus=[0, 1, 2], communication=[ CollectiveCommunication.AUTO, CollectiveCommunication.RING, CollectiveCommunication.NCCL, ], # TODO(b/166682130): add MEAN reduce once the bug is fixed. reduce_op=ReduceOp.SUM)) def testBatchAllReduceSparse(self, num_processes, required_gpus, communication, reduce_op): if required_gpus == 0 and communication == CollectiveCommunication.NCCL: self.skipTest("Skip CPU + NCCL combination") if num_processes == 2 and communication == CollectiveCommunication.NCCL: self.skipTest("Skip NCCL + 2 processes combination. NCCL requires " "physical GPUs for every process.") options = self.RunOptions( mode=["func_graph"], # Sparse reduce is not supported in eager. num_processes=num_processes, gpus_per_process=required_gpus, reduce_op=reduce_op, communication=communication) group_size = options.num_processes * (options.gpus_per_process or 1) inputs_data = ([ indexed_slices.IndexedSlicesValue( values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1]), indexed_slices.IndexedSlicesValue( values=[[3.], [4.]], indices=[1, 2], dense_shape=[5, 1]) ], [ indexed_slices.IndexedSlicesValue( values=[[5.], [6.]], indices=[1, 2], dense_shape=[10, 1]), indexed_slices.IndexedSlicesValue( values=[[7.], [8.]], indices=[0, 1], dense_shape=[5, 1]) ], [ indexed_slices.IndexedSlicesValue( values=[[9.], [10.]], indices=[3, 4], dense_shape=[10, 1]), indexed_slices.IndexedSlicesValue( values=[[11.], [12.]], indices=[3, 4], dense_shape=[5, 1]) ], [ indexed_slices.IndexedSlicesValue( values=[[13.], [14.]], indices=[8, 9], dense_shape=[10, 1]), indexed_slices.IndexedSlicesValue( values=[[15.], [16.]], indices=[3, 4], dense_shape=[5, 1]) ]) inputs = inputs_data[0:group_size] if group_size == 1: expect = [ indexed_slices.IndexedSlices( values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1]), indexed_slices.IndexedSlicesValue( values=[[3.], [4.]], indices=[1, 2], dense_shape=[5, 1]) ] if group_size == 2: expect = [ indexed_slices.IndexedSlices( values=[[1.], [2.], [5.], [6.]], indices=[0, 1, 1, 2], dense_shape=[10, 1]), indexed_slices.IndexedSlices( values=[[3.], [4.], [7.], [8.]], indices=[1, 2, 3, 4], dense_shape=[5, 1]) ] elif group_size == 4: expect = [ indexed_slices.IndexedSlices( values=[[1.], [2.], [5.], [6.], [9.], [10.], [13.], [14.]], indices=[0, 1, 1, 2, 3, 4, 8, 9], dense_shape=[10, 1]), indexed_slices.IndexedSlices( values=[[3.], [4.], [7.], [8.], [11.], [12.], [15.], [16.]], indices=[1, 2, 0, 1, 3, 4, 3, 4], dense_shape=[5, 2]) ] self.batch_reduce_and_verify(inputs, expect, options) @combinations.generate( combinations.combine( num_processes=[1, 2], required_gpus=[0, 1, 2], axis=[0, 1, 2], func_mode=["eager", "func_graph"], communication=[ CollectiveCommunication.NCCL, CollectiveCommunication.AUTO, CollectiveCommunication.RING ])) def testAllGatherSameShape(self, num_processes, required_gpus, communication, func_mode, axis): def replica_fn(): collective, devices, _ = self.make_collective(num_processes, required_gpus, communication) value = constant_op.constant([[[1, 2], [1, 2]]], dtype=dtypes.float32) def gather_fn(): value_fn = lambda device_idx: value per_replica_value = make_per_replica_value(value_fn, devices) gathered_values = collective._gather( per_replica_value, per_replica_value, axis=axis) gathered_values = self.as_list(gathered_values) self.assertAllEqual(devices, [v.device for v in gathered_values]) return [ops.convert_to_tensor(v) for v in gathered_values] group_size = num_processes * (required_gpus or 1) expect = array_ops.concat([value] * group_size, axis=axis) per_replica_expect = [ops.convert_to_tensor(expect)] * len(devices) if func_mode == "eager": result = gather_fn() self.assertAllClose(result, per_replica_expect) if func_mode == "func_graph": result = def_function.function(gather_fn)() self.assertAllClose(result, per_replica_expect) get_global_mpr(num_processes).run(replica_fn)
def all_strategy_and_input_config_combinations(): return (combinations.times( combinations.combine( distribution=all_strategies, cloning=[True, False]), eager_mode_test_configuration() + graph_mode_test_configuration()))
_, replica_local = _make_replica_local( variable_scope.VariableAggregation.SUM) converted = ops.internal_convert_to_tensor(replica_local, as_ref=False) self.assertIsInstance(converted, ops.Tensor) self.assertEqual(converted.dtype, replica_local.dtype) converted = ops.internal_convert_to_tensor(replica_local, as_ref=True) # Resources variable are converted to tensors as well when as_ref is True. self.assertIsInstance(converted, ops.Tensor) self.assertEqual(converted.dtype, replica_local.dtype) @combinations.generate( combinations.combine( distribution=[ strategy_combinations.mirrored_strategy_with_gpu_and_cpu, ], mode=["graph", "eager"])) class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): def _assign_replica_local(self, devices, v, new): for d, var, n in zip(devices, v, new): with ops.device(d): self.evaluate(var.assign(n)) def _save_return_saver(self, sess, var): saver = saver_lib.Saver(var_list=[var]) test_dir = self.get_temp_dir() prefix = os.path.join(test_dir, "ckpt") return saver.save(sess, prefix), saver
class KerasModelsTest(test.TestCase, parameterized.TestCase): @combinations.generate( combinations.combine(distribution=strategy_combinations.all_strategies, mode=["eager"])) def test_single_keras_layer_experimental_run(self, distribution): dataset = self._get_dataset() input_iterator = iter( distribution.experimental_distribute_dataset(dataset)) with distribution.scope(): model = keras.layers.Dense(4, name="dense") @def_function.function def train_step(iterator): def step_fn(inputs): images, targets = inputs with backprop.GradientTape() as tape: outputs = model(images) loss = math_ops.reduce_sum(outputs - targets) grads = tape.gradient(loss, model.variables) return grads outputs = distribution.experimental_run_v2(step_fn, args=(next(iterator), )) return nest.map_structure(distribution.experimental_local_results, outputs) train_step(input_iterator) @combinations.generate( combinations.combine(distribution=strategy_combinations.all_strategies, mode=["eager"])) def test_keras_model_creation_experimental_run(self, distribution): dataset = self._get_dataset() input_iterator = iter( distribution.experimental_distribute_dataset(dataset)) with distribution.scope(): model = self._get_model() @def_function.function def train_step(iterator): def step_fn(inputs): images, targets = inputs with backprop.GradientTape() as tape: outputs = model(images) loss = math_ops.reduce_sum(outputs - targets) grads = tape.gradient(loss, model.variables) return grads outputs = distribution.experimental_run_v2(step_fn, args=(next(iterator), )) return nest.map_structure(distribution.experimental_local_results, outputs) train_step(input_iterator) @combinations.generate( combinations.combine(distribution=strategy_combinations.all_strategies, mode=["eager"])) def test_keras_model_optimizer_experimental_run(self, distribution): dataset = self._get_dataset() input_iterator = iter( distribution.experimental_distribute_dataset(dataset)) with distribution.scope(): model = self._get_model() optimizer = keras.optimizer_v2.rmsprop.RMSprop() @def_function.function def train_step(iterator): def step_fn(inputs): images, targets = inputs with backprop.GradientTape() as tape: outputs = model(images) loss = math_ops.reduce_sum(outputs - targets) grads = tape.gradient(loss, model.variables) optimizer.apply_gradients(zip(grads, model.variables)) return loss outputs = distribution.experimental_run_v2(step_fn, args=(next(iterator), )) return nest.map_structure(distribution.experimental_local_results, outputs) train_step(input_iterator) @combinations.generate( combinations.combine(distribution=strategy_combinations.all_strategies, mode=["eager"])) def test_keras_subclass_model_optimizer_experimental_run( self, distribution): def get_subclass_model(): class KerasSubclassModel(keras.Model): def __init__(self): super(KerasSubclassModel, self).__init__() self.l = keras.layers.Dense(4, name="dense") def call(self, x): return self.l(x) return KerasSubclassModel() dataset = self._get_dataset() input_iterator = iter( distribution.experimental_distribute_dataset(dataset)) with distribution.scope(): model = get_subclass_model() optimizer = keras.optimizer_v2.rmsprop.RMSprop() @def_function.function def train_step(iterator): def step_fn(inputs): images, targets = inputs with backprop.GradientTape() as tape: outputs = model(images) loss = math_ops.reduce_sum(outputs - targets) grads = tape.gradient(loss, model.variables) optimizer.apply_gradients(zip(grads, model.variables)) return loss outputs = distribution.experimental_run_v2(step_fn, args=(next(iterator), )) return nest.map_structure(distribution.experimental_local_results, outputs) train_step(input_iterator) @combinations.generate( combinations.combine(distribution=strategy_combinations.all_strategies, mode=["eager"])) def test_keras_model_optimizer_experimental_run_loop(self, distribution): dataset = self._get_dataset() input_iterator = iter( distribution.experimental_distribute_dataset(dataset)) with distribution.scope(): model = self._get_model() optimizer = keras.optimizer_v2.rmsprop.RMSprop() @def_function.function def train_step(iterator): def step_fn(inputs): images, targets = inputs with backprop.GradientTape() as tape: outputs = model(images) loss = math_ops.reduce_sum(outputs - targets) grads = tape.gradient(loss, model.variables) optimizer.apply_gradients(zip(grads, model.variables)) return loss for _ in range(5): distribution.experimental_run_v2(step_fn, args=(next(iterator), )) train_step(input_iterator) @combinations.generate( combinations.combine(distribution=strategy_combinations.all_strategies, mode=["eager"])) def test_lstm(self, distribution): batch_size = 32 def create_lstm_model(): model = keras.models.Sequential() # We only have LSTM variables so we can detect no gradient issues more # easily. model.add( keras.layers.LSTM(1, return_sequences=False, input_shape=(10, 1))) return model def create_lstm_data(): seq_length = 10 x_train = np.random.rand(batch_size, seq_length, 1).astype("float32") y_train = np.random.rand(batch_size, 1).astype("float32") return x_train, y_train x, y = create_lstm_data() dataset = dataset_ops.Dataset.from_tensor_slices((x, y)) dataset = dataset.batch(batch_size, drop_remainder=True) input_iterator = iter( distribution.experimental_distribute_dataset(dataset)) with distribution.scope(): model = create_lstm_model() optimizer = keras.optimizer_v2.gradient_descent.SGD() @def_function.function def train_step(input_iterator): def step_fn(inputs): inps, targ = inputs with backprop.GradientTape() as tape: output = model(inps) loss = math_ops.reduce_mean( keras.losses.binary_crossentropy(y_true=targ, y_pred=output, from_logits=False)) grads = tape.gradient(loss, model.variables) optimizer.apply_gradients(zip(grads, model.variables)) return loss outputs = distribution.experimental_run_v2( step_fn, args=(next(input_iterator), )) return distribution.experimental_local_results(outputs) train_step(input_iterator) @combinations.generate( combinations.combine(distribution=strategy_combinations.all_strategies, mode=["eager"])) def test_nested_tf_functions(self, distribution): # The test builds two computations with keras layers, one with nested # tf.function, and the other without nested tf.function. We run these # computations independently on the model with same weights, and make sure # the variables are still the same after one training step. inputs = np.random.random((10, 3)).astype(np.float32) targets = np.ones((10, 4), dtype=np.float32) dataset = dataset_ops.Dataset.from_tensor_slices( (inputs, targets)).repeat() dataset = dataset.batch(10, drop_remainder=True) input_iterator = iter( distribution.experimental_distribute_dataset(dataset)) def get_model(): x = keras.layers.Input(shape=(3, ), name="input") y = keras.layers.Dense(4, name="dense")(x) model = keras.Model(x, y) return model with distribution.scope(): model = get_model() optimizer = keras.optimizer_v2.gradient_descent.SGD(0.1, momentum=0.01) weights_file = os.path.join(self.get_temp_dir(), ".h5") model.save_weights(weights_file) model2 = get_model() model2.load_weights(weights_file) # Make sure model and model2 variables are in sync when initialized. for model_v, model2_v in zip(model.variables, model2.variables): self.assertAllClose(model_v.numpy(), model2_v.numpy()) def compute_loss(images, targets): outputs = model(images) return math_ops.reduce_sum(outputs - targets) @def_function.function def train_step_without_nested_tf_function(inputs): def step_fn(inputs): images, targets = inputs with backprop.GradientTape() as tape: loss = compute_loss(images, targets) grads = tape.gradient(loss, model.variables) optimizer.apply_gradients(zip(grads, model.variables)) distribution.experimental_run_v2(step_fn, args=(inputs, )) @def_function.function def compute_loss2(images, targets): outputs = model2(images) return math_ops.reduce_sum(outputs - targets) @def_function.function def train_step_with_nested_tf_function(inputs): def step_fn(inputs): images, targets = inputs with backprop.GradientTape() as tape: loss = compute_loss2(images, targets) grads = tape.gradient(loss, model2.variables) optimizer.apply_gradients(zip(grads, model2.variables)) distribution.experimental_run_v2(step_fn, args=(inputs, )) inputs = next(input_iterator) train_step_without_nested_tf_function(inputs) train_step_with_nested_tf_function(inputs) # Make sure model and model2 variables are still in sync. for model_v, model2_v in zip(model.variables, model2.variables): self.assertAllClose(model_v.numpy(), model2_v.numpy()) @combinations.generate( combinations.combine(distribution=strategy_combinations.all_strategies, mode=["eager"])) def test_customized_tf_module_experimental_run(self, distribution): dataset = self._get_dataset() input_iterator = iter( distribution.experimental_distribute_dataset(dataset)) with distribution.scope(): model = CustomModel() @def_function.function def train_step(iterator): def step_fn(inputs): images, targets = inputs with backprop.GradientTape() as tape: outputs = model(images) loss = math_ops.reduce_sum(outputs - targets) grads = tape.gradient(loss, model.variables) return grads outputs = distribution.experimental_run_v2(step_fn, args=(next(iterator), )) return nest.map_structure(distribution.experimental_local_results, outputs) train_step(input_iterator) def _get_dataset(self): inputs = np.zeros((10, 3), dtype=np.float32) targets = np.zeros((10, 4), dtype=np.float32) dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.repeat(100) dataset = dataset.batch(10, drop_remainder=True) return dataset def _get_model(self): x = keras.layers.Input(shape=(3, ), name="input") y = keras.layers.Dense(4, name="dense")(x) model = keras.Model(x, y) return model
class DistributedValuesTest(test.TestCase, parameterized.TestCase): @combinations.generate( combinations.combine( distribution=(strategy_combinations.all_strategies_minus_default + strategy_combinations.multiworker_strategies), mode=["eager"])) def testMakeDistributedValueFromTensor(self, distribution): if not tf2.enabled(): self.skipTest("Only V2 is supported.") single_value = constant_op.constant(1) def value_fn(ctx): del ctx return single_value distributed_values = ( distribution.experimental_distribute_values_from_function(value_fn) ) self.assertAllEqual( ds_test_util.gather(distribution, distributed_values), constant_op.constant(1., shape=(distribution.num_replicas_in_sync))) @combinations.generate( combinations.combine( distribution=(strategy_combinations.all_strategies_minus_default + strategy_combinations.multiworker_strategies), mode=["eager"])) def testMakeDistributedValueSingleNumpyArrayConstant(self, distribution): if not tf2.enabled(): self.skipTest("Only V2 is supported.") array_value = np.array([1., 2., 3.]) def value_fn(ctx): del ctx return array_value distributed_values = ( distribution.experimental_distribute_values_from_function(value_fn) ) self.assertAllEqual( ds_test_util.gather(distribution, distributed_values).numpy(), [[1., 2., 3.]] * distribution.num_replicas_in_sync) @combinations.generate( combinations.combine( distribution=(strategy_combinations.all_strategies_minus_default + strategy_combinations.multiworker_strategies), mode=["eager"])) def testMakeDistributedValueTupleConstant(self, distribution): if not tf2.enabled(): self.skipTest("Only V2 is supported.") tuple_value = (1., 2., 3.) def value_fn(ctx): del ctx return tuple_value distributed_values = ( distribution.experimental_distribute_values_from_function(value_fn) ) distributed_values = ds_test_util.gather(distribution, distributed_values) # Expected output for 2 replicas: # ([1.0, 1.0], [2.0, 2.0], [3.0, 3.0]) expected = tuple([v for i in range(distribution.num_replicas_in_sync)] for v in tuple_value) self.assertAllEqual(distributed_values, expected) @combinations.generate( combinations.combine( distribution=(strategy_combinations.all_strategies_minus_default + strategy_combinations.multiworker_strategies), mode=["eager"])) def testMakeDistributedValueNestedStructurePerReplica(self, distribution): if not tf2.enabled(): self.skipTest("Only V2 is supported.") tuple_value = (1., 2., 3.) def value_fn(ctx): per_replica = [] for val in tuple_value: per_replica.append(val * ctx.replica_id_in_sync_group) return tuple(per_replica) distributed_values = ( distribution.experimental_distribute_values_from_function(value_fn) ) distributed_values = ds_test_util.gather(distribution, distributed_values) # Expected output for 2 replicas: # ([0.0, 1.0], [0.0, 2.0], [0.0, 3.0]) expected = tuple( [v * i for i in range(distribution.num_replicas_in_sync)] for v in tuple_value) self.assertAllEqual(distributed_values, expected) # NOTE(priyag): Cannot test this with MultiWorkerMirroredStrategy because # collective ops do not support SparseTensors. @combinations.generate( combinations.combine( distribution=strategy_combinations.all_strategies_minus_default, mode=["eager"])) def testMakeDistributedValueSpareTensor(self, distribution): if not tf2.enabled(): self.skipTest("Only V2 is supported.") def value_fn(ctx): del ctx return sparse_tensor.SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]) distributed_values = ( distribution.experimental_distribute_values_from_function(value_fn) ) local_results = distribution.experimental_local_results( distributed_values) for i in range(distribution.num_replicas_in_sync): self.assertAllEqual( sparse_ops.sparse_tensor_to_dense(local_results[i]), [[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]]) @combinations.generate( combinations.combine( distribution=(strategy_combinations.all_strategies_minus_default + strategy_combinations.multiworker_strategies), mode=["eager"])) def testMakeDistributedValueExtractFromArray(self, distribution): if not tf2.enabled(): self.skipTest("Only V2 is supported.") multiple_values = range(distribution.num_replicas_in_sync) def value_fn(ctx): return multiple_values[ctx.replica_id_in_sync_group] distributed_values = ( distribution.experimental_distribute_values_from_function(value_fn) ) distributed_values = ds_test_util.gather(distribution, distributed_values) expected = range(distribution.num_replicas_in_sync) self.assertAllEqual(distributed_values, expected) @combinations.generate( combinations.combine( distribution=(strategy_combinations.all_strategies_minus_default + strategy_combinations.multiworker_strategies), mode=["eager"])) def testMakeDistributedValueAndRun(self, distribution): if not tf2.enabled(): self.skipTest("Only V2 is supported.") @def_function.function def run(): multiple_values = range(distribution.num_replicas_in_sync) def value_fn(ctx): return multiple_values[ctx.replica_id_in_sync_group] distributed_values = ( distribution.experimental_distribute_values_from_function( value_fn)) def computation(x): return math_ops.square(x) outputs = ds_test_util.gather( distribution, distribution.run(computation, args=(distributed_values, ))) return outputs results = run() expected = [i**2 for i in range(distribution.num_replicas_in_sync)] self.assertAllEqual(results, expected) @combinations.generate( combinations.combine(distribution=[ strategy_combinations.mirrored_strategy_with_gpu_and_cpu, strategy_combinations. mirrored_strategy_with_two_gpus_no_merge_call, strategy_combinations.tpu_strategy, strategy_combinations.tpu_strategy_packed_var, strategy_combinations.central_storage_strategy_with_two_gpus, ] + strategy_combinations.multiworker_strategies, mode=["eager"])) def testMakeDistributedValueDefaultDevicePlacement(self, distribution): if not tf2.enabled(): self.skipTest("Only V2 is supported.") def value_fn(ctx): del ctx return constant_op.constant(1.0) distributed_values = ( distribution.experimental_distribute_values_from_function(value_fn) ) default_device = array_ops.identity(constant_op.constant(1.0)).device for i in range(len(distribution.extended.worker_devices)): self.assertAllEqual(distributed_values._values[i].device, default_device) @combinations.generate( combinations.combine( distribution=[ strategy_combinations.mirrored_strategy_with_gpu_and_cpu, strategy_combinations. mirrored_strategy_with_two_gpus_no_merge_call, strategy_combinations.tpu_strategy, strategy_combinations.tpu_strategy_packed_var, strategy_combinations.central_storage_strategy_with_two_gpus, ] + strategy_combinations.multiworker_strategies, mode=["eager"], op_type=[constant_op.constant, array_ops.identity])) def testMakeDistributedValueExplicitDevicePlacement( self, distribution, op_type): if not tf2.enabled(): self.skipTest("Only V2 is supported.") worker_devices = distribution.extended.worker_devices def value_fn(ctx): # In multi client setup, worker_devices is just the devices on that # worker. worker_device_id = ctx.replica_id_in_sync_group % len( worker_devices) with ops.device(worker_devices[worker_device_id]): return op_type(1.0) distributed_values = ( distribution.experimental_distribute_values_from_function(value_fn) ) for i in range(len(distribution.extended.worker_devices)): self.assertAllEqual(distributed_values._values[i].device, worker_devices[i])