def test_sequential_experimental_runs(self): resolver = get_tpu_cluster_resolver() remote.connect_to_cluster(resolver) topology = tpu_strategy_util.initialize_tpu_system(resolver) # Computation replicated to all cores. device_assignment = device_assignment_lib.DeviceAssignment.build( topology, num_replicas=2) strategy = tpu_lib.TPUStrategy(resolver, device_assignment=device_assignment) # Computation on the 1st core. device_assignment2 = device_assignment_lib.DeviceAssignment.build( topology, num_replicas=1) strategy2 = tpu_lib.TPUStrategy(resolver, device_assignment=device_assignment2) def computation(x): return math_ops.square(x) @def_function.function def train_step(): outputs = strategy.experimental_local_results( strategy.run(computation, args=([2., 2.], ))) outputs2 = strategy2.run(computation, args=([outputs[0]], )) return outputs2 self.assertAllEqual([[16., 16.]], train_step())
def test_worker_devices_on_subset_cores(self, enable_packed_var): resolver = get_tpu_cluster_resolver() remote.connect_to_cluster(resolver) topology = tpu_strategy_util.initialize_tpu_system(resolver) # Strategy for the 1st core. device_assignment = device_assignment_lib.DeviceAssignment.build( topology, num_replicas=1) first_core_strategy = tpu_lib.TPUStrategy( resolver, device_assignment=device_assignment) first_core_strategy._enable_packed_variable_in_eager_mode = ( enable_packed_var) # Strategy for the 2nd core. device_assignment2 = device_assignment_lib.DeviceAssignment( topology, [[[0, 0, 0, 1]]]) second_core_strategy = tpu_lib.TPUStrategy( resolver, device_assignment=device_assignment2) second_core_strategy._enable_packed_variable_in_eager_mode = ( enable_packed_var) self.assertLen(first_core_strategy.extended.worker_devices, 1) self.assertEndsWith(first_core_strategy.extended.worker_devices[0], "device:TPU:0") self.assertLen(second_core_strategy.extended.worker_devices, 1) self.assertEndsWith(second_core_strategy.extended.worker_devices[0], "device:TPU:1")
def test_computation_on_subset_cores(self): resolver = get_tpu_cluster_resolver() remote.connect_to_cluster(resolver) topology = tpu_strategy_util.initialize_tpu_system(resolver) all_core_strategy = tpu_lib.TPUStrategy(resolver) with all_core_strategy.scope(): v = variables.Variable( 0.0, aggregation=variables.VariableAggregation.MEAN) # Computation on the 1st core. device_assignment = device_assignment_lib.DeviceAssignment.build( topology, num_replicas=1) first_core_strategy = tpu_lib.TPUStrategy( resolver, device_assignment=device_assignment) # Computation on the 2nd core. device_assignment2 = device_assignment_lib.DeviceAssignment( topology, [[[0, 0, 0, 1]]]) second_core_strategy = tpu_lib.TPUStrategy( resolver, device_assignment=device_assignment2) @def_function.function def train_step(): def step_fn(): return v + 1.0 all_core_strategy.run(step_fn) r1 = first_core_strategy.run(step_fn) r2 = second_core_strategy.run(step_fn) return r1 + r2 train_step() self.assertAllEqual(2., train_step())
def setUp(self): super(TPUEmbeddingCheckpointTest, self).setUp() self.resolver = tpu_cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project) remote.connect_to_cluster(self.resolver) tpu_strategy_util.initialize_tpu_system(self.resolver) self.strategy = tpu_strategy.TPUStrategy(self.resolver) self.num_rows = self.strategy.num_replicas_in_sync # These tests use two mid level API objects, initialized with different # values. These have the same sizes. with self.strategy.scope(): self.first_mid_level_contents = np.ones((self.num_rows, 4)) self.first_mid_level_optimizer = tpu_embedding_v2_utils.SGD( learning_rate=0.1) self.first_mid_level = self.build_mid_level( self.first_mid_level_contents, self.first_mid_level_optimizer) self.second_mid_level_contents = np.ones((self.num_rows, 4)) * 2 self.second_mid_level_optimizer = tpu_embedding_v2_utils.SGD( learning_rate=0.1) self.second_mid_level = self.build_mid_level( self.second_mid_level_contents, self.second_mid_level_optimizer, initialize_tpu_embedding=False) self.cpu_mid_level_optimizer = tpu_embedding_v2_utils.SGD( learning_rate=0.1) self.cpu_mid_level = self.build_mid_level( self.second_mid_level_contents, self.cpu_mid_level_optimizer)
def testSummaryWithCustomTrainingLoop(self): resolver = tpu_cluster_resolver.TPUClusterResolver('') tpu_strategy_util.initialize_tpu_system(resolver) strategy = tpu_strategy_lib.TPUStrategy(resolver) with strategy.scope(): model = distribute_strategy_test.get_model() model.compile('sgd', 'mse') writer = summary_ops_v2.create_file_writer_v2(self.summary_dir) @def_function.function def custom_function(dataset): def _custom_step(features, labels): del labels logits = model(features) with summary_ops_v2.always_record_summaries( ), writer.as_default(): summary_ops_v2.scalar('logits', logits, step=model.optimizer.iterations) return logits iterator = iter(dataset) output = strategy.unwrap( strategy.run(_custom_step, args=(next(iterator)))) return output dataset = strategy.experimental_distribute_dataset( distribute_strategy_test.get_dataset(strategy)) custom_function(dataset)
def testEagerTPUDistributionStrategy(self): self.skipTest("b/121387144") num_training_steps = 10 checkpoint_directory = self.get_temp_dir() checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") def _train_fn(optimizer, model): input_value = constant_op.constant([[3.]]) optimizer.minimize(functools.partial(model, input_value), global_step=root.optimizer_step) for training_continuation in range(3): strategy = tpu_strategy.TPUStrategy() with strategy.scope(): model = Subclassed() optimizer = adam_v1.AdamOptimizer(0.001) root = trackable_utils.Checkpoint( optimizer=optimizer, model=model, optimizer_step=training_util.get_or_create_global_step()) root.restore( checkpoint_management.latest_checkpoint( checkpoint_directory)) for _ in range(num_training_steps): strategy.extended.call_for_each_replica( functools.partial(_train_fn, optimizer, model)) root.save(file_prefix=checkpoint_prefix) self.assertEqual( (training_continuation + 1) * num_training_steps, root.optimizer_step.numpy())
def _create_tpu_strategy(): global _did_connect_to_cluster # These flags will be defined by tpu_test_wrapper.py. resolver = tpu_cluster_resolver.TPUClusterResolver( tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "", zone=hasattr(FLAGS, "zone") and FLAGS.zone or None, project=hasattr(FLAGS, "project") and FLAGS.project or None, ) # Only connect once per process, rather than per test method. if hasattr(FLAGS, "tpu") and FLAGS.tpu and not _did_connect_to_cluster: remote.connect_to_cluster(resolver) _did_connect_to_cluster = True topology = tpu_strategy_util.initialize_tpu_system(resolver) device_assignment = None if use_single_core: device_assignment = device_assignment_lib.DeviceAssignment( topology, core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT) # Steps per run is only supported in TF 1.x if tf2.enabled(): return tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs) else: return tpu_lib.TPUStrategyV1(resolver, steps_per_run, device_assignment, **kwargs)
def get_tpu_strategy(enable_packed_var=False): resolver = get_tpu_cluster_resolver() remote.connect_to_cluster(resolver) tpu_strategy_util.initialize_tpu_system(resolver) strategy = tpu_lib.TPUStrategy(resolver) strategy._enable_packed_variable_in_eager_mode = enable_packed_var return strategy
def _get_strategy(self): self.resolver = tpu_cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project) remote.connect_to_cluster(self.resolver) tpu_strategy_util.initialize_tpu_system(self.resolver) strategy = tpu_strategy.TPUStrategy(self.resolver) self.num_replicas = strategy.num_replicas_in_sync return strategy
def _get_strategy(self): self.resolver = tpu_cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project) if hasattr(self.resolver, '_cloud_tpu_client'): self.resolver._cloud_tpu_client.configure_tpu_version( version='nightly', restart_type='always') remote.connect_to_cluster(self.resolver) tpu_strategy_util.initialize_tpu_system(self.resolver) return tpu_strategy.TPUStrategy(self.resolver)
def get_tpu_strategy(): resolver = tpu_cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project, ) remote.connect_to_cluster(resolver) tpu_strategy_util.initialize_tpu_system(resolver) return tpu_lib.TPUStrategy(resolver)
def get_strategy(): resolver = tpu_cluster_resolver.TPUClusterResolver(tpu="grpc://" + os.environ["TPU_IP"]) remote.connect_to_cluster(resolver) topology = tpu_strategy_util.initialize_tpu_system(resolver) print("Device coordinates: ", topology.device_coordinates) device_assignment = tf.python.tpu.device_assignment.DeviceAssignment.build( topology, computation_shape=[1, 1, 1, 1], num_replicas=1) return tpu_strategy.TPUStrategy(resolver, device_assignment=device_assignment)
def _create_tpu_strategy(): resolver = tpu_cluster_resolver.TPUClusterResolver("") topology = tpu_strategy_util.initialize_tpu_system(resolver) device_assignment = None if use_single_core: device_assignment = device_assignment_lib.DeviceAssignment( topology, core_assignment=device_assignment_lib. SINGLE_CORE_ASSIGNMENT) strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=steps_per_run, device_assignment=device_assignment, **kwargs) return strategy
def _create_tpu_strategy(): resolver = tpu_cluster_resolver.TPUClusterResolver("") topology = tpu_strategy_util.initialize_tpu_system(resolver) device_assignment = None if use_single_core: device_assignment = device_assignment_lib.DeviceAssignment( topology, core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT) # Steps per run is only supported in TF 1.x if tf2.enabled(): return tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs) else: return tpu_lib.TPUStrategyV1(resolver, steps_per_run, device_assignment, **kwargs)
def _create_tpu_strategy(): FLAGS = flags.FLAGS # pylint: disable=invalid-name global _did_connect_to_cluster global _topology try: # Attempt to locally discover the TPU. This will fail for Cloud TPU, in # which case we fall back to the values passed as flags. resolver = tpu_cluster_resolver.TPUClusterResolver() did_automatically_resolve = True except ValueError: did_automatically_resolve = False # These flags will be defined by tpu_test_wrapper.py. resolver = tpu_cluster_resolver.TPUClusterResolver( tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "", zone=hasattr(FLAGS, "zone") and FLAGS.zone or None, project=hasattr(FLAGS, "project") and FLAGS.project or None, ) # Only connect once per process, rather than per test method. if not _did_connect_to_cluster: if getattr(FLAGS, "tpu", "") or did_automatically_resolve: remote.connect_to_cluster(resolver) _did_connect_to_cluster = True _topology = tpu_strategy_util.initialize_tpu_system(resolver) device_assignment = None if use_single_core: device_assignment = device_assignment_lib.DeviceAssignment( _topology, core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT) # Steps per run is only supported in TF 1.x if tf2.enabled(): strategy = tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs) else: strategy = tpu_lib.TPUStrategyV1(resolver, steps_per_run, device_assignment, **kwargs) strategy._enable_packed_variable_in_eager_mode = enable_packed_variable # pylint: disable=protected-access return strategy
def testV2SummaryWithKerasFit(self): resolver = tpu_cluster_resolver.TPUClusterResolver('') tpu_strategy_util.initialize_tpu_system(resolver) strategy = tpu_strategy_lib.TPUStrategy(resolver) with strategy.scope(): model = CustomModel() model.compile('sgd', 'mse') dataset = distribute_strategy_test.get_dataset(strategy) tensorboard_callback = callbacks.TensorBoard(self.summary_dir, update_freq=2) model.fit(dataset, steps_per_epoch=10, epochs=1, callbacks=[tensorboard_callback]) event_files = file_io.get_matching_files_v2( os.path.join(self.summary_dir, 'train', 'event*')) events_count_dictionary = { 'custom_model/layer_for_scalar_summary/custom_scalar_summary': 0, 'custom_model/layer_for_histogram_summary/custom_histogram_summary': 0 } for event_file in event_files: for e in summary_iterator.summary_iterator(event_file): for v in e.summary.value: if v.tag in events_count_dictionary: events_count_dictionary[v.tag] += 1 # Since total of 10 steps are ran and summary ops should be invoked # every 2 batches, we should see total of 5 event logs. self.assertEqual( events_count_dictionary[( 'custom_model/layer_for_histogram_summary/' 'custom_histogram_summary')], 5) self.assertEqual( events_count_dictionary[ 'custom_model/layer_for_scalar_summary/custom_scalar_summary'], 5)
def get_tpu_strategy(): resolver = get_tpu_cluster_resolver() remote.connect_to_cluster(resolver) tpu_strategy_util.initialize_tpu_system(resolver) return tpu_strategy_lib.TPUStrategy(resolver)
def test_cluster_resolver_available(self, enable_packed_var): resolver = get_tpu_cluster_resolver() remote.connect_to_cluster(resolver) tpu_strategy_util.initialize_tpu_system(resolver) strategy = tpu_lib.TPUStrategy(resolver) self.assertIs(strategy.cluster_resolver, resolver)