def proc_func(): for i in range(5): logging.info('(logging) %s-%d, i: %d', multi_worker_test_base.get_task_type(), self._worker_idx(), i) print('(print) {}-{}, i: {}'.format( multi_worker_test_base.get_task_type(), self._worker_idx(), i), flush=True) time.sleep(1)
def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj): model, _, train_ds, steps = _model_setup(test_obj, file_format='') num_epoch = 2 # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves summaries but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'logfile_%s_%d' % (test_base.get_task_type(), test_base.get_task_index())) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(file_io.file_exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)]) # If it's chief, the summaries should be saved in the filepath; if not, # the directory should be empty (although created). Using # `file_io.list_directory()` since the directory may be created at this # point. test_obj.assertEqual(bool(file_io.list_directory(saving_filepath)), test_base.is_chief())
def callableForTestModelCheckpointSavesOnChiefButNotOtherwise( model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath, **kwargs): extension = os.path.splitext(saving_filepath)[1] # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' % (test_base.get_task_type(), test_base.get_task_index(), extension)) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual(training_state.checkpoint_exists(saving_filepath), test_base.is_chief())
def testSimpleInputFromFnLastPartialBatch(self, strategy): def dataset_fn(input_context): global_batch_size = 8 batch_size = input_context.get_per_replica_batch_size( global_batch_size) dataset = dataset_ops.DatasetV2.range(14).batch( batch_size, drop_remainder=False) return dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) input_iterator = iter( strategy.distribute_datasets_from_function(dataset_fn)) @def_function.function def run(input_iterator): return strategy.run(lambda x: x, args=(next(input_iterator), )) # Let the complete batch go. run(input_iterator) # `result` is an incomplete batch result = run(input_iterator) expected_data_on_worker = {'chief': [8, 9, 10, 11], 'worker': [12, 13]} self.assertAllEqual( expected_data_on_worker[multi_worker_test_base.get_task_type()], result.numpy())
def callableForTestModelCheckpointSavesOnChiefButNotOtherwise( model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath, **kwargs): # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. # TODO(b/134551335): Must save to hdf5 until bug with copying # MirroredVariables is resolved. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d.h5' % (test_base.get_task_type(), test_base.get_task_index())) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(os.path.exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual(os.path.exists(saving_filepath), test_base.is_chief())
def testGatherRaiseSparsePerReplicaMultiWorker(self, strategy, pure_eager): if strategy.num_replicas_in_sync != 2: self.skipTest('Test for two replicas.') dense_shape = [5, 2] if multi_worker_test_base.get_task_type() == 'chief': t0 = _make_indexed_slices(values=[[1., 2.]], indices=[2], dense_shape=dense_shape) if multi_worker_test_base.get_task_type() == 'worker': t0 = _make_indexed_slices(values=[[3., 4.], [5., 6.]], indices=[1, 3], dense_shape=dense_shape) def run(value): return strategy._gather(value, axis=0) with self.assertRaisesRegex( NotImplementedError, r'gather/all_gather does not support IndexedSlices'): if pure_eager: run(t0) else: def_function.function(run)(t0)
def proc_model_checkpoint_saves_on_chief_but_not_otherwise( test_obj, file_format): model, saving_filepath, train_ds, steps = _model_setup( test_obj, file_format) num_epoch = 2 extension = os.path.splitext(saving_filepath)[1] # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' % (test_base.get_task_type(), test_base.get_task_index(), extension)) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse( training_state.checkpoint_exists(saving_filepath)) model.fit(x=train_ds, epochs=num_epoch, steps_per_epoch=steps, validation_data=train_ds, validation_steps=steps, callbacks=[ callbacks.ModelCheckpoint( filepath=saving_filepath, save_weights_only=save_weights_only) ]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual( training_state.checkpoint_exists(saving_filepath), test_base.is_chief()) # If it's chief, the model should be saved (`write_filepath` should # simply return `saving_filepath`); if not, i.e. for non-chief workers, # the temporary path generated by `write_filepath` should no longer # contain the checkpoint that has been deleted. test_obj.assertEqual( training_state.checkpoint_exists( distributed_file_utils.write_filepath( saving_filepath, model._distribution_strategy)), test_base.is_chief())
def testSimpleInputFromDatasetLastPartialBatch(self, strategy): global_batch_size = 8 dataset = dataset_ops.DatasetV2.range(14).batch( global_batch_size, drop_remainder=False) input_iterator = iter(strategy.experimental_distribute_dataset(dataset)) @def_function.function def run(input_iterator): return strategy.run(lambda x: x, args=(next(input_iterator),)) # Let the complete batch go. run(input_iterator) # `result` is an incomplete batch result = run(input_iterator) expected_data_on_workers = {'chief': [8, 9, 10], 'worker': [11, 12, 13]} self.assertAllEqual( expected_data_on_workers[multi_worker_test_base.get_task_type()], result.numpy(), )
def proc_tensorboard_can_still_save_to_temp_even_if_it_exists(test_obj): model, _, train_ds, steps = _model_setup(test_obj, file_format='') num_epoch = 2 saving_filepath = os.path.join(test_obj.get_temp_dir(), 'logfile_%s' % (test_base.get_task_type())) saving_filepath_for_temp = os.path.join(saving_filepath, 'workertemp_1') os.mkdir(saving_filepath) os.mkdir(saving_filepath_for_temp) # Verifies that even if `saving_filepath_for_temp` exists, tensorboard # can still save to temporary directory. test_obj.assertTrue(file_io.file_exists(saving_filepath_for_temp)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])
def testDatasetFromFunction(self, strategy): def dataset_fn(input_context): global_batch_size = 10 batch_size = input_context.get_per_replica_batch_size(global_batch_size) d = dataset_ops.DatasetV2.range(100).repeat().batch(batch_size) return d.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) expected_sum_on_workers = {'chief': 10, 'worker': 35} input_iterator = iter( strategy.distribute_datasets_from_function(dataset_fn)) @def_function.function def run(iterator): return strategy.experimental_local_results(iterator.get_next()) result = run(input_iterator) sum_value = math_ops.reduce_sum(result) self.assertEqual( sum_value.numpy(), expected_sum_on_workers[multi_worker_test_base.get_task_type()])
def callableForTestModelCheckpointSavesOnChiefButNotOtherwise( model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath): # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d' % (test_base.get_task_type(), test_base.get_task_index())) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(os.path.exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual(os.path.exists(saving_filepath), test_base.is_chief())
def fn(): for i in range(5): logging.info('%s-%d, i: %d', multi_worker_test_base.get_task_type(), self._worker_idx(), i) time.sleep(1)
def wrapped_method(method_to_wrap, name, *arg, **kwargs): # Use lock to ensure += operation is thread-safe. with self._lock: self._task_dict[test_base.get_task_type()][ test_base.get_task_index()][name] += 1 method_to_wrap(*arg, **kwargs)
def proc_func_that_adds_task_type_in_return_data(): return multi_worker_test_base.get_task_type()
def proc_func_expected_to_seg_fault(): if multi_worker_test_base.get_task_type() == 'worker': time.sleep(10000) ctypes.string_at(0) # Intentionally made seg fault.
def proc_func_expected_to_exit_with_20(): if multi_worker_test_base.get_task_type() == 'worker': time.sleep(10000) sys.exit(20)
def proc_func(): for i in range(50): logging.info('(logging) %s-%d, i: %d', multi_worker_test_base.get_task_type(), self._worker_idx(), i) time.sleep(1)
def proc_func(): time.sleep(1) if multi_worker_test_base.get_task_type() != 'chief': raise ValueError
def proc_func_that_adds_task_type_in_return_data(test_obj, val): test_obj.assertEqual(val, 3) return multi_worker_test_base.get_task_type()
def proc_func_that_adds_task_type_in_return_data(test_obj): test_obj.assertTrue(flags.FLAGS.test_flag == 3) return multi_worker_test_base.get_task_type()
def wrapped_method(method_to_wrap, name, *arg, **kwargs): # Use lock to ensure += operation is thread-safe. with self._lock: self._task_dict[test_base.get_task_type()][ test_base.get_task_index()][name] += 1 method_to_wrap(*arg, **kwargs)
def proc_func_that_adds_task_type_in_return_data(test_obj): multi_process_runner.add_return_data( multi_worker_test_base.get_task_type()) test_obj.assertTrue(flags.FLAGS.test_flag == 3)