def testWrapClass(self): # Normally a mirrored value would be the same across devices, but # for a test it is convenient to be able to tell the values apart. result = values.regroup( { _device_str(0): _nested_value("1"), _device_str(1): _nested_value("2") }, values.Mirrored) self.assertIsInstance(result, tuple) self.assertEqual(3, len(result)) self._is_per_replica(result[0], ["a1", "a2"], values.Mirrored) self._is_per_replica(result[2], ["h1", "h2"], values.Mirrored) self.assertIsInstance(result[1], list) self.assertEqual(3, len(result[1])) self._is_per_replica(result[1][0], ["b1", "b2"], values.Mirrored) self._is_per_replica(result[1][2], ["g1", "g2"], values.Mirrored) self.assertIsInstance(result[1][1], dict) self.assertEqual(set(["c", "e"]), set(result[1][1].keys())) self._is_per_replica(result[1][1]["c"], ["d1", "d2"], values.Mirrored) self._is_per_replica(result[1][1]["e"], ["f1", "f2"], values.Mirrored) # Also test that we can undo the merge using select_device() self.assertEqual(_nested_value("1"), values.select_device(_device_str(0), result)) self.assertEqual(_nested_value("2"), values.select_device(_device_str(1), result)) # Values are marked as mirrored, so select_device_mirrored() is allowed. self.assertEqual(_nested_value("1"), values.select_device_mirrored(_device_str(0), result)) self.assertEqual(_nested_value("2"), values.select_device_mirrored(_device_str(1), result))
def _test_input_fn_iterator(self, iterator, devices, expected_values, sess=None): evaluate = lambda x: sess.run(x) if sess else self.evaluate(x) evaluate(iterator.initialize()) for expected_value in expected_values: next_element = iterator.get_next() computed_value = evaluate( [values.select_device(d, next_element) for d in devices]) self.assertEqual(expected_value, computed_value) with self.assertRaises(errors.OutOfRangeError): next_element = iterator.get_next() evaluate([values.select_device(d, next_element) for d in devices]) # After re-initializing the iterator, should be able to iterate again. evaluate(iterator.initialize()) for expected_value in expected_values: next_element = iterator.get_next() computed_value = evaluate( [values.select_device(d, next_element) for d in devices]) self.assertEqual(expected_value, computed_value)
def testWrapClass(self): # Normally a mirrored value would be the same across devices, but # for a test it is convenient to be able to tell the values apart. result = values.regroup({_device_str(0): _nested_value("1"), _device_str(1): _nested_value("2")}, values.Mirrored) self.assertIsInstance(result, tuple) self.assertEqual(3, len(result)) self._is_per_replica(result[0], ["a1", "a2"], values.Mirrored) self._is_per_replica(result[2], ["h1", "h2"], values.Mirrored) self.assertIsInstance(result[1], list) self.assertEqual(3, len(result[1])) self._is_per_replica(result[1][0], ["b1", "b2"], values.Mirrored) self._is_per_replica(result[1][2], ["g1", "g2"], values.Mirrored) self.assertIsInstance(result[1][1], dict) self.assertEqual(set(["c", "e"]), set(result[1][1].keys())) self._is_per_replica(result[1][1]["c"], ["d1", "d2"], values.Mirrored) self._is_per_replica(result[1][1]["e"], ["f1", "f2"], values.Mirrored) # Also test that we can undo the merge using select_device() self.assertEqual(_nested_value("1"), values.select_device(_device_str(0), result)) self.assertEqual(_nested_value("2"), values.select_device(_device_str(1), result)) # Values are marked as mirrored, so select_device_mirrored() is allowed. self.assertEqual(_nested_value("1"), values.select_device_mirrored(_device_str(0), result)) self.assertEqual(_nested_value("2"), values.select_device_mirrored(_device_str(1), result))
def _test_iterator(self, input_fn, worker_device_pairs, expected_values, sess=None): devices = nest.flatten([ds for _, ds in worker_device_pairs]) iterator = values.InputFunctionIterator(input_fn, worker_device_pairs) evaluate = lambda x: sess.run(x) if sess else self.evaluate(x) evaluate(iterator.initialize()) for expected_value in expected_values: next_element = iterator.get_next() computed_value = evaluate( [values.select_device(d, next_element) for d in devices]) self.assertEqual(expected_value, computed_value) with self.assertRaises(errors.OutOfRangeError): next_element = iterator.get_next() evaluate([values.select_device(d, next_element) for d in devices]) # After re-initializing the iterator, should be able to iterate again. evaluate(iterator.initialize()) for expected_value in expected_values: next_element = iterator.get_next() computed_value = evaluate( [values.select_device(d, next_element) for d in devices]) self.assertEqual(expected_value, computed_value)
def testNested(self): result = values.regroup({ _device_str(0): _nested_value("1"), _device_str(1): _nested_value("2") }) self.assertIsInstance(result, tuple) self.assertEqual(3, len(result)) self._is_per_replica(result[0], ["a1", "a2"]) self._is_per_replica(result[2], ["h1", "h2"]) self.assertIsInstance(result[1], list) self.assertEqual(3, len(result[1])) self._is_per_replica(result[1][0], ["b1", "b2"]) self._is_per_replica(result[1][2], ["g1", "g2"]) self.assertIsInstance(result[1][1], dict) self.assertEqual(set(["c", "e"]), set(result[1][1].keys())) self._is_per_replica(result[1][1]["c"], ["d1", "d2"]) self._is_per_replica(result[1][1]["e"], ["f1", "f2"]) # Also test that we can undo the merge using select_device() self.assertEqual(_nested_value("1"), values.select_device(_device_str(0), result)) self.assertEqual(_nested_value("2"), values.select_device(_device_str(1), result)) # select_device_mirrored() should fail due to non-mirrored values with self.assertRaises(TypeError): values.select_device_mirrored(_device_str(0), result) with self.assertRaises(TypeError): values.select_device_mirrored(_device_str(1), result)
def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn, expected_values): distribution, master_target, config = self._get_test_object( task_type, task_id, num_gpus) devices = distribution.worker_devices with ops.Graph().as_default(), \ self.cached_session(config=config, target=master_target) as sess: iterator = distribution.make_input_fn_iterator(input_fn) sess.run(iterator.initialize()) for expected_value in expected_values: next_element = iterator.get_next() computed_value = sess.run( [values.select_device(d, next_element) for d in devices]) self.assertEqual(expected_value, computed_value) with self.assertRaises(errors.OutOfRangeError): next_element = iterator.get_next() sess.run([values.select_device(d, next_element) for d in devices]) # After re-initializing the iterator, should be able to iterate again. sess.run(iterator.initialize()) for expected_value in expected_values: next_element = iterator.get_next() computed_value = sess.run( [values.select_device(d, next_element) for d in devices]) self.assertEqual(expected_value, computed_value)
def testNested(self): result = values.regroup({_device_str(0): _nested_value("1"), _device_str(1): _nested_value("2")}) self.assertIsInstance(result, tuple) self.assertEqual(3, len(result)) self._is_per_replica(result[0], ["a1", "a2"]) self._is_per_replica(result[2], ["h1", "h2"]) self.assertIsInstance(result[1], list) self.assertEqual(3, len(result[1])) self._is_per_replica(result[1][0], ["b1", "b2"]) self._is_per_replica(result[1][2], ["g1", "g2"]) self.assertIsInstance(result[1][1], dict) self.assertEqual(set(["c", "e"]), set(result[1][1].keys())) self._is_per_replica(result[1][1]["c"], ["d1", "d2"]) self._is_per_replica(result[1][1]["e"], ["f1", "f2"]) # Also test that we can undo the merge using select_device() self.assertEqual(_nested_value("1"), values.select_device(_device_str(0), result)) self.assertEqual(_nested_value("2"), values.select_device(_device_str(1), result)) # select_device_mirrored() should fail due to non-mirrored values with self.assertRaises(TypeError): values.select_device_mirrored(_device_str(0), result) with self.assertRaises(TypeError): values.select_device_mirrored(_device_str(1), result)
def _test_iterator(self, sess, iterator, devices, expected_values): next_element = iterator.get_next() for device in devices: v = values.select_device(device, next_element) # The `v` here can be a tuple. for element in nest.flatten(v): self.assertTrue(element.device in device) for expected_value in expected_values: actual = sess.run( [values.select_device(d, next_element) for d in devices]) self.assertEqual(expected_value, actual) with self.assertRaises(errors.OutOfRangeError): sess.run([values.select_device(d, next_element) for d in devices])
def testNamedTupleEstimatorSpec(self): with context.graph_mode(), ops.Graph().as_default(): created_estimator_specs = {} to_regroup = {} for device_id in range(3): spec = model_fn_lib.EstimatorSpec( mode=model_fn_lib.ModeKeys.TRAIN, loss=constant_op.constant(device_id / 2), train_op=array_ops.identity(constant_op.constant(device_id))) created_estimator_specs[device_id] = spec to_regroup[_device_str(device_id)] = spec merged_estimator_spec = values.regroup(to_regroup) self.assertTrue( isinstance(merged_estimator_spec, model_fn_lib.EstimatorSpec)) self.assertEqual(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode) for device_id in range(3): d = _device_str(device_id) self.assertEqual(created_estimator_specs[device_id].loss, merged_estimator_spec.loss.get(d)) self.assertEqual(created_estimator_specs[device_id].train_op, merged_estimator_spec.train_op.get(d)) # Scaffold is populated by `EstimatorSpec.__new__`. self.assertEqual(created_estimator_specs[device_id].scaffold, merged_estimator_spec.scaffold.get(d)) # Also test that we can undo the merge using select_device() self.assertEqual(created_estimator_specs[device_id], values.select_device(_device_str(device_id), merged_estimator_spec))
def testNamedTupleEstimatorSpec(self): with context.graph_mode(), ops.Graph().as_default(): created_estimator_specs = {} to_regroup = {} for device_id in range(3): spec = model_fn_lib.EstimatorSpec( mode=model_fn_lib.ModeKeys.TRAIN, loss=constant_op.constant(device_id / 2), train_op=array_ops.identity( constant_op.constant(device_id))) created_estimator_specs[device_id] = spec to_regroup[_device_str(device_id)] = spec merged_estimator_spec = values.regroup(to_regroup) self.assertTrue( isinstance(merged_estimator_spec, model_fn_lib.EstimatorSpec)) self.assertEquals(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode) for device_id in range(3): d = _device_str(device_id) self.assertEquals(created_estimator_specs[device_id].loss, merged_estimator_spec.loss.get(d)) self.assertEquals(created_estimator_specs[device_id].train_op, merged_estimator_spec.train_op.get(d)) # Scaffold is populated by `EstimatorSpec.__new__`. self.assertEquals(created_estimator_specs[device_id].scaffold, merged_estimator_spec.scaffold.get(d)) # Also test that we can undo the merge using select_device() self.assertEquals( created_estimator_specs[device_id], values.select_device(_device_str(device_id), merged_estimator_spec))
def _test_iterator(self, devices, dataset, expected_values): per_replica_dataset = values.PerReplicaDataset(dataset, devices) if context.executing_eagerly(): iterator = per_replica_dataset.make_one_shot_iterator() else: iterator = per_replica_dataset.make_initializable_iterator() self.evaluate([iterator.initializer]) for expected_value in expected_values: next_element = iterator.get_next() computed_value = self.evaluate( [values.select_device(d, next_element) for d in devices]) self.assertEqual(expected_value, computed_value) with self.assertRaises(errors.OutOfRangeError): next_element = iterator.get_next() self.evaluate( [values.select_device(d, next_element) for d in devices])
def _test_iterator(self, devices, dataset, expected_values): per_replica_dataset = values.PerReplicaDataset(dataset, devices) if context.executing_eagerly(): iterator = per_replica_dataset.make_one_shot_iterator() else: iterator = per_replica_dataset.make_initializable_iterator() self.evaluate([iterator.initializer]) for expected_value in expected_values: next_element = iterator.get_next() computed_value = self.evaluate( [values.select_device(d, next_element) for d in devices]) self.assertEqual(expected_value, computed_value) with self.assertRaises(errors.OutOfRangeError): next_element = iterator.get_next() self.evaluate([ values.select_device(d, next_element) for d in devices])
def testSameId(self): foo = object() result = values.regroup({_device_str(0): ("a", foo), _device_str(1): ("b", foo)}) self.assertIsInstance(result, tuple) self.assertEqual(2, len(result)) self._is_per_replica(result[0], ["a", "b"]) self.assertIs(foo, result[1]) # Test select_device(), should undo the merge done by regroup(). result_0 = values.select_device(_device_str(0), result) self.assertIsInstance(result_0, tuple) self.assertEqual(2, len(result_0)) self.assertEqual("a", result_0[0]) self.assertIs(foo, result_0[1]) result_1 = values.select_device(_device_str(1), result) self.assertIsInstance(result_1, tuple) self.assertEqual(2, len(result_1)) self.assertEqual("b", result_1[0]) self.assertIs(foo, result_1[1])
def _test_iterator(self, input_type, dataset_fn, worker_device_pairs, expected_values, sess=None, split_batch_by=None): devices = nest.flatten([ds for _, ds in worker_device_pairs]) if input_type == "input_fn": input_contexts = [ distribute_lib.InputContext() for _ in worker_device_pairs] input_fn = lambda _: dataset_fn() iterator = values.InputFunctionIterator(input_fn, worker_device_pairs, input_contexts) else: iterator = values.DatasetIterator(dataset_fn(), worker_device_pairs, split_batch_by) evaluate = lambda x: sess.run(x) if sess else self.evaluate(x) evaluate(control_flow_ops.group(iterator.initialize())) for expected_value in expected_values: next_element = iterator.get_next() computed_value = evaluate( [values.select_device(d, next_element) for d in devices]) self.assertAllEqual(expected_value, computed_value) with self.assertRaises(errors.OutOfRangeError): next_element = iterator.get_next() evaluate([values.select_device(d, next_element) for d in devices]) # After re-initializing the iterator, should be able to iterate again. evaluate(control_flow_ops.group(iterator.initialize())) for expected_value in expected_values: next_element = iterator.get_next() computed_value = evaluate( [values.select_device(d, next_element) for d in devices]) self.assertAllEqual(expected_value, computed_value)
def testOneDevice(self): result = values.regroup({_device_str(0): _nested_value("1")}) # On one device regroup() and select_device() are basically identity. self.assertEqual(_nested_value("1"), result) self.assertEqual(_nested_value("1"), values.select_device(_device_str(0), result)) # The one exception has to do with MirroredVariables. d = "/device:CPU:0" with ops.device(d): v = variable_scope.get_variable( name="v", initializer=1., use_resource=True) index = {d: v} mirrored = values.MirroredVariable(index, v, variable_scope.VariableAggregation.SUM) result = values.regroup(index) self.assertIs(mirrored, result)
def _call_for_each_replica(distribution, fn, args, kwargs): """Run `fn` in separate threads, once per replica/worker device. Args: distribution: the DistributionStrategy object. fn: function to run (will be run once per device, each in its own thread). args: positional arguments for `fn` kwargs: keyword arguments for `fn`. Returns: Merged return value of `fn` across all replicas. Raises: RuntimeError: If fn() calls get_replica_context().merge_call() a different number of times from the available devices. """ # TODO(josh11b): Add this option once we add synchronization to variable # creation. Until then, this is pretty unsafe to use. run_concurrently = False if not context.executing_eagerly(): # Needed for per-thread device, etc. contexts in graph mode. ops.get_default_graph().switch_to_thread_local() coord = coordinator.Coordinator( clean_stop_exception_types=(_RequestedStop, )) shared_variable_store = {} # TODO(isaprykin): Create these threads once instead of during every run() # call. threads = [] for index, d in enumerate(distribution.extended.worker_devices): variable_creator_fn = shared_variable_creator.make_fn( shared_variable_store, index) t = MirroredExtended._MirroredReplicaThread( # pylint: disable=protected-access distribution, coord, d, variable_creator_fn, fn, *values.select_device(d, args), **values.select_device(d, kwargs)) threads.append(t) for t in threads: t.start() # When `fn` starts `should_run` event is set on _MirroredReplicaThread # (`MRT`) threads. The execution waits until # `MRT.has_paused` is set, which indicates that either `fn` is # complete or a `get_replica_context().merge_call()` is called. If `fn` is # complete, then `MRT.done` is set to True. Otherwise, arguments # of `get_replica_context().merge_call` from all paused threads are grouped # and the `merge_fn` is performed. Results of the # `get_replica_context().merge_call` are then set to `MRT.merge_result`. # Each such `get_replica_context().merge_call` call returns the # `MRT.merge_result` for that thread when `MRT.should_run` event # is reset again. Execution of `fn` resumes. try: with coord.stop_on_exception(): all_done = False while not all_done and not coord.should_stop(): done = [] if run_concurrently: for t in threads: t.should_run.set() for t in threads: t.has_paused.wait() t.has_paused.clear() if coord.should_stop(): return None done.append(t.done) else: for t in threads: t.should_run.set() t.has_paused.wait() t.has_paused.clear() if coord.should_stop(): return None done.append(t.done) if coord.should_stop(): return None all_done = all(done) if not all_done: if any(done): raise RuntimeError( "Some replicas made a different number of " "replica_context().merge_call() calls.") # get_replica_context().merge_call() case merge_args = values.regroup( {t.device: t.merge_args for t in threads}) merge_kwargs = values.regroup( {t.device: t.merge_kwargs for t in threads}) # We capture the name_scope of the MRT when we call merge_fn # to ensure that if we have opened a name scope in the MRT, # it will be respected when executing the merge function. We only # capture the name_scope from the first MRT and assume it is # the same for all other MRTs. mtt_captured_name_scope = threads[0].captured_name_scope # Capture and merge the control dependencies from all the threads. mtt_captured_control_deps = set() for t in threads: mtt_captured_control_deps.update( t.captured_control_deps) with ops.name_scope(mtt_captured_name_scope),\ ops.control_dependencies(mtt_captured_control_deps): merge_result = threads[0].merge_fn( distribution, *merge_args, **merge_kwargs) for t in threads: t.merge_result = values.select_device( t.device, merge_result) finally: for t in threads: t.should_run.set() coord.join(threads) return values.regroup({t.device: t.main_result for t in threads})
def _call_for_each_replica(distribution, fn, args, kwargs): """Run `fn` in separate threads, once per replica/worker device. Args: distribution: the DistributionStrategy object. fn: function to run (will be run once per device, each in its own thread). args: positional arguments for `fn` kwargs: keyword arguments for `fn`. Returns: Merged return value of `fn` across all replicas. Raises: RuntimeError: If fn() calls get_replica_context().merge_call() a different number of times from the available devices. """ # TODO(josh11b): Add this option once we add synchronization to variable # creation. Until then, this is pretty unsafe to use. run_concurrently = False if not context.executing_eagerly(): # Needed for per-thread device, etc. contexts in graph mode. ops.get_default_graph().switch_to_thread_local() coord = coordinator.Coordinator(clean_stop_exception_types=(_RequestedStop,)) shared_variable_store = {} # TODO(isaprykin): Create these threads once instead of during every run() # call. threads = [] for index, d in enumerate(distribution.extended.worker_devices): variable_creator_fn = shared_variable_creator.make_fn( shared_variable_store, index) t = MirroredExtended._MirroredReplicaThread( # pylint: disable=protected-access distribution, coord, d, variable_creator_fn, fn, *values.select_device(d, args), **values.select_device(d, kwargs)) threads.append(t) for t in threads: t.start() # When `fn` starts `should_run` event is set on _MirroredReplicaThread # (`MRT`) threads. The execution waits until # `MRT.has_paused` is set, which indicates that either `fn` is # complete or a `get_replica_context().merge_call()` is called. If `fn` is # complete, then `MRT.done` is set to True. Otherwise, arguments # of `get_replica_context().merge_call` from all paused threads are grouped # and the `merge_fn` is performed. Results of the # `get_replica_context().merge_call` are then set to `MRT.merge_result`. # Each such `get_replica_context().merge_call` call returns the # `MRT.merge_result` for that thread when `MRT.should_run` event # is reset again. Execution of `fn` resumes. try: with coord.stop_on_exception(): all_done = False while not all_done and not coord.should_stop(): done = [] if run_concurrently: for t in threads: t.should_run.set() for t in threads: t.has_paused.wait() t.has_paused.clear() if coord.should_stop(): return None done.append(t.done) else: for t in threads: t.should_run.set() t.has_paused.wait() t.has_paused.clear() if coord.should_stop(): return None done.append(t.done) if coord.should_stop(): return None all_done = all(done) if not all_done: if any(done): raise RuntimeError("Some replicas made a different number of " "replica_context().merge_call() calls.") # get_replica_context().merge_call() case merge_args = values.regroup({t.device: t.merge_args for t in threads}) merge_kwargs = values.regroup( {t.device: t.merge_kwargs for t in threads}) # We capture the name_scope of the MRT when we call merge_fn # to ensure that if we have opened a name scope in the MRT, # it will be respected when executing the merge function. We only # capture the name_scope from the first MRT and assume it is # the same for all other MRTs. mtt_captured_name_scope = threads[0].captured_name_scope with ops.name_scope(mtt_captured_name_scope): merge_result = threads[0].merge_fn(distribution, *merge_args, **merge_kwargs) for t in threads: t.merge_result = values.select_device(t.device, merge_result) finally: for t in threads: t.should_run.set() coord.join(threads) return values.regroup({t.device: t.main_result for t in threads})