def _configure(self, session_config=None, cluster_spec=None, task_type=None, task_id=None): del task_type, task_id if session_config: session_config.isolate_session_state = True if cluster_spec: self._initialize_multi_worker(self._num_gpus, cluster_spec) if self._cross_device_ops is None: if self._cluster_spec: # It currently cannot detect the toplogy of remote workers. So we # hard-code the multi-worker all-reduce algorithm for now. if len(self._workers) == 1: # The default is "nccl". self._cross_device_ops = ( cross_device_ops_lib.AllReduceCrossDeviceOps()) else: # The default is hierarchical reduce and broadcast. self._cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce( self._workers, self._num_gpus) else: self._cross_device_ops = cross_device_ops_lib.choose_the_best( self._devices, session_config=session_config)
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): reduction_to_one_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject("DefaultReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "ReductionToCPUDeviceCrossDeviceOps", cross_device_ops_lib.ReductionToOneDevice( reduce_to_device=_cpu_device)), combinations.NamedObject( "AccumulateNCrossDeviceOp", cross_device_ops_lib.ReductionToOneDevice( accumulation_fn=math_ops.add_n)), ], devices=[ ["/cpu:0"], ["/cpu:0", "/gpu:0"], ["/gpu:0", "/gpu:1"], ], mode=["graph", "eager"]) allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "AllReduce", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1)), combinations.NamedObject( "AllReduceNoGradientRepacking", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0)), combinations.NamedObject("NcclAllReduce", cross_device_ops_lib.NcclAllReduce()), combinations.NamedObject( "HierarchicalCopy", cross_device_ops_lib.HierarchicalCopyAllReduce(8)), ], devices=[ ["/gpu:0", "/gpu:1"], ], mode=["graph", "eager"]) @combinations.generate(reduction_to_one_combinations + allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, devices): if isinstance( cross_device_ops._obj, # pylint: disable=protected-access cross_device_ops_lib.AllReduceCrossDeviceOps ) and context.executing_eagerly(): self.skipTest("b/149881884") self._testReductionAndBroadcast(cross_device_ops, devices) def testChooseAlgorithm(self): # Not use nccl if there is any cpu device. self.assertIsInstance( cross_device_ops_lib.select_cross_device_ops(["/cpu:0"]), cross_device_ops_lib.ReductionToOneDevice) # Not use nccl if requested device is not visible to TensorFlow. # TODO(yuefengz): make `select_cross_device_ops` work with device strings # self.assertIsInstance( # cross_device_ops_lib.select_cross_device_ops(["/gpu:100"]), # cross_device_ops_lib.ReductionToOneDevice) if context.num_gpus() < 1: return devices = ["/gpu:0"] def mock_get_registered_kernels_for_op(op): if op == "NcclAllReduce": return [object] else: return [] # Use nccl if nccl kernel is found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", mock_get_registered_kernels_for_op): self.assertIsInstance( cross_device_ops_lib.select_cross_device_ops(devices), cross_device_ops_lib.NcclAllReduce) # Not use nccl if nccl kernel is not found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", lambda _: []): self.assertIsInstance( cross_device_ops_lib.select_cross_device_ops(devices), cross_device_ops_lib.ReductionToOneDevice) @combinations.generate(combinations.combine( mode=["graph", "eager"], required_gpus=1)) def testSimpleReduceWithIndexedSlices(self): devices = ["/cpu:0", "/gpu:0"] t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1]) per_replica = value_lib.PerReplica((t0, t1)) result = cross_device_ops_lib._simple_reduce( per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM) # Test that the result is semantically equal to both the concatenated # IndexedSlices with and without duplicate indices. total_with_dups = _make_indexed_slices( [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0]) total_without_dups = _make_indexed_slices( [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0]) self._assert_indexed_slices_equal(total_with_dups, result) self._assert_indexed_slices_equal(total_without_dups, result) @combinations.generate( combinations.combine( cross_device_ops_instance=[ combinations.NamedObject( "ReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "AllReduceCrossDeviceOps", cross_device_ops_lib.AllReduceCrossDeviceOps()) ], reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN], batch_reduce=[True, False], mode=["graph", "eager"], required_gpus=1)) def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op, batch_reduce): devices = ["/cpu:0", "/gpu:0"] self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance, reduce_op, batch_reduce) @combinations.generate( combinations.combine( distribution=strategy_combinations.mirrored_strategy_with_gpu_and_cpu, cross_device_ops_instance=[ combinations.NamedObject( "ReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "AllReduceCrossDeviceOps", cross_device_ops_lib.AllReduceCrossDeviceOps("ring")) ], batch_reduce=[True, False], mode=["graph", "eager"])) def testReduceDistributedVariable(self, distribution, cross_device_ops_instance, batch_reduce): with distribution.scope(): v = variables.Variable(1.) if batch_reduce: result = cross_device_ops_instance.batch_reduce(reduce_util.ReduceOp.MEAN, [(v, v)])[0] else: result = cross_device_ops_instance.reduce(reduce_util.ReduceOp.MEAN, v, v) for v in result.values: self.assertIsInstance(v, ops.Tensor) self.evaluate(variables.global_variables_initializer()) self.assertAllEqual(self.evaluate(result.values), [1.0, 1.0])
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): reduction_to_one_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject("DefaultReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "ReductionToCPUDeviceCrossDeviceOps", cross_device_ops_lib.ReductionToOneDevice( reduce_to_device=_cpu_device)), combinations.NamedObject( "AccumulateNCrossDeviceOp", cross_device_ops_lib.ReductionToOneDevice( accumulation_fn=math_ops.add_n)), ], devices=[ ["/cpu:0"], ["/cpu:0", "/gpu:0"], ["/gpu:0", "/gpu:1"], ], mode=["graph", "eager"]) allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "AllReduce", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)), combinations.NamedObject( "AllReduceNoGradientRepacking", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)), combinations.NamedObject("NcclAllReduce", cross_device_ops_lib.NcclAllReduce()), combinations.NamedObject( "HierarchicalCopy", cross_device_ops_lib.HierarchicalCopyAllReduce(8)), combinations.NamedObject( "HierarchicalCopyAggregateSmallTensors", cross_device_ops_lib.AllReduceCrossDeviceOps( "hierarchical_copy", 0, 100, 10)) ], devices=[ ["/gpu:0", "/gpu:1"], ], mode=["graph", "eager"]) @combinations.generate(reduction_to_one_combinations + allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, devices): self._testReductionAndBroadcast(cross_device_ops, devices) def testChooseAlgorithm(self): # Not use nccl if there is any cpu device. self.assertIsInstance( cross_device_ops_lib.choose_the_best(["/cpu:0"]), cross_device_ops_lib.ReductionToOneDevice) # Not use nccl if requested device is not visible to TensorFlow. # TODO(yuefengz): make `choose_the_best` work with device strings # self.assertIsInstance( # cross_device_ops_lib.choose_the_best(["/gpu:100"]), # cross_device_ops_lib.ReductionToOneDevice) if context.num_gpus() < 1: return devices = ["/gpu:0"] def mock_get_registered_kernels_for_op(op): if op == "NcclAllReduce": return [object] else: return [] # Use nccl if nccl kernel is found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", mock_get_registered_kernels_for_op): self.assertIsInstance( cross_device_ops_lib.choose_the_best(devices), cross_device_ops_lib.NcclAllReduce) # Not use nccl if nccl kernel is not found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", lambda _: []): self.assertIsInstance( cross_device_ops_lib.choose_the_best(devices), cross_device_ops_lib.ReductionToOneDevice) @combinations.generate(combinations.combine( mode=["graph", "eager"], required_gpus=1)) def testSimpleReduceWithIndexedSlices(self): devices = ["/cpu:0", "/gpu:0"] t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1]) per_replica = value_lib.PerReplica((t0, t1)) result = cross_device_ops_lib._simple_reduce( per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM) # Test that the result is semantically equal to both the concatenated # IndexedSlices with and without duplicate indices. total_with_dups = _make_indexed_slices( [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0]) total_without_dups = _make_indexed_slices( [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0]) self._assert_indexed_slices_equal(total_with_dups, result) self._assert_indexed_slices_equal(total_without_dups, result) @combinations.generate( combinations.combine( cross_device_ops_instance=[ combinations.NamedObject( "ReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "AllReduceCrossDeviceOps", cross_device_ops_lib.AllReduceCrossDeviceOps()) ], reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN], batch_reduce=[True, False], mode=["graph", "eager"], required_gpus=1)) def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op, batch_reduce): devices = ["/cpu:0", "/gpu:0"] self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance, reduce_op, batch_reduce)
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): # TODO(yuefengz): decouple the num_gpus check from distribution in # combinations module so that we can pass in devices instead of a distribution # strategy. reduction_to_one_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "DefaultReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "ReductionToCPUDeviceCrossDeviceOps", cross_device_ops_lib.ReductionToOneDevice( reduce_to_device=_cpu_device)), combinations.NamedObject( "AccumulateNCrossDeviceOp", cross_device_ops_lib.ReductionToOneDevice( accumulation_fn=math_ops.accumulate_n)), ], distribution=[ strategy_combinations.one_device_strategy, strategy_combinations.mirrored_strategy_with_gpu_and_cpu, strategy_combinations.mirrored_strategy_with_two_gpus, ], mode=["graph", "eager"]) allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "AllReduce", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)), combinations.NamedObject( "AllReduceNoGradientRepacking", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)), combinations.NamedObject("NcclAllReduce", cross_device_ops_lib.NcclAllReduce()), combinations.NamedObject( "HierarchicalCopy", cross_device_ops_lib.HierarchicalCopyAllReduce(8)), combinations.NamedObject( "HierarchicalCopyAggregateSmallTensors", cross_device_ops_lib.AllReduceCrossDeviceOps( "hierarchical_copy", 0, 100, 10)) ], distribution=[ strategy_combinations.mirrored_strategy_with_two_gpus, ], mode=["graph", "eager"]) @combinations.generate(reduction_to_one_combinations + allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, distribution): with distribution.scope(): self._testReductionAndBroadcast(cross_device_ops, distribution) def testChooseAlgorithm(self): device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]] result = cross_device_ops_lib._choose_all_reduce_algorithm( device_links) self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "hierarchical_copy") self.assertEqual(result._num_packs, 8) # if there are only 4 devices device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]] result = cross_device_ops_lib._choose_all_reduce_algorithm( device_links) self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "nccl") self.assertEqual(result._num_packs, 1) # if devices links contain each device itself device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6], [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7], [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]] result = cross_device_ops_lib._choose_all_reduce_algorithm( device_links) self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "hierarchical_copy") self.assertEqual(result._num_packs, 8) # if not dgx1-like links device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]] result = cross_device_ops_lib._choose_all_reduce_algorithm( device_links) self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "nccl") self.assertEqual(result._num_packs, 1) @combinations.generate( combinations.combine(mode=["graph", "eager"], required_gpus=1)) def testSimpleReduceWithIndexedSlices(self): devices = ["/cpu:0", "/gpu:0"] t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1]) per_replica = value_lib.PerReplica(value_lib.ReplicaDeviceMap(devices), (t0, t1)) result = cross_device_ops_lib._simple_reduce(per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM) # Test that the result is semantically equal to both the concatenated # IndexedSlices with and without duplicate indices. total_with_dups = _make_indexed_slices([[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0]) total_without_dups = _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0]) self._assert_indexed_slices_equal(total_with_dups, result) self._assert_indexed_slices_equal(total_without_dups, result) @combinations.generate( combinations.combine( cross_device_ops_instance=[ combinations.NamedObject( "ReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "AllReduceCrossDeviceOps", cross_device_ops_lib.AllReduceCrossDeviceOps()) ], reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN], batch_reduce=[True, False], mode=["graph", "eager"], required_gpus=1)) def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op, batch_reduce): devices = ["/cpu:0", "/gpu:0"] self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance, reduce_op, batch_reduce)