class MultiWorkerCrossTowerOpsTest(multi_worker_test_base.MultiWorkerTestBase, CrossTowerOpsTestBase): worker_devices = [ "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1" ] multi_worker_allreduce_combinations = combinations.combine( cross_tower_ops=[ combinations.NamedObject( "MultiWorkerAllReduce", cross_tower_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)), combinations.NamedObject( "MultiWorkerAllReducePack", cross_tower_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)), combinations.NamedObject( "MultiWorkerAllReduceAggregation", cross_tower_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)), combinations.NamedObject( "MultiWorkerAllReduceMultipleSpecs", cross_tower_ops_lib.MultiWorkerAllReduce( worker_devices, 2, [("pscpu/pscpu", 2, 100), ("xring", 2, -1)], 0, 0, 0)), ], distribution=[ combinations.NamedDistribution( "MirroredCPU", lambda: mirrored_strategy.MirroredStrategy(num_gpus=0), required_gpus=0), combinations.NamedDistribution( "Mirrored1GPU", lambda: mirrored_strategy.MirroredStrategy(num_gpus=1), required_gpus=1), combinations.NamedDistribution( "Mirrored2GPUs", lambda: mirrored_strategy.MirroredStrategy(num_gpus=2), required_gpus=2), ], mode=["graph"]) @combinations.generate(multi_worker_allreduce_combinations) def testReductionAndBroadcast(self, cross_tower_ops, distribution): distribution.configure( cluster_spec={ "worker": [ "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1" ] }) with distribution.scope(): self._testReductionAndBroadcast(cross_tower_ops, distribution)
def configure(self, session_config=None, cluster_spec=None, task_type=None, task_id=None): del task_type, task_id if session_config: session_config.isolate_session_state = True if cluster_spec: self._initialize_multi_worker(self._num_gpus, cluster_spec) if self._cross_tower_ops is None: if self._cluster_spec: # It currently cannot detect the toplogy of remote workers. So we # hard-code the multi-worker all-reduce algorithm for now. if len(self._workers) == 1: # The default is "nccl". self._cross_tower_ops = cross_tower_ops_lib.AllReduceCrossDeviceOps( ) else: # The default is hierarchical reduce and broadcast. self._cross_tower_ops = cross_tower_ops_lib.MultiWorkerAllReduce( self._workers, self._num_gpus) else: self._cross_tower_ops = cross_tower_ops_lib.choose_the_best( self._devices, session_config=session_config)
def configure(self, session_config=None, cluster_spec=None, task_type=None, task_id=None): del cluster_spec, task_type, task_id if self._cross_tower_ops is None: if self._cluster_spec: self._cross_tower_ops = cross_tower_ops_lib.MultiWorkerAllReduce( self._workers, self._num_gpus) else: self._cross_tower_ops = cross_tower_ops_lib.choose_the_best( self._devices, session_config=session_config)
class MultiWorkerCrossTowerOpsTest(multi_worker_test_base.MultiWorkerTestBase, CrossTowerOpsTestBase): worker_devices = [ "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1" ] multi_worker_allreduce_combinations = combinations.combine( cross_tower_ops=[ combinations.NamedObject( "MultiWorkerAllReduce", cross_tower_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)), combinations.NamedObject( "MultiWorkerAllReducePack", cross_tower_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)), combinations.NamedObject( "MultiWorkerAllReduceAggregation", cross_tower_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)), combinations.NamedObject( "MultiWorkerAllReduceMultipleSpecs", cross_tower_ops_lib.MultiWorkerAllReduce( worker_devices, 2, [("pscpu/pscpu", 2, 100), ("xring", 2, -1)], 0, 0, 0)), ], distribution=[ combinations.multi_worker_strategy_with_cpu, combinations.multi_worker_strategy_with_one_gpu, combinations.multi_worker_strategy_with_two_gpus ], mode=["graph"]) @combinations.generate(multi_worker_allreduce_combinations) def testReductionAndBroadcast(self, cross_tower_ops, distribution): with distribution.scope(): self._testReductionAndBroadcast(cross_tower_ops, distribution)