def test_init_run_config_duplicate_distribute(self): with self.assertRaises(ValueError): run_config_lib.RunConfig( train_distribute=mirrored_strategy.CoreMirroredStrategy(), experimental_distribute=DistributeConfig( train_distribute=mirrored_strategy.CoreMirroredStrategy())) with self.assertRaises(ValueError): run_config_lib.RunConfig( eval_distribute=mirrored_strategy.CoreMirroredStrategy(), experimental_distribute=DistributeConfig( eval_distribute=mirrored_strategy.CoreMirroredStrategy()))
def test_previously_unexpected_cluster_spec(self): with test.mock.patch.dict( "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITHOUT_TASK)}): run_config_lib.RunConfig( experimental_distribute=DistributeConfig( train_distribute=mirrored_strategy.CoreMirroredStrategy( ["/device:GPU:0", "/device:GPU:1"])))
def test_init_run_config_independent_worker(self): # When `train_distribute` is specified and TF_CONFIG is detected, use # distribute coordinator with INDEPENDENT_WORKER mode. with test.mock.patch.dict("os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}): config = run_config_lib.RunConfig( train_distribute=mirrored_strategy.CoreMirroredStrategy()) self.assertEqual(config._distribute_coordinator_mode, dc.CoordinatorMode.INDEPENDENT_WORKER)
def test_init_run_config_standalone_client(self): # When `train_distribute` is specified, TF_CONFIG is detected and # `experimental.remote_cluster` is set use distribute coordinator with # STANDALONE_CLIENT mode. config = run_config_lib.RunConfig( train_distribute=mirrored_strategy.CoreMirroredStrategy(), experimental_distribute=DistributeConfig( remote_cluster={"chief": ["fake_worker"]})) self.assertEqual(config._distribute_coordinator_mode, dc.CoordinatorMode.STANDALONE_CLIENT)
def test_should_run_distribute_coordinator(self): """Tests that should_run_distribute_coordinator return a correct value.""" # We don't use distribute coordinator for local training. self.assertFalse( dc_training.should_run_distribute_coordinator( run_config_lib.RunConfig())) # When `train_distribute` is not specified, don't use distribute # coordinator. with test.mock.patch.dict( "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}): self.assertFalse( dc_training.should_run_distribute_coordinator( run_config_lib.RunConfig())) # When `train_distribute` is specified and TF_CONFIG is detected, use # distribute coordinator. with test.mock.patch.dict( "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}): config_with_train_distribute = run_config_lib.RunConfig( experimental_distribute=DistributeConfig( train_distribute=mirrored_strategy.CoreMirroredStrategy( num_gpus=2))) config_with_eval_distribute = run_config_lib.RunConfig( experimental_distribute=DistributeConfig( eval_distribute=mirrored_strategy.CoreMirroredStrategy( num_gpus=2))) self.assertTrue( dc_training.should_run_distribute_coordinator( config_with_train_distribute)) self.assertFalse( dc_training.should_run_distribute_coordinator( config_with_eval_distribute)) # With a master in the cluster, don't run distribute coordinator. with test.mock.patch.dict( "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}): config = run_config_lib.RunConfig( experimental_distribute=DistributeConfig( train_distribute=mirrored_strategy.CoreMirroredStrategy( num_gpus=2))) self.assertFalse(dc_training.should_run_distribute_coordinator(config))
def test_init_run_config_none_distribute_coordinator_mode(self): # We don't use distribute coordinator for local training. config = run_config_lib.RunConfig( train_distribute=mirrored_strategy.CoreMirroredStrategy()) dc_training.init_run_config(config, {}) self.assertIsNone(config._distribute_coordinator_mode) # With a master in the cluster, don't run distribute coordinator. with test.mock.patch.dict("os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}): config = run_config_lib.RunConfig( train_distribute=mirrored_strategy.CoreMirroredStrategy()) self.assertIsNone(config._distribute_coordinator_mode) # When `train_distribute` is not specified, don't use distribute # coordinator. with test.mock.patch.dict("os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}): config = run_config_lib.RunConfig() self.assertFalse(hasattr(config, "_distribute_coordinator_mode"))
lambda: mirrored_lib.MirroredStrategy(["/cpu:0"])) mirrored_strategy_with_one_gpu = NamedDistribution( "Mirrored1GPU", lambda: mirrored_lib.MirroredStrategy(["/gpu:0"]), required_gpus=1) mirrored_strategy_with_gpu_and_cpu = NamedDistribution( "MirroredCPUAndGPU", lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]), required_gpus=1) mirrored_strategy_with_two_gpus = NamedDistribution( "Mirrored2GPUs", lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"]), required_gpus=2) core_mirrored_strategy_with_one_cpu = NamedDistribution( "CoreMirrored1CPU", lambda: mirrored_lib.CoreMirroredStrategy(["/cpu:0"])) core_mirrored_strategy_with_one_gpu = NamedDistribution( "CoreMirrored1GPU", lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0"]), required_gpus=1) core_mirrored_strategy_with_gpu_and_cpu = NamedDistribution( "CoreMirroredCPUAndGPU", lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/cpu:0"]), required_gpus=1) core_mirrored_strategy_with_two_gpus = NamedDistribution( "CoreMirrored2GPUs", lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/gpu:1"]), required_gpus=2) gradient_descent_optimizer_v1_fn = NamedObject(
class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase, CrossDeviceOpsTestBase): worker_devices = [ "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1" ] multi_worker_allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "MultiWorkerAllReduce", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)), combinations.NamedObject( "MultiWorkerAllReducePack", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)), combinations.NamedObject( "MultiWorkerAllReduceAggregation", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)), combinations.NamedObject( "MultiWorkerAllReduceMultipleSpecs", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, [("pscpu/pscpu", 2, 100), ("xring", 2, -1)], 0, 0, 0)), ], distribution=[ combinations.NamedDistribution( "MirroredCPU", lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker= 0), required_gpus=0), combinations.NamedDistribution( "Mirrored1GPU", lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker= 1), required_gpus=1), combinations.NamedDistribution( "Mirrored2GPUs", lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker= 2), required_gpus=2), # pylint: disable=g-long-lambda combinations.NamedDistribution( "CoreMirroredCPU", lambda: mirrored_strategy.CoreMirroredStrategy( ["/device:CPU:0"]), required_gpus=0), combinations.NamedDistribution( "CoreMirrored1GPU", lambda: mirrored_strategy.CoreMirroredStrategy( ["/device:GPU:0"]), required_gpus=1), combinations.NamedDistribution( "CoreMirrored2GPUs", lambda: mirrored_strategy.CoreMirroredStrategy( ["/device:GPU:0", "/device:GPU:1"]), required_gpus=2), ], mode=["graph"]) @combinations.generate(multi_worker_allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, distribution): distribution.configure( cluster_spec={ "worker": [ "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1" ] }) with distribution.scope(): self._testReductionAndBroadcast(cross_device_ops, distribution)