示例#1
0
  def test_init_run_config_duplicate_distribute(self):
    with self.assertRaises(ValueError):
      run_config_lib.RunConfig(
          train_distribute=mirrored_strategy.CoreMirroredStrategy(),
          experimental_distribute=DistributeConfig(
              train_distribute=mirrored_strategy.CoreMirroredStrategy()))

    with self.assertRaises(ValueError):
      run_config_lib.RunConfig(
          eval_distribute=mirrored_strategy.CoreMirroredStrategy(),
          experimental_distribute=DistributeConfig(
              eval_distribute=mirrored_strategy.CoreMirroredStrategy()))
示例#2
0
 def test_previously_unexpected_cluster_spec(self):
   with test.mock.patch.dict(
       "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITHOUT_TASK)}):
     run_config_lib.RunConfig(
         experimental_distribute=DistributeConfig(
             train_distribute=mirrored_strategy.CoreMirroredStrategy(
                 ["/device:GPU:0", "/device:GPU:1"])))
示例#3
0
 def test_init_run_config_independent_worker(self):
   # When `train_distribute` is specified and TF_CONFIG is detected, use
   # distribute coordinator with INDEPENDENT_WORKER mode.
   with test.mock.patch.dict("os.environ",
                             {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
     config = run_config_lib.RunConfig(
         train_distribute=mirrored_strategy.CoreMirroredStrategy())
   self.assertEqual(config._distribute_coordinator_mode,
                    dc.CoordinatorMode.INDEPENDENT_WORKER)
示例#4
0
 def test_init_run_config_standalone_client(self):
   # When `train_distribute` is specified, TF_CONFIG is detected and
   # `experimental.remote_cluster` is set use distribute coordinator with
   # STANDALONE_CLIENT mode.
   config = run_config_lib.RunConfig(
       train_distribute=mirrored_strategy.CoreMirroredStrategy(),
       experimental_distribute=DistributeConfig(
           remote_cluster={"chief": ["fake_worker"]}))
   self.assertEqual(config._distribute_coordinator_mode,
                    dc.CoordinatorMode.STANDALONE_CLIENT)
示例#5
0
    def test_should_run_distribute_coordinator(self):
        """Tests that should_run_distribute_coordinator return a correct value."""
        # We don't use distribute coordinator for local training.
        self.assertFalse(
            dc_training.should_run_distribute_coordinator(
                run_config_lib.RunConfig()))

        # When `train_distribute` is not specified, don't use distribute
        # coordinator.
        with test.mock.patch.dict(
                "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
            self.assertFalse(
                dc_training.should_run_distribute_coordinator(
                    run_config_lib.RunConfig()))

        # When `train_distribute` is specified and TF_CONFIG is detected, use
        # distribute coordinator.
        with test.mock.patch.dict(
                "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
            config_with_train_distribute = run_config_lib.RunConfig(
                experimental_distribute=DistributeConfig(
                    train_distribute=mirrored_strategy.CoreMirroredStrategy(
                        num_gpus=2)))
            config_with_eval_distribute = run_config_lib.RunConfig(
                experimental_distribute=DistributeConfig(
                    eval_distribute=mirrored_strategy.CoreMirroredStrategy(
                        num_gpus=2)))
        self.assertTrue(
            dc_training.should_run_distribute_coordinator(
                config_with_train_distribute))
        self.assertFalse(
            dc_training.should_run_distribute_coordinator(
                config_with_eval_distribute))

        # With a master in the cluster, don't run distribute coordinator.
        with test.mock.patch.dict(
                "os.environ",
            {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
            config = run_config_lib.RunConfig(
                experimental_distribute=DistributeConfig(
                    train_distribute=mirrored_strategy.CoreMirroredStrategy(
                        num_gpus=2)))
        self.assertFalse(dc_training.should_run_distribute_coordinator(config))
示例#6
0
  def test_init_run_config_none_distribute_coordinator_mode(self):
    # We don't use distribute coordinator for local training.
    config = run_config_lib.RunConfig(
        train_distribute=mirrored_strategy.CoreMirroredStrategy())
    dc_training.init_run_config(config, {})
    self.assertIsNone(config._distribute_coordinator_mode)

    # With a master in the cluster, don't run distribute coordinator.
    with test.mock.patch.dict("os.environ",
                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
      config = run_config_lib.RunConfig(
          train_distribute=mirrored_strategy.CoreMirroredStrategy())
      self.assertIsNone(config._distribute_coordinator_mode)

    # When `train_distribute` is not specified, don't use distribute
    # coordinator.
    with test.mock.patch.dict("os.environ",
                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
      config = run_config_lib.RunConfig()
      self.assertFalse(hasattr(config, "_distribute_coordinator_mode"))
示例#7
0
    lambda: mirrored_lib.MirroredStrategy(["/cpu:0"]))
mirrored_strategy_with_one_gpu = NamedDistribution(
    "Mirrored1GPU",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0"]),
    required_gpus=1)
mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
    "MirroredCPUAndGPU",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]),
    required_gpus=1)
mirrored_strategy_with_two_gpus = NamedDistribution(
    "Mirrored2GPUs",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"]),
    required_gpus=2)
core_mirrored_strategy_with_one_cpu = NamedDistribution(
    "CoreMirrored1CPU",
    lambda: mirrored_lib.CoreMirroredStrategy(["/cpu:0"]))
core_mirrored_strategy_with_one_gpu = NamedDistribution(
    "CoreMirrored1GPU",
    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0"]),
    required_gpus=1)
core_mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
    "CoreMirroredCPUAndGPU",
    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/cpu:0"]),
    required_gpus=1)
core_mirrored_strategy_with_two_gpus = NamedDistribution(
    "CoreMirrored2GPUs",
    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/gpu:1"]),
    required_gpus=2)


gradient_descent_optimizer_v1_fn = NamedObject(
示例#8
0
class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
                                    CrossDeviceOpsTestBase):

    worker_devices = [
        "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
    ]
    multi_worker_allreduce_combinations = combinations.combine(
        cross_device_ops=[
            combinations.NamedObject(
                "MultiWorkerAllReduce",
                cross_device_ops_lib.MultiWorkerAllReduce(
                    worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)),
            combinations.NamedObject(
                "MultiWorkerAllReducePack",
                cross_device_ops_lib.MultiWorkerAllReduce(
                    worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)),
            combinations.NamedObject(
                "MultiWorkerAllReduceAggregation",
                cross_device_ops_lib.MultiWorkerAllReduce(
                    worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)),
            combinations.NamedObject(
                "MultiWorkerAllReduceMultipleSpecs",
                cross_device_ops_lib.MultiWorkerAllReduce(
                    worker_devices, 2, [("pscpu/pscpu", 2, 100),
                                        ("xring", 2, -1)], 0, 0, 0)),
        ],
        distribution=[
            combinations.NamedDistribution(
                "MirroredCPU",
                lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=
                                                           0),
                required_gpus=0),
            combinations.NamedDistribution(
                "Mirrored1GPU",
                lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=
                                                           1),
                required_gpus=1),
            combinations.NamedDistribution(
                "Mirrored2GPUs",
                lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=
                                                           2),
                required_gpus=2),
            # pylint: disable=g-long-lambda
            combinations.NamedDistribution(
                "CoreMirroredCPU",
                lambda: mirrored_strategy.CoreMirroredStrategy(
                    ["/device:CPU:0"]),
                required_gpus=0),
            combinations.NamedDistribution(
                "CoreMirrored1GPU",
                lambda: mirrored_strategy.CoreMirroredStrategy(
                    ["/device:GPU:0"]),
                required_gpus=1),
            combinations.NamedDistribution(
                "CoreMirrored2GPUs",
                lambda: mirrored_strategy.CoreMirroredStrategy(
                    ["/device:GPU:0", "/device:GPU:1"]),
                required_gpus=2),
        ],
        mode=["graph"])

    @combinations.generate(multi_worker_allreduce_combinations)
    def testReductionAndBroadcast(self, cross_device_ops, distribution):
        distribution.configure(
            cluster_spec={
                "worker": [
                    "/job:worker/replica:0/task:0",
                    "/job:worker/replica:0/task:1"
                ]
            })
        with distribution.scope():
            self._testReductionAndBroadcast(cross_device_ops, distribution)