def select_strategy(self) -> Strategy:
        if isinstance(self.distributed_backend, Accelerator) and self.distributed_backend.strategy is not None:
            plugin = self.distributed_backend.strategy
        elif self.use_ddp2:
            plugin = DDP2Strategy(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment)
        elif self.use_ddp and self.use_deepspeed:
            plugin = DeepSpeedStrategy(
                cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices
            )
        elif self.use_ddp:
            use_slurm_ddp = self.use_ddp and self._is_slurm_managing_tasks()
            use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.detect()
            use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.detect()
            use_ddp_spawn = self._strategy_type == _StrategyType.DDP_SPAWN
            use_ddp_cpu_spawn = use_ddp_spawn and self.use_cpu
            use_tpu_spawn = self.use_tpu and self._strategy_type == _StrategyType.TPU_SPAWN
            use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.detect()
            use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.detect()
            use_ddp_cpu_slurm = use_ddp_cpu_spawn and self._is_slurm_managing_tasks()
            use_ddp_sharded = self._strategy_type == _StrategyType.DDP_SHARDED
            use_ddp_sharded_spawn = self._strategy_type == _StrategyType.DDP_SHARDED_SPAWN
            use_ddp_fully_sharded = self._strategy_type == _StrategyType.DDP_FULLY_SHARDED

            if use_tpu_spawn:
                ddp_strategy_cls = TPUSpawnStrategy
            elif use_ddp_sharded:
                ddp_strategy_cls = DDPShardedStrategy
            elif use_ddp_sharded_spawn:
                ddp_strategy_cls = DDPSpawnShardedStrategy
            elif (
                use_ddp_cpu_slurm
                or use_slurm_ddp
                or use_ddp_cpu_torch_elastic
                or use_torchelastic_ddp
                or use_kubeflow_ddp
                or use_ddp_cpu_kubeflow
            ):
                ddp_strategy_cls = DDPStrategy
            elif use_ddp_spawn or use_ddp_cpu_spawn:
                ddp_strategy_cls = DDPSpawnStrategy
            elif use_ddp_fully_sharded:
                ddp_strategy_cls = DDPFullyShardedStrategy
            else:
                ddp_strategy_cls = DDPStrategy

            plugin = ddp_strategy_cls(
                parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment
            )
        elif self.use_dp:
            plugin = DataParallelStrategy(parallel_devices=self.parallel_devices)
        elif self.use_horovod:
            plugin = HorovodStrategy(parallel_devices=self.parallel_devices)
        elif self.use_tpu and isinstance(self.tpu_cores, list):
            plugin = SingleTPUStrategy(self.tpu_id)
        elif self.use_ipu:
            plugin = IPUStrategy(parallel_devices=self.parallel_devices)
        else:
            single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids)
            plugin = SingleDeviceStrategy(device=single_gpu_ordinal if self.use_gpu else "cpu")
        return plugin
Exemplo n.º 2
0
    def select_training_type_plugin(self) -> TrainingTypePlugin:
        if self.use_ddp2:
            plugin = DDP2Plugin(
                parallel_devices=self.parallel_devices,
                num_nodes=self.num_nodes,
                cluster_environment=self.cluster_environment,
                sync_batchnorm=self.sync_batchnorm,
            )
        elif self.use_ddp and self.use_deepspeed:
            plugin = DeepSpeedPlugin(
                num_nodes=self.num_nodes,
                cluster_environment=self.select_cluster_environment(),
                parallel_devices=self.parallel_devices
            )
        elif self.use_ddp:
            use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks
            use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.is_using_torchelastic()
            use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN
            use_ddp_cpu_spawn = self.use_ddp and self.on_cpu
            use_tpu_spawn = self.on_tpu and self._distrib_type == DistributedType.TPU_SPAWN
            use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.is_using_torchelastic()
            use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks
            use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED
            use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN

            # TODO: decouple from TE
            # ddp script mode uses the same flags as TE
            if os.environ.get("PL_IN_DDP_SUBPROCESS", False):
                use_torchelastic_ddp = False

            if use_tpu_spawn:
                ddp_plugin_cls = TPUSpawnPlugin
            elif use_ddp_sharded:
                ddp_plugin_cls = DDPShardedPlugin
            elif use_ddp_sharded_spawn:
                ddp_plugin_cls = DDPSpawnShardedPlugin
            elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp:
                ddp_plugin_cls = DDPPlugin
            elif use_ddp_spawn or use_ddp_cpu_spawn:
                ddp_plugin_cls = DDPSpawnPlugin
            else:
                ddp_plugin_cls = DDPPlugin

            plugin = ddp_plugin_cls(
                parallel_devices=self.parallel_devices,
                num_nodes=self.num_nodes,
                cluster_environment=self.cluster_environment,
                sync_batchnorm=self.sync_batchnorm,
            )
        elif self.use_dp:
            plugin = DataParallelPlugin(parallel_devices=self.parallel_devices)
        elif self.use_horovod:
            plugin = HorovodPlugin(parallel_devices=self.parallel_devices)
        elif self.on_tpu and isinstance(self.tpu_cores, list):
            plugin = SingleTPUPlugin(self.tpu_id)
        else:
            single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids)
            plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu"))
        return plugin
 def select_cluster_environment(self) -> ClusterEnvironment:
     if self._cluster_environment is not None:
         return self._cluster_environment
     if self.is_slurm_managing_tasks:
         env = SLURMEnvironment()
     elif TorchElasticEnvironment.is_using_torchelastic():
         env = TorchElasticEnvironment()
     else:
         env = LightningEnvironment()
     return env
Exemplo n.º 4
0
def test_detect_after_1_9_1():
    """Test the detection of a torchelastic environment configuration after 1.9.1."""
    with mock.patch.dict(os.environ, {}):
        assert not TorchElasticEnvironment.detect()

    with mock.patch.dict(
            os.environ,
        {
            "TORCHELASTIC_RUN_ID": "",
        },
    ):
        assert TorchElasticEnvironment.detect()
Exemplo n.º 5
0
 def select_cluster_environment(self):
     if self.cluster_environment is not None:
         return self.cluster_environment
     if self.is_slurm_managing_tasks:
         env = SLURMEnvironment()
     elif self.is_using_torchelastic:
         env = TorchElasticEnvironment()
         # TODO: decouple DDP from TE
         #   maybe introduce a DefaultEnvironment?
         os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
     else:
         # TODO: maybe introduce a DefaultEnvironment?
         env = TorchElasticEnvironment()
     return env
Exemplo n.º 6
0
def test_default_attributes():
    """ Test the default attributes when no environment variables are set. """
    env = TorchElasticEnvironment()
    assert env.creates_children()
    assert env.master_address() == "127.0.0.1"
    assert env.master_port() == 12910
    assert env.world_size() is None
    with pytest.raises(KeyError):
        # local rank is required to be passed as env variable
        env.local_rank()
    assert env.node_rank() == 0
Exemplo n.º 7
0
def test_detect_before_1_9_1():
    """Test the detection of a torchelastic environment configuration before 1.9.1."""
    with mock.patch.dict(os.environ, {}):
        assert not TorchElasticEnvironment.detect()

    with mock.patch.dict(
            os.environ,
        {
            "RANK": "",
            "GROUP_RANK": "",
            "LOCAL_RANK": "",
            "LOCAL_WORLD_SIZE": "",
        },
    ):
        assert TorchElasticEnvironment.detect()
Exemplo n.º 8
0
 def select_cluster_environment(self) -> ClusterEnvironment:
     if self._cluster_environment is not None:
         return self._cluster_environment
     if self._is_slurm_managing_tasks():
         env = SLURMEnvironment()
         rank_zero_info("Multiprocessing is handled by SLURM.")
     elif TorchElasticEnvironment.is_using_torchelastic():
         env = TorchElasticEnvironment()
     elif KubeflowEnvironment.is_using_kubeflow():
         env = KubeflowEnvironment()
     elif LSFEnvironment.is_using_lsf():
         env = LSFEnvironment()
     else:
         env = LightningEnvironment()
     return env
Exemplo n.º 9
0
def environment_combinations():
    expected = dict(global_rank=3, local_rank=1, node_rank=1, world_size=4)
    # Lightning
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "LOCAL_RANK": "1",
        "NODE_RANK": "1",
        "WORLD_SIZE": "8"
    }
    environment = LightningEnvironment()
    yield environment, variables, expected
    # SLURM
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "SLURM_JOB_NAME": "SOME_NAME",
        "SLURM_LOCALID": "1",
        "SLURM_NODEID": "1",
        "SLURM_PROCID": "3",
        "SLURM_NTASKS": "4",
    }
    environment = SLURMEnvironment()
    yield environment, variables, expected
    # TorchElastic
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "LOCAL_RANK": "1",
        "GROUP_RANK": "1",
        "RANK": "3",
        "WORLD_SIZE": "4",
        "LOCAL_WORLD_SIZE": "2",
    }
    environment = TorchElasticEnvironment()
    yield environment, variables, expected
Exemplo n.º 10
0
    def _check_strategy_and_fallback(self) -> None:
        """Checks edge cases when the strategy selection was a string input, and we need to fall back to a
        different choice depending on other parameters or the environment."""
        # current fallback and check logic only apply to user pass in str config and object config
        # TODO this logic should apply to both str and object config
        strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag

        if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
            TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks()
        ):
            strategy_flag = "ddp"
        if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu":
            rank_zero_warn(f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.")
            strategy_flag = "ddp"
        if (
            strategy_flag in DDPFullyShardedNativeStrategy.get_registered_strategies()
            or isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy)
        ) and self._accelerator_flag != "gpu":
            raise MisconfigurationException(
                f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, "
                "but GPU accelerator is not used."
            )

        if strategy_flag:
            self._strategy_flag = strategy_flag
    def _check_strategy_and_fallback(self) -> None:
        """Checks edge cases when the strategy selection was a string input, and we need to fall back to a
        different choice depending on other parameters or the environment."""
        # current fallback and check logic only apply to user pass in str config and object config
        # TODO this logic should apply to both str and object config
        strategy_flag = "" if isinstance(self._strategy_flag,
                                         Strategy) else self._strategy_flag

        if strategy_flag in ("ddp_spawn",
                             "ddp_spawn_find_unused_parameters_false") and (
                                 TorchElasticEnvironment.detect()
                                 or KubeflowEnvironment.detect()
                                 or self._is_slurm_managing_tasks()):
            strategy_flag = "ddp"
        if strategy_flag == "dp" and self._accelerator_flag == "cpu":
            rank_zero_warn(
                f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`."
            )
            strategy_flag = "ddp"
        if (strategy_flag
                in DDPFullyShardedNativeStrategy.get_registered_strategies() or
                isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy)
            ) and self._accelerator_flag not in ("cuda", "gpu"):
            raise MisconfigurationException(
                f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, "
                "but GPU accelerator is not used.")
        if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods(
        ):
            raise ValueError(
                f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this"
                f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead."
            )
        if strategy_flag:
            self._strategy_flag = strategy_flag
Exemplo n.º 12
0
 def select_cluster_environment(self) -> ClusterEnvironment:
     if self._cluster_environment is not None:
         return self._cluster_environment
     if self.is_slurm_managing_tasks:
         env = SLURMEnvironment()
         # TODO: decouple DDP from SLURM
         #   refactor and let generic cluster env hold the information about who spawns the processes
         os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
     elif self.is_using_torchelastic:
         env = TorchElasticEnvironment()
         # TODO: decouple DDP from TE
         #   refactor and let generic cluster env hold the information about who spawns the processes
         os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
     else:
         # TODO: maybe introduce a DefaultEnvironment?
         env = TorchElasticEnvironment()
     return env
def test_sync_batchnorm_ddp(tmpdir):
    seed_everything(234)
    set_random_master_port()

    # define datamodule and dataloader
    dm = MNISTDataModule()
    dm.prepare_data()
    dm.setup(stage=None)

    train_dataloader = dm.train_dataloader()
    model = SyncBNModule()

    bn_outputs = []

    # shuffle is false by default
    for batch_idx, batch in enumerate(train_dataloader):
        x, _ = batch

        _, out_bn = model.forward(x, batch_idx)
        bn_outputs.append(out_bn)

        # get 3 steps
        if batch_idx == 2:
            break

    bn_outputs = [x.cuda() for x in bn_outputs]

    # reset datamodule
    # batch-size = 16 because 2 GPUs in DDP
    dm = MNISTDataModule(batch_size=16, dist_sampler=True)
    dm.prepare_data()
    dm.setup(stage=None)

    model = SyncBNModule(gpu_count=2, bn_targets=bn_outputs)

    trainer = Trainer(gpus=2,
                      num_nodes=1,
                      accelerator='ddp_spawn',
                      max_epochs=1,
                      max_steps=3,
                      sync_batchnorm=True,
                      num_sanity_val_steps=0,
                      replace_sampler_ddp=False,
                      plugins=[
                          DDPSpawnPlugin(
                              parallel_devices=[
                                  torch.device("cuda", 0),
                                  torch.device("cuda", 1)
                              ],
                              num_nodes=1,
                              sync_batchnorm=True,
                              cluster_environment=TorchElasticEnvironment(),
                              find_unused_parameters=True)
                      ])

    trainer.fit(model, dm)
    assert trainer.state == TrainerState.FINISHED, "Sync batchnorm failing with DDP"
Exemplo n.º 14
0
def test_attributes_from_environment_variables():
    """ Test that the torchelastic cluster environment takes the attributes from the environment variables. """
    env = TorchElasticEnvironment()
    assert env.master_address() == "1.2.3.4"
    assert env.master_port() == 500
    assert env.world_size() == 20
    assert env.local_rank() == 2
    assert env.node_rank() == 3
Exemplo n.º 15
0
 def is_using_torchelastic(self) -> bool:
     """
     .. deprecated:: v1.3
         Will be removed in v1.5.0.
     Returns:
         ``True`` if the current process was launched using the torchelastic command.
     """
     rank_zero_deprecation(
         "The property `AcceleratorConnector.is_using_torchelastic` was deprecated in v1.3"
         " and will be removed in 1.5. Use `TorchElasticEnvironment.is_using_torchelastic()` instead.",
     )
     return TorchElasticEnvironment.is_using_torchelastic()
Exemplo n.º 16
0
def test_default_attributes():
    """Test the default attributes when no environment variables are set."""
    env = TorchElasticEnvironment()
    assert env.creates_processes_externally
    assert env.main_address == "127.0.0.1"
    assert env.main_port == 12910
    with pytest.raises(KeyError):
        # world size is required to be passed as env variable
        env.world_size()
    with pytest.raises(KeyError):
        # local rank is required to be passed as env variable
        env.local_rank()
    assert env.node_rank() == 0
Exemplo n.º 17
0
def test_v1_4_0_deprecated_lightning_distributed_data_parallel(tmpdir):
    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        gpus=2,
        accelerator="ddp_spawn",
        plugins=[
            CustomDDPPlugin(
                parallel_devices=[torch.device("cuda", 0), torch.device("cuda", 1)],
                cluster_environment=TorchElasticEnvironment(),
            )
        ]
    )
    trainer.fit(model)
Exemplo n.º 18
0
def parse_gpu_ids(
        gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[int]]:
    """
    Parses the GPU ids given in the format as accepted by the
    :class:`~pytorch_lightning.trainer.Trainer`.

    Args:
        gpus: An int -1 or string '-1' indicate that all available GPUs should be used.
            A list of ints or a string containing list of comma separated integers
            indicates specific GPUs to use.
            An int 0 means that no GPUs should be used.
            Any int N > 0 indicates that GPUs [0..N) should be used.

    Returns:
        a list of gpus to be used or ``None`` if no GPUs were requested

    If no GPUs are available but the value of gpus variable indicates request for GPUs
    then a MisconfigurationException is raised.
    """
    # Check that gpus param is None, Int, String or List
    _check_data_type(gpus)

    # Handle the case when no gpus are requested
    if gpus is None or isinstance(gpus, int) and gpus == 0:
        return None

    if _compare_version("pytorch_lightning", operator.ge,
                        "1.5") and isinstance(gpus,
                                              str) and gpus.strip() == "0":
        # TODO: in v1.5 combine this with the above if statement
        return None

    # We know user requested GPUs therefore if some of the
    # requested GPUs are not available an exception is thrown.
    gpus = _normalize_parse_gpu_string_input(gpus)
    gpus = _normalize_parse_gpu_input_to_list(gpus)
    if not gpus:
        raise MisconfigurationException(
            "GPUs requested but none are available.")
    if TorchElasticEnvironment.is_using_torchelastic(
    ) and len(gpus) != 1 and len(_get_all_available_gpus()) == 1:
        # omit sanity check on torchelastic as by default shows one visible GPU per process
        return gpus
    return _sanitize_gpu_ids(gpus)
Exemplo n.º 19
0
def parse_gpu_ids(gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[int]]:
    """
    Parses the GPU ids given in the format as accepted by the
    :class:`~pytorch_lightning.trainer.Trainer`.

    Args:
        gpus: An int -1 or string '-1' indicate that all available GPUs should be used.
            A list of unique ints or a string containing list of comma separated unique integers
            indicates specific GPUs to use.
            An int 0 means that no GPUs should be used.
            Any int N > 0 indicates that GPUs [0..N) should be used.

    Returns:
        a list of gpus to be used or ``None`` if no GPUs were requested

    Raises:
        MisconfigurationException:
            If no GPUs are available but the value of gpus variable indicates request for GPUs
    """
    # Check that gpus param is None, Int, String or List
    _check_data_type(gpus)

    # Handle the case when no gpus are requested
    if gpus is None or (isinstance(gpus, int) and gpus == 0) or str(gpus).strip() in ("0", "[]"):
        return None

    # We know user requested GPUs therefore if some of the
    # requested GPUs are not available an exception is thrown.
    gpus = _normalize_parse_gpu_string_input(gpus)
    gpus = _normalize_parse_gpu_input_to_list(gpus)
    if not gpus:
        raise MisconfigurationException("GPUs requested but none are available.")
    if TorchElasticEnvironment.detect() and len(gpus) != 1 and len(_get_all_available_gpus()) == 1:
        # omit sanity check on torchelastic as by default shows one visible GPU per process
        return gpus

    # Check that gpus are unique. Duplicate gpus are not supported by the backend.
    _check_unique(gpus)

    return _sanitize_gpu_ids(gpus)
Exemplo n.º 20
0
    def _check_strategy_and_fallback(self) -> None:
        """Checks edge cases when the strategy selection was a string input, and we need to fall back to a
        different choice depending on other parameters or the environment."""
        # current fallback and check logic only apply to user pass in str config and object config
        # TODO this logic should apply to both str and object config
        strategy_flag = "" if isinstance(self._strategy_flag,
                                         Strategy) else self._strategy_flag

        if strategy_flag == "ddp_cpu":
            if _TPU_AVAILABLE:
                raise MisconfigurationException(
                    "`accelerator='ddp_cpu'` is not supported on TPU machines. "
                    "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810"
                )
            if self._devices_flag == 1 and self._num_nodes_flag > 1:
                strategy_flag = DDPStrategy.strategy_name
            else:
                strategy_flag = "ddp_spawn"
            if self._accelerator_flag == "gpu":
                rank_zero_warn(
                    "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs."
                )
                self._accelerator_flag = "cpu"
                self.accelerator = CPUAccelerator()
        if strategy_flag in ("ddp_spawn",
                             "ddp_spawn_find_unused_parameters_false") and (
                                 TorchElasticEnvironment.detect()
                                 or KubeflowEnvironment.detect()
                                 or self._is_slurm_managing_tasks()):
            strategy_flag = "ddp"
        if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu":
            rank_zero_warn(
                f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`."
            )
            strategy_flag = "ddp"

        if strategy_flag:
            self._strategy_flag = strategy_flag
    def select_training_type_plugin(self) -> TrainingTypePlugin:
        if (isinstance(self.distributed_backend, Accelerator)
                and self.distributed_backend.training_type_plugin is not None):
            plugin = self.distributed_backend.training_type_plugin
        elif self.use_ddp2:
            plugin = DDP2Plugin(parallel_devices=self.parallel_devices,
                                cluster_environment=self.cluster_environment)
        elif self.use_ddp and self.use_deepspeed:
            plugin = DeepSpeedPlugin(
                cluster_environment=self.select_cluster_environment(),
                parallel_devices=self.parallel_devices)
        elif self.use_ddp:
            use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks
            use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.is_using_torchelastic(
            )
            use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.is_using_kubeflow(
            )
            use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN
            use_ddp_cpu_spawn = use_ddp_spawn and self.use_cpu
            use_tpu_spawn = self.use_tpu and self._distrib_type == DistributedType.TPU_SPAWN
            use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.is_using_torchelastic(
            )
            use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.is_using_kubeflow(
            )
            use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks
            use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED
            use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN
            use_ddp_fully_sharded = self._distrib_type == DistributedType.DDP_FULLY_SHARDED

            if use_tpu_spawn:
                ddp_plugin_cls = TPUSpawnPlugin
            elif use_ddp_sharded:
                ddp_plugin_cls = DDPShardedPlugin
            elif use_ddp_sharded_spawn:
                ddp_plugin_cls = DDPSpawnShardedPlugin
            elif (use_ddp_cpu_slurm or use_slurm_ddp
                  or use_ddp_cpu_torch_elastic or use_torchelastic_ddp
                  or use_kubeflow_ddp or use_ddp_cpu_kubeflow):
                ddp_plugin_cls = DDPPlugin
            elif use_ddp_spawn or use_ddp_cpu_spawn:
                ddp_plugin_cls = DDPSpawnPlugin
            elif use_ddp_fully_sharded:
                ddp_plugin_cls = DDPFullyShardedPlugin
            else:
                ddp_plugin_cls = DDPPlugin

            plugin = ddp_plugin_cls(
                parallel_devices=self.parallel_devices,
                cluster_environment=self.cluster_environment)
        elif self.use_dp:
            plugin = DataParallelPlugin(parallel_devices=self.parallel_devices)
        elif self.use_horovod:
            plugin = HorovodPlugin(parallel_devices=self.parallel_devices)
        elif self.use_tpu and isinstance(self.tpu_cores, list):
            plugin = SingleTPUPlugin(self.tpu_id)
        elif self.use_ipu:
            plugin = IPUPlugin(parallel_devices=self.parallel_devices)
        else:
            single_gpu_ordinal = device_parser.determine_root_gpu_device(
                self.parallel_device_ids)
            plugin = SingleDevicePlugin(device=torch.device(
                f"cuda:{single_gpu_ordinal}" if self.use_gpu else "cpu"))
        return plugin
Exemplo n.º 22
0
def test_attributes_from_environment_variables(caplog):
    """Test that the torchelastic cluster environment takes the attributes from the environment variables."""
    env = TorchElasticEnvironment()
    assert env.main_address == "1.2.3.4"
    assert env.main_port == 500
    assert env.world_size() == 20
    assert env.global_rank() == 1
    assert env.local_rank() == 2
    assert env.node_rank() == 3
    # setter should be no-op
    with caplog.at_level(logging.DEBUG,
                         logger="pytorch_lightning.plugins.environments"):
        env.set_global_rank(100)
    assert env.global_rank() == 1
    assert "setting global rank is not allowed" in caplog.text

    caplog.clear()

    with caplog.at_level(logging.DEBUG,
                         logger="pytorch_lightning.plugins.environments"):
        env.set_world_size(100)
    assert env.world_size() == 20
    assert "setting world size is not allowed" in caplog.text