def select_strategy(self) -> Strategy:
        if isinstance(self.distributed_backend, Accelerator) and self.distributed_backend.strategy is not None:
            plugin = self.distributed_backend.strategy
        elif self.use_ddp2:
            plugin = DDP2Strategy(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment)
        elif self.use_ddp and self.use_deepspeed:
            plugin = DeepSpeedStrategy(
                cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices
            )
        elif self.use_ddp:
            use_slurm_ddp = self.use_ddp and self._is_slurm_managing_tasks()
            use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.detect()
            use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.detect()
            use_ddp_spawn = self._strategy_type == _StrategyType.DDP_SPAWN
            use_ddp_cpu_spawn = use_ddp_spawn and self.use_cpu
            use_tpu_spawn = self.use_tpu and self._strategy_type == _StrategyType.TPU_SPAWN
            use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.detect()
            use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.detect()
            use_ddp_cpu_slurm = use_ddp_cpu_spawn and self._is_slurm_managing_tasks()
            use_ddp_sharded = self._strategy_type == _StrategyType.DDP_SHARDED
            use_ddp_sharded_spawn = self._strategy_type == _StrategyType.DDP_SHARDED_SPAWN
            use_ddp_fully_sharded = self._strategy_type == _StrategyType.DDP_FULLY_SHARDED

            if use_tpu_spawn:
                ddp_strategy_cls = TPUSpawnStrategy
            elif use_ddp_sharded:
                ddp_strategy_cls = DDPShardedStrategy
            elif use_ddp_sharded_spawn:
                ddp_strategy_cls = DDPSpawnShardedStrategy
            elif (
                use_ddp_cpu_slurm
                or use_slurm_ddp
                or use_ddp_cpu_torch_elastic
                or use_torchelastic_ddp
                or use_kubeflow_ddp
                or use_ddp_cpu_kubeflow
            ):
                ddp_strategy_cls = DDPStrategy
            elif use_ddp_spawn or use_ddp_cpu_spawn:
                ddp_strategy_cls = DDPSpawnStrategy
            elif use_ddp_fully_sharded:
                ddp_strategy_cls = DDPFullyShardedStrategy
            else:
                ddp_strategy_cls = DDPStrategy

            plugin = ddp_strategy_cls(
                parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment
            )
        elif self.use_dp:
            plugin = DataParallelStrategy(parallel_devices=self.parallel_devices)
        elif self.use_horovod:
            plugin = HorovodStrategy(parallel_devices=self.parallel_devices)
        elif self.use_tpu and isinstance(self.tpu_cores, list):
            plugin = SingleTPUStrategy(self.tpu_id)
        elif self.use_ipu:
            plugin = IPUStrategy(parallel_devices=self.parallel_devices)
        else:
            single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids)
            plugin = SingleDeviceStrategy(device=single_gpu_ordinal if self.use_gpu else "cpu")
        return plugin
Пример #2
0
 def select_cluster_environment(self) -> ClusterEnvironment:
     if self._cluster_environment is not None:
         return self._cluster_environment
     if self.is_slurm_managing_tasks:
         env = SLURMEnvironment()
     elif TorchElasticEnvironment.is_using_torchelastic():
         env = TorchElasticEnvironment()
     elif KubeflowEnvironment.is_using_kubeflow():
         env = KubeflowEnvironment()
     else:
         env = LightningEnvironment()
     return env
Пример #3
0
 def select_cluster_environment(self) -> ClusterEnvironment:
     if self._cluster_environment is not None:
         return self._cluster_environment
     if self._is_slurm_managing_tasks():
         env = SLURMEnvironment()
         rank_zero_info("Multiprocessing is handled by SLURM.")
     elif TorchElasticEnvironment.is_using_torchelastic():
         env = TorchElasticEnvironment()
     elif KubeflowEnvironment.is_using_kubeflow():
         env = KubeflowEnvironment()
     elif LSFEnvironment.is_using_lsf():
         env = LSFEnvironment()
     else:
         env = LightningEnvironment()
     return env
Пример #4
0
    def _check_strategy_and_fallback(self) -> None:
        """Checks edge cases when the strategy selection was a string input, and we need to fall back to a
        different choice depending on other parameters or the environment."""
        # current fallback and check logic only apply to user pass in str config and object config
        # TODO this logic should apply to both str and object config
        strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag

        if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
            TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks()
        ):
            strategy_flag = "ddp"
        if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu":
            rank_zero_warn(f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.")
            strategy_flag = "ddp"
        if (
            strategy_flag in DDPFullyShardedNativeStrategy.get_registered_strategies()
            or isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy)
        ) and self._accelerator_flag != "gpu":
            raise MisconfigurationException(
                f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, "
                "but GPU accelerator is not used."
            )

        if strategy_flag:
            self._strategy_flag = strategy_flag
    def _check_strategy_and_fallback(self) -> None:
        """Checks edge cases when the strategy selection was a string input, and we need to fall back to a
        different choice depending on other parameters or the environment."""
        # current fallback and check logic only apply to user pass in str config and object config
        # TODO this logic should apply to both str and object config
        strategy_flag = "" if isinstance(self._strategy_flag,
                                         Strategy) else self._strategy_flag

        if strategy_flag in ("ddp_spawn",
                             "ddp_spawn_find_unused_parameters_false") and (
                                 TorchElasticEnvironment.detect()
                                 or KubeflowEnvironment.detect()
                                 or self._is_slurm_managing_tasks()):
            strategy_flag = "ddp"
        if strategy_flag == "dp" and self._accelerator_flag == "cpu":
            rank_zero_warn(
                f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`."
            )
            strategy_flag = "ddp"
        if (strategy_flag
                in DDPFullyShardedNativeStrategy.get_registered_strategies() or
                isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy)
            ) and self._accelerator_flag not in ("cuda", "gpu"):
            raise MisconfigurationException(
                f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, "
                "but GPU accelerator is not used.")
        if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods(
        ):
            raise ValueError(
                f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this"
                f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead."
            )
        if strategy_flag:
            self._strategy_flag = strategy_flag
Пример #6
0
def test_default_attributes():
    """Test the default attributes when no environment variables are set."""
    env = KubeflowEnvironment()
    assert env.creates_processes_externally

    with pytest.raises(KeyError):
        # MASTER_ADDR is required
        env.main_address
    with pytest.raises(KeyError):
        # MASTER_PORT is required
        env.main_port
    with pytest.raises(KeyError):
        # WORLD_SIZE is required
        env.world_size()
    with pytest.raises(KeyError):
        # RANK is required
        env.global_rank()
    assert env.local_rank() == 0
Пример #7
0
    def _check_strategy_and_fallback(self) -> None:
        """Checks edge cases when the strategy selection was a string input, and we need to fall back to a
        different choice depending on other parameters or the environment."""
        # current fallback and check logic only apply to user pass in str config and object config
        # TODO this logic should apply to both str and object config
        strategy_flag = "" if isinstance(self._strategy_flag,
                                         Strategy) else self._strategy_flag

        if strategy_flag == "ddp_cpu":
            if _TPU_AVAILABLE:
                raise MisconfigurationException(
                    "`accelerator='ddp_cpu'` is not supported on TPU machines. "
                    "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810"
                )
            if self._devices_flag == 1 and self._num_nodes_flag > 1:
                strategy_flag = DDPStrategy.strategy_name
            else:
                strategy_flag = "ddp_spawn"
            if self._accelerator_flag == "gpu":
                rank_zero_warn(
                    "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs."
                )
                self._accelerator_flag = "cpu"
                self.accelerator = CPUAccelerator()
        if strategy_flag in ("ddp_spawn",
                             "ddp_spawn_find_unused_parameters_false") and (
                                 TorchElasticEnvironment.detect()
                                 or KubeflowEnvironment.detect()
                                 or self._is_slurm_managing_tasks()):
            strategy_flag = "ddp"
        if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu":
            rank_zero_warn(
                f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`."
            )
            strategy_flag = "ddp"

        if strategy_flag:
            self._strategy_flag = strategy_flag
Пример #8
0
    def select_training_type_plugin(self) -> TrainingTypePlugin:
        if isinstance(
            self.distributed_backend, Accelerator
        ) and self.distributed_backend.training_type_plugin is not None:
            plugin = self.distributed_backend.training_type_plugin
        elif self.use_ddp2:
            plugin = DDP2Plugin(
                parallel_devices=self.parallel_devices,
                cluster_environment=self.cluster_environment,
            )
        elif self.use_ddp and self.use_deepspeed:
            plugin = DeepSpeedPlugin(
                cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices
            )
        elif self.use_ddp:
            use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks
            use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.is_using_torchelastic()
            use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.is_using_kubeflow()
            use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN
            use_ddp_cpu_spawn = self.use_ddp and self.on_cpu
            use_tpu_spawn = self.on_tpu and self._distrib_type == DistributedType.TPU_SPAWN
            use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.is_using_torchelastic()
            use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.is_using_kubeflow()
            use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks
            use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED
            use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN
            use_ddp_fully_sharded = self._distrib_type == DistributedType.DDP_FULLY_SHARDED

            # TODO: decouple from TE
            # ddp script mode uses the same flags as TE
            if os.environ.get("PL_IN_DDP_SUBPROCESS", False):
                use_torchelastic_ddp = False

            if use_tpu_spawn:
                ddp_plugin_cls = TPUSpawnPlugin
            elif use_ddp_sharded:
                ddp_plugin_cls = DDPShardedPlugin
            elif use_ddp_sharded_spawn:
                ddp_plugin_cls = DDPSpawnShardedPlugin
            elif (
                use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp
                or use_kubeflow_ddp or use_ddp_cpu_kubeflow
            ):
                ddp_plugin_cls = DDPPlugin
            elif use_ddp_spawn or use_ddp_cpu_spawn:
                ddp_plugin_cls = DDPSpawnPlugin
            elif use_ddp_fully_sharded:
                ddp_plugin_cls = DDPFullyShardedPlugin
            else:
                ddp_plugin_cls = DDPPlugin

            plugin = ddp_plugin_cls(
                parallel_devices=self.parallel_devices,
                cluster_environment=self.cluster_environment,
            )
        elif self.use_dp:
            plugin = DataParallelPlugin(parallel_devices=self.parallel_devices)
        elif self.use_horovod:
            plugin = HorovodPlugin(parallel_devices=self.parallel_devices)
        elif self.on_tpu and isinstance(self.tpu_cores, list):
            plugin = SingleTPUPlugin(self.tpu_id)
        elif self.on_ipu:
            plugin = IPUPlugin(parallel_devices=self.parallel_devices)
        else:
            single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids)
            plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu"))
        return plugin
Пример #9
0
def test_detect_kubeflow():
    assert KubeflowEnvironment.detect()
Пример #10
0
def test_attributes_from_environment_variables(caplog):
    """Test that the torchelastic cluster environment takes the attributes from the environment variables."""
    env = KubeflowEnvironment()
    assert env.main_address == "1.2.3.4"
    assert env.main_port == 500
    assert env.world_size() == 20
    assert env.global_rank() == 1
    assert env.local_rank() == 0
    assert env.node_rank() == 1
    # setter should be no-op
    with caplog.at_level(logging.DEBUG,
                         logger="pytorch_lightning.plugins.environments"):
        env.set_global_rank(100)
    assert env.global_rank() == 1
    assert "setting global rank is not allowed" in caplog.text

    caplog.clear()

    with caplog.at_level(logging.DEBUG,
                         logger="pytorch_lightning.plugins.environments"):
        env.set_world_size(100)
    assert env.world_size() == 20
    assert "setting world size is not allowed" in caplog.text
Пример #11
0
def test_detect_torchelastic_over_kubeflow():
    assert not KubeflowEnvironment.detect()
def test_is_using_kubeflow():
    assert KubeflowEnvironment.is_using_kubeflow()
def test_is_using_kubeflow_torchelastic():
    assert not KubeflowEnvironment.is_using_kubeflow()