def test_deepspeed_multigpu_single_file(tmpdir):
    """Test to ensure that DeepSpeed loads from a single file checkpoint."""
    model = BoringModel()
    checkpoint_path = os.path.join(tmpdir, "model.pt")
    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
    trainer.fit(model)
    trainer.save_checkpoint(checkpoint_path)

    trainer = Trainer(
        default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=1, fast_dev_run=True, precision=16
    )
    plugin = trainer.training_type_plugin
    assert isinstance(plugin, DeepSpeedPlugin)
    assert not plugin.load_full_weights
    with pytest.raises(MisconfigurationException, match="DeepSpeed was unable to load the checkpoint."):
        trainer.test(model, ckpt_path=checkpoint_path)

    trainer = Trainer(
        default_root_dir=tmpdir,
        plugins=[DeepSpeedPlugin(stage=3, load_full_weights=True)],
        gpus=1,
        fast_dev_run=True,
        precision=16,
    )
    plugin = trainer.training_type_plugin
    assert isinstance(plugin, DeepSpeedPlugin)
    assert plugin.load_full_weights
    trainer.test(model, ckpt_path=checkpoint_path)
def run_checkpoint_test(tmpdir: str, automatic_optimization: bool = True, accumulate_grad_batches: int = 2):
    seed_everything(1)
    if automatic_optimization:
        model = ModelParallelClassificationModel()
    else:
        model = ManualModelParallelClassificationModel()
    dm = ClassifDataModule()
    ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1)
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=10,
        plugins=[DeepSpeedPlugin(stage=3)],
        gpus=2,
        precision=16,
        accumulate_grad_batches=accumulate_grad_batches,
        callbacks=[ck],
    )
    trainer.fit(model, datamodule=dm)

    results = trainer.test(datamodule=dm)
    assert results[0]["test_acc"] > 0.7
    saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
    assert saved_results[0]["test_acc"] > 0.7
    assert saved_results == results

    if automatic_optimization:
        model = ModelParallelClassificationModel()
    else:
        model = ManualModelParallelClassificationModel()
    trainer = Trainer(default_root_dir=tmpdir, gpus=2, plugins=[DeepSpeedPlugin(stage=3)], precision=16)

    results = trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path)
    assert results[0]["test_acc"] > 0.7
示例#3
0
def test_deepspeed_multigpu_stage_3_resume_training(tmpdir):
    """Test to ensure with Stage 3 and multiple GPUs that we can resume training."""
    initial_model = ModelParallelClassificationModel()
    dm = ClassifDataModule()

    ck = ModelCheckpoint(monitor="val_acc",
                         mode="max",
                         save_last=True,
                         save_top_k=-1)
    initial_trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=2,
        limit_val_batches=2,
        limit_test_batches=2,
        plugins=DeepSpeedPlugin(stage=3),
        gpus=1,
        precision=16,
        callbacks=[ck],
    )
    initial_trainer.fit(initial_model, datamodule=dm)

    class TestCallback(Callback):
        def on_train_batch_start(self, trainer: Trainer,
                                 pl_module: LightningModule, batch: Any,
                                 batch_idx: int, dataloader_idx: int) -> None:
            original_deepspeed_plugin = initial_trainer.accelerator.training_type_plugin
            current_deepspeed_plugin = trainer.accelerator.training_type_plugin

            assert isinstance(original_deepspeed_plugin, DeepSpeedPlugin)
            assert isinstance(current_deepspeed_plugin, DeepSpeedPlugin)
            # assert optimizer states are the correctly loaded
            original_optimizer_dict = original_deepspeed_plugin.deepspeed_engine.optimizer.state_dict(
            )
            current_optimizer_dict = current_deepspeed_plugin.deepspeed_engine.optimizer.state_dict(
            )
            for orig_tensor, current_tensor in zip(
                    original_optimizer_dict["fp32_flat_groups"],
                    current_optimizer_dict["fp32_flat_groups"]):
                assert torch.all(orig_tensor.eq(current_tensor))
            # assert model state is loaded correctly
            for current_param, initial_param in zip(
                    pl_module.parameters(), initial_model.parameters()):
                assert torch.equal(current_param.cpu(), initial_param.cpu())
            # assert epoch has correctly been restored
            assert trainer.current_epoch == 1

    model = ModelParallelClassificationModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        plugins=DeepSpeedPlugin(stage=3),
        gpus=1,
        precision=16,
        resume_from_checkpoint=ck.best_model_path,
        callbacks=TestCallback(),
    )
    trainer.fit(model, datamodule=dm)
def run_checkpoint_test(tmpdir: str,
                        save_full_weights: bool,
                        automatic_optimization: bool = True,
                        accumulate_grad_batches: int = 2):
    seed_everything(1)
    if automatic_optimization:
        model = ModelParallelClassificationModel()
    else:
        model = ManualModelParallelClassificationModel()
    dm = ClassifDataModule()
    ck = ModelCheckpoint(monitor="val_acc",
                         mode="max",
                         save_last=True,
                         save_top_k=-1)
    trainer = Trainer(default_root_dir=tmpdir,
                      progress_bar_refresh_rate=0,
                      max_epochs=10,
                      plugins=[
                          DeepSpeedPlugin(stage=3,
                                          save_full_weights=save_full_weights)
                      ],
                      gpus=2,
                      precision=16,
                      accumulate_grad_batches=accumulate_grad_batches,
                      callbacks=[ck])
    trainer.fit(model, datamodule=dm)

    results = trainer.test(model, datamodule=dm)
    assert results[0]['test_acc'] > 0.7

    saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
    assert saved_results[0]['test_acc'] > 0.7
    assert saved_results == results

    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=10,
                      plugins=[
                          DeepSpeedPlugin(stage=3,
                                          save_full_weights=save_full_weights)
                      ],
                      gpus=2,
                      precision=16,
                      accumulate_grad_batches=2,
                      callbacks=[ck],
                      resume_from_checkpoint=ck.best_model_path)
    results = trainer.test(model, datamodule=dm)
    assert results[0]['test_acc'] > 0.7

    dm.predict_dataloader = dm.test_dataloader
    results = trainer.predict(datamodule=dm)
    assert results[-1] > 0.7
def test_deepspeed_with_invalid_config_path(tmpdir):
    """Test to ensure if we pass an invalid config path we throw an exception."""

    with pytest.raises(
        MisconfigurationException, match="You passed in a path to a DeepSpeed config but the path does not exist"
    ):
        DeepSpeedPlugin(config="invalid_path.json")
def test_deepspeed_setup_train_dataloader(tmpdir):
    """Test DeepSpeed works when setup is required to call in the DataModule."""

    class TestSetupIsCalledDataModule(LightningDataModule):
        def __init__(self):
            super().__init__()
            self._setup = False

        def setup(self, stage: Optional[str] = None) -> None:
            self._setup = True

        def train_dataloader(self):
            assert self._setup
            return DataLoader(RandomDataset(32, 64), batch_size=2)

        def val_dataloader(self):
            assert self._setup
            return DataLoader(RandomDataset(32, 64), batch_size=2)

        def test_dataloader(self):
            assert self._setup
            return DataLoader(RandomDataset(32, 64), batch_size=2)

    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        strategy=DeepSpeedPlugin(logging_level=logging.INFO),
        gpus=1,
        fast_dev_run=True,
    )
    dm = TestSetupIsCalledDataModule()
    with mock.patch("deepspeed.utils.logging.logger.warning", autospec=True) as mock_object:
        trainer.fit(model, datamodule=dm)
    assert any("Tried to infer the batch size" in str(arg) for arg in mock_object.call_args_list)
def _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer):
    """Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works."""
    seed_everything(42)

    class VerificationCallback(Callback):
        def __init__(self):
            self.on_train_batch_start_called = False

        def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int) -> None:
            deepspeed_engine = trainer.training_type_plugin.model
            assert trainer.global_step == deepspeed_engine.global_steps
            self.on_train_batch_start_called = True

    model = ModelParallelClassificationModel()
    dm = ClassifDataModule()
    verification_callback = VerificationCallback()
    trainer = Trainer(
        default_root_dir=tmpdir,
        enable_progress_bar=False,
        # TODO: this test fails with max_epochs >1 as there are leftover batches per epoch.
        # there's divergence in how Lightning handles the last batch of the epoch with how DeepSpeed does it.
        # we step the optimizers on the last batch but DeepSpeed keeps the accumulation for the next epoch
        max_epochs=1,
        strategy=DeepSpeedPlugin(stage=2, offload_optimizer=offload_optimizer),
        gpus=2,
        limit_train_batches=5,
        limit_val_batches=2,
        precision=16,
        accumulate_grad_batches=2,
        callbacks=[verification_callback],
    )
    assert trainer.limit_train_batches % trainer.accumulate_grad_batches != 0, "leftover batches should be tested"
    trainer.fit(model, datamodule=dm)
    assert verification_callback.on_train_batch_start_called
def test_deepspeed_config(tmpdir, deepspeed_zero_config):
    """Test to ensure deepspeed works correctly when passed a DeepSpeed config object including
    optimizers/schedulers and saves the model weights to load correctly."""

    class TestCB(Callback):
        def on_train_start(self, trainer, pl_module) -> None:
            from deepspeed.runtime.lr_schedules import WarmupLR
            from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer

            assert isinstance(trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer)
            assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD)
            assert isinstance(trainer.lr_schedulers[0]["scheduler"], WarmupLR)

    model = BoringModel()
    trainer = Trainer(
        strategy=DeepSpeedPlugin(config=deepspeed_zero_config),
        default_root_dir=tmpdir,
        gpus=1,
        fast_dev_run=True,
        precision=16,
        callbacks=[TestCB()],
    )

    trainer.fit(model)
    trainer.test(model)
def test_deepspeed_run_configure_optimizers(tmpdir):
    """Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation), whilst using
    configure_optimizers for optimizers and schedulers."""

    class TestCB(Callback):
        def on_train_start(self, trainer, pl_module) -> None:
            from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer

            assert isinstance(trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer)
            assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD)
            assert isinstance(trainer.lr_schedulers[0]["scheduler"], torch.optim.lr_scheduler.StepLR)
            # check that the lr_scheduler config was preserved
            assert trainer.lr_schedulers[0]["name"] == "Sean"

    class TestModel(BoringModel):
        def configure_optimizers(self):
            [optimizer], [scheduler] = super().configure_optimizers()
            return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "name": "Sean"}}

    model = TestModel()
    lr_monitor = LearningRateMonitor()
    trainer = Trainer(
        strategy=DeepSpeedPlugin(),  # disable ZeRO so our optimizers are not wrapped
        default_root_dir=tmpdir,
        gpus=1,
        fast_dev_run=True,
        precision=16,
        callbacks=[TestCB(), lr_monitor],
    )
    trainer.fit(model)

    assert lr_monitor.lrs == {"Sean": [0.1]}

    _assert_save_model_is_equal(model, tmpdir, trainer)
def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(
        tmpdir, offload_optimizer):
    """
    Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works.
    """
    seed_everything(42)

    class VerificationCallback(Callback):
        def on_train_batch_start(self, trainer, pl_module: LightningModule,
                                 batch: Any, batch_idx: int,
                                 dataloader_idx: int) -> None:
            deepspeed_engine = trainer.training_type_plugin.model
            assert trainer.global_step == deepspeed_engine.global_steps

    model = ModelParallelClassificationModel()
    dm = ClassifDataModule()
    trainer = Trainer(default_root_dir=tmpdir,
                      progress_bar_refresh_rate=0,
                      max_epochs=5,
                      plugins=[
                          DeepSpeedPlugin(stage=2,
                                          offload_optimizer=offload_optimizer)
                      ],
                      gpus=2,
                      limit_val_batches=2,
                      precision=16,
                      accumulate_grad_batches=2,
                      callbacks=[VerificationCallback()])
    trainer.fit(model, datamodule=dm)
def test_deepspeed_multigpu_test(tmpdir, deepspeed_config):
    """Test to ensure we can use DeepSpeed with just test using ZeRO Stage 3."""
    model = ModelParallelBoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16
    )
    trainer.test(model)
def test_deepspeed_custom_precision_params(tmpdir):
    """
        Ensure if we modify the FP16 parameters via the DeepSpeedPlugin, the deepspeed config contains these changes.
    """

    class TestModel(BoringModel):

        def on_train_start(self) -> None:
            assert self.trainer.training_type_plugin.config['fp16']['loss_scale'] == 10
            assert self.trainer.training_type_plugin.config['fp16']['initial_scale_power'] == 10
            assert self.trainer.training_type_plugin.config['fp16']['loss_scale_window'] == 10
            assert self.trainer.training_type_plugin.config['fp16']['hysteresis'] == 10
            assert self.trainer.training_type_plugin.config['fp16']['min_loss_scale'] == 10
            raise SystemExit()

    model = TestModel()
    trainer = Trainer(
        plugins=[
            DeepSpeedPlugin(
                loss_scale=10, initial_scale_power=10, loss_scale_window=10, hysteresis=10, min_loss_scale=10
            )
        ],
        precision=16,
        gpus=1
    )
    with pytest.raises(SystemExit):
        trainer.fit(model)
def test_deepspeed_setup_train_dataloader(tmpdir):
    """Test DeepSpeed works when setup is required to call, and the user passes the batch size manually."""

    class TestSetupIsCalledDataModule(LightningDataModule):
        def __init__(self):
            super().__init__()
            self._setup = False

        def setup(self, stage: Optional[str] = None) -> None:
            self._setup = True

        def train_dataloader(self):
            assert self._setup
            return DataLoader(RandomDataset(32, 64), batch_size=2)

        def val_dataloader(self):
            assert self._setup
            return DataLoader(RandomDataset(32, 64), batch_size=2)

        def test_dataloader(self):
            assert self._setup
            return DataLoader(RandomDataset(32, 64), batch_size=2)

    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        plugins=[DeepSpeedPlugin(logging_batch_size_per_gpu=32)],
        gpus=1,
        fast_dev_run=True,
    )
    trainer.fit(model, datamodule=TestSetupIsCalledDataModule())
    trainer.test(model)
def test_deepspeed_auto_batch_size_config_select(tmpdir, dataset_cls, value):
    """Test to ensure that the batch size is correctly set as expected for deepspeed logging purposes."""
    class TestModel(BoringModel):
        def train_dataloader(self):
            return DataLoader(dataset_cls(32, 64))

    class AssertCallback(Callback):
        def on_train_start(self, trainer, pl_module) -> None:
            assert isinstance(trainer.accelerator.training_type_plugin,
                              DeepSpeedPlugin)
            config = trainer.accelerator.training_type_plugin.config

            # int value overrides auto mode
            expected_value = value if isinstance(value, int) else 1
            if dataset_cls == RandomDataset:
                expected_value = pl_module.train_dataloader(
                ).batch_size if value == "auto" else value

            assert config['train_micro_batch_size_per_gpu'] == expected_value
            raise SystemExit

    ck = AssertCallback()
    model = TestModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        callbacks=ck,
        gpus=1,
        plugins=DeepSpeedPlugin(logging_batch_size_per_gpu=value,
                                zero_optimization=False),
    )
    with pytest.raises(SystemExit):
        trainer.fit(model)
def test_deepspeed_config(tmpdir, deepspeed_zero_config):
    """
    Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers
    and saves the model weights to load correctly.
    """
    class TestCB(Callback):
        def on_train_start(self, trainer, pl_module) -> None:
            from deepspeed.runtime.lr_schedules import WarmupLR
            from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer

            assert isinstance(trainer.optimizers[0],
                              FP16_DeepSpeedZeroOptimizer)
            assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD)
            assert trainer.lr_schedulers == [
            ]  # DeepSpeed manages LR scheduler internally
            # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler
            assert isinstance(trainer.model.lr_scheduler, WarmupLR)

    model = BoringModel()
    trainer = Trainer(plugins=[DeepSpeedPlugin(config=deepspeed_zero_config)],
                      default_root_dir=tmpdir,
                      gpus=1,
                      fast_dev_run=True,
                      precision=16,
                      callbacks=[TestCB()])

    trainer.fit(model)
    trainer.test(model)

    _assert_save_model_is_equal(model, tmpdir, trainer)
def test_deepspeed_defaults(tmpdir):
    """
    Ensure that defaults are correctly set as a config for DeepSpeed if no arguments are passed.
    """
    plugin = DeepSpeedPlugin()
    assert plugin.config is not None
    assert isinstance(plugin.config["zero_optimization"], dict)
def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir):
    """Test to ensure with Stage 3 and multiple GPUs that we can resume from training, throwing a warning that the
    optimizer state and scheduler states cannot be restored."""
    dm = ClassifDataModule()
    model = BoringModel()
    checkpoint_path = os.path.join(tmpdir, "model.pt")
    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
    trainer.fit(model)
    trainer.save_checkpoint(checkpoint_path)

    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        plugins=DeepSpeedPlugin(stage=3, load_full_weights=True),
        gpus=1,
        precision=16,
        resume_from_checkpoint=checkpoint_path,
    )
    with pytest.warns(
        UserWarning,
        match="A single checkpoint file has been given. This means optimizer states and "
        "scheduler states can not be restored. If you'd like to restore these states, you must "
        "provide a path to the originally saved DeepSpeed checkpoint.",
    ):
        trainer.fit(model, datamodule=dm)
def test_deepspeed_custom_precision_params(tmpdir):
    """Ensure if we modify the FP16 parameters via the DeepSpeedPlugin, the deepspeed config contains these changes."""
    class TestCB(Callback):
        def on_train_start(self, trainer, pl_module) -> None:
            assert trainer.training_type_plugin.config['fp16'][
                'loss_scale'] == 10
            assert trainer.training_type_plugin.config['fp16'][
                'initial_scale_power'] == 10
            assert trainer.training_type_plugin.config['fp16'][
                'loss_scale_window'] == 10
            assert trainer.training_type_plugin.config['fp16'][
                'hysteresis'] == 10
            assert trainer.training_type_plugin.config['fp16'][
                'min_loss_scale'] == 10
            raise SystemExit()

    model = BoringModel()
    ds = DeepSpeedPlugin(loss_scale=10,
                         initial_scale_power=10,
                         loss_scale_window=10,
                         hysteresis=10,
                         min_loss_scale=10)
    trainer = Trainer(default_root_dir=tmpdir,
                      plugins=[ds],
                      precision=16,
                      gpus=1,
                      callbacks=[TestCB()])
    with pytest.raises(SystemExit):
        trainer.fit(model)
def test_deepspeed_run_configure_optimizers(tmpdir):
    """
        Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation),
        whilst using configure_optimizers for optimizers and schedulers.
    """
    class TestModel(BoringModel):
        def on_train_start(self) -> None:
            assert isinstance(self.trainer.optimizers[0], torch.optim.SGD)
            assert self.trainer.lr_schedulers == [
            ]  # DeepSpeed manages LR scheduler internally
            # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler
            assert isinstance(self.trainer.model.lr_scheduler,
                              torch.optim.lr_scheduler.StepLR)

    model = TestModel()
    trainer = Trainer(
        plugins=DeepSpeedPlugin(zero_optimization=False),
        default_root_dir=tmpdir,
        gpus=1,
        fast_dev_run=True,
    )

    trainer.fit(model)

    _assert_save_model_is_equal(model, tmpdir, trainer)
def test_deepspeed_config(tmpdir, deepspeed_config):
    """
        Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers
        and saves the model weights to load correctly.
    """
    class TestModel(BoringModel):
        def on_train_start(self) -> None:
            import deepspeed
            assert isinstance(self.trainer.optimizers[0], torch.optim.SGD)
            assert self.trainer.lr_schedulers == [
            ]  # DeepSpeed manages LR scheduler internally
            assert isinstance(self.trainer.model.optimizer, torch.optim.SGD)
            assert isinstance(self.trainer.model.lr_scheduler,
                              deepspeed.runtime.lr_schedules.WarmupLR)

    model = TestModel()
    trainer = Trainer(
        plugins=[DeepSpeedPlugin(config=deepspeed_config)],
        default_root_dir=tmpdir,
        gpus=1,
        fast_dev_run=True,
    )

    trainer.fit(model)
    trainer.test(model)

    _assert_save_model_is_equal(model, tmpdir, trainer)
def test_deepspeed_plugin_env_variables(mock_deepspeed_distributed, tmpdir,
                                        platform):
    """
    Test to ensure that we setup distributed communication using correctly.
    When using windows, ranks environment variables should not be set, and deepspeed should handle this.
    """
    trainer = Trainer(default_root_dir=tmpdir,
                      plugins=[DeepSpeedPlugin(stage=3)])
    plugin = trainer.training_type_plugin
    assert isinstance(plugin, DeepSpeedPlugin)
    with mock.patch("platform.system", return_value=platform) as mock_platform:
        plugin.init_ddp_connection()
    mock_deepspeed_distributed.assert_called()
    mock_platform.assert_called()
    if platform == "Windows":
        # assert no env variables have been set within the DeepSpeedPlugin
        assert all(k not in os.environ
                   for k in ("MASTER_PORT", "MASTER_ADDR", "RANK",
                             "WORLD_SIZE", "LOCAL_RANK"))
    else:
        assert os.environ["MASTER_ADDR"] == str(
            trainer.training_type_plugin.cluster_environment.master_address())
        assert os.environ["MASTER_PORT"] == str(
            trainer.training_type_plugin.cluster_environment.master_port())
        assert os.environ["RANK"] == str(
            trainer.training_type_plugin.global_rank)
        assert os.environ["WORLD_SIZE"] == str(
            trainer.training_type_plugin.world_size)
        assert os.environ["LOCAL_RANK"] == str(
            trainer.training_type_plugin.local_rank)
def test_deepspeed_assert_config_zero_offload_disabled(tmpdir,
                                                       deepspeed_zero_config):
    """Ensure if we use a config and turn off cpu_offload, that this is set to False within the config."""

    deepspeed_zero_config["zero_optimization"]["cpu_offload"] = False

    class TestCallback(Callback):
        def on_before_accelerator_backend_setup(self, trainer,
                                                pl_module) -> None:
            assert trainer.training_type_plugin.config["zero_optimization"][
                "cpu_offload"] is False
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        plugins=[DeepSpeedPlugin(config=deepspeed_zero_config)],
        precision=16,
        gpus=1,
        callbacks=[TestCallback()],
    )
    with pytest.raises(SystemExit):
        trainer.fit(model)
示例#23
0
def test_deepspeed_multigpu_partial_partition_parameters(tmpdir):
    """Test to ensure that a module that defines a layer inside the ``__init__`` and ``configure_sharded_model``
    correctly converts all parameters to float16 when ``precision=16`` and runs successfully."""
    class TestModel(ModelParallelBoringModel):
        def __init__(self):
            super().__init__()
            self.layer_2 = torch.nn.Linear(32, 32)

        def configure_sharded_model(self) -> None:
            self.layer = torch.nn.Linear(32, 2)

        def forward(self, x):
            x = self.layer_2(x)
            return self.layer(x)

        def on_train_epoch_start(self) -> None:
            assert all([x.dtype == torch.float16 for x in self.parameters()])

    model = TestModel()
    trainer = Trainer(default_root_dir=tmpdir,
                      strategy=DeepSpeedPlugin(stage=3),
                      gpus=1,
                      fast_dev_run=True,
                      precision=16)
    trainer.fit(model)
def test_deepspeed_run_configure_optimizers(tmpdir):
    """
    Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation),
    whilst using configure_optimizers for optimizers and schedulers.
    """
    class TestCB(Callback):
        def on_train_start(self, trainer, pl_module) -> None:
            from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer

            assert isinstance(trainer.optimizers[0],
                              FP16_DeepSpeedZeroOptimizer)
            assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD)
            assert trainer.lr_schedulers == [
            ]  # DeepSpeed manages LR scheduler internally
            # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler
            assert isinstance(trainer.model.lr_scheduler,
                              torch.optim.lr_scheduler.StepLR)

    model = BoringModel()
    trainer = Trainer(
        plugins=DeepSpeedPlugin(
        ),  # disable ZeRO so our optimizers are not wrapped
        default_root_dir=tmpdir,
        gpus=1,
        fast_dev_run=True,
        precision=16,
        callbacks=[TestCB()])

    trainer.fit(model)

    _assert_save_model_is_equal(model, tmpdir, trainer)
示例#25
0
    def select_training_type_plugin(self) -> TrainingTypePlugin:
        if self.use_ddp2:
            plugin = DDP2Plugin(parallel_devices=self.parallel_devices,
                                cluster_environment=self.cluster_environment)
        elif self.use_ddp and self.use_deepspeed:
            plugin = DeepSpeedPlugin(
                num_nodes=self.num_nodes,
                cluster_environment=self.select_cluster_environment(),
                parallel_devices=self.parallel_devices)
        elif self.use_ddp:
            use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks
            use_torchelastic_ddp = self.use_ddp and self.is_using_torchelastic
            use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN
            use_ddp_cpu_spawn = self.use_ddp and self.on_cpu
            use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic
            use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks
            use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED
            use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN

            # TODO: decouple from TE
            # ddp script mode uses the same flags as TE
            if os.environ.get("PL_IN_DDP_SUBPROCESS", False):
                use_torchelastic_ddp = False

            if self.on_tpu:
                ddp_plugin_cls = TPUSpawnPlugin
            elif use_ddp_sharded:
                ddp_plugin_cls = DDPShardedPlugin
            elif use_ddp_sharded_spawn:
                ddp_plugin_cls = DDPSpawnShardedPlugin
            elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp:
                ddp_plugin_cls = DDPPlugin
            elif use_ddp_spawn or use_ddp_cpu_spawn:
                ddp_plugin_cls = DDPSpawnPlugin
            else:
                ddp_plugin_cls = DDPPlugin

            plugin = ddp_plugin_cls(
                parallel_devices=self.parallel_devices,
                num_nodes=self.num_nodes,
                cluster_environment=self.cluster_environment,
                sync_batchnorm=self.sync_batchnorm,
            )
        elif self.use_dp:
            plugin = DataParallelPlugin(parallel_devices=self.parallel_devices)
        elif self.use_horovod:
            plugin = HorovodPlugin(parallel_devices=self.parallel_devices)
        elif self.on_tpu:
            if isinstance(self.tpu_cores, list):
                plugin = SingleTPUPlugin(self.tpu_id)
            else:
                plugin = TPUSpawnPlugin(
                    parallel_devices=list(range(self.tpu_cores)))
        else:
            single_gpu_ordinal = device_parser.determine_root_gpu_device(
                self.parallel_device_ids)
            plugin = SingleDevicePlugin(device=torch.device(
                f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu"))
        return plugin
def test_deepspeed_with_env_path(tmpdir, monkeypatch, deepspeed_config):
    """Test to ensure if we pass an env variable, we load the config from the path."""
    config_path = os.path.join(tmpdir, "temp.json")
    with open(config_path, "w") as f:
        f.write(json.dumps(deepspeed_config))
    monkeypatch.setenv("PL_DEEPSPEED_CONFIG_PATH", config_path)
    plugin = DeepSpeedPlugin()
    assert plugin.config == deepspeed_config
def test_deepspeed_multigpu_no_schedulers(tmpdir):
    """Test to ensure ZeRO Stage 3 works with a parallel model and no schedulers."""
    model = ModelParallelBoringModelNoSchedulers()
    trainer = Trainer(
        default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16
    )
    trainer.fit(model)

    _assert_save_model_is_equal(model, tmpdir, trainer)
def test_deepspeed_skip_backward_raises(tmpdir):
    class TestModel(BoringModel):
        def training_step(self, batch, batch_idx):
            return None

    model = TestModel()
    trainer = Trainer(default_root_dir=tmpdir, plugins=[DeepSpeedPlugin()], gpus=1, fast_dev_run=True, precision=16)
    with pytest.raises(MisconfigurationException, match="returning `None` .* is not supported"):
        trainer.fit(model)
def test_deepspeed_with_meta_device(tmpdir):
    with init_meta_context():
        model = BoringModel()
    assert model.layer.weight.device.type == "meta"
    trainer = Trainer(
        default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16
    )
    trainer.fit(model)
    assert model.layer.weight.device.type == "cpu"
示例#30
0
def test_lightning_model():
    """Test that DeepSpeed works with a simple LightningModule and LightningDataModule."""

    model = BoringModel()
    trainer = Trainer(strategy=DeepSpeedPlugin(),
                      max_epochs=1,
                      precision=16,
                      gpus=1)
    trainer.fit(model)