Exemplo n.º 1
0
def test_non_sequential_sampler_warning_is_raised_for_eval_dataloader(
        val_dl, warns):
    trainer = Trainer()
    model = BoringModel()
    trainer._data_connector.attach_data(model, val_dataloaders=val_dl)
    context = pytest.warns if warns else no_warning_call
    with context(
            PossibleUserWarning,
            match="recommended .* turn shuffling off for val/test/predict"):
        trainer._data_connector._reset_eval_dataloader(RunningStage.VALIDATING,
                                                       model)
Exemplo n.º 2
0
def test_deepspeed_with_meta_device(tmpdir):
    with init_meta_context():
        model = BoringModel()
    assert model.layer.weight.device.type == "meta"
    trainer = Trainer(default_root_dir=tmpdir,
                      plugins=[DeepSpeedPlugin(stage=3)],
                      gpus=2,
                      fast_dev_run=True,
                      precision=16)
    trainer.fit(model)
    assert model.layer.weight.device.type == "cpu"
def test_deepspeed_gradient_clip_by_value(tmpdir):
    """Test to ensure that an exception is raised when using `gradient_clip_algorithm='value'`."""
    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        gpus=1,
        strategy="deepspeed",
        gradient_clip_algorithm="value",
    )
    with pytest.raises(MisconfigurationException, match="does not support clipping gradients by value"):
        trainer.fit(model)
Exemplo n.º 4
0
def test_if_test_works_after_train(tmpdir):
    """Ensure that .test() works after .fit()"""

    # Train a model on TPU
    model = BoringModel()
    trainer = Trainer(max_epochs=1,
                      tpu_cores=8,
                      default_root_dir=tmpdir,
                      fast_dev_run=True)
    trainer.fit(model)
    assert len(trainer.test(model)) == 1
Exemplo n.º 5
0
def test_overfit_basic(tmpdir, overfit):
    """Tests that only training_step can be used."""

    model = BoringModel()

    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=1,
                      overfit_batches=overfit,
                      weights_summary=None)

    trainer.fit(model)
Exemplo n.º 6
0
def test_replication_factor(tmpdir):
    """Ensure if the user passes manual poptorch Options with custom parameters set, we set them correctly in the
    dataloaders."""

    plugin = IPUPlugin()
    trainer = Trainer(ipus=2,
                      default_root_dir=tmpdir,
                      fast_dev_run=True,
                      strategy=plugin)
    assert trainer.ipus == 2
    assert trainer.training_type_plugin.replication_factor == 2

    model = BoringModel()
    training_opts = poptorch.Options()
    inference_opts = poptorch.Options()
    training_opts.replicationFactor(8)
    inference_opts.replicationFactor(7)
    plugin = IPUPlugin(inference_opts=inference_opts,
                       training_opts=training_opts)

    trainer = Trainer(default_root_dir=tmpdir, ipus=1, strategy=plugin)
    trainer.optimizers = model.configure_optimizers()[0]
    plugin.model = model
    model.trainer = trainer
    trainer.state.fn = TrainerFn.FITTING
    trainer.training_type_plugin.pre_dispatch()

    trainer.state.stage = RunningStage.TRAINING
    assert trainer.training_type_plugin.replication_factor == 8
    trainer.state.stage = RunningStage.VALIDATING
    assert trainer.training_type_plugin.replication_factor == 7

    for fn, stage in (
        (TrainerFn.VALIDATING, RunningStage.VALIDATING),
        (TrainerFn.TESTING, RunningStage.TESTING),
        (TrainerFn.PREDICTING, RunningStage.PREDICTING),
    ):
        trainer.state.fn = fn
        trainer.state.stage = stage
        trainer.training_type_plugin.pre_dispatch()
        assert trainer.training_type_plugin.replication_factor == 7
def _assert_save_model_is_equal(model, tmpdir, trainer):
    checkpoint_path = os.path.join(tmpdir, 'model.pt')
    trainer.save_checkpoint(checkpoint_path)
    # carry out the check only on rank 0
    if trainer.global_rank == 0:
        saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
        saved_model = saved_model.float()
        model = model.float().cpu()
        # Assert model parameters are identical after loading
        for orig_param, trained_model_param in zip(model.parameters(),
                                                   saved_model.parameters()):
            assert torch.equal(orig_param, trained_model_param)
Exemplo n.º 8
0
def test_ddp_barrier_non_consecutive_device_ids(barrier_mock, tmpdir):
    """ Test correct usage of barriers when device ids do not start at 0 or are not consecutive. """
    model = BoringModel()
    gpus = [1, 3]
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_steps=1,
        gpus=gpus,
        accelerator="ddp",
    )
    trainer.fit(model)
    barrier_mock.assert_any_call(device_ids=[gpus[trainer.local_rank]])
Exemplo n.º 9
0
def test_different_accumulate_grad_batches_fails(tmpdir):
    model = BoringModel()
    trainer = Trainer(default_root_dir=tmpdir,
                      accumulate_grad_batches={1: 2},
                      gpus=1,
                      strategy="deepspeed")
    with pytest.raises(
            MisconfigurationException,
            match=
            "DeepSpeed currently does not support different `accumulate_grad_batches`"
    ):
        trainer.fit(model)
Exemplo n.º 10
0
def test_ddp_sharded_strategy_fit_ckpt_path_gpu_to_cpu(tmpdir):
    """Test to ensure that resuming from checkpoint works when going from GPUs- > CPU."""
    model = BoringModel()
    trainer = Trainer(strategy="ddp_sharded_spawn",
                      accelerator="gpu",
                      devices=1,
                      fast_dev_run=True)

    trainer.fit(model)

    checkpoint_path = os.path.join(tmpdir, "model.pt")
    trainer.save_checkpoint(checkpoint_path)

    model = BoringModel()

    trainer = Trainer(strategy="ddp_sharded_spawn",
                      accelerator="cpu",
                      devices=2,
                      fast_dev_run=True)

    trainer.fit(model, ckpt_path=checkpoint_path)
def test_tqdm_progress_bar_progress_refresh(tmpdir, refresh_rate: int):
    """Test that the three progress bars get correctly updated when using different refresh rates."""

    model = BoringModel()

    class CurrentProgressBar(TQDMProgressBar):

        train_batches_seen = 0
        val_batches_seen = 0
        test_batches_seen = 0

        def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
            super().on_train_batch_end(trainer, pl_module, outputs, batch, batch_idx)
            self.train_batches_seen += 1

        def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
            super().on_validation_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx)
            self.val_batches_seen += 1

        def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
            super().on_test_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx)
            self.test_batches_seen += 1

    pbar = CurrentProgressBar(refresh_rate=refresh_rate)
    trainer = Trainer(
        default_root_dir=tmpdir,
        callbacks=[pbar],
        limit_train_batches=1.0,
        num_sanity_val_steps=2,
        max_epochs=3,
    )
    assert trainer.progress_bar_callback.refresh_rate == refresh_rate

    trainer.fit(model)
    assert (
        pbar.train_batches_seen + pbar.val_batches_seen
        == 3 * pbar.main_progress_bar.total + trainer.num_sanity_val_steps
    )
    assert pbar.test_batches_seen == 0

    trainer.validate(model)
    assert (
        pbar.train_batches_seen + pbar.val_batches_seen
        == 3 * pbar.main_progress_bar.total + pbar.val_progress_bar.total + trainer.num_sanity_val_steps
    )
    assert pbar.test_batches_seen == 0

    trainer.test(model)
    assert (
        pbar.train_batches_seen + pbar.val_batches_seen
        == 3 * pbar.main_progress_bar.total + pbar.val_progress_bar.total + trainer.num_sanity_val_steps
    )
    assert pbar.test_batches_seen == pbar.test_progress_bar.total
Exemplo n.º 12
0
def test_if_test_works_after_train(tmpdir):
    """ Ensure that .test() works after .fit() """

    # Train a model on TPU
    model = BoringModel()
    trainer = Trainer(checkpoint_callback=True,
                      max_epochs=1,
                      tpu_cores=8,
                      default_root_dir=tmpdir)
    trainer.fit(model)

    assert trainer.test() == 1
def test_invalid_apex_sharded(tmpdir):
    """Test to ensure that we raise an error when we try to use apex and sharded."""

    model = BoringModel()
    with pytest.raises(MisconfigurationException,
                       match="Sharded Plugin is not supported with Apex AMP"):
        trainer = Trainer(fast_dev_run=True,
                          accelerator="ddp_sharded_spawn",
                          precision=16,
                          amp_backend="apex")

        trainer.fit(model)
def test_deepspeed_warn_train_dataloader_called(tmpdir):
    """Test DeepSpeed warns when it calls ``lightning_module.train_dataloader`` internally for logging batch
    size."""
    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        plugins=[DeepSpeedPlugin()],
        gpus=1,
        fast_dev_run=True,
    )
    with pytest.warns(UserWarning, match="Inferring the batch size for internal deepspeed logging"):
        trainer.fit(model)
Exemplo n.º 15
0
def test_ort_callback_fails_no_model(tmpdir):
    model = BoringModel()
    trainer = Trainer(default_root_dir=tmpdir,
                      fast_dev_run=True,
                      callbacks=ORTCallback())
    with pytest.raises(MisconfigurationException,
                       match="Torch ORT requires to wrap a single model"):
        trainer.fit(
            model,
            train_dataloader=torch.utils.data.DataLoader(DummyDataset()),
            val_dataloaders=torch.utils.data.DataLoader(DummyDataset()),
        )
Exemplo n.º 16
0
def test_tuner_with_distributed_strategies():
    """Test that an error is raised when tuner is used with multi-device strategy."""
    trainer = Trainer(auto_scale_batch_size=True,
                      devices=2,
                      strategy="ddp",
                      accelerator="cpu")
    model = BoringModel()

    with pytest.raises(
            MisconfigurationException,
            match=r"not supported with `Trainer\(strategy='ddp'\)`"):
        trainer.tune(model)
def test_tqdm_progress_bar_can_be_pickled():
    bar = TQDMProgressBar()
    trainer = Trainer(fast_dev_run=True, callbacks=[bar], max_steps=1)
    model = BoringModel()

    pickle.dumps(bar)
    trainer.fit(model)
    pickle.dumps(bar)
    trainer.test(model)
    pickle.dumps(bar)
    trainer.predict(model)
    pickle.dumps(bar)
Exemplo n.º 18
0
def test_has_len_all_rank():
    trainer = Trainer(fast_dev_run=True)
    model = BoringModel()

    with pytest.raises(
            MisconfigurationException,
            match="Total length of `Dataloader` across ranks is zero."):
        assert not has_len_all_ranks(DataLoader(RandomDataset(0, 0)),
                                     trainer.strategy, model)

    assert has_len_all_ranks(DataLoader(RandomDataset(1, 1)), trainer.strategy,
                             model)
Exemplo n.º 19
0
def test_invalid_deepspeed_defaults_no_precision(tmpdir):
    """Test to ensure that using defaults, if precision is not set to 16, we throw an exception."""
    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        plugins='deepspeed',
    )
    with pytest.raises(
        MisconfigurationException, match='To use DeepSpeed ZeRO Optimization, you must set precision=16.'
    ):
        trainer.fit(model)
Exemplo n.º 20
0
def test_ddp_sharded_plugin_test_multigpu(tmpdir):
    """
        Test to ensure we can use test without fit
    """
    model = BoringModel()
    trainer = Trainer(
        accelerator='ddp_sharded_spawn',
        gpus=2,
        fast_dev_run=True,
    )

    trainer.test(model)
Exemplo n.º 21
0
 def train():
     model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
         precision=16,
         amp_backend='native',
         gpus=gpus,
         num_processes=num_processes,
         accelerator=ddp_backend,
         callbacks=[CB()],
     )
     trainer.fit(model)
Exemplo n.º 22
0
def test_ddp_sharded_plugin_checkpoint_multi_gpu(tmpdir):
    """
        Test to ensure that checkpoint is saved correctly when using multiple GPUs
    """
    model = BoringModel()
    trainer = Trainer(
        gpus=2,
        accelerator='ddp_sharded_spawn',
        fast_dev_run=True,
    )

    trainer.fit(model)

    checkpoint_path = os.path.join(tmpdir, 'model.pt')
    trainer.save_checkpoint(checkpoint_path)
    saved_model = BoringModel.load_from_checkpoint(checkpoint_path)

    # Assert model parameters are identical after loading
    for ddp_param, shard_param in zip(model.parameters(),
                                      saved_model.parameters()):
        assert torch.equal(ddp_param.to("cpu"), shard_param)
Exemplo n.º 23
0
def test_progress_bar_totals(tmpdir):
    """Test that the progress finishes with the correct total steps processed."""

    model = BoringModel()

    trainer = Trainer(default_root_dir=tmpdir,
                      progress_bar_refresh_rate=1,
                      max_epochs=1)
    bar = trainer.progress_bar_callback
    assert 0 == bar.total_train_batches
    assert 0 == bar.total_val_batches
    assert 0 == bar.total_test_batches

    trainer.fit(model)

    # check main progress bar total
    n = bar.total_train_batches
    m = bar.total_val_batches
    assert len(trainer.train_dataloader) == n
    assert bar.main_progress_bar.total == n + m

    # check val progress bar total
    assert sum(len(loader) for loader in trainer.val_dataloaders) == m
    assert bar.val_progress_bar.total == m

    # main progress bar should have reached the end (train batches + val batches)
    assert bar.main_progress_bar.n == n + m
    assert bar.train_batch_idx == n

    # val progress bar should have reached the end
    assert bar.val_progress_bar.n == m
    assert bar.val_batch_idx == m

    # check that the test progress bar is off
    assert 0 == bar.total_test_batches
    assert bar.test_progress_bar is None

    trainer.validate(model)

    assert bar.val_progress_bar.total == m
    assert bar.val_progress_bar.n == m
    assert bar.val_batch_idx == m

    trainer.test(model)

    # check test progress bar total
    k = bar.total_test_batches
    assert sum(len(loader) for loader in trainer.test_dataloaders) == k
    assert bar.test_progress_bar.total == k

    # test progress bar should have reached the end
    assert bar.test_progress_bar.n == k
    assert bar.test_batch_idx == k
Exemplo n.º 24
0
def test_poptorch_models_at_different_stages(tmpdir):
    plugin = IPUStrategy()
    trainer = Trainer(default_root_dir=tmpdir, strategy=plugin, accelerator="ipu", devices=8)
    model = BoringModel()
    model.trainer = trainer
    plugin.model = model

    trainer.optimizers = model.configure_optimizers()[0]
    trainer.state.fn = TrainerFn.FITTING
    trainer.strategy.setup(trainer)
    assert list(trainer.strategy.poptorch_models) == [RunningStage.TRAINING, RunningStage.VALIDATING]

    for fn, stage in (
        (TrainerFn.VALIDATING, RunningStage.VALIDATING),
        (TrainerFn.TESTING, RunningStage.TESTING),
        (TrainerFn.PREDICTING, RunningStage.PREDICTING),
    ):
        trainer.state.fn = fn
        trainer.state.stage = stage
        trainer.strategy.setup(trainer)
        assert list(trainer.strategy.poptorch_models) == [stage]
def test_deepspeed_stage_3_save_warning(tmpdir):
    """
    Test to ensure that DeepSpeed Stage 3 gives a warning when saving.
    """
    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16
    )
    trainer.fit(model)
    checkpoint_path = os.path.join(tmpdir, "model.pt")
    with pytest.warns(UserWarning, match="each worker will save a shard of the checkpoint within a directory."):
        trainer.save_checkpoint(checkpoint_path)
def _assert_save_model_is_equal(model, tmpdir, trainer):
    checkpoint_path = os.path.join(tmpdir, 'model.pt')
    trainer.save_checkpoint(checkpoint_path)
    # carry out the check only on rank 0
    if trainer.global_rank == 0:
        saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
        if model.dtype == torch.half:
            saved_model = saved_model.half()  # model is loaded in float32 as default, move it to float16
        model = model.cpu()
        # Assert model parameters are identical after loading
        for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()):
            assert torch.equal(orig_param, trained_model_param)
Exemplo n.º 27
0
def test_has_len_all_rank():
    trainer = Trainer(fast_dev_run=True)
    model = BoringModel()

    with pytest.warns(
            UserWarning,
            match="Total length of `DataLoader` across ranks is zero."):
        assert has_len_all_ranks(DataLoader(RandomDataset(0, 0)),
                                 trainer.strategy, model)

    assert has_len_all_ranks(DataLoader(RandomDataset(1, 1)), trainer.strategy,
                             model)
def test_deepspeed_multigpu(tmpdir):
    """
    Test to ensure that DeepSpeed with multiple GPUs works.
    """
    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16
    )
    trainer.fit(model)
    trainer.test(model)

    _assert_save_model_is_equal(model, tmpdir, trainer)
def test_checkpoint_plugin_called(tmpdir):
    """
    Ensure that the custom checkpoint IO plugin and torch checkpoint IO plugin is called when saving/loading.
    """
    checkpoint_plugin = CustomCheckpointIO()
    checkpoint_plugin = MagicMock(wraps=checkpoint_plugin,
                                  spec=CustomCheckpointIO)

    ck = ModelCheckpoint(dirpath=tmpdir, save_last=True)

    model = BoringModel()
    device = torch.device("cpu")
    trainer = Trainer(
        default_root_dir=tmpdir,
        plugins=SingleDevicePlugin(device, checkpoint_io=checkpoint_plugin),
        callbacks=ck,
        max_epochs=1,
    )
    trainer.fit(model)
    assert checkpoint_plugin.save_checkpoint.call_count == 3
    trainer.test(model, ckpt_path=ck.last_model_path)
    checkpoint_plugin.load_checkpoint.assert_called_with(tmpdir / "last.ckpt")

    checkpoint_plugin.reset_mock()
    ck = ModelCheckpoint(dirpath=tmpdir, save_last=True)

    model = BoringModel()
    device = torch.device("cpu")
    trainer = Trainer(
        default_root_dir=tmpdir,
        plugins=[SingleDevicePlugin(device), checkpoint_plugin],
        callbacks=ck,
        max_epochs=1,
    )
    trainer.fit(model)
    assert checkpoint_plugin.save_checkpoint.call_count == 3

    trainer.test(model, ckpt_path=ck.last_model_path)
    checkpoint_plugin.load_checkpoint.assert_called_once()
    checkpoint_plugin.load_checkpoint.assert_called_with(tmpdir / "last.ckpt")
def test_deepspeed_lightning_module(tmpdir):
    """Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly."""

    model = BoringModel()
    module = LightningDeepSpeedModule(model, precision=16)

    module.half()
    assert module.dtype == torch.half
    assert model.dtype == torch.half

    module.to(torch.double)
    assert module.dtype == torch.double
    assert model.dtype == torch.double