예제 #1
0
def test_dp_test(tmpdir):
    tutils.set_random_master_port()

    dm = ClassifDataModule()
    model = CustomClassificationModelDP()
    trainer = pl.Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_train_batches=10,
        limit_val_batches=10,
        gpus=[0, 1],
        accelerator='dp',
    )
    trainer.fit(model, datamodule=dm)
    assert 'ckpt' in trainer.checkpoint_callback.best_model_path
    results = trainer.test(datamodule=dm)
    assert 'test_acc' in results[0]

    old_weights = model.layer_0.weight.clone().detach().cpu()

    results = trainer.test(model, datamodule=dm)
    assert 'test_acc' in results[0]

    # make sure weights didn't change
    new_weights = model.layer_0.weight.clone().detach().cpu()

    assert torch.all(torch.eq(old_weights, new_weights))
def test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset():
    """Make sure result logging works with DDP"""
    tutils.set_random_master_port()
    worldsize = 2
    mp.spawn(
        _test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset, args=(worldsize,), nprocs=worldsize
    )
예제 #3
0
def test_sync_batchnorm_ddp(tmpdir):
    seed_everything(234)
    set_random_master_port()

    # define datamodule and dataloader
    dm = MNISTDataModule()
    dm.prepare_data()
    dm.setup(stage=None)

    train_dataloader = dm.train_dataloader()
    model = SyncBNModule()

    bn_outputs = []

    # shuffle is false by default
    for batch_idx, batch in enumerate(train_dataloader):
        x, _ = batch

        _, out_bn = model.forward(x, batch_idx)
        bn_outputs.append(out_bn)

        # get 3 steps
        if batch_idx == 2:
            break

    bn_outputs = [x.cuda() for x in bn_outputs]

    # reset datamodule
    # batch-size = 16 because 2 GPUs in DDP
    dm = MNISTDataModule(batch_size=16, dist_sampler=True)
    dm.prepare_data()
    dm.setup(stage=None)

    model = SyncBNModule(gpu_count=2, bn_targets=bn_outputs)
    ddp = DDPSpawnPlugin(
        parallel_devices=[torch.device("cuda", 0),
                          torch.device("cuda", 1)],
        num_nodes=1,
        sync_batchnorm=True,
        cluster_environment=LightningEnvironment(),
        find_unused_parameters=True,
    )

    trainer = Trainer(
        default_root_dir=tmpdir,
        gpus=2,
        num_nodes=1,
        accelerator="ddp_spawn",
        max_epochs=1,
        max_steps=3,
        sync_batchnorm=True,
        num_sanity_val_steps=0,
        replace_sampler_ddp=False,
        plugins=[ddp],
    )

    trainer.fit(model, dm)
    assert trainer.state.finished, "Sync batchnorm failing with DDP"
예제 #4
0
def test_result_reduce_horovod(tmpdir):
    """Make sure result logging works with Horovod.

    This test mirrors tests/core/test_results.py::_ddp_test_fn
    """
    tutils.reset_seed()
    tutils.set_random_master_port()

    def hvd_test_fn():
        path_here = os.path.abspath(os.path.dirname(__file__))
        path_root = os.path.abspath(os.path.join(path_here, "..", ".."))
        sys.path.insert(0, os.path.abspath(path_root))

        class TestModel(BoringModel):
            def training_step(self, batch, batch_idx):
                self.training_step_called = True

                tensor = torch.tensor([1.0])
                self.log("test_tensor",
                         tensor,
                         sync_dist=True,
                         reduce_fx="sum",
                         on_step=True,
                         on_epoch=True)

                res = self._results

                # Check that `tensor` is summed across all ranks automatically
                assert (
                    res["test_tensor"].item() == hvd.size()
                ), "Result-Log does not work properly with Horovod and Tensors"

            def training_epoch_end(self, outputs) -> None:
                assert len(outputs) == 0

        model = TestModel()
        model.val_dataloader = None

        trainer = Trainer(
            default_root_dir=tmpdir,
            limit_train_batches=2,
            limit_val_batches=2,
            max_epochs=1,
            log_every_n_steps=1,
            weights_summary=None,
            logger=False,
        )

        trainer.fit(model)

    horovod.run(hvd_test_fn, np=2)
예제 #5
0
def test_multi_gpu_none_backend(tmpdir):
    """Make sure when using multiple GPUs the user can't use `distributed_backend = None`."""
    tutils.set_random_master_port()
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        limit_train_batches=0.2,
        limit_val_batches=0.2,
        gpus=2,
    )

    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, min_acc=0.20)
예제 #6
0
def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
    tutils.set_random_master_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        callbacks=[EarlyStopping()],
        max_epochs=50,
        limit_train_batches=10,
        limit_val_batches=10,
        gpus=[0, 1],
        accelerator='ddp_spawn',
    )

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model)
예제 #7
0
def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
    tutils.set_random_master_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        callbacks=[EarlyStopping(monitor='train_acc')],
        max_epochs=50,
        limit_train_batches=10,
        limit_val_batches=10,
        gpus=[0, 1],
        accelerator='ddp_spawn',
    )

    dm = ClassifDataModule()
    model = ClassificationModel()
    tpipes.run_model_test(trainer_options, model, dm)
예제 #8
0
def run_test_from_config(trainer_options, on_gpu, check_size=True):
    """Trains the default model with the given config."""
    set_random_master_port()
    reset_seed()

    ckpt_path = trainer_options['weights_save_path']
    trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)])

    model = BoringModel()

    trainer = Trainer(**trainer_options)
    trainer.fit(model)
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

    # Horovod should be initialized following training. If not, this will raise an exception.
    if check_size:
        assert hvd.size() == 2

    if trainer.global_rank > 0:
        return

    # test model loading
    pretrained_model = BoringModel.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path)

    # test new model accuracy
    test_loaders = model.test_dataloader()
    if not isinstance(test_loaders, list):
        test_loaders = [test_loaders]

    for dataloader in test_loaders:
        batch = next(iter(dataloader))
        pretrained_model(batch)

    # test HPC saving
    trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger)
    # test HPC loading
    checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(
        ckpt_path)
    trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu)

    if on_gpu:
        trainer = Trainer(gpus=1, accelerator='horovod', max_epochs=1)
        # Test the root_gpu property
        assert trainer.root_gpu == hvd.local_rank()
def test_ddp_all_dataloaders_passed_to_fit(tmpdir):
    """Make sure DDP works with dataloaders passed to fit()"""
    tutils.set_random_master_port()

    model = BoringModel()
    fit_options = dict(train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader())

    trainer = Trainer(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        limit_train_batches=0.2,
        limit_val_batches=0.2,
        gpus=[0, 1],
        accelerator="ddp_spawn",
    )
    trainer.fit(model, **fit_options)
    assert trainer.state.finished, "DDP doesn't work with dataloaders passed to fit()."
예제 #10
0
def test_model_saves_on_multi_gpu(tmpdir):
    """Test that ONNX model saves on a distributed backend."""
    tutils.set_random_master_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=10,
        limit_val_batches=10,
        gpus=[0, 1],
        accelerator="ddp_spawn",
        progress_bar_refresh_rate=0,
    )

    model = BoringModel()
    model.example_input_array = torch.randn(5, 32)

    tpipes.run_model_test(trainer_options, model, min_acc=0.08)

    file_path = os.path.join(tmpdir, "model.onnx")
    model.to_onnx(file_path)
    assert os.path.exists(file_path) is True