def test_train_client(tmpdir, start_ray_client_server_2_cpus, seed, num_slots):
    """Tests if training modifies model weights."""
    assert ray.util.client.ray.is_connected()
    model = BoringModel()
    plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=False)
    trainer = get_trainer(tmpdir, plugins=[plugin])
    train_test(trainer, model)
예제 #2
0
def test_distributed_sampler(tmpdir, ray_start_2_cpus):
    """Tests if distributed sampler is properly set."""
    model = BoringModel()
    train_dataloader = model.train_dataloader()
    initial_sampler = train_dataloader.sampler
    assert not isinstance(initial_sampler, DistributedSampler)

    class DistributedSamplerCallback(Callback):
        def on_train_start(self, trainer, pl_module):
            train_sampler = trainer.train_dataloader.sampler
            assert isinstance(train_sampler, DistributedSampler)
            assert train_sampler.shuffle
            assert train_sampler.num_replicas == 2
            assert train_sampler.rank == trainer.global_rank

        def on_validation_start(self, trainer, pl_module):
            train_sampler = trainer.val_dataloaders[0].sampler
            assert isinstance(train_sampler, DistributedSampler)
            assert not train_sampler.shuffle
            assert train_sampler.num_replicas == 2
            assert train_sampler.rank == trainer.global_rank

        def on_test_start(self, trainer, pl_module):
            train_sampler = trainer.test_dataloaders[0].sampler
            assert isinstance(train_sampler, DistributedSampler)
            assert not train_sampler.shuffle
            assert train_sampler.num_replicas == 2
            assert train_sampler.rank == trainer.global_rank

    plugin = RayPlugin(num_workers=2)
    trainer = get_trainer(
        tmpdir, plugins=[plugin], callbacks=[DistributedSamplerCallback()])
    trainer.fit(model)
예제 #3
0
def test_multi_node(tmpdir):
    """Tests if multi-node GPU training works."""
    ray.init("auto")
    num_gpus = ray.available_resources()["GPU"]
    model = BoringModel()
    plugin = RayPlugin(num_workers=num_gpus, use_gpu=True)
    trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True)
    train_test(trainer, model)
예제 #4
0
def test_multi_node(tmpdir):
    """Tests if multi-node GPU training works."""
    ray.init("auto")
    num_gpus = ray.available_resources()["GPU"]
    model = BoringModel()
    accelerator = RayAccelerator(num_workers=num_gpus, use_gpu=True)
    trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True)
    train_test(trainer, model)
예제 #5
0
 def _inner_train(config):
     model = BoringModel()
     trainer = get_trainer(dir,
                           use_gpu=use_gpu,
                           callbacks=callbacks,
                           accelerator=accelerator,
                           **config)
     trainer.fit(model)
 def _inner_train(config):
     model = BoringModel()
     trainer = get_trainer(dir,
                           use_gpu=use_gpu,
                           callbacks=callbacks,
                           plugins=[plugin],
                           checkpoint_callback=False,
                           **config)
     trainer.fit(model)
예제 #7
0
def test_unused_parameters(tmpdir, ray_start_2_cpus):
    """Tests if find_unused_parameters is properly passed to model."""
    model = BoringModel()
    plugin = RayPlugin(
        num_workers=2, use_gpu=False, find_unused_parameters=False)

    class UnusedParameterCallback(Callback):
        def on_train_start(self, trainer, pl_module):
            assert trainer.model.find_unused_parameters is False

    trainer = get_trainer(
        tmpdir, plugins=[plugin], callbacks=[UnusedParameterCallback()])
    trainer.fit(model)
예제 #8
0
def test_actor_creation(tmpdir, ray_start_2_cpus, num_workers):
    """Tests whether the appropriate number of training actors are created."""
    model = BoringModel()

    def check_num_actor():
        assert len(ray.actors()) == num_workers

    model.on_epoch_end = check_num_actor
    plugin = RayPlugin(num_workers=num_workers)
    trainer = get_trainer(tmpdir, plugins=[plugin])
    trainer.fit(model)
    assert all(actor["State"] == ray.gcs_utils.ActorTableData.DEAD
               for actor in list(ray.actors().values()))
예제 #9
0
def test_model_to_gpu(tmpdir, ray_start_2_gpus):
    """Tests if model is placed on CUDA device."""
    model = BoringModel()

    class CheckGPUCallback(Callback):
        def on_epoch_end(self, trainer, pl_module):
            assert next(pl_module.parameters()).is_cuda

    plugin = RayPlugin(num_workers=2, use_gpu=True)
    trainer = get_trainer(tmpdir,
                          plugins=[plugin],
                          use_gpu=True,
                          callbacks=[CheckGPUCallback()])
    trainer.fit(model)
예제 #10
0
def test_predict(tmpdir, ray_start_2_cpus, seed, num_workers):
    """Tests if trained model has high accuracy on test set."""
    config = {
        "layer_1": 32,
        "layer_2": 32,
        "lr": 1e-2,
        "batch_size": 32,
    }

    model = LightningMNISTClassifier(config, tmpdir)
    dm = MNISTDataModule(
        data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"])
    plugin = RayPlugin(num_workers=num_workers, use_gpu=False)
    trainer = get_trainer(
        tmpdir, limit_train_batches=20, max_epochs=1, plugins=[plugin])
    predict_test(trainer, model, dm)
def test_early_stop(tmpdir, ray_start_2_cpus):
    """Tests if early stopping callback works correctly."""
    model = BoringModel()
    accelerator = RayAccelerator(num_workers=1, use_gpu=False)
    early_stop = EarlyStopping(monitor="val_loss", patience=2, verbose=True)
    trainer = get_trainer(tmpdir,
                          max_epochs=500,
                          accelerator=accelerator,
                          callbacks=[early_stop],
                          limit_train_batches=1.0,
                          limit_val_batches=1.0,
                          progress_bar_refresh_rate=1)
    trainer.fit(model)
    trained_model = BoringModel.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path)
    assert trained_model.val_epoch == 2, trained_model.val_epoch
예제 #12
0
def test_correct_devices(tmpdir, ray_start_2_gpus):
    """Tests if GPU devices are correctly set."""
    model = BoringModel()

    class CheckDevicesCallback(Callback):
        def on_epoch_end(self, trainer, pl_module):
            assert trainer.root_gpu == 0
            assert int(os.environ["CUDA_VISIBLE_DEVICES"]) == \
                trainer.local_rank
            assert trainer.root_gpu == pl_module.device.index
            assert torch.cuda.current_device() == trainer.root_gpu

    plugin = RayPlugin(num_workers=2, use_gpu=True)
    trainer = get_trainer(tmpdir,
                          plugins=plugin,
                          use_gpu=True,
                          callbacks=[CheckDevicesCallback()])
    trainer.fit(model)
def test_predict_client(tmpdir, start_ray_client_server_2_cpus, seed,
                        num_slots):
    assert ray.util.client.ray.is_connected()
    config = {
        "layer_1": 32,
        "layer_2": 32,
        "lr": 1e-2,
        "batch_size": 32,
    }
    model = LightningMNISTClassifier(config, tmpdir)
    dm = MNISTDataModule(data_dir=tmpdir,
                         num_workers=1,
                         batch_size=config["batch_size"])
    plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=False)
    trainer = get_trainer(tmpdir,
                          limit_train_batches=20,
                          max_epochs=1,
                          plugins=[plugin])
    predict_test(trainer, model, dm)
예제 #14
0
def test_predict_gpu(tmpdir, ray_start_2_gpus, seed, num_slots):
    """Tests if trained model has high accuracy on test set."""
    config = {
        "layer_1": 32,
        "layer_2": 32,
        "lr": 1e-2,
        "batch_size": 32,
    }
    model = LightningMNISTClassifier(config, tmpdir)
    dm = MNISTDataModule(data_dir=tmpdir,
                         num_workers=1,
                         batch_size=config["batch_size"])
    accelerator = HorovodRayAccelerator(num_slots=num_slots, use_gpu=True)
    trainer = get_trainer(tmpdir,
                          limit_train_batches=10,
                          max_epochs=1,
                          accelerator=accelerator,
                          use_gpu=True)
    predict_test(trainer, model, dm)
예제 #15
0
def test_train_gpu(tmpdir, ray_start_2_gpus, seed, num_slots):
    """Tests if training modifies model weights."""
    model = BoringModel()
    accelerator = HorovodRayAccelerator(num_slots=num_slots, use_gpu=True)
    trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True)
    train_test(trainer, model)
def test_load(tmpdir, ray_start_2_cpus, num_workers):
    """Tests if model checkpoint can be loaded."""
    model = BoringModel()
    accelerator = RayAccelerator(num_workers=num_workers, use_gpu=False)
    trainer = get_trainer(tmpdir, accelerator=accelerator)
    load_test(trainer, model)
def test_train(tmpdir, ray_start_2_cpus, num_workers):
    """Tests if training modifies model weights."""
    model = BoringModel()
    accelerator = RayAccelerator(num_workers=num_workers)
    trainer = get_trainer(tmpdir, accelerator=accelerator)
    train_test(trainer, model)
예제 #18
0
def test_train_gpu(tmpdir, ray_start_2_gpus, seed, num_slots):
    """Tests if training modifies model weights."""
    model = BoringModel()
    plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=True)
    trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True)
    train_test(trainer, model)
예제 #19
0
def test_train(tmpdir, ray_start_2_cpus, num_workers):
    """Tests if training modifies model weights."""
    model = BoringModel()
    plugin = RayPlugin(num_workers=num_workers)
    trainer = get_trainer(tmpdir, plugins=[plugin])
    train_test(trainer, model)
예제 #20
0
def test_load(tmpdir, ray_start_2_cpus, num_workers):
    """Tests if model checkpoint can be loaded."""
    model = BoringModel()
    plugin = RayPlugin(num_workers=num_workers, use_gpu=False)
    trainer = get_trainer(tmpdir, plugins=[plugin])
    load_test(trainer, model)
예제 #21
0
def test_train_client(tmpdir, start_ray_client_server_2_cpus, num_workers):
    assert ray.util.client.ray.is_connected()
    model = BoringModel()
    plugin = RayPlugin(num_workers=num_workers)
    trainer = get_trainer(tmpdir, plugins=[plugin])
    train_test(trainer, model)
예제 #22
0
def test_load_gpu(tmpdir, ray_start_2_gpus, seed, num_slots):
    """Tests if model checkpoint can be loaded."""
    model = BoringModel()
    plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=True)
    trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True)
    load_test(trainer, model)
예제 #23
0
def test_load_gpu(tmpdir, ray_start_2_gpus, seed, num_slots):
    """Tests if model checkpoint can be loaded."""
    model = BoringModel()
    accelerator = HorovodRayAccelerator(num_slots=num_slots, use_gpu=True)
    trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True)
    load_test(trainer, model)