예제 #1
0
def test_distributed_sampler(tmpdir, ray_start_2_cpus):
    """Tests if distributed sampler is properly set."""
    model = BoringModel()
    train_dataloader = model.train_dataloader()
    initial_sampler = train_dataloader.sampler
    assert not isinstance(initial_sampler, DistributedSampler)

    class DistributedSamplerCallback(Callback):
        def on_train_start(self, trainer, pl_module):
            train_sampler = trainer.train_dataloader.sampler
            assert isinstance(train_sampler, DistributedSampler)
            assert train_sampler.shuffle
            assert train_sampler.num_replicas == 2
            assert train_sampler.rank == trainer.global_rank

        def on_validation_start(self, trainer, pl_module):
            train_sampler = trainer.val_dataloaders[0].sampler
            assert isinstance(train_sampler, DistributedSampler)
            assert not train_sampler.shuffle
            assert train_sampler.num_replicas == 2
            assert train_sampler.rank == trainer.global_rank

        def on_test_start(self, trainer, pl_module):
            train_sampler = trainer.test_dataloaders[0].sampler
            assert isinstance(train_sampler, DistributedSampler)
            assert not train_sampler.shuffle
            assert train_sampler.num_replicas == 2
            assert train_sampler.rank == trainer.global_rank

    plugin = RayPlugin(num_workers=2)
    trainer = get_trainer(
        tmpdir, plugins=[plugin], callbacks=[DistributedSamplerCallback()])
    trainer.fit(model)
예제 #2
0
def test_actor_creation(tmpdir, ray_start_2_cpus, num_workers):
    """Tests whether the appropriate number of training actors are created."""
    model = BoringModel()

    def check_num_actor():
        assert len(ray.actors()) == num_workers

    model.on_epoch_end = check_num_actor
    plugin = RayPlugin(num_workers=num_workers)
    trainer = get_trainer(tmpdir, plugins=[plugin])
    trainer.fit(model)
    assert all(actor["State"] == ray.gcs_utils.ActorTableData.DEAD
               for actor in list(ray.actors().values()))
def test_ddp_sharded_plugin_finetune(tmpdir, ray_start_2_cpus, seed):
    """Tests if we can save and restart training."""
    model = BoringModel()
    trainer = Trainer(
        plugins=[RayShardedPlugin(num_workers=2)],
        fast_dev_run=True,
    )
    trainer.fit(model)

    checkpoint_path = os.path.join(tmpdir, "model.pt")
    trainer.save_checkpoint(checkpoint_path)
    saved_model = BoringModel.load_from_checkpoint(checkpoint_path)

    trainer = Trainer(fast_dev_run=True, )
    trainer.fit(saved_model)
def test_early_stop(tmpdir, ray_start_2_cpus):
    """Tests if early stopping callback works correctly."""
    model = BoringModel()
    accelerator = RayAccelerator(num_workers=1, use_gpu=False)
    early_stop = EarlyStopping(monitor="val_loss", patience=2, verbose=True)
    trainer = get_trainer(tmpdir,
                          max_epochs=500,
                          accelerator=accelerator,
                          callbacks=[early_stop],
                          limit_train_batches=1.0,
                          limit_val_batches=1.0,
                          progress_bar_refresh_rate=1)
    trainer.fit(model)
    trained_model = BoringModel.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path)
    assert trained_model.val_epoch == 2, trained_model.val_epoch
def test_train_client(tmpdir, start_ray_client_server_2_cpus, seed, num_slots):
    """Tests if training modifies model weights."""
    assert ray.util.client.ray.is_connected()
    model = BoringModel()
    plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=False)
    trainer = get_trainer(tmpdir, plugins=[plugin])
    train_test(trainer, model)
예제 #6
0
def test_multi_node(tmpdir):
    """Tests if multi-node GPU training works."""
    ray.init("auto")
    num_gpus = ray.available_resources()["GPU"]
    model = BoringModel()
    accelerator = RayAccelerator(num_workers=num_gpus, use_gpu=True)
    trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True)
    train_test(trainer, model)
예제 #7
0
 def _inner_train(config):
     model = BoringModel()
     trainer = get_trainer(dir,
                           use_gpu=use_gpu,
                           callbacks=callbacks,
                           accelerator=accelerator,
                           **config)
     trainer.fit(model)
예제 #8
0
def test_multi_node(tmpdir):
    """Tests if multi-node GPU training works."""
    ray.init("auto")
    num_gpus = ray.available_resources()["GPU"]
    model = BoringModel()
    plugin = RayPlugin(num_workers=num_gpus, use_gpu=True)
    trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True)
    train_test(trainer, model)
def test_ddp_sharded_plugin_checkpoint(tmpdir, ray_start_2_cpus, seed):
    """Tests if checkpoint is saved correctly."""
    model = BoringModel()
    trainer = Trainer(
        plugins=[RayShardedPlugin(num_workers=2)],
        fast_dev_run=True,
    )

    trainer.fit(model)

    checkpoint_path = os.path.join(tmpdir, "model.pt")
    trainer.save_checkpoint(checkpoint_path)
    saved_model = BoringModel.load_from_checkpoint(checkpoint_path)

    # Assert model parameters are identical after loading.
    for ddp_param, shard_param in zip(model.parameters(),
                                      saved_model.parameters()):
        assert torch.equal(ddp_param, shard_param)
def test_ddp_sharded_plugin_test(tmpdir, ray_start_2_cpus, seed):
    """Tests if test works without fit."""
    model = BoringModel()
    trainer = Trainer(
        plugins=[RayShardedPlugin(num_workers=2)],
        fast_dev_run=True,
    )

    trainer.test(model)
 def _inner_train(config):
     model = BoringModel()
     trainer = get_trainer(dir,
                           use_gpu=use_gpu,
                           callbacks=callbacks,
                           plugins=[plugin],
                           checkpoint_callback=False,
                           **config)
     trainer.fit(model)
def test_ddp_sharded_plugin_resume_from_checkpoint_downsize(
        tmpdir, ray_start_2_cpus, seed):
    """Tests if we can save and resume training with less workers."""
    model = BoringModel()
    trainer = Trainer(
        plugins=[RayShardedPlugin(num_workers=2)], fast_dev_run=True)

    trainer.fit(model)

    checkpoint_path = os.path.join(tmpdir, "model.pt")
    trainer.save_checkpoint(checkpoint_path)

    model = BoringModel()

    trainer = Trainer(
        plugins=[RayShardedPlugin(num_workers=1)],
        fast_dev_run=True,
        resume_from_checkpoint=checkpoint_path)

    trainer.fit(model)
예제 #13
0
def test_unused_parameters(tmpdir, ray_start_2_cpus):
    """Tests if find_unused_parameters is properly passed to model."""
    model = BoringModel()
    plugin = RayPlugin(
        num_workers=2, use_gpu=False, find_unused_parameters=False)

    class UnusedParameterCallback(Callback):
        def on_train_start(self, trainer, pl_module):
            assert trainer.model.find_unused_parameters is False

    trainer = get_trainer(
        tmpdir, plugins=[plugin], callbacks=[UnusedParameterCallback()])
    trainer.fit(model)
def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir, ray_start_2_cpus,
                                                   seed):
    """Tests if resuming from checkpoint works."""
    model = BoringModel()
    trainer = Trainer(
        plugins=[RayShardedPlugin(num_workers=2)],
        fast_dev_run=True,
    )

    trainer.fit(model)

    checkpoint_path = os.path.join(tmpdir, "model.pt")
    trainer.save_checkpoint(checkpoint_path)

    model = BoringModel()

    trainer = Trainer(
        plugins=[RayShardedPlugin(num_workers=2)],
        fast_dev_run=True,
        resume_from_checkpoint=checkpoint_path)

    trainer.fit(model)
예제 #15
0
def test_model_to_gpu(tmpdir, ray_start_2_gpus):
    """Tests if model is placed on CUDA device."""
    model = BoringModel()

    class CheckGPUCallback(Callback):
        def on_epoch_end(self, trainer, pl_module):
            assert next(pl_module.parameters()).is_cuda

    plugin = RayPlugin(num_workers=2, use_gpu=True)
    trainer = get_trainer(tmpdir,
                          plugins=[plugin],
                          use_gpu=True,
                          callbacks=[CheckGPUCallback()])
    trainer.fit(model)
def test_ddp_choice_sharded(tmpdir, ray_start_2_cpus, seed):
    """Tests if sharded plugin is properly recognized."""

    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.accelerator.training_type_plugin,
                              RayShardedPlugin)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(
        fast_dev_run=True,
        plugins=[RayShardedPlugin(num_workers=2)],
        callbacks=[CB()],
    )

    with pytest.raises(SystemExit):
        trainer.fit(model)
예제 #17
0
def test_correct_devices(tmpdir, ray_start_2_gpus):
    """Tests if GPU devices are correctly set."""
    model = BoringModel()

    class CheckDevicesCallback(Callback):
        def on_epoch_end(self, trainer, pl_module):
            assert trainer.root_gpu == 0
            assert int(os.environ["CUDA_VISIBLE_DEVICES"]) == \
                trainer.local_rank
            assert trainer.root_gpu == pl_module.device.index
            assert torch.cuda.current_device() == trainer.root_gpu

    plugin = RayPlugin(num_workers=2, use_gpu=True)
    trainer = get_trainer(tmpdir,
                          plugins=plugin,
                          use_gpu=True,
                          callbacks=[CheckDevicesCallback()])
    trainer.fit(model)
예제 #18
0
def test_load_gpu(tmpdir, ray_start_2_gpus, seed, num_slots):
    """Tests if model checkpoint can be loaded."""
    model = BoringModel()
    accelerator = HorovodRayAccelerator(num_slots=num_slots, use_gpu=True)
    trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True)
    load_test(trainer, model)
예제 #19
0
def test_load(tmpdir, ray_start_2_cpus, num_workers):
    """Tests if model checkpoint can be loaded."""
    model = BoringModel()
    plugin = RayPlugin(num_workers=num_workers, use_gpu=False)
    trainer = get_trainer(tmpdir, plugins=[plugin])
    load_test(trainer, model)
예제 #20
0
def test_train(tmpdir, ray_start_2_cpus, num_workers):
    """Tests if training modifies model weights."""
    model = BoringModel()
    plugin = RayPlugin(num_workers=num_workers)
    trainer = get_trainer(tmpdir, plugins=[plugin])
    train_test(trainer, model)
def test_load(tmpdir, ray_start_2_cpus, num_workers):
    """Tests if model checkpoint can be loaded."""
    model = BoringModel()
    accelerator = RayAccelerator(num_workers=num_workers, use_gpu=False)
    trainer = get_trainer(tmpdir, accelerator=accelerator)
    load_test(trainer, model)
def test_train(tmpdir, ray_start_2_cpus, num_workers):
    """Tests if training modifies model weights."""
    model = BoringModel()
    accelerator = RayAccelerator(num_workers=num_workers)
    trainer = get_trainer(tmpdir, accelerator=accelerator)
    train_test(trainer, model)
예제 #23
0
def test_train_gpu(tmpdir, ray_start_2_gpus, seed, num_slots):
    """Tests if training modifies model weights."""
    model = BoringModel()
    plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=True)
    trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True)
    train_test(trainer, model)
예제 #24
0
def test_load_gpu(tmpdir, ray_start_2_gpus, seed, num_slots):
    """Tests if model checkpoint can be loaded."""
    model = BoringModel()
    plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=True)
    trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True)
    load_test(trainer, model)
예제 #25
0
def test_train_client(tmpdir, start_ray_client_server_2_cpus, num_workers):
    assert ray.util.client.ray.is_connected()
    model = BoringModel()
    plugin = RayPlugin(num_workers=num_workers)
    trainer = get_trainer(tmpdir, plugins=[plugin])
    train_test(trainer, model)
예제 #26
0
def test_train_gpu(tmpdir, ray_start_2_gpus, seed, num_slots):
    """Tests if training modifies model weights."""
    model = BoringModel()
    accelerator = HorovodRayAccelerator(num_slots=num_slots, use_gpu=True)
    trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True)
    train_test(trainer, model)