예제 #1
0
def test_eval_train_calls(test_train_mock, test_eval_mock, val_train_mock, val_eval_mock, tmpdir):
    """
    Tests that only training_step can be used
    """
    model = BoringModel()
    model.validation_epoch_end = None

    trainer = Trainer(
        default_root_dir=tmpdir,
        limit_train_batches=2,
        limit_val_batches=2,
        max_epochs=2,
        row_log_interval=1,
        weights_summary=None,
    )

    trainer.fit(model)
    trainer.test()

    # sanity + 2 epochs
    assert val_eval_mock.call_count == 3
    assert val_train_mock.call_count == 3

    # test is called only once
    assert test_eval_mock.call_count == 1
    assert test_train_mock.call_count == 1
def test_ddp_sharded_plugin_resume_from_checkpoint_downsize_gpus(tmpdir):
    """
        Test to ensure that resuming from checkpoint works when downsizing number of GPUS
    """
    model = BoringModel()
    trainer = Trainer(
        accelerator='ddp_spawn',
        plugins=[DDPShardedPlugin()],
        fast_dev_run=True,
        gpus=2,
    )

    trainer.fit(model)

    checkpoint_path = os.path.join(tmpdir, 'model.pt')
    trainer.save_checkpoint(checkpoint_path)

    model = BoringModel()

    trainer = Trainer(accelerator='ddp_spawn',
                      plugins=[DDPShardedPlugin()],
                      fast_dev_run=True,
                      gpus=1,
                      resume_from_checkpoint=checkpoint_path)

    trainer.fit(model)
def test_accelerator_choice_ddp_cpu_custom_cluster(tmpdir):
    """
    Test that we choose the custom cluster even when SLURM or TE flags are around
    """
    class CustomCluster(ClusterEnvironment):
        def master_address(self):
            return 'asdf'

    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert trainer.use_ddp
            assert isinstance(trainer.accelerator_backend,
                              accelerators.DDPCPUSLURMBackend)
            assert isinstance(trainer.accelerator_backend.cluster_environment,
                              CustomCluster)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(plugins=[CustomCluster()],
                      fast_dev_run=True,
                      distributed_backend='ddp_cpu',
                      num_processes=1,
                      callbacks=[CB()])

    with pytest.raises(SystemExit):
        trainer.fit(model)
def test_ddp_choice_sharded_amp(tmpdir, ddp_backend, gpus, num_processes):
    """
        Test to ensure that plugin native amp plugin is correctly chosen when using sharded
    """
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.accelerator_backend.ddp_plugin,
                              DDPShardedPlugin)
            assert isinstance(trainer.precision_connector.backend,
                              ShardedNativeAMPPlugin)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(
        fast_dev_run=True,
        gpus=gpus,
        precision=16,
        num_processes=num_processes,
        accelerator=ddp_backend,
        plugins=[DDPShardedPlugin()],
        callbacks=[CB()],
    )

    with pytest.raises(SystemExit):
        trainer.fit(model)
예제 #5
0
def test_resume_training_on_cpu(tmpdir):
    """ Checks if training can be resumed from a saved checkpoint on CPU"""

    # Train a model on TPU
    model = BoringModel()
    trainer = Trainer(
        checkpoint_callback=True,
        max_epochs=1,
        tpu_cores=8,
    )
    trainer.fit(model)

    model_path = trainer.checkpoint_callback.best_model_path

    # Verify saved Tensors are on CPU
    ckpt = torch.load(model_path)
    weight_tensor = list(ckpt["state_dict"].values())[0]
    assert weight_tensor.device == torch.device("cpu")

    # Verify that training is resumed on CPU
    trainer = Trainer(resume_from_checkpoint=model_path,
                      checkpoint_callback=True,
                      max_epochs=1,
                      default_root_dir=tmpdir)
    result = trainer.fit(model)

    assert result == 1
def test_rpc_function_calls_ddp(tmpdir):
    model = BoringModel()
    plugin = CustomRPCPlugin()
    max_epochs = 2
    limit_train_batches = 2
    trainer = Trainer(limit_train_batches=limit_train_batches,
                      limit_val_batches=2,
                      max_epochs=max_epochs,
                      gpus=2,
                      distributed_backend='ddp',
                      plugins=[plugin])

    trainer.fit(model)
    if trainer.global_rank == 0:  # Main process
        assert plugin.rpc_save_model_count == max_epochs
        assert plugin.on_main_rpc_connect_count == 1
        assert plugin.worker_optimizer_step_count == max_epochs * limit_train_batches
        # Call once at init, and at optim step
        assert plugin.is_main_rpc_process_count == 1 + plugin.worker_optimizer_step_count
        assert plugin.on_exit_rpc_process_count == 0
    else:  # Worker process
        assert plugin.rpc_save_model_count == max_epochs
        assert plugin.on_main_rpc_connect_count == 0
        # Never signaled by worker, only by main process
        assert plugin.worker_optimizer_step_count == 0
        # Call once at init, and at optim step
        assert plugin.is_main_rpc_process_count == 1 + (max_epochs *
                                                        limit_train_batches)
        # Called at init
        assert plugin.on_exit_rpc_process_count == 1
예제 #7
0
def test_accelerator_choice_cpu(tmpdir):
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.accelerator_backend,
                              accelerators.CPUBackend)

    model = BoringModel()
    trainer = Trainer(fast_dev_run=True, callbacks=[CB()])
    trainer.fit(model)
def test_ddp_sharded_plugin_finetune(tmpdir):
    """
        Test to ensure that we can save and restart training (simulate fine-tuning)
    """
    model = BoringModel()
    trainer = Trainer(
        gpus=2,
        accelerator='ddp_spawn',
        plugins=[DDPShardedPlugin()],
        fast_dev_run=True,
    )
    trainer.fit(model)

    checkpoint_path = os.path.join(tmpdir, 'model.pt')
    trainer.save_checkpoint(checkpoint_path)
    saved_model = BoringModel.load_from_checkpoint(checkpoint_path)

    trainer = Trainer(fast_dev_run=True, )
    trainer.fit(saved_model)
예제 #9
0
def test_accelerator_choice_cpu(tmpdir):
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.accelerator_backend,
                              accelerators.CPUAccelerator)
            assert isinstance(trainer.accelerator_backend.cluster_environment,
                              TorchElasticEnvironment)

    model = BoringModel()
    trainer = Trainer(fast_dev_run=True, callbacks=[CB()])
    trainer.fit(model)
def test_ddp_sharded_plugin_test(tmpdir):
    """
        Test to ensure we can use test without fit
    """
    model = BoringModel()
    trainer = Trainer(
        accelerator='ddp_cpu',
        plugins=[DDPShardedPlugin()],
        fast_dev_run=True,
    )

    trainer.test(model)
def test_ddp_sharded_plugin_checkpoint_cpu(tmpdir):
    """
        Test to ensure that checkpoint is saved correctly
    """
    model = BoringModel()
    trainer = Trainer(
        accelerator='ddp_cpu',
        plugins=[DDPShardedPlugin()],
        fast_dev_run=True,
    )

    trainer.fit(model)

    checkpoint_path = os.path.join(tmpdir, 'model.pt')
    trainer.save_checkpoint(checkpoint_path)
    saved_model = BoringModel.load_from_checkpoint(checkpoint_path)

    # Assert model parameters are identical after loading
    for ddp_param, shard_param in zip(model.parameters(),
                                      saved_model.parameters()):
        assert torch.equal(ddp_param, shard_param)
예제 #12
0
def test_if_test_works_after_train(tmpdir):
    """ Ensure that .test() works after .fit() """

    # Train a model on TPU
    model = BoringModel()
    trainer = Trainer(checkpoint_callback=True,
                      max_epochs=1,
                      tpu_cores=8,
                      default_root_dir=tmpdir)
    trainer.fit(model)

    assert trainer.test() == 1
def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir):
    """
        Test to ensure that resuming from checkpoint works when going from GPUs- > CPU
    """
    model = BoringModel()
    trainer = Trainer(accelerator='ddp_spawn',
                      plugins=[DDPShardedPlugin()],
                      gpus=1,
                      fast_dev_run=True)

    trainer.fit(model)

    checkpoint_path = os.path.join(tmpdir, 'model.pt')
    trainer.save_checkpoint(checkpoint_path)

    model = BoringModel()

    trainer = Trainer(plugins=[DDPShardedPlugin()],
                      accelerator='ddp_cpu',
                      fast_dev_run=True,
                      resume_from_checkpoint=checkpoint_path)

    trainer.fit(model)
예제 #14
0
def test_accelerator_choice_ddp_cpu(tmpdir):
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.accelerator_backend,
                              accelerators.DDPCPUSpawnBackend)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(fast_dev_run=True,
                      distributed_backend='ddp_cpu',
                      callbacks=[CB()])

    with pytest.raises(SystemExit):
        trainer.fit(model)
def test_overfit_basic(tmpdir, overfit):
    """
    Tests that only training_step can be used
    """

    model = BoringModel()

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        overfit_batches=overfit,
        weights_summary=None,
    )

    trainer.fit(model)
def test_invalid_apex_sharded(tmpdir):
    """
        Test to ensure that we raise an error when we try to use apex and sharded
    """

    model = BoringModel()
    with pytest.raises(MisconfigurationException,
                       match='Sharded Plugin is not supported with Apex AMP'):
        trainer = Trainer(fast_dev_run=True,
                          distributed_backend='ddp_spawn',
                          plugins=[DDPShardedPlugin()],
                          precision=16,
                          amp_backend='apex')

        trainer.fit(model)
def test_dist_backend_accelerator_mapping(tmpdir):
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.accelerator_backend,
                              accelerators.DDPCPUSLURMBackend)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(fast_dev_run=True,
                      accelerator='ddp_cpu',
                      num_processes=1,
                      callbacks=[CB()])

    with pytest.raises(SystemExit):
        trainer.fit(model)
예제 #18
0
def test_ddp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.accelerator_backend.ddp_plugin,
                              DDPPlugin)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(fast_dev_run=True,
                      gpus=gpus,
                      num_processes=num_processes,
                      distributed_backend=ddp_backend,
                      callbacks=[CB()])

    with pytest.raises(SystemExit):
        trainer.fit(model)
def test_rpc_choice(tmpdir, ddp_backend, gpus, num_processes):
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.accelerator_backend.ddp_plugin,
                              RPCPlugin)
            raise RuntimeError('finished plugin check')

    model = BoringModel()
    trainer = Trainer(fast_dev_run=True,
                      gpus=gpus,
                      num_processes=num_processes,
                      distributed_backend=ddp_backend,
                      callbacks=[CB()],
                      plugins=[RPCPlugin()])

    with pytest.raises(RuntimeError, match='finished plugin check'):
        trainer.fit(model)
def test_accelerator_choice_ddp_cpu(tmpdir):
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert trainer.use_ddp
            assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSpawnAccelerator)
            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(
        fast_dev_run=True,
        accelerator='ddp_cpu',
        callbacks=[CB()],
    )

    with pytest.raises(SystemExit):
        trainer.fit(model)
def test_amp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.precision_connector.backend, ApexPlugin)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(fast_dev_run=True,
                      precision=16,
                      amp_backend='apex',
                      gpus=gpus,
                      num_processes=num_processes,
                      distributed_backend=ddp_backend,
                      callbacks=[CB()])

    with pytest.raises(SystemExit):
        trainer.fit(model)
def test_accelerator_choice_ddp_cpu_slurm(tmpdir):
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert trainer.use_ddp
            assert isinstance(trainer.accelerator_backend,
                              accelerators.DDPCPUSLURMBackend)
            assert isinstance(trainer.accelerator_backend.cluster_environment,
                              SLURMEnvironment)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(fast_dev_run=True,
                      distributed_backend='ddp_cpu',
                      num_processes=1,
                      callbacks=[CB()])

    with pytest.raises(SystemExit):
        trainer.fit(model)
def test_accelerator_choice_ddp2_te(tmpdir):
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert trainer.use_ddp2
            assert isinstance(trainer.accelerator_backend,
                              accelerators.DDP2Backend)
            assert isinstance(trainer.accelerator_backend.cluster_environment,
                              TorchElasticEnvironment)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(fast_dev_run=True,
                      distributed_backend='ddp2',
                      gpus=2,
                      callbacks=[CB()])

    with pytest.raises(SystemExit):
        trainer.fit(model)
def test_ddp_choice_string_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.accelerator_backend.ddp_plugin,
                              DDPShardedPlugin)
            raise RuntimeError('finished plugin check')

    model = BoringModel()
    trainer = Trainer(
        fast_dev_run=True,
        gpus=gpus,
        num_processes=num_processes,
        accelerator=ddp_backend,
        plugins='ddp_sharded',
        callbacks=[CB()],
    )

    with pytest.raises(RuntimeError, match='finished plugin check'):
        trainer.fit(model)
def test_custom_accelerator(tmpdir):
    class Accel(Accelerator):
        def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None:
            pass

    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.accelerator_backend, Accel)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(
        fast_dev_run=True,
        accelerator=Accel(),
        num_processes=1,
        callbacks=[CB()]
    )

    with pytest.raises(SystemExit):
        trainer.fit(model)
def test_accelerator_choice_ddp2_te(tmpdir):
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert trainer.use_ddp2
            assert isinstance(trainer.accelerator_backend, accelerators.DDP2Accelerator)
            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
            assert trainer.accelerator_backend.task_idx == 10
            assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(
        fast_dev_run=True,
        accelerator='ddp2',
        gpus=2,
        callbacks=[CB()],
    )

    with pytest.raises(SystemExit):
        trainer.fit(model)
def test_accelerator_choice_ddp_cpu_te(tmpdir):
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert trainer.use_ddp
            assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator)
            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
            assert trainer.accelerator_backend.task_idx == 10
            assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx

            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(
        fast_dev_run=True,
        distributed_backend='ddp_cpu',
        num_processes=1,
        callbacks=[CB()]
    )

    with pytest.raises(SystemExit):
        trainer.fit(model)
예제 #28
0
def test_custom_required_plugins(tmpdir, ddp_backend, gpus, num_processes):
    """
    Test to ensure that if a plugin requires certain plugin to be added, these are added automatically
    """
    class RequiredPlugin(NativeAMPPlugin):
        """
        My custom amp plugin that's required with my DDP plugin as default.
        This allows us to ensure this plugin is added when using CustomPlugin rather than ensuring
        the user passes it manually into the list.
        """

    class CustomPlugin(DDPPlugin):
        def required_plugins(self, amp_backend: AMPType,
                             trainer: Trainer) -> list:
            return [RequiredPlugin(trainer=trainer)]

    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.accelerator_backend.ddp_plugin,
                              CustomPlugin)
            assert isinstance(trainer.precision_connector.backend,
                              RequiredPlugin)
            raise RuntimeError('finished plugin check')

    model = BoringModel()
    with pytest.warns(
            UserWarning,
            match=f'plugin {type(CustomPlugin())} has added additional '
            f'required plugins as default: {[type(RequiredPlugin())]}*'):
        trainer = Trainer(
            fast_dev_run=True,
            gpus=gpus,
            num_processes=num_processes,
            distributed_backend=ddp_backend,
            plugins=[CustomPlugin()],
            callbacks=[CB()],
        )
    with pytest.raises(RuntimeError, match='finished plugin check'):
        trainer.fit(model)
def test_ddp_choice_sharded(tmpdir, ddp_backend, gpus, num_processes):
    """
        Test to ensure that plugin is correctly chosen
    """
    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.accelerator_backend.ddp_plugin,
                              DDPShardedPlugin)
            raise SystemExit()

    model = BoringModel()
    trainer = Trainer(
        fast_dev_run=True,
        gpus=gpus,
        num_processes=num_processes,
        distributed_backend=ddp_backend,
        plugins=[DDPShardedPlugin()],
        callbacks=[CB()],
    )

    with pytest.raises(SystemExit):
        trainer.fit(model)
def test_ddp_choice_custom_ddp_cpu_custom_args(tmpdir, ddp_backend, gpus,
                                               num_processes):
    class MyDDP(DDPPlugin):
        pass

    class CB(Callback):
        def on_fit_start(self, trainer, pl_module):
            assert isinstance(trainer.accelerator_backend.ddp_plugin, MyDDP)
            raise RuntimeError('finished plugin check')

    model = BoringModel()
    trainer = Trainer(
        fast_dev_run=True,
        gpus=gpus,
        num_processes=num_processes,
        accelerator=ddp_backend,
        plugins=[MyDDP(broadcast_buffers=False, find_unused_parameters=True)],
        callbacks=[CB()],
    )

    with pytest.raises(RuntimeError, match='finished plugin check'):
        trainer.fit(model)