def test_eval_train_calls(test_train_mock, test_eval_mock, val_train_mock, val_eval_mock, tmpdir): """ Tests that only training_step can be used """ model = BoringModel() model.validation_epoch_end = None trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=2, row_log_interval=1, weights_summary=None, ) trainer.fit(model) trainer.test() # sanity + 2 epochs assert val_eval_mock.call_count == 3 assert val_train_mock.call_count == 3 # test is called only once assert test_eval_mock.call_count == 1 assert test_train_mock.call_count == 1
def test_ddp_sharded_plugin_resume_from_checkpoint_downsize_gpus(tmpdir): """ Test to ensure that resuming from checkpoint works when downsizing number of GPUS """ model = BoringModel() trainer = Trainer( accelerator='ddp_spawn', plugins=[DDPShardedPlugin()], fast_dev_run=True, gpus=2, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) model = BoringModel() trainer = Trainer(accelerator='ddp_spawn', plugins=[DDPShardedPlugin()], fast_dev_run=True, gpus=1, resume_from_checkpoint=checkpoint_path) trainer.fit(model)
def test_accelerator_choice_ddp_cpu_custom_cluster(tmpdir): """ Test that we choose the custom cluster even when SLURM or TE flags are around """ class CustomCluster(ClusterEnvironment): def master_address(self): return 'asdf' class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSLURMBackend) assert isinstance(trainer.accelerator_backend.cluster_environment, CustomCluster) raise SystemExit() model = BoringModel() trainer = Trainer(plugins=[CustomCluster()], fast_dev_run=True, distributed_backend='ddp_cpu', num_processes=1, callbacks=[CB()]) with pytest.raises(SystemExit): trainer.fit(model)
def test_ddp_choice_sharded_amp(tmpdir, ddp_backend, gpus, num_processes): """ Test to ensure that plugin native amp plugin is correctly chosen when using sharded """ class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPShardedPlugin) assert isinstance(trainer.precision_connector.backend, ShardedNativeAMPPlugin) raise SystemExit() model = BoringModel() trainer = Trainer( fast_dev_run=True, gpus=gpus, precision=16, num_processes=num_processes, accelerator=ddp_backend, plugins=[DDPShardedPlugin()], callbacks=[CB()], ) with pytest.raises(SystemExit): trainer.fit(model)
def test_resume_training_on_cpu(tmpdir): """ Checks if training can be resumed from a saved checkpoint on CPU""" # Train a model on TPU model = BoringModel() trainer = Trainer( checkpoint_callback=True, max_epochs=1, tpu_cores=8, ) trainer.fit(model) model_path = trainer.checkpoint_callback.best_model_path # Verify saved Tensors are on CPU ckpt = torch.load(model_path) weight_tensor = list(ckpt["state_dict"].values())[0] assert weight_tensor.device == torch.device("cpu") # Verify that training is resumed on CPU trainer = Trainer(resume_from_checkpoint=model_path, checkpoint_callback=True, max_epochs=1, default_root_dir=tmpdir) result = trainer.fit(model) assert result == 1
def test_rpc_function_calls_ddp(tmpdir): model = BoringModel() plugin = CustomRPCPlugin() max_epochs = 2 limit_train_batches = 2 trainer = Trainer(limit_train_batches=limit_train_batches, limit_val_batches=2, max_epochs=max_epochs, gpus=2, distributed_backend='ddp', plugins=[plugin]) trainer.fit(model) if trainer.global_rank == 0: # Main process assert plugin.rpc_save_model_count == max_epochs assert plugin.on_main_rpc_connect_count == 1 assert plugin.worker_optimizer_step_count == max_epochs * limit_train_batches # Call once at init, and at optim step assert plugin.is_main_rpc_process_count == 1 + plugin.worker_optimizer_step_count assert plugin.on_exit_rpc_process_count == 0 else: # Worker process assert plugin.rpc_save_model_count == max_epochs assert plugin.on_main_rpc_connect_count == 0 # Never signaled by worker, only by main process assert plugin.worker_optimizer_step_count == 0 # Call once at init, and at optim step assert plugin.is_main_rpc_process_count == 1 + (max_epochs * limit_train_batches) # Called at init assert plugin.on_exit_rpc_process_count == 1
def test_accelerator_choice_cpu(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend, accelerators.CPUBackend) model = BoringModel() trainer = Trainer(fast_dev_run=True, callbacks=[CB()]) trainer.fit(model)
def test_ddp_sharded_plugin_finetune(tmpdir): """ Test to ensure that we can save and restart training (simulate fine-tuning) """ model = BoringModel() trainer = Trainer( gpus=2, accelerator='ddp_spawn', plugins=[DDPShardedPlugin()], fast_dev_run=True, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) trainer = Trainer(fast_dev_run=True, ) trainer.fit(saved_model)
def test_accelerator_choice_cpu(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend, accelerators.CPUAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) model = BoringModel() trainer = Trainer(fast_dev_run=True, callbacks=[CB()]) trainer.fit(model)
def test_ddp_sharded_plugin_test(tmpdir): """ Test to ensure we can use test without fit """ model = BoringModel() trainer = Trainer( accelerator='ddp_cpu', plugins=[DDPShardedPlugin()], fast_dev_run=True, ) trainer.test(model)
def test_ddp_sharded_plugin_checkpoint_cpu(tmpdir): """ Test to ensure that checkpoint is saved correctly """ model = BoringModel() trainer = Trainer( accelerator='ddp_cpu', plugins=[DDPShardedPlugin()], fast_dev_run=True, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) # Assert model parameters are identical after loading for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()): assert torch.equal(ddp_param, shard_param)
def test_if_test_works_after_train(tmpdir): """ Ensure that .test() works after .fit() """ # Train a model on TPU model = BoringModel() trainer = Trainer(checkpoint_callback=True, max_epochs=1, tpu_cores=8, default_root_dir=tmpdir) trainer.fit(model) assert trainer.test() == 1
def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir): """ Test to ensure that resuming from checkpoint works when going from GPUs- > CPU """ model = BoringModel() trainer = Trainer(accelerator='ddp_spawn', plugins=[DDPShardedPlugin()], gpus=1, fast_dev_run=True) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) model = BoringModel() trainer = Trainer(plugins=[DDPShardedPlugin()], accelerator='ddp_cpu', fast_dev_run=True, resume_from_checkpoint=checkpoint_path) trainer.fit(model)
def test_accelerator_choice_ddp_cpu(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSpawnBackend) raise SystemExit() model = BoringModel() trainer = Trainer(fast_dev_run=True, distributed_backend='ddp_cpu', callbacks=[CB()]) with pytest.raises(SystemExit): trainer.fit(model)
def test_overfit_basic(tmpdir, overfit): """ Tests that only training_step can be used """ model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, overfit_batches=overfit, weights_summary=None, ) trainer.fit(model)
def test_invalid_apex_sharded(tmpdir): """ Test to ensure that we raise an error when we try to use apex and sharded """ model = BoringModel() with pytest.raises(MisconfigurationException, match='Sharded Plugin is not supported with Apex AMP'): trainer = Trainer(fast_dev_run=True, distributed_backend='ddp_spawn', plugins=[DDPShardedPlugin()], precision=16, amp_backend='apex') trainer.fit(model)
def test_dist_backend_accelerator_mapping(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSLURMBackend) raise SystemExit() model = BoringModel() trainer = Trainer(fast_dev_run=True, accelerator='ddp_cpu', num_processes=1, callbacks=[CB()]) with pytest.raises(SystemExit): trainer.fit(model)
def test_ddp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPPlugin) raise SystemExit() model = BoringModel() trainer = Trainer(fast_dev_run=True, gpus=gpus, num_processes=num_processes, distributed_backend=ddp_backend, callbacks=[CB()]) with pytest.raises(SystemExit): trainer.fit(model)
def test_rpc_choice(tmpdir, ddp_backend, gpus, num_processes): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend.ddp_plugin, RPCPlugin) raise RuntimeError('finished plugin check') model = BoringModel() trainer = Trainer(fast_dev_run=True, gpus=gpus, num_processes=num_processes, distributed_backend=ddp_backend, callbacks=[CB()], plugins=[RPCPlugin()]) with pytest.raises(RuntimeError, match='finished plugin check'): trainer.fit(model)
def test_accelerator_choice_ddp_cpu(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSpawnAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) raise SystemExit() model = BoringModel() trainer = Trainer( fast_dev_run=True, accelerator='ddp_cpu', callbacks=[CB()], ) with pytest.raises(SystemExit): trainer.fit(model)
def test_amp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.precision_connector.backend, ApexPlugin) raise SystemExit() model = BoringModel() trainer = Trainer(fast_dev_run=True, precision=16, amp_backend='apex', gpus=gpus, num_processes=num_processes, distributed_backend=ddp_backend, callbacks=[CB()]) with pytest.raises(SystemExit): trainer.fit(model)
def test_accelerator_choice_ddp_cpu_slurm(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSLURMBackend) assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment) raise SystemExit() model = BoringModel() trainer = Trainer(fast_dev_run=True, distributed_backend='ddp_cpu', num_processes=1, callbacks=[CB()]) with pytest.raises(SystemExit): trainer.fit(model)
def test_accelerator_choice_ddp2_te(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp2 assert isinstance(trainer.accelerator_backend, accelerators.DDP2Backend) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) raise SystemExit() model = BoringModel() trainer = Trainer(fast_dev_run=True, distributed_backend='ddp2', gpus=2, callbacks=[CB()]) with pytest.raises(SystemExit): trainer.fit(model)
def test_ddp_choice_string_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPShardedPlugin) raise RuntimeError('finished plugin check') model = BoringModel() trainer = Trainer( fast_dev_run=True, gpus=gpus, num_processes=num_processes, accelerator=ddp_backend, plugins='ddp_sharded', callbacks=[CB()], ) with pytest.raises(RuntimeError, match='finished plugin check'): trainer.fit(model)
def test_custom_accelerator(tmpdir): class Accel(Accelerator): def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None: pass class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend, Accel) raise SystemExit() model = BoringModel() trainer = Trainer( fast_dev_run=True, accelerator=Accel(), num_processes=1, callbacks=[CB()] ) with pytest.raises(SystemExit): trainer.fit(model)
def test_accelerator_choice_ddp2_te(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp2 assert isinstance(trainer.accelerator_backend, accelerators.DDP2Accelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) assert trainer.accelerator_backend.task_idx == 10 assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx raise SystemExit() model = BoringModel() trainer = Trainer( fast_dev_run=True, accelerator='ddp2', gpus=2, callbacks=[CB()], ) with pytest.raises(SystemExit): trainer.fit(model)
def test_accelerator_choice_ddp_cpu_te(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) assert trainer.accelerator_backend.task_idx == 10 assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx raise SystemExit() model = BoringModel() trainer = Trainer( fast_dev_run=True, distributed_backend='ddp_cpu', num_processes=1, callbacks=[CB()] ) with pytest.raises(SystemExit): trainer.fit(model)
def test_custom_required_plugins(tmpdir, ddp_backend, gpus, num_processes): """ Test to ensure that if a plugin requires certain plugin to be added, these are added automatically """ class RequiredPlugin(NativeAMPPlugin): """ My custom amp plugin that's required with my DDP plugin as default. This allows us to ensure this plugin is added when using CustomPlugin rather than ensuring the user passes it manually into the list. """ class CustomPlugin(DDPPlugin): def required_plugins(self, amp_backend: AMPType, trainer: Trainer) -> list: return [RequiredPlugin(trainer=trainer)] class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend.ddp_plugin, CustomPlugin) assert isinstance(trainer.precision_connector.backend, RequiredPlugin) raise RuntimeError('finished plugin check') model = BoringModel() with pytest.warns( UserWarning, match=f'plugin {type(CustomPlugin())} has added additional ' f'required plugins as default: {[type(RequiredPlugin())]}*'): trainer = Trainer( fast_dev_run=True, gpus=gpus, num_processes=num_processes, distributed_backend=ddp_backend, plugins=[CustomPlugin()], callbacks=[CB()], ) with pytest.raises(RuntimeError, match='finished plugin check'): trainer.fit(model)
def test_ddp_choice_sharded(tmpdir, ddp_backend, gpus, num_processes): """ Test to ensure that plugin is correctly chosen """ class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPShardedPlugin) raise SystemExit() model = BoringModel() trainer = Trainer( fast_dev_run=True, gpus=gpus, num_processes=num_processes, distributed_backend=ddp_backend, plugins=[DDPShardedPlugin()], callbacks=[CB()], ) with pytest.raises(SystemExit): trainer.fit(model)
def test_ddp_choice_custom_ddp_cpu_custom_args(tmpdir, ddp_backend, gpus, num_processes): class MyDDP(DDPPlugin): pass class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend.ddp_plugin, MyDDP) raise RuntimeError('finished plugin check') model = BoringModel() trainer = Trainer( fast_dev_run=True, gpus=gpus, num_processes=num_processes, accelerator=ddp_backend, plugins=[MyDDP(broadcast_buffers=False, find_unused_parameters=True)], callbacks=[CB()], ) with pytest.raises(RuntimeError, match='finished plugin check'): trainer.fit(model)