def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir): """ Test to ensure that resuming from checkpoint works when going from GPUs- > CPU """ model = BoringModel() trainer = Trainer( accelerator='ddp_spawn', plugins=[DDPShardedPlugin()], gpus=1, fast_dev_run=True, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) model = BoringModel() trainer = Trainer(plugins=[DDPShardedPlugin()], accelerator='ddp_cpu', num_processes=2, fast_dev_run=True, resume_from_checkpoint=checkpoint_path) trainer.fit(model)
def test_ddp_sharded_plugin_resume_from_checkpoint_downsize_gpus(tmpdir): """ Test to ensure that resuming from checkpoint works when downsizing number of GPUS """ model = BoringModel() trainer = Trainer( accelerator='ddp_spawn', plugins=[DDPShardedPlugin()], fast_dev_run=True, gpus=2, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) model = BoringModel() trainer = Trainer(accelerator='ddp_spawn', plugins=[DDPShardedPlugin()], fast_dev_run=True, gpus=1, resume_from_checkpoint=checkpoint_path) trainer.fit(model)
def test_ddp_choice_sharded_amp(tmpdir, ddp_backend, gpus, num_processes): """ Test to ensure that plugin native amp plugin is correctly chosen when using sharded """ class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPShardedPlugin) assert isinstance(trainer.precision_connector.backend, ShardedNativeAMPPlugin) raise SystemExit() model = BoringModel() trainer = Trainer( fast_dev_run=True, gpus=gpus, precision=16, num_processes=num_processes, accelerator=ddp_backend, plugins=[DDPShardedPlugin()], callbacks=[CB()], ) with pytest.raises(SystemExit): trainer.fit(model)
def test_ddp_sharded_plugin_correctness_one_gpu(): plugin_parity_test( gpus=1, accelerator='ddp_spawn', plugin=DDPShardedPlugin(), model_cls=SeedTrainLoaderModel, )
def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None): plugin_parity_test( gpus=args.gpus, precision=args.precision, accelerator=args.accelerator, plugin=DDPShardedPlugin(), model_cls=SeedTrainLoaderModel, )
def test_ddp_sharded_plugin_correctness_multi_gpu(): plugin_parity_test( gpus=2, accelerator='ddp_spawn', plugin=DDPShardedPlugin(), model_cls=SeedTrainLoaderModel, max_percent_speed_diff= 0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers )
def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir): """ Ensures using multiple optimizers across multiple GPUs with manual optimization """ plugin_parity_test( plugin=DDPShardedPlugin(), gpus=2, accelerator='ddp_spawn', model_cls=SeedTrainLoaderManualModel, max_percent_speed_diff= 0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers )
def test_ddp_sharded_plugin_test_multigpu(tmpdir): """ Test to ensure we can use test without fit """ model = BoringModel() trainer = Trainer( accelerator='ddp_spawn', gpus=2, plugins=[DDPShardedPlugin()], fast_dev_run=True, ) trainer.test(model)
def test_ddp_sharded_plugin_test(tmpdir): """ Test to ensure we can use test without fit """ model = BoringModel() trainer = Trainer( accelerator='ddp_cpu', num_processes=2, plugins=[DDPShardedPlugin()], fast_dev_run=True, ) trainer.test(model)
def test_invalid_apex_sharded(tmpdir): """ Test to ensure that we raise an error when we try to use apex and sharded """ model = BoringModel() with pytest.raises(MisconfigurationException, match='Sharded Plugin is not supported with Apex AMP'): trainer = Trainer( fast_dev_run=True, accelerator='ddp_spawn', plugins=[DDPShardedPlugin()], precision=16, amp_backend='apex', ) trainer.fit(model)
def test_ddp_sharded_plugin_finetune(tmpdir): """ Test to ensure that we can save and restart training (simulate fine-tuning) """ model = BoringModel() trainer = Trainer( gpus=2, accelerator='ddp_spawn', plugins=[DDPShardedPlugin()], fast_dev_run=True, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) trainer = Trainer(fast_dev_run=True, ) trainer.fit(saved_model)
def test_ddp_sharded_plugin_checkpoint_multi_gpu(tmpdir): """ Test to ensure that checkpoint is saved correctly when using multiple GPUs """ model = BoringModel() trainer = Trainer( gpus=2, accelerator='ddp_spawn', plugins=[DDPShardedPlugin()], fast_dev_run=True, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) # Assert model parameters are identical after loading for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()): assert torch.equal(ddp_param, shard_param)