def test_manual_poptorch_opts_train_grad_accum(tmpdir): """ Ensure if the user passes manual poptorch Options and grad accumulation differs to accumulate_grad_batches, we """ model = IPUModel() inference_opts = poptorch.Options() inference_opts.Training.gradientAccumulation(1) training_opts = poptorch.Options() training_opts.Training.gradientAccumulation(2) trainer = Trainer(default_root_dir=tmpdir, ipus=1, fast_dev_run=True, accumulate_grad_batches=1, plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)) with pytest.warns( UserWarning, match=f"Training poptorch.Options set gradientAccumulation to {2}. " f"This is different to accumulate_grad_batches which was set to {1}. " f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. " f"Setting poptorch.Options gradientAccumulation to {1}", ): trainer.fit(model) assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1
def test_replication_factor(tmpdir): """Ensure if the user passes manual poptorch Options with custom parameters set, we set them correctly in the dataloaders.""" plugin = IPUPlugin() trainer = Trainer(ipus=2, default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin) assert trainer.ipus == 2
def test_manual_poptorch_opts_inference_grad_accum(tmpdir): """ Ensure if the user passes manual poptorch Options and grad accumulation is set greater than 1 for inference, we warn and set to 1. """ model = IPUModel() inference_opts = poptorch.Options() inference_opts.Training.gradientAccumulation(4) training_opts = poptorch.Options() training_opts.Training.gradientAccumulation(1) trainer = Trainer(default_root_dir=tmpdir, ipus=1, fast_dev_run=True, plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)) with pytest.warns( UserWarning, match= "Inference poptorch.Options should set gradientAccumulation to 1. " "Setting gradientAccumulation to 1 for inference options.", ): trainer.fit(model) assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1
def test_manual_poptorch_opts_ipu_count(tmpdir): """ Ensure if the user passes manual poptorch Options and the number of ipus do not match, we warn and we set it for the user. """ manual_ipus = 1 expected_ipus = 2 model = IPUModel() inference_opts = poptorch.Options() inference_opts.replicationFactor(manual_ipus) training_opts = poptorch.Options() training_opts.replicationFactor(manual_ipus) trainer = Trainer(default_root_dir=tmpdir, ipus=expected_ipus, fast_dev_run=True, plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)) with pytest.warns( UserWarning, match= f"Manual poptorch.Options set replicationFactor to {manual_ipus} " f"which differs to the ipus={expected_ipus} flag passed to the Trainer. " f"Setting to {expected_ipus} in the poptorch.Options."): trainer.fit(model) assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) assert trainer.accelerator.training_type_plugin.training_opts.replication_factor == 2 assert trainer.accelerator.training_type_plugin.inference_opts.replication_factor == 2
def test_manual_poptorch_opts_custom(tmpdir): """ Ensure if the user passes manual poptorch Options with custom parameters set, we respect them in our poptorch options and the dataloaders. """ model = IPUModel() training_opts = poptorch.Options() training_opts.deviceIterations(8) training_opts.replicationFactor(2) training_opts.Training.gradientAccumulation(2) inference_opts = poptorch.Options() inference_opts.deviceIterations(16) inference_opts.replicationFactor(1) inference_opts.Training.gradientAccumulation(1) class TestCallback(Callback): def on_fit_end(self, trainer: Trainer, pl_module: LightningModule) -> None: # ensure dataloaders were correctly set up during training. plugin = trainer.accelerator.training_type_plugin assert isinstance(plugin, IPUPlugin) assert plugin.training_opts.replication_factor == 2 assert plugin.inference_opts.replication_factor == 1 val_dataloader = trainer.val_dataloaders[0] train_dataloader = trainer.train_dataloader assert isinstance(train_dataloader, CombinedLoader) train_dataloader = train_dataloader.loaders assert isinstance(val_dataloader, poptorch.DataLoader) assert isinstance(train_dataloader, poptorch.DataLoader) assert train_dataloader.options.replication_factor == 2 assert val_dataloader.options.replication_factor == 1 plugin = IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) # ensure we default to the training options replication factor assert plugin.replication_factor == 2 trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin, callbacks=TestCallback()) trainer.fit(model) plugin = trainer.accelerator.training_type_plugin assert isinstance(plugin, IPUPlugin) training_opts = plugin.training_opts assert training_opts.device_iterations == 8 assert training_opts.replication_factor == 2 assert training_opts.Training.gradient_accumulation == 2 inference_opts = plugin.inference_opts assert inference_opts.device_iterations == 16 assert inference_opts.replication_factor == 1 assert inference_opts.Training.gradient_accumulation == 1
def test_replication_factor(tmpdir): """Ensure if the user passes manual poptorch Options with custom parameters set, we set them correctly in the dataloaders.""" plugin = IPUPlugin() trainer = Trainer(ipus=2, default_root_dir=tmpdir, fast_dev_run=True, strategy=plugin) assert trainer.ipus == 2 assert trainer.training_type_plugin.replication_factor == 2 model = BoringModel() training_opts = poptorch.Options() inference_opts = poptorch.Options() training_opts.replicationFactor(8) inference_opts.replicationFactor(7) plugin = IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) trainer = Trainer(default_root_dir=tmpdir, ipus=1, strategy=plugin) trainer.optimizers = model.configure_optimizers()[0] plugin.model = model model.trainer = trainer trainer.state.fn = TrainerFn.FITTING trainer.training_type_plugin.pre_dispatch() trainer.state.stage = RunningStage.TRAINING assert trainer.training_type_plugin.replication_factor == 8 trainer.state.stage = RunningStage.VALIDATING assert trainer.training_type_plugin.replication_factor == 7 for fn, stage in ( (TrainerFn.VALIDATING, RunningStage.VALIDATING), (TrainerFn.TESTING, RunningStage.TESTING), (TrainerFn.PREDICTING, RunningStage.PREDICTING), ): trainer.state.fn = fn trainer.state.stage = stage trainer.training_type_plugin.pre_dispatch() assert trainer.training_type_plugin.replication_factor == 7
def test_autoreport(tmpdir): """Ensure autoreport dumps to a file.""" model = IPUModel() autoreport_path = os.path.join(tmpdir, 'report/') trainer = Trainer(default_root_dir=tmpdir, ipus=1, fast_dev_run=True, plugins=IPUPlugin(autoreport=True, autoreport_dir=autoreport_path)) trainer.fit(model) assert os.path.exists(autoreport_path) assert os.path.isfile(autoreport_path + 'profile.pop')
def test_manual_poptorch_opts(tmpdir): """Ensure if the user passes manual poptorch Options, we run with the correct object.""" model = IPUModel() inference_opts = poptorch.Options() training_opts = poptorch.Options() trainer = Trainer(default_root_dir=tmpdir, ipus=1, fast_dev_run=True, plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)) trainer.fit(model) assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) assert trainer.accelerator.training_type_plugin.training_opts == training_opts assert trainer.accelerator.training_type_plugin.inference_opts == inference_opts
def test_device_iterations_ipu_plugin(tmpdir): class TestCallback(Callback): def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: assert trainer.accelerator.training_type_plugin.device_iterations == 2 # assert device iterations has been set correctly within the poptorch options poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models[ RunningStage.TRAINING] assert poptorch_model._options.toDict()['device_iterations'] == 2 raise SystemExit model = IPUModel() trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=1, plugins=IPUPlugin(device_iterations=2), callbacks=TestCallback()) assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) with pytest.raises(SystemExit): trainer.fit(model)
def test_poptorch_models_at_different_stages(tmpdir): plugin = IPUPlugin() trainer = Trainer(default_root_dir=tmpdir, strategy=plugin, ipus=8) model = BoringModel() model.trainer = trainer plugin.model = model trainer.optimizers = model.configure_optimizers()[0] trainer.state.fn = TrainerFn.FITTING trainer.training_type_plugin.pre_dispatch() assert list(trainer.training_type_plugin.poptorch_models) == [ RunningStage.TRAINING, RunningStage.VALIDATING ] for fn, stage in ( (TrainerFn.VALIDATING, RunningStage.VALIDATING), (TrainerFn.TESTING, RunningStage.TESTING), (TrainerFn.PREDICTING, RunningStage.PREDICTING), ): trainer.state.fn = fn trainer.state.stage = stage trainer.training_type_plugin.pre_dispatch() assert list(trainer.training_type_plugin.poptorch_models) == [stage]
def test_pure_half_precision(tmpdir): class TestCallback(Callback): def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: assert trainer.accelerator.model.precision == 16 assert trainer.accelerator.training_type_plugin.convert_model_to_half for param in trainer.accelerator.model.parameters(): assert param.dtype == torch.float16 raise SystemExit model = IPUModel() trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=1, precision=16, plugins=IPUPlugin(convert_model_to_half=True), callbacks=TestCallback()) assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) assert trainer.accelerator.precision_plugin.precision == 16 with pytest.raises(SystemExit): trainer.fit(model)
def select_training_type_plugin(self) -> TrainingTypePlugin: if isinstance( self.distributed_backend, Accelerator ) and self.distributed_backend.training_type_plugin is not None: plugin = self.distributed_backend.training_type_plugin elif self.use_ddp2: plugin = DDP2Plugin( parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment, ) elif self.use_ddp and self.use_deepspeed: plugin = DeepSpeedPlugin( cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices ) elif self.use_ddp: use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.is_using_torchelastic() use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.is_using_kubeflow() use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN use_ddp_cpu_spawn = self.use_ddp and self.on_cpu use_tpu_spawn = self.on_tpu and self._distrib_type == DistributedType.TPU_SPAWN use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.is_using_torchelastic() use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.is_using_kubeflow() use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN use_ddp_fully_sharded = self._distrib_type == DistributedType.DDP_FULLY_SHARDED # TODO: decouple from TE # ddp script mode uses the same flags as TE if os.environ.get("PL_IN_DDP_SUBPROCESS", False): use_torchelastic_ddp = False if use_tpu_spawn: ddp_plugin_cls = TPUSpawnPlugin elif use_ddp_sharded: ddp_plugin_cls = DDPShardedPlugin elif use_ddp_sharded_spawn: ddp_plugin_cls = DDPSpawnShardedPlugin elif ( use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp or use_kubeflow_ddp or use_ddp_cpu_kubeflow ): ddp_plugin_cls = DDPPlugin elif use_ddp_spawn or use_ddp_cpu_spawn: ddp_plugin_cls = DDPSpawnPlugin elif use_ddp_fully_sharded: ddp_plugin_cls = DDPFullyShardedPlugin else: ddp_plugin_cls = DDPPlugin plugin = ddp_plugin_cls( parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment, ) elif self.use_dp: plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) elif self.use_horovod: plugin = HorovodPlugin(parallel_devices=self.parallel_devices) elif self.on_tpu and isinstance(self.tpu_cores, list): plugin = SingleTPUPlugin(self.tpu_id) elif self.on_ipu: plugin = IPUPlugin(parallel_devices=self.parallel_devices) else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu")) return plugin
def test_no_warning_plugin(tmpdir): with pytest.warns(None) as record: Trainer(default_root_dir=tmpdir, plugins=IPUPlugin(training_opts=poptorch.Options())) assert len(record) == 0
def test_device_type_when_training_plugin_ipu_passed(tmpdir): trainer = Trainer(strategy=IPUPlugin(), ipus=8) assert isinstance(trainer.training_type_plugin, IPUPlugin) assert trainer._device_type == DeviceType.IPU assert isinstance(trainer.accelerator, IPUAccelerator)
def test_strategy_choice_ipu_plugin(tmpdir): trainer = Trainer(strategy=IPUPlugin(), accelerator="ipu", devices=8) assert isinstance(trainer.training_type_plugin, IPUPlugin)