예제 #1
0
def test_manual_poptorch_opts_train_grad_accum(tmpdir):
    """
    Ensure if the user passes manual poptorch Options
    and grad accumulation differs to accumulate_grad_batches, we
    """

    model = IPUModel()
    inference_opts = poptorch.Options()
    inference_opts.Training.gradientAccumulation(1)

    training_opts = poptorch.Options()
    training_opts.Training.gradientAccumulation(2)

    trainer = Trainer(default_root_dir=tmpdir,
                      ipus=1,
                      fast_dev_run=True,
                      accumulate_grad_batches=1,
                      plugins=IPUPlugin(inference_opts=inference_opts,
                                        training_opts=training_opts))
    with pytest.warns(
            UserWarning,
            match=f"Training poptorch.Options set gradientAccumulation to {2}. "
            f"This is different to accumulate_grad_batches which was set to {1}. "
            f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. "
            f"Setting poptorch.Options gradientAccumulation to {1}",
    ):
        trainer.fit(model)
        assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
        assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1
예제 #2
0
def test_replication_factor(tmpdir):
    """Ensure if the user passes manual poptorch Options with custom parameters set, we set them correctly in the
    dataloaders."""

    plugin = IPUPlugin()
    trainer = Trainer(ipus=2, default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin)
    assert trainer.ipus == 2
예제 #3
0
def test_manual_poptorch_opts_inference_grad_accum(tmpdir):
    """
    Ensure if the user passes manual poptorch Options
    and grad accumulation is set greater than 1 for inference, we warn and set to 1.
    """

    model = IPUModel()
    inference_opts = poptorch.Options()
    inference_opts.Training.gradientAccumulation(4)

    training_opts = poptorch.Options()
    training_opts.Training.gradientAccumulation(1)

    trainer = Trainer(default_root_dir=tmpdir,
                      ipus=1,
                      fast_dev_run=True,
                      plugins=IPUPlugin(inference_opts=inference_opts,
                                        training_opts=training_opts))
    with pytest.warns(
            UserWarning,
            match=
            "Inference poptorch.Options should set gradientAccumulation to 1. "
            "Setting gradientAccumulation to 1 for inference options.",
    ):
        trainer.fit(model)
        assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
        assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1
예제 #4
0
def test_manual_poptorch_opts_ipu_count(tmpdir):
    """
    Ensure if the user passes manual poptorch Options
    and the number of ipus do not match, we warn and we set it for the user.
    """

    manual_ipus = 1
    expected_ipus = 2
    model = IPUModel()
    inference_opts = poptorch.Options()
    inference_opts.replicationFactor(manual_ipus)

    training_opts = poptorch.Options()
    training_opts.replicationFactor(manual_ipus)

    trainer = Trainer(default_root_dir=tmpdir,
                      ipus=expected_ipus,
                      fast_dev_run=True,
                      plugins=IPUPlugin(inference_opts=inference_opts,
                                        training_opts=training_opts))
    with pytest.warns(
            UserWarning,
            match=
            f"Manual poptorch.Options set replicationFactor to {manual_ipus} "
            f"which differs to the ipus={expected_ipus} flag passed to the Trainer. "
            f"Setting to {expected_ipus} in the poptorch.Options."):
        trainer.fit(model)
        assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
        assert trainer.accelerator.training_type_plugin.training_opts.replication_factor == 2
        assert trainer.accelerator.training_type_plugin.inference_opts.replication_factor == 2
예제 #5
0
def test_manual_poptorch_opts_custom(tmpdir):
    """
    Ensure if the user passes manual poptorch Options with custom parameters set,
    we respect them in our poptorch options and the dataloaders.
    """

    model = IPUModel()
    training_opts = poptorch.Options()
    training_opts.deviceIterations(8)
    training_opts.replicationFactor(2)
    training_opts.Training.gradientAccumulation(2)

    inference_opts = poptorch.Options()
    inference_opts.deviceIterations(16)
    inference_opts.replicationFactor(1)
    inference_opts.Training.gradientAccumulation(1)

    class TestCallback(Callback):
        def on_fit_end(self, trainer: Trainer,
                       pl_module: LightningModule) -> None:
            # ensure dataloaders were correctly set up during training.
            plugin = trainer.accelerator.training_type_plugin
            assert isinstance(plugin, IPUPlugin)
            assert plugin.training_opts.replication_factor == 2
            assert plugin.inference_opts.replication_factor == 1

            val_dataloader = trainer.val_dataloaders[0]
            train_dataloader = trainer.train_dataloader
            assert isinstance(train_dataloader, CombinedLoader)
            train_dataloader = train_dataloader.loaders
            assert isinstance(val_dataloader, poptorch.DataLoader)
            assert isinstance(train_dataloader, poptorch.DataLoader)
            assert train_dataloader.options.replication_factor == 2
            assert val_dataloader.options.replication_factor == 1

    plugin = IPUPlugin(inference_opts=inference_opts,
                       training_opts=training_opts)
    # ensure we default to the training options replication factor
    assert plugin.replication_factor == 2
    trainer = Trainer(default_root_dir=tmpdir,
                      fast_dev_run=True,
                      plugins=plugin,
                      callbacks=TestCallback())
    trainer.fit(model)

    plugin = trainer.accelerator.training_type_plugin
    assert isinstance(plugin, IPUPlugin)

    training_opts = plugin.training_opts
    assert training_opts.device_iterations == 8
    assert training_opts.replication_factor == 2
    assert training_opts.Training.gradient_accumulation == 2

    inference_opts = plugin.inference_opts
    assert inference_opts.device_iterations == 16
    assert inference_opts.replication_factor == 1
    assert inference_opts.Training.gradient_accumulation == 1
예제 #6
0
def test_replication_factor(tmpdir):
    """Ensure if the user passes manual poptorch Options with custom parameters set, we set them correctly in the
    dataloaders."""

    plugin = IPUPlugin()
    trainer = Trainer(ipus=2,
                      default_root_dir=tmpdir,
                      fast_dev_run=True,
                      strategy=plugin)
    assert trainer.ipus == 2
    assert trainer.training_type_plugin.replication_factor == 2

    model = BoringModel()
    training_opts = poptorch.Options()
    inference_opts = poptorch.Options()
    training_opts.replicationFactor(8)
    inference_opts.replicationFactor(7)
    plugin = IPUPlugin(inference_opts=inference_opts,
                       training_opts=training_opts)

    trainer = Trainer(default_root_dir=tmpdir, ipus=1, strategy=plugin)
    trainer.optimizers = model.configure_optimizers()[0]
    plugin.model = model
    model.trainer = trainer
    trainer.state.fn = TrainerFn.FITTING
    trainer.training_type_plugin.pre_dispatch()

    trainer.state.stage = RunningStage.TRAINING
    assert trainer.training_type_plugin.replication_factor == 8
    trainer.state.stage = RunningStage.VALIDATING
    assert trainer.training_type_plugin.replication_factor == 7

    for fn, stage in (
        (TrainerFn.VALIDATING, RunningStage.VALIDATING),
        (TrainerFn.TESTING, RunningStage.TESTING),
        (TrainerFn.PREDICTING, RunningStage.PREDICTING),
    ):
        trainer.state.fn = fn
        trainer.state.stage = stage
        trainer.training_type_plugin.pre_dispatch()
        assert trainer.training_type_plugin.replication_factor == 7
예제 #7
0
def test_autoreport(tmpdir):
    """Ensure autoreport dumps to a file."""
    model = IPUModel()
    autoreport_path = os.path.join(tmpdir, 'report/')
    trainer = Trainer(default_root_dir=tmpdir,
                      ipus=1,
                      fast_dev_run=True,
                      plugins=IPUPlugin(autoreport=True,
                                        autoreport_dir=autoreport_path))
    trainer.fit(model)
    assert os.path.exists(autoreport_path)
    assert os.path.isfile(autoreport_path + 'profile.pop')
예제 #8
0
def test_manual_poptorch_opts(tmpdir):
    """Ensure if the user passes manual poptorch Options, we run with the correct object."""
    model = IPUModel()
    inference_opts = poptorch.Options()
    training_opts = poptorch.Options()

    trainer = Trainer(default_root_dir=tmpdir,
                      ipus=1,
                      fast_dev_run=True,
                      plugins=IPUPlugin(inference_opts=inference_opts,
                                        training_opts=training_opts))
    trainer.fit(model)

    assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
    assert trainer.accelerator.training_type_plugin.training_opts == training_opts
    assert trainer.accelerator.training_type_plugin.inference_opts == inference_opts
예제 #9
0
def test_device_iterations_ipu_plugin(tmpdir):
    class TestCallback(Callback):
        def on_train_start(self, trainer: Trainer,
                           pl_module: LightningModule) -> None:
            assert trainer.accelerator.training_type_plugin.device_iterations == 2
            # assert device iterations has been set correctly within the poptorch options
            poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models[
                RunningStage.TRAINING]
            assert poptorch_model._options.toDict()['device_iterations'] == 2
            raise SystemExit

    model = IPUModel()
    trainer = Trainer(default_root_dir=tmpdir,
                      fast_dev_run=True,
                      ipus=1,
                      plugins=IPUPlugin(device_iterations=2),
                      callbacks=TestCallback())
    assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
    with pytest.raises(SystemExit):
        trainer.fit(model)
예제 #10
0
def test_poptorch_models_at_different_stages(tmpdir):
    plugin = IPUPlugin()
    trainer = Trainer(default_root_dir=tmpdir, strategy=plugin, ipus=8)
    model = BoringModel()
    model.trainer = trainer
    plugin.model = model

    trainer.optimizers = model.configure_optimizers()[0]
    trainer.state.fn = TrainerFn.FITTING
    trainer.training_type_plugin.pre_dispatch()
    assert list(trainer.training_type_plugin.poptorch_models) == [
        RunningStage.TRAINING, RunningStage.VALIDATING
    ]

    for fn, stage in (
        (TrainerFn.VALIDATING, RunningStage.VALIDATING),
        (TrainerFn.TESTING, RunningStage.TESTING),
        (TrainerFn.PREDICTING, RunningStage.PREDICTING),
    ):
        trainer.state.fn = fn
        trainer.state.stage = stage
        trainer.training_type_plugin.pre_dispatch()
        assert list(trainer.training_type_plugin.poptorch_models) == [stage]
예제 #11
0
def test_pure_half_precision(tmpdir):
    class TestCallback(Callback):
        def on_train_start(self, trainer: Trainer,
                           pl_module: LightningModule) -> None:
            assert trainer.accelerator.model.precision == 16
            assert trainer.accelerator.training_type_plugin.convert_model_to_half
            for param in trainer.accelerator.model.parameters():
                assert param.dtype == torch.float16
            raise SystemExit

    model = IPUModel()
    trainer = Trainer(default_root_dir=tmpdir,
                      fast_dev_run=True,
                      ipus=1,
                      precision=16,
                      plugins=IPUPlugin(convert_model_to_half=True),
                      callbacks=TestCallback())

    assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
    assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin)
    assert trainer.accelerator.precision_plugin.precision == 16

    with pytest.raises(SystemExit):
        trainer.fit(model)
예제 #12
0
    def select_training_type_plugin(self) -> TrainingTypePlugin:
        if isinstance(
            self.distributed_backend, Accelerator
        ) and self.distributed_backend.training_type_plugin is not None:
            plugin = self.distributed_backend.training_type_plugin
        elif self.use_ddp2:
            plugin = DDP2Plugin(
                parallel_devices=self.parallel_devices,
                cluster_environment=self.cluster_environment,
            )
        elif self.use_ddp and self.use_deepspeed:
            plugin = DeepSpeedPlugin(
                cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices
            )
        elif self.use_ddp:
            use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks
            use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.is_using_torchelastic()
            use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.is_using_kubeflow()
            use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN
            use_ddp_cpu_spawn = self.use_ddp and self.on_cpu
            use_tpu_spawn = self.on_tpu and self._distrib_type == DistributedType.TPU_SPAWN
            use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.is_using_torchelastic()
            use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.is_using_kubeflow()
            use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks
            use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED
            use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN
            use_ddp_fully_sharded = self._distrib_type == DistributedType.DDP_FULLY_SHARDED

            # TODO: decouple from TE
            # ddp script mode uses the same flags as TE
            if os.environ.get("PL_IN_DDP_SUBPROCESS", False):
                use_torchelastic_ddp = False

            if use_tpu_spawn:
                ddp_plugin_cls = TPUSpawnPlugin
            elif use_ddp_sharded:
                ddp_plugin_cls = DDPShardedPlugin
            elif use_ddp_sharded_spawn:
                ddp_plugin_cls = DDPSpawnShardedPlugin
            elif (
                use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp
                or use_kubeflow_ddp or use_ddp_cpu_kubeflow
            ):
                ddp_plugin_cls = DDPPlugin
            elif use_ddp_spawn or use_ddp_cpu_spawn:
                ddp_plugin_cls = DDPSpawnPlugin
            elif use_ddp_fully_sharded:
                ddp_plugin_cls = DDPFullyShardedPlugin
            else:
                ddp_plugin_cls = DDPPlugin

            plugin = ddp_plugin_cls(
                parallel_devices=self.parallel_devices,
                cluster_environment=self.cluster_environment,
            )
        elif self.use_dp:
            plugin = DataParallelPlugin(parallel_devices=self.parallel_devices)
        elif self.use_horovod:
            plugin = HorovodPlugin(parallel_devices=self.parallel_devices)
        elif self.on_tpu and isinstance(self.tpu_cores, list):
            plugin = SingleTPUPlugin(self.tpu_id)
        elif self.on_ipu:
            plugin = IPUPlugin(parallel_devices=self.parallel_devices)
        else:
            single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids)
            plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu"))
        return plugin
예제 #13
0
def test_no_warning_plugin(tmpdir):
    with pytest.warns(None) as record:
        Trainer(default_root_dir=tmpdir,
                plugins=IPUPlugin(training_opts=poptorch.Options()))
    assert len(record) == 0
예제 #14
0
def test_device_type_when_training_plugin_ipu_passed(tmpdir):

    trainer = Trainer(strategy=IPUPlugin(), ipus=8)
    assert isinstance(trainer.training_type_plugin, IPUPlugin)
    assert trainer._device_type == DeviceType.IPU
    assert isinstance(trainer.accelerator, IPUAccelerator)
예제 #15
0
def test_strategy_choice_ipu_plugin(tmpdir):
    trainer = Trainer(strategy=IPUPlugin(), accelerator="ipu", devices=8)
    assert isinstance(trainer.training_type_plugin, IPUPlugin)