예제 #1
0
    def reset_train_dataloader(self,
                               model: Optional["pl.LightningModule"] = None
                               ) -> None:
        """Resets the train dataloader and initialises required variables (number of batches, when to validate,
        etc.).

        Args:
            model: The `LightningModule` if calling this outside of the trainer scope.
        """
        self.train_dataloader = self.request_dataloader(RunningStage.TRAINING,
                                                        model=model)

        if self.overfit_batches > 0:
            if hasattr(self.train_dataloader, "sampler") and isinstance(
                    self.train_dataloader.sampler, RandomSampler):
                rank_zero_warn(
                    "You requested to overfit but enabled training dataloader shuffling."
                    " We are turning off the training dataloader shuffling for you."
                )
                self.train_dataloader = self.replace_sampler(
                    self.train_dataloader,
                    SequentialSampler(self.train_dataloader.dataset),
                    mode=RunningStage.TRAINING)

        # debugging
        self.dev_debugger.track_load_dataloader_call(
            "train_dataloader", dataloaders=[self.train_dataloader])

        # automatically add samplers
        self.train_dataloader = apply_to_collection(self.train_dataloader,
                                                    DataLoader,
                                                    self.auto_add_sampler,
                                                    shuffle=True,
                                                    mode=RunningStage.TRAINING)

        # check the workers recursively
        apply_to_collection(self.train_dataloader, DataLoader,
                            self._worker_check, "train_dataloader")

        # add worker_init_fn for correct seeding in worker processes
        apply_to_collection(self.train_dataloader, DataLoader,
                            self.auto_add_worker_init_fn)

        # add collate_fn to collect metadata for fault tolerant training
        if _fault_tolerant_training():
            apply_to_collection(self.train_dataloader, DataLoader,
                                self._add_sampler_metadata_collate)

        # wrap the sequence of train loaders to a CombinedLoader object for computing the num_training_batches
        self.train_dataloader = CombinedLoader(
            self.train_dataloader,
            self.data_connector.multiple_trainloader_mode)

        self.num_training_batches = len(self.train_dataloader) if has_len(
            self.train_dataloader) else float("inf")

        if isinstance(self.limit_train_batches,
                      int) or self.limit_train_batches == 0.0:
            self.num_training_batches = min(self.num_training_batches,
                                            int(self.limit_train_batches))
        elif self.num_training_batches != float("inf"):
            self.num_training_batches = int(self.num_training_batches *
                                            self.limit_train_batches)
        elif self.limit_train_batches != 1.0:
            raise MisconfigurationException(
                "When using an IterableDataset for `limit_train_batches`,"
                " `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies"
                " `num_training_batches` to use.")

        # determine when to check validation
        # if int passed in, val checks that often
        # otherwise, it checks in [0, 1.0] % range of a training epoch
        if isinstance(self.val_check_interval, int):
            self.val_check_batch = self.val_check_interval
            if self.val_check_batch > self.num_training_batches:
                raise ValueError(
                    f"`val_check_interval` ({self.val_check_interval}) must be less than or equal "
                    f"to the number of the training batches ({self.num_training_batches}). "
                    "If you want to disable validation set `limit_val_batches` to 0.0 instead."
                )
        else:
            if not has_len(self.train_dataloader):
                if self.val_check_interval == 1.0:
                    self.val_check_batch = float("inf")
                else:
                    raise MisconfigurationException(
                        "When using an IterableDataset for `train_dataloader`,"
                        " `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies"
                        " checking validation every k training batches.")
            else:
                self.val_check_batch = int(self.num_training_batches *
                                           self.val_check_interval)
                self.val_check_batch = max(1, self.val_check_batch)

        if self.logger and self.num_training_batches < self.log_every_n_steps:
            rank_zero_warn(
                f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"
                f" Trainer(log_every_n_steps={self.log_every_n_steps}). Set a lower value for log_every_n_steps if"
                " you want to see logs for the training epoch.")
예제 #2
0
    def _reset_eval_dataloader(
        self,
        mode: RunningStage,
        model: Optional["pl.LightningModule"] = None
    ) -> Tuple[List[Union[int, float]], List[DataLoader]]:
        """Generic method to reset a dataloader for evaluation.

        Args:
            mode: The running stage of the ``Trainer``
            model: The ``LightningModule`` if calling this outside of the trainer scope.

        Returns:
            Tuple (num_batches, dataloaders)
        """
        assert mode.evaluating or mode == RunningStage.PREDICTING

        # always get the loaders first so we can count how many there are
        loader_name = f"{mode.dataloader_prefix}_dataloader"
        dataloaders = self.request_dataloader(mode, model=model)

        if not isinstance(dataloaders, list):
            dataloaders = [dataloaders]

        # when overfitting, use the training loader as val and test
        # duplicate it the numb of times needed to match the train loaders
        if self.overfit_batches > 0:
            train_dataloader = self.request_dataloader(RunningStage.TRAINING,
                                                       model=model)
            dataloaders = [
                deepcopy(train_dataloader) for _ in range(len(dataloaders))
            ]

        self.dev_debugger.track_load_dataloader_call(loader_name,
                                                     dataloaders=dataloaders)

        for loader_i in range(len(dataloaders)):
            loader = dataloaders[loader_i]

            if hasattr(loader, "sampler") and isinstance(
                    loader.sampler, RandomSampler):

                # when overfitting, the dataloader should not have sampler
                if self.overfit_batches > 0 and mode.evaluating:
                    rank_zero_warn(
                        "You requested to overfit but enabled val/test dataloader shuffling."
                        " We are turning it off for you.")
                    dataloaders[loader_i] = self.replace_sampler(
                        loader, SequentialSampler(loader.dataset), mode=mode)
                else:
                    rank_zero_warn(
                        f"Your `{mode.dataloader_prefix}_dataloader` has `shuffle=True`,"
                        "it is strongly recommended that you turn this off for val/test/predict dataloaders."
                    )

        if any(dl is None for dl in dataloaders):
            rank_zero_warn(
                "One of given dataloaders is None and it will be skipped.")

        # add samplers
        dataloaders = [
            self.auto_add_sampler(dl, False, mode=mode) for dl in dataloaders
            if dl is not None
        ]

        # add worker_init_fn for correct seeding in worker processes
        apply_to_collection(dataloaders,
                            dtype=DataLoader,
                            function=self.auto_add_worker_init_fn)

        loader_num_batches = []

        # determine number of batches
        # datasets could be none, 1 or 2+
        if len(dataloaders) != 0:
            for i, dataloader in enumerate(dataloaders):
                num_batches = len(dataloader) if has_len(
                    dataloader) else float("inf")
                self._worker_check(dataloader,
                                   f"{mode.dataloader_prefix}_dataloader {i}")

                # percent or num_steps
                limit_eval_batches = getattr(
                    self, f"limit_{mode.dataloader_prefix}_batches")

                # limit num batches either as a percent or num steps
                if isinstance(limit_eval_batches,
                              int) or limit_eval_batches == 0.0:
                    num_batches = min(num_batches, int(limit_eval_batches))
                elif num_batches != float("inf"):
                    num_batches = int(num_batches * limit_eval_batches)
                elif limit_eval_batches != 1.0:
                    raise MisconfigurationException(
                        f"When using an IterableDataset for `limit_{mode}_batches`,"
                        f" `Trainer(limit_{mode.dataloader_prefix}_batches)` must be `0.0`, `1.0` or an int. An int k"
                        f" specifies `num_{mode.dataloader_prefix}_batches` to use."
                    )

                if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(
                        limit_eval_batches, float):
                    min_pct = 1.0 / len(dataloader)
                    raise MisconfigurationException(
                        f"you requested to check {limit_eval_batches} of the `{mode.dataloader_prefix}_dataloader` but"
                        f" {limit_eval_batches}*{num_batches} < 1. Please increase the"
                        f" `limit_{mode.dataloader_prefix}_batches` flag. Try at least"
                        f" `limit_{mode.dataloader_prefix}_batches={min_pct}`")

                loader_num_batches.append(num_batches)

        return loader_num_batches, dataloaders
예제 #3
0
    def scale_batch_size(self,
                         model: LightningModule,
                         mode: str = 'power',
                         steps_per_trial: int = 3,
                         init_val: int = 2,
                         max_trials: int = 25,
                         batch_arg_name: str = 'batch_size'):
        r"""
        Will iteratively try to find the largest batch size for a given model
        that does not give an out of memory (OOM) error.

        Args:
            model: Model to fit.

            mode: string setting the search mode. Either `power` or `binsearch`.
                If mode is `power` we keep multiplying the batch size by 2, until
                we get an OOM error. If mode is 'binsearch', we will initially
                also keep multiplying by 2 and after encountering an OOM error
                do a binary search between the last successful batch size and the
                batch size that failed.

            steps_per_trial: number of steps to run with a given batch size.
                Idealy 1 should be enough to test if a OOM error occurs,
                however in practise a few are needed

            init_val: initial batch size to start the search with

            max_trials: max number of increase in batch size done before
               algorithm is terminated

        """
        if not hasattr(model, batch_arg_name):
            raise MisconfigurationException(
                f'Field {batch_arg_name} not found in `model.hparams`')

        if hasattr(model.train_dataloader, 'patch_loader_code'):
            raise MisconfigurationException(
                'The batch scaling feature cannot be used with dataloaders'
                ' passed directly to `.fit()`. Please disable the feature or'
                ' incorporate the dataloader into the model.')

        # Arguments we adjust during the batch size finder, save for restoring
        self.__scale_batch_dump_params()

        # Set to values that are required by the algorithm
        self.__scale_batch_reset_params(model, steps_per_trial)

        # Save initial model, that is loaded after batch size is found
        save_path = os.path.join(self.default_root_dir, 'temp_model.ckpt')
        self.save_checkpoint(str(save_path))

        if self.progress_bar_callback:
            self.progress_bar_callback.disable()

        # Initially we just double in size until an OOM is encountered
        new_size = _adjust_batch_size(
            self, value=init_val)  # initially set to init_val
        if mode == 'power':
            new_size = _run_power_scaling(self, model, new_size,
                                          batch_arg_name, max_trials)
        elif mode == 'binsearch':
            new_size = _run_binsearch_scaling(self, model, new_size,
                                              batch_arg_name, max_trials)
        else:
            raise ValueError(
                'mode in method `scale_batch_size` can only be `power` or `binsearch'
            )

        garbage_collection_cuda()
        log.info(
            f'Finished batch size finder, will continue with full run using batch size {new_size}'
        )

        # Restore initial state of model
        self.restore(str(save_path), on_gpu=self.on_gpu)
        os.remove(save_path)

        # Finish by resetting variables so trainer is ready to fit model
        self.__scale_batch_restore_params()
        if self.progress_bar_callback:
            self.progress_bar_callback.enable()

        return new_size
예제 #4
0
def _validate_scheduler_optimizer(optimizers, lr_schedulers):
    if any(sch['scheduler'].optimizer not in optimizers
           for sch in lr_schedulers):
        raise MisconfigurationException(
            "Some schedulers are attatched with an optimizer that wasn't returned from `configure_optimizers`."
        )
    def set_distributed_mode(self):
        self.trainer.use_dp = False
        self.trainer.use_ddp = False
        self.trainer.use_ddp2 = False
        self.trainer.use_horovod = False
        self.trainer.use_single_gpu = False

        if self.trainer.distributed_backend is None:
            if self.has_horovodrun():
                self._set_horovod_backend()
            elif self.trainer.num_gpus == 0:
                if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1:
                    self.trainer.use_ddp = True  # ddp_cpu
            elif self.trainer.num_gpus == 1:
                self.trainer.use_single_gpu = True
            elif self.trainer.num_gpus > 1:
                rank_zero_warn(
                    'You requested multiple GPUs but did not specify a backend, e.g.'
                    ' `Trainer(accelerator="dp"|"ddp"|"ddp2")`.'
                    ' Setting `accelerator="ddp_spawn"` for you.')
                self.trainer.distributed_backend = "ddp_spawn"

        if self.trainer.distributed_backend == "dp":
            # do nothing if num_gpus == 0
            if self.trainer.num_gpus == 1:
                self.trainer.use_single_gpu = True
                self.trainer.use_dp = True
            elif self.trainer.num_gpus > 1:
                self.trainer.use_dp = True

        elif self.trainer.distributed_backend in ("ddp", "ddp_spawn"):
            if self.trainer.num_gpus == 0:
                if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1:
                    self.trainer.use_ddp = True  # ddp_cpu
            elif self.trainer.num_gpus == 1:
                self.trainer.use_single_gpu = True
                self.trainer.use_ddp = True
            elif self.trainer.num_gpus > 1:
                self.trainer.use_ddp = True
                self.trainer.num_processes = self.trainer.num_gpus

        elif self.trainer.distributed_backend == "ddp2":
            # do nothing if num_gpus == 0
            if self.trainer.num_gpus >= 1:
                self.trainer.use_ddp2 = True
        elif self.trainer.distributed_backend == "ddp_cpu":
            if self.trainer.num_gpus > 0:
                rank_zero_warn(
                    'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.'
                )
            self.trainer.use_ddp = True
            self.trainer.data_parallel_device_ids = None
            self.trainer.on_gpu = False
            self.trainer.on_cpu = True
        elif self.trainer.distributed_backend == "horovod":
            self._set_horovod_backend()

        # throw error to force user ddp or ddp2 choice
        if self.trainer.num_nodes > 1 and not (self.trainer.use_ddp2
                                               or self.trainer.use_ddp):
            raise MisconfigurationException(
                'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
                'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`'
            )

        rank_zero_info(
            f'GPU available: {torch.cuda.is_available()}, used: {self.trainer.on_gpu}'
        )
        num_cores = self.trainer.tpu_cores if self.trainer.tpu_cores is not None else 0
        rank_zero_info(
            f'TPU available: {TPU_AVAILABLE}, using: {num_cores} TPU cores')

        if torch.cuda.is_available() and not self.trainer.on_gpu:
            rank_zero_warn(
                'GPU available but not used. Set the --gpus flag when calling the script.'
            )
예제 #6
0
 def setup_distributed(self) -> None:
     if not self.on_gpu:
         raise MisconfigurationException(
             "You selected accelerator to be `ddp_fully_sharded`, but GPU is not available."
         )
     super().setup_distributed()
예제 #7
0
    def _reset_eval_dataloader(
        self,
        model: LightningModule,
        mode: str,
    ) -> Tuple[List[Union[int, float]], List[DataLoader]]:
        """Generic method to reset a dataloader for evaluation.

        Args:
            model: The current `LightningModule`
            mode: Either `'val'` or `'test'`

        Returns:
            Tuple (num_batches, dataloaders)
        """
        # always get the loaders first so we can count how many there are
        loader_name = f'{mode}_dataloader'
        dataloaders = self.request_dataloader(getattr(model, loader_name))

        if not isinstance(dataloaders, list):
            dataloaders = [dataloaders]

        # when overfitting use the training loader as val and test
        # duplicate it the numb of times needed to match the train loaders
        if self.overfit_batches > 0:
            num_loaders = len(dataloaders)
            train_dataloader = self.request_dataloader(getattr(model, 'train_dataloader'))
            dataloaders = [deepcopy(train_dataloader) for _ in range(num_loaders)]

        self.dev_debugger.track_load_dataloader_call(loader_name, dataloaders=dataloaders)

        for loader_i in range(len(dataloaders)):
            loader = dataloaders[loader_i]

            # shuffling in val and test set is bad practice
            modes = ('val', 'test', 'predict')
            if mode in modes and hasattr(loader, 'sampler') and isinstance(loader.sampler, RandomSampler):

                # when overfitting, the dataloader should not have sampler
                if self.overfit_batches > 0:
                    rank_zero_warn(
                        'You requested to overfit but enabled test/val dataloader shuffling.'
                        ' We are turning it off for you.'
                    )
                    dataloaders[loader_i] = self.replace_sampler(loader, SequentialSampler(loader.dataset))

                else:
                    rank_zero_warn(
                        f'Your {mode}_dataloader has `shuffle=True`, it is best practice to turn'
                        ' this off for validation and test dataloaders.'
                    )

        if any([dl is None for dl in dataloaders]):
            rank_zero_warn("One of given dataloaders is None and it will be skipped.")

        # add samplers
        dataloaders = [self.auto_add_sampler(dl, shuffle=False) for dl in dataloaders if dl is not None]

        loader_num_batches = []

        # determine number of batches
        # datasets could be none, 1 or 2+
        if len(dataloaders) != 0:
            for i, dataloader in enumerate(dataloaders):
                num_batches = len(dataloader) if has_len(dataloader) else float('inf')
                self._worker_check(dataloader, f'{mode} dataloader {i}')

                # percent or num_steps
                limit_eval_batches = getattr(self, f'limit_{mode}_batches')

                # limit num batches either as a percent or num steps
                if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0:
                    num_batches = min(num_batches, int(limit_eval_batches))
                elif num_batches != float('inf'):
                    num_batches = int(num_batches * limit_eval_batches)
                elif limit_eval_batches != 1.0:
                    raise MisconfigurationException(
                        'When using an IterableDataset for `limit_{mode}_batches`,'
                        f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
                        f' `num_{mode}_batches` to use.'
                    )

                if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float):
                    min_pct = 1.0 / len(dataloader)
                    raise MisconfigurationException(
                        f'you requested to check {limit_eval_batches} of the {mode} dataloader but'
                        f' {limit_eval_batches}*{num_batches} < 1. Please increase the limit_{mode}_batches.'
                        f' Try at least limit_{mode}_batches={min_pct}'
                    )

                loader_num_batches.append(num_batches)

        return loader_num_batches, dataloaders
    def setup_training(self, model: LightningModule):
        """Sanity check a few things before starting actual training.

        Args:
            model: The model to run sanity test on.
        """
        # --------------------------
        # Setup??
        # --------------------------
        ref_model = model
        if self.trainer.data_parallel:
            ref_model = model.module

        # set the ranks and devices
        self.trainer.accelerator_backend.dist.rank = self.trainer.global_rank
        self.trainer.accelerator_backend.dist.device = ref_model.device

        # give model convenience properties
        ref_model.trainer = self.trainer

        # set local properties on the model
        self.trainer.model_connector.copy_trainer_model_properties(ref_model)

        # init amp. Must be done here instead of __init__ to allow ddp to work
        if self.trainer.amp_backend == AMPType.NATIVE and self.trainer.precision == 16 and not self.trainer.use_tpu:
            self.trainer.scaler = self.trainer.precision_connector.backend.scaler

        # log hyper-parameters
        if self.trainer.logger is not None:
            # save exp to get started (this is where the first experiment logs are written)
            self.trainer.logger.log_hyperparams(ref_model.hparams_initial)
            self.trainer.logger.log_graph(ref_model)
            self.trainer.logger.save()

        # wait for all to join if on distributed
        self.trainer.accelerator_backend.barrier("setup_training")

        # register auto-resubmit when on SLURM
        self.trainer.slurm_connector.register_slurm_signal_handlers()

        # --------------------------
        # Pre-train
        # --------------------------
        # on pretrain routine start
        self.trainer.on_pretrain_routine_start(ref_model)
        if self.trainer.is_function_implemented("on_pretrain_routine_start"):
            ref_model.on_pretrain_routine_start()

        # print model summary
        if self.trainer.is_global_zero and self.trainer.weights_summary is not None and not self.trainer.testing:
            if self.trainer.weights_summary in ModelSummary.MODES:
                ref_model.summarize(mode=self.trainer.weights_summary)
            else:
                raise MisconfigurationException(
                    "weights_summary can be None, " +
                    ", ".join(ModelSummary.MODES))

        # track model now.
        # if cluster resets state, the model will update with the saved weights
        self.trainer.model = model

        # restore training and model before hpc is called
        self.trainer.checkpoint_connector.restore_weights(model)

        # on pretrain routine end
        self.trainer.on_pretrain_routine_end(ref_model)
        if self.trainer.is_function_implemented("on_pretrain_routine_end"):
            ref_model.on_pretrain_routine_end()
예제 #9
0
    def _evaluate(
        self,
        model: LightningModule,
        dataloaders: List[DataLoader],
        max_batches: Union[int, List[int]],
        test_mode: bool = False
    ):
        """Run evaluation code.

        Args:
            model: The model to evaluate.
            dataloaders: A list of PyTorch dataloaders.
            max_batches: An integer or list of integers with length of the number of dataloaders. Each
                entry is the number of batches to process in the corresponding dataloader.
            test_mode:
        """
        # enable eval mode
        model.zero_grad()
        model.eval()

        # copy properties for forward overrides
        self.copy_trainer_model_properties(model)

        # disable gradients to save memory
        torch.set_grad_enabled(False)

        # bookkeeping
        outputs = []

        # convert max_batches to list
        if isinstance(max_batches, int):
            max_batches = [max_batches] * len(dataloaders)

        # --------------------------
        # ON_EVAL_EPOCH_START hook
        # --------------------------
        self.__call_eval_loop_hook_start(test_mode)

        # run validation
        for dataloader_idx, dataloader in enumerate(dataloaders):
            dl_outputs = []

            # on TPU we have to wrap it under the ParallelLoader
            if self.use_tpu:
                device = xm.xla_device(self.tpu_id)
                dataloader = xla_pl.ParallelLoader(dataloader, [device])
                dataloader = dataloader.per_device_loader(device)

            # each dataloader has a max num batches
            dl_max_batches = max_batches[dataloader_idx]

            for batch_idx, batch in enumerate(dataloader):
                if batch is None:
                    continue

                # stop short when on fast_dev_run (sets max_batch=1)
                if batch_idx >= dl_max_batches:
                    break

                # callbacks
                if test_mode:
                    self.on_test_batch_start()
                else:
                    self.on_validation_batch_start()

                # -----------------
                # RUN EVALUATION STEP
                # -----------------
                if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu:
                    with torch.cuda.amp.autocast():
                        output = self.evaluation_forward(model, batch, batch_idx, dataloader_idx, test_mode)
                else:
                    output = self.evaluation_forward(model, batch, batch_idx, dataloader_idx, test_mode)

                # allow only EvalResult when using structured results (from val_step)
                if isinstance(output, Result) and not isinstance(output, EvalResult):
                    m = 'only EvalResults or dicts are allowed from validation_step'
                    raise MisconfigurationException(m)

                # on dp / ddp2 might still want to do something with the batch parts
                if test_mode:
                    if self.is_overridden('test_step_end'):
                        model_ref = self.get_model()
                        with self.profiler.profile('test_step_end'):
                            output = model_ref.test_step_end(output)
                    self.on_test_batch_end()
                else:
                    if self.is_overridden('validation_step_end'):
                        model_ref = self.get_model()
                        with self.profiler.profile('validation_step_end'):
                            output = model_ref.validation_step_end(output)
                    self.on_validation_batch_end()

                # track outputs for collation
                if output is not None:
                    dl_outputs.append(output)

                self.__eval_add_step_metrics(output)

            outputs.append(dl_outputs)

        # ---------------------
        # EVAL_EPOCH_END
        # ---------------------
        using_eval_result = len(outputs) > 0 and len(outputs[0]) > 0 and isinstance(outputs[0][0], EvalResult)
        eval_results = self.__run_eval_epoch_end(test_mode, outputs, dataloaders, using_eval_result)

        # log callback metrics
        self.__update_callback_metrics(eval_results, using_eval_result)

        # enable train mode again
        model.train()

        # enable gradients to save memory
        torch.set_grad_enabled(True)

        # --------------------------
        # ON_EVAL_EPOCH_END hook
        # --------------------------
        self.__call_eval_loop_hook_end(test_mode)

        return eval_results
예제 #10
0
    def reset_train_dataloader(self, model: LightningModule) -> None:
        """Resets the train dataloader and initialises required variables
        (number of batches, when to validate, etc.).

        Args:
            model: The current `LightningModule`
        """
        self.train_dataloader = self.request_dataloader(model.train_dataloader)

        if self.overfit_batches > 0:
            if hasattr(self.train_dataloader, 'sampler') and isinstance(self.train_dataloader.sampler, RandomSampler):
                rank_zero_warn(
                    'You requested to overfit but enabled training dataloader shuffling.'
                    ' We are turning it off for you.'
                )
                self.train_dataloader = self.replace_sampler(
                    self.train_dataloader, SequentialSampler(self.train_dataloader.dataset)
                )

        # debugging
        self.dev_debugger.track_load_dataloader_call('train_dataloader', dataloaders=[self.train_dataloader])

        # automatically add samplers
        self.train_dataloader = apply_to_collection(
            self.train_dataloader, DataLoader, self.auto_add_sampler, shuffle=True
        )

        # check the workers recursively
        apply_to_collection(self.train_dataloader, DataLoader, self._worker_check, 'train dataloader')

        # wrap the sequence of train loaders to a CombinedLoader object for computing the num_training_batches
        self.train_dataloader = CombinedLoader(self.train_dataloader, self._multiple_trainloader_mode)

        self.num_training_batches = len(self.train_dataloader) if has_len(self.train_dataloader) else float('inf')

        if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0:
            self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches))
        elif self.num_training_batches != float('inf'):
            self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
        elif self.limit_train_batches != 1.0:
            raise MisconfigurationException(
                'When using an IterableDataset for `limit_train_batches`,'
                ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
                ' `num_training_batches` to use.'
            )

        # determine when to check validation
        # if int passed in, val checks that often
        # otherwise, it checks in [0, 1.0] % range of a training epoch
        if isinstance(self.val_check_interval, int):
            self.val_check_batch = self.val_check_interval
            if self.val_check_batch > self.num_training_batches:
                raise ValueError(
                    f'`val_check_interval` ({self.val_check_interval}) must be less than or equal '
                    f'to the number of the training batches ({self.num_training_batches}). '
                    'If you want to disable validation set `limit_val_batches` to 0.0 instead.'
                )
        else:
            if not has_len(self.train_dataloader):
                if self.val_check_interval == 1.0:
                    self.val_check_batch = float('inf')
                else:
                    raise MisconfigurationException(
                        'When using an IterableDataset for `train_dataloader`,'
                        ' `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies'
                        ' checking validation every k training batches.'
                    )
            else:
                self.val_check_batch = int(self.num_training_batches * self.val_check_interval)
                self.val_check_batch = max(1, self.val_check_batch)
예제 #11
0
    def _call_children_scripts(self):
        # bookkeeping of spawned processes
        self._check_can_spawn_children()

        # DDP Environment variables
        os.environ["MASTER_ADDR"] = self.cluster_environment.master_address()
        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())

        # allow the user to pass the node rank
        os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank())
        os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank())

        # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c`
        # See https://docs.python.org/3/reference/import.html#main-spec
        if __main__.__spec__ is None:  # pragma: no-cover
            # Script called as `python a/b/c.py`
            # when user is using hydra find the absolute path
            path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path

            # pull out the commands used to run the script and resolve the abs file path
            command = sys.argv
            try:
                full_path = path_lib(command[0])
            except Exception:
                full_path = os.path.abspath(command[0])

            command[0] = full_path
            # use the same python interpreter and actually running
            command = [sys.executable] + command
        else:  # Script called as `python -m a.b.c`
            command = [sys.executable, "-m", __main__.__spec__.name
                       ] + sys.argv[1:]

        # the visible devices tell us how many GPUs we want to use.
        # when the trainer script was called the device has already been scoped by the time
        # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone
        # but forward the GPUs selected via environment variables
        if self.parallel_devices is None:
            raise MisconfigurationException(
                "you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)"
            )

        os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}"

        self.interactive_ddp_procs = []

        for local_rank in range(1, self.num_processes):
            env_copy = os.environ.copy()
            env_copy["LOCAL_RANK"] = f"{local_rank}"

            # remove env var if global seed not set
            if os.environ.get(
                    "PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy:
                del env_copy["PL_GLOBAL_SEED"]

            # start process
            # if hydra is available and initialized, make sure to set the cwd correctly
            cwd: Optional[str] = None
            if _HYDRA_AVAILABLE:
                if HydraConfig.initialized():
                    cwd = get_original_cwd()
                    os_cwd = f'"{os.getcwd()}"'
                    command += [
                        f"hydra.run.dir={os_cwd}",
                        f"hydra.job.name=train_ddp_process_{local_rank}"
                    ]
            proc = subprocess.Popen(command, env=env_copy, cwd=cwd)
            self.interactive_ddp_procs.append(proc)

            # starting all processes at once can cause issues
            # with dataloaders delay between 1-10 seconds
            delay = np.random.uniform(1, 5, 1)[0]
            sleep(delay)
예제 #12
0
    def lr_find(self,
                model: LightningModule,
                train_dataloader: Optional[DataLoader] = None,
                val_dataloaders: Optional[DataLoader] = None,
                min_lr: float = 1e-8,
                max_lr: float = 1,
                num_training: int = 100,
                mode: str = 'exponential',
                early_stop_threshold: float = 4.0,
                num_accumulation_steps=None):
        r"""
        lr_find enables the user to do a range test of good initial learning rates,
        to reduce the amount of guesswork in picking a good starting learning rate.

        Args:
            model: Model to do range testing for

            train_dataloader: A PyTorch
                DataLoader with training samples. If the model has
                a predefined train_dataloader method this will be skipped.

            min_lr: minimum learning rate to investigate

            max_lr: maximum learning rate to investigate

            num_training: number of learning rates to test

            mode: search strategy, either 'linear' or 'exponential'. If set to
                'linear' the learning rate will be searched by linearly increasing
                after each batch. If set to 'exponential', will increase learning
                rate exponentially.

            early_stop_threshold: threshold for stopping the search. If the
                loss at any point is larger than early_stop_threshold*best_loss
                then the search is stopped. To disable, set to None.

            num_accumulation_steps: deprepecated, number of batches to calculate loss over.
                Set trainer argument ``accumulate_grad_batches`` instead.

        Example::

            # Setup model and trainer
            model = MyModelClass(hparams)
            trainer = pl.Trainer()

            # Run lr finder
            lr_finder = trainer.lr_find(model, ...)

            # Inspect results
            fig = lr_finder.plot(); fig.show()
            suggested_lr = lr_finder.suggestion()

            # Overwrite lr and create new model
            hparams.lr = suggested_lr
            model = MyModelClass(hparams)

            # Ready to train with new learning rate
            trainer.fit(model)

        """
        if num_accumulation_steps is not None:
            rank_zero_warn("Argument `num_accumulation_steps` has been deprepecated"
                           " since v0.7.6 and will be removed in 0.9. Please"
                           " set trainer argument `accumulate_grad_batches` instead.",
                           DeprecationWarning)

        save_path = os.path.join(self.default_root_dir, 'lr_find_temp.ckpt')

        self.__lr_finder_dump_params(model)

        # Prevent going into infinite loop
        self.auto_lr_find = False

        # Initialize lr finder object (stores results)
        lr_finder = _LRFinder(mode, min_lr, max_lr, num_training)

        # Use special lr logger callback
        self.callbacks = [_LRCallback(num_training,
                                      early_stop_threshold,
                                      progress_bar_refresh_rate=1)]

        # No logging
        self.logger = DummyLogger()

        # Max step set to number of iterations
        self.max_steps = num_training

        # Disable standard progress bar for fit
        if self.progress_bar_callback:
            self.progress_bar_callback.disable()

        # Disable standard checkpoint & early stopping
        self.checkpoint_callback = False
        self.early_stop_callback = None
        self.enable_early_stop = False

        # Required for saving the model
        self.optimizers, self.schedulers = [], [],
        self.model = model

        # Dump model checkpoint
        self.save_checkpoint(str(save_path))

        # Configure optimizer and scheduler
        optimizers, _, _ = self.init_optimizers(model)

        if len(optimizers) != 1:
            raise MisconfigurationException(
                f'`model.configure_optimizers()` returned {len(optimizers)}, but'
                ' learning rate finder only works with single optimizer')
        model.configure_optimizers = lr_finder._get_new_optimizer(optimizers[0])

        # Fit, lr & loss logged in callback
        self.fit(model,
                 train_dataloader=train_dataloader,
                 val_dataloaders=val_dataloaders)

        # Prompt if we stopped early
        if self.global_step != num_training:
            log.info('LR finder stopped early due to diverging loss.')

        # Transfer results from callback to lr finder object
        lr_finder.results.update({'lr': self.callbacks[0].lrs,
                                  'loss': self.callbacks[0].losses})
        lr_finder._total_batch_idx = self.total_batch_idx  # for debug purpose

        # Reset model state
        self.restore(str(save_path), on_gpu=self.on_gpu)
        os.remove(save_path)

        # Finish by resetting variables so trainer is ready to fit model
        self.__lr_finder_restore_params(model)
        if self.progress_bar_callback:
            self.progress_bar_callback.enable()

        return lr_finder
예제 #13
0
    def init_optimizers(self,
                        model: LightningModule) -> Tuple[List, List, List]:
        optim_conf = model.configure_optimizers()
        if optim_conf is None:
            rank_zero_warn(
                '`LightningModule.configure_optimizers` returned `None`, this fit will run with no optimizer',
                UserWarning,
            )
            optim_conf = _MockOptimizer()

        optimizers, lr_schedulers, optimizer_frequencies = [], [], []
        monitor = None

        # single output, single optimizer
        if isinstance(optim_conf, Optimizer):
            optimizers = [optim_conf]
        # two lists, optimizer + lr schedulers
        elif isinstance(optim_conf,
                        (list, tuple)) and len(optim_conf) == 2 and isinstance(
                            optim_conf[0], list):
            opt, sch = optim_conf
            optimizers = opt
            lr_schedulers = sch if isinstance(sch, list) else [sch]
        # single dictionary
        elif isinstance(optim_conf, dict):
            optimizers = [optim_conf["optimizer"]]
            monitor = optim_conf.get('monitor', None)
            lr_schedulers = [optim_conf["lr_scheduler"]
                             ] if "lr_scheduler" in optim_conf else []
        # multiple dictionaries
        elif isinstance(optim_conf, (list, tuple)) and all(
                isinstance(d, dict) for d in optim_conf):
            optimizers = [opt_dict["optimizer"] for opt_dict in optim_conf]
            lr_schedulers = [
                opt_dict["lr_scheduler"] for opt_dict in optim_conf
                if "lr_scheduler" in opt_dict
            ]
            optimizer_frequencies = [
                opt_dict["frequency"] for opt_dict in optim_conf
                if opt_dict.get("frequency", None) is not None
            ]
            # assert that if frequencies are present, they are given for all optimizers
            if optimizer_frequencies and len(optimizer_frequencies) != len(
                    optimizers):
                raise ValueError(
                    "A frequency must be given to each optimizer.")
        # single list or tuple, multiple optimizer
        elif isinstance(optim_conf, (list, tuple)):
            optimizers = list(optim_conf)
        # unknown configuration
        else:
            raise MisconfigurationException(
                'Unknown configuration for model optimizers.'
                ' Output from `model.configure_optimizers()` should either be:\n'
                ' * `torch.optim.Optimizer`\n'
                ' * [`torch.optim.Optimizer`]\n'
                ' * ([`torch.optim.Optimizer`], [`torch.optim.lr_scheduler`])\n'
                ' * {"optimizer": `torch.optim.Optimizer`, (optional) "lr_scheduler": `torch.optim.lr_scheduler`}\n'
                ' * A list of the previously described dict format, with an optional "frequency" key (int)'
            )

        lr_schedulers = self.configure_schedulers(lr_schedulers,
                                                  monitor=monitor)
        _validate_scheduler_optimizer(optimizers, lr_schedulers)

        return optimizers, lr_schedulers, optimizer_frequencies
예제 #14
0
    def set_distributed_mode(self, distributed_backend):
        self.use_dp = False
        self.use_ddp = False
        self.use_ddp2 = False
        self.use_horovod = False
        self.single_gpu = False

        if distributed_backend is None:
            if self.has_horovodrun():
                self._set_horovod_backend()
            elif self.num_gpus == 0:
                if self.num_nodes > 1 or self.num_processes > 1:
                    self.use_ddp = True  # ddp_cpu
            elif self.num_gpus == 1:
                self.single_gpu = True
            elif self.num_gpus > 1:
                rank_zero_warn(
                    'You requested multiple GPUs but did not specify a backend, e.g.'
                    ' Trainer(distributed_backend=dp) (or ddp, ddp2).'
                    ' Setting distributed_backend=ddp_spawn for you.')
                self.distributed_backend = 'ddp_spawn'
                distributed_backend = 'ddp_spawn'

        if distributed_backend == "dp":
            # do nothing if num_gpus == 0
            if self.num_gpus == 1:
                self.single_gpu = True
                self.use_dp = True
            elif self.num_gpus > 1:
                self.use_dp = True

        elif distributed_backend in ['ddp', 'ddp_spawn']:
            if self.num_gpus == 0:
                if self.num_nodes > 1 or self.num_processes > 1:
                    self.use_ddp = True  # ddp_cpu
            elif self.num_gpus == 1:
                self.single_gpu = True
                self.use_ddp = True
            elif self.num_gpus > 1:
                self.use_ddp = True
                self.num_processes = self.num_gpus

        elif distributed_backend == "ddp2":
            # do nothing if num_gpus == 0
            if self.num_gpus >= 1:
                self.use_ddp2 = True
        elif distributed_backend == "ddp_cpu":
            if self.num_gpus > 0:
                rank_zero_warn(
                    'You requested one or more GPUs, but set the backend to `ddp_cpu`.'
                    ' Training will not use GPUs.')
            self.use_ddp = True
            self.data_parallel_device_ids = None
            self.on_gpu = False
        elif distributed_backend == 'horovod':
            self._set_horovod_backend()

        # throw error to force user ddp or ddp2 choice
        if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp):
            raise MisconfigurationException(
                'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
                'To silence this warning set distributed_backend=ddp or distributed_backend=ddp2'
            )

        rank_zero_info(
            f'GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}')
        num_cores = self.tpu_cores if self.tpu_cores is not None else 0
        rank_zero_info(
            f'TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores')
예제 #15
0
    def __init__(
        self,
        accelerator: Optional[
            "pl.accelerators.accelerator.Accelerator"] = None,
        zero_optimization: bool = True,
        stage: int = 2,
        remote_device: str = "cpu",
        offload_optimizer: bool = False,
        offload_parameters: bool = False,
        offload_params_device: str = "cpu",
        nvme_path: str = "/local_nvme",
        params_buffer_count: int = 5,
        params_buffer_size: int = 1e8,
        max_in_cpu: int = 1e9,
        offload_optimizer_device: str = "cpu",
        optimizer_buffer_count: int = 4,
        block_size: int = 1048576,
        queue_depth: int = 8,
        single_submit: bool = False,
        overlap_events: bool = True,
        thread_count: int = 1,
        pin_memory: bool = False,
        sub_group_size: int = 1e12,
        contiguous_gradients: bool = True,
        overlap_comm: bool = True,
        allgather_partitions: bool = True,
        reduce_scatter: bool = True,
        allgather_bucket_size: int = 2e8,
        reduce_bucket_size: int = 2e8,
        zero_allow_untested_optimizer: bool = True,
        logging_batch_size_per_gpu: Union[str, int] = "auto",
        config: Optional[Union[Path, str, dict]] = None,
        logging_level: int = logging.WARN,
        parallel_devices: Optional[List[torch.device]] = None,
        cluster_environment: Optional[ClusterEnvironment] = None,
        loss_scale: float = 0,
        initial_scale_power: int = 16,
        loss_scale_window: int = 1000,
        hysteresis: int = 2,
        min_loss_scale: int = 1,
        partition_activations: bool = False,
        cpu_checkpointing: bool = False,
        contiguous_memory_optimization: bool = False,
        synchronize_checkpoint_boundary: bool = False,
        load_full_weights: bool = False,
        precision_plugin: Optional[PrecisionPlugin] = None,
    ) -> None:
        """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large
        billion parameter models. `For more information: https://pytorch-
        lightning.readthedocs.io/en/latest/advanced/advanced_gpu.html#deepspeed`.

        .. warning:: ``DeepSpeedStrategy`` is in beta and subject to change.

        Defaults have been set to enable ZeRO-Offload and some have been taken from the link below.
        These defaults have been set generally, but may require tuning for optimum performance based on your model size.
        `For more information: https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training`.

        Arguments:

            zero_optimization: Enable ZeRO optimization. This is only compatible with precision=16.

            stage: Different stages of the ZeRO Optimizer. 0 is disabled,
                1 is optimizer state partitioning, 2 is optimizer+gradient state partitioning,
                3 is optimizer+gradient_parameter partitioning using the infinity engine.

            remote_device: Device to instantiate the model on initially (``cpu`` or ``nvme``).

            offload_optimizer: Enable offloading optimizer memory and computation to CPU or NVMe
                based on ``offload_optimizer_device``.

            offload_parameters: When using ZeRO Stage 3, Enable offloading parameter memory and computation
                to CPU or NVMe based on ``offload_params_device``.

            offload_params_device: When offloading parameters choose the device to offload to, ``cpu`` or ``nvme``.

            offload_optimizer_device: When offloading optimizer state choose the device to offload to,
                ``cpu`` or ``nvme``.

            params_buffer_count: Number of buffers in buffer pool for
                parameter offloading when ``offload_params_device`` is ``nvme``.

            params_buffer_size: Size of buffers in buffer pool for parameter offloading
                when ``offload_params_device`` is ``nvme``.

            max_in_cpu: Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled.

            nvme_path: Filesystem path for NVMe device for optimizer/parameter state offloading.

            optimizer_buffer_count: Number of buffers in buffer pool for optimizer state offloading
                when ``offload_optimizer_device`` is set to to ``nvme``.
                This should be at least the number of states maintained per parameter by the optimizer.
                For example, Adam optimizer has 4 states (parameter, gradient, momentum, and variance).

            block_size: When using NVMe Offloading, the I/O block size in bytes.

            queue_depth: When using NVMe Offloading, the I/O queue depth.

            single_submit: When using NVMe Offloading,
                submit requests to storage device as multiple individual requests,
                as opposed to one block of requests.

            overlap_events: When using NVMe Offloading,
                submit requests to storage device in an overlapped fashion
                without waiting for completion of earlier requests.

            thread_count: When using NVMe Offloading,
                Intra-request parallelism for each read/write submitted by a user thread.

            pin_memory: When using ZeRO stage 3, pin optimizer state memory on CPU.
                This could boost throughput at the cost of extra memory overhead.

            sub_group_size: When using ZeRO stage 3, defines the number of parameters
                within a sub group to offload at a time.
                Smaller numbers require more communication, but improve memory efficiency.

            contiguous_gradients: Copies gradients to a continuous buffer as they are produced.
                Avoids memory fragmentation during backwards. Useful when training large models.

            overlap_comm: Overlap the reduction (synchronization) of gradients with the backwards computation.
                This is a speed optimization when training across multiple GPUs/machines.

            allgather_partitions: All gather updated parameters at the end of training step,
                instead of using a series of broadcast collectives.

            reduce_scatter: Use reduce/scatter instead of allreduce to average gradients.

            allgather_bucket_size: Number of elements to allgather at once.
                Used to limit the memory required for larger model sizes, with a tradeoff with speed.

            reduce_bucket_size: Number of elements to reduce at once.
                Used to limit the memory required for larger model sizes, with a tradeoff with speed.

            zero_allow_untested_optimizer: Allow untested optimizers to be used with ZeRO. Currently only Adam is a
                DeepSpeed supported optimizer when using ZeRO.

            logging_batch_size_per_gpu: Config used in DeepSpeed to calculate verbose timing for logging
                on a per sample per second basis (only displayed if logging=logging.INFO).
                If set to "auto", the plugin tries to infer this from
                the train DataLoader's BatchSampler, else defaults to 1.
                To obtain accurate logs when using datasets that do not support batch samplers,
                set this to the actual per gpu batch size (trainer.batch_size).

            config: Pass in a deepspeed formatted config dict,
                or path to a deepspeed config: https://www.deepspeed.ai/docs/config-json.
                All defaults will be ignored if a config is passed in.

            logging_level: Set logging level for deepspeed.

            loss_scale: Loss scaling value for FP16 training.
                0.0 results in dynamic loss scaling, otherwise static.

            initial_scale_power: Power of the initial dynamic loss scale value. Loss scale is computed
                by ``2^initial_scale_power``.

            loss_scale_window: Window in which to raise/lower the dynamic FP16 loss scaling value.

            hysteresis: FP16 Delay shift in Dynamic Loss scaling.

            min_loss_scale: The minimum FP16 dynamic loss scaling value.

            partition_activations: Enables partition activation when used with ZeRO stage 3 and model parallelism.
                Still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint.
                See `deepspeed tutorial
                <https://www.deepspeed.ai/tutorials/megatron/#deepspeed-activation-checkpoints-optional>`_.

            cpu_checkpointing: Offloads partitioned activations to CPU if ``partition_activations`` is enabled.

            contiguous_memory_optimization: Copies partitioned activations so that they are contiguous in memory.
                Not supported by all models.

            synchronize_checkpoint_boundary: Insert :func:`torch.cuda.synchronize` at each checkpoint boundary.

            load_full_weights: True when loading a single checkpoint file containing the model state dict
                when using ZeRO Stage 3. This differs from the DeepSpeed checkpoint which contains shards
                per worker.
        """
        if not _DEEPSPEED_AVAILABLE:
            raise MisconfigurationException(
                "To use the DeepSpeed plugin, you must have DeepSpeed installed. pip install deepspeed"
            )

        super().__init__(
            accelerator=accelerator,
            parallel_devices=parallel_devices,
            cluster_environment=cluster_environment,
            precision_plugin=precision_plugin,
        )

        self.config = self._load_config(config)
        if self.config is None:
            # User has not overridden config, set defaults
            self.config = self._create_default_config(
                zero_optimization,
                zero_allow_untested_optimizer,
                logging_batch_size_per_gpu,
                offload_optimizer=offload_optimizer,
                offload_parameters=offload_parameters,
                nvme_path=nvme_path,
                offload_params_device=offload_params_device,
                params_buffer_count=params_buffer_count,
                params_buffer_size=params_buffer_size,
                max_in_cpu=max_in_cpu,
                pin_memory=pin_memory,
                offload_optimizer_device=offload_optimizer_device,
                optimizer_buffer_count=optimizer_buffer_count,
                block_size=block_size,
                queue_depth=queue_depth,
                single_submit=single_submit,
                overlap_events=overlap_events,
                thread_count=thread_count,
                partition_activations=partition_activations,
                cpu_checkpointing=cpu_checkpointing,
                contiguous_memory_optimization=contiguous_memory_optimization,
                synchronize_checkpoint_boundary=synchronize_checkpoint_boundary,
                stage=stage,
                contiguous_gradients=contiguous_gradients,
                overlap_comm=overlap_comm,
                allgather_partitions=allgather_partitions,
                reduce_scatter=reduce_scatter,
                allgather_bucket_size=allgather_bucket_size,
                reduce_bucket_size=reduce_bucket_size,
                sub_group_size=sub_group_size,
            )
        self._config_initialized = False
        deepspeed.utils.logging.logger.setLevel(logging_level)

        self.remote_device = remote_device
        self.load_full_weights = load_full_weights

        # default FP16 parameters.
        self.loss_scale = loss_scale
        self.initial_scale_power = initial_scale_power
        self.loss_scale_window = loss_scale_window
        self.hysteresis = hysteresis
        self.min_loss_scale = min_loss_scale
def __verify_train_val_loop_configuration(trainer: "pl.Trainer",
                                          model: "pl.LightningModule") -> None:
    # -----------------------------------
    # verify model has a training step
    # -----------------------------------
    has_training_step = is_overridden("training_step", model)
    if not has_training_step:
        raise MisconfigurationException(
            "No `training_step()` method defined. Lightning `Trainer` expects as minimum a"
            " `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined."
        )

    # -----------------------------------
    # verify model has a train dataloader
    # -----------------------------------
    has_train_dataloader = trainer._data_connector._train_dataloader_source.is_defined(
    )
    if not has_train_dataloader:
        raise MisconfigurationException(
            "No `train_dataloader()` method defined. Lightning `Trainer` expects as minimum a"
            " `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined."
        )

    # -----------------------------------
    # verify model has optimizer
    # -----------------------------------
    has_optimizers = is_overridden("configure_optimizers", model)
    if not has_optimizers:
        raise MisconfigurationException(
            "No `configure_optimizers()` method defined. Lightning `Trainer` expects as minimum a"
            " `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined."
        )

    # ----------------------------------------------
    # verify model does not have on_train_dataloader
    # ----------------------------------------------
    has_on_train_dataloader = is_overridden("on_train_dataloader", model)
    if has_on_train_dataloader:
        rank_zero_deprecation(
            "Method `on_train_dataloader` is deprecated in v1.5.0 and will be removed in v1.7.0."
            " Please use `train_dataloader()` directly.")

    trainer.overriden_optimizer_step = is_overridden("optimizer_step", model)
    trainer.overriden_optimizer_zero_grad = is_overridden(
        "optimizer_zero_grad", model)
    automatic_optimization = model.automatic_optimization
    going_to_accumulate_grad_batches = trainer.accumulation_scheduler.going_to_accumulate_grad_batches(
    )

    has_overriden_optimization_functions = trainer.overriden_optimizer_step or trainer.overriden_optimizer_zero_grad
    if has_overriden_optimization_functions and going_to_accumulate_grad_batches and automatic_optimization:
        rank_zero_warn(
            "When using `Trainer(accumulate_grad_batches != 1)` and overriding"
            " `LightningModule.optimizer_{step,zero_grad}`, the hooks will not be called on every batch"
            " (rather, they are called on every optimization step).")

    # -----------------------------------
    # verify model for val loop
    # -----------------------------------

    has_val_loader = trainer._data_connector._val_dataloader_source.is_defined(
    )
    has_val_step = is_overridden("validation_step", model)

    if has_val_loader and not has_val_step:
        rank_zero_warn(
            "You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop."
        )
    if has_val_step and not has_val_loader:
        rank_zero_warn(
            "You defined a `validation_step` but have no `val_dataloader`. Skipping val loop."
        )

    # ----------------------------------------------
    # verify model does not have on_val_dataloader
    # ----------------------------------------------
    has_on_val_dataloader = is_overridden("on_val_dataloader", model)
    if has_on_val_dataloader:
        rank_zero_deprecation(
            "Method `on_val_dataloader` is deprecated in v1.5.0 and will be removed in v1.7.0."
            " Please use `val_dataloader()` directly.")
def scale_batch_size(trainer,
                     model: LightningModule,
                     mode: str = 'power',
                     steps_per_trial: int = 3,
                     init_val: int = 2,
                     max_trials: int = 25,
                     batch_arg_name: str = 'batch_size',
                     **fit_kwargs):
    r"""
    Will iteratively try to find the largest batch size for a given model
    that does not give an out of memory (OOM) error.

    Args:
        trainer: The Trainer
        model: Model to fit.

        mode: string setting the search mode. Either `power` or `binsearch`.
            If mode is `power` we keep multiplying the batch size by 2, until
            we get an OOM error. If mode is 'binsearch', we will initially
            also keep multiplying by 2 and after encountering an OOM error
            do a binary search between the last successful batch size and the
            batch size that failed.

        steps_per_trial: number of steps to run with a given batch size.
            Idealy 1 should be enough to test if a OOM error occurs,
            however in practise a few are needed

        init_val: initial batch size to start the search with

        max_trials: max number of increase in batch size done before
           algorithm is terminated

        batch_arg_name: name of the attribute that stores the batch size.
            It is expected that the user has provided a model or datamodule that has a hyperparameter
            with that name. We will look for this attribute name in the following places

            - `model`
            - `model.hparams`
            - `model.datamodule`
            - `trainer.datamodule` (the datamodule passed to the tune method)

        **fit_kwargs: remaining arguments to be passed to .fit(), e.g., dataloader
            or datamodule.
    """
    if not lightning_hasattr(model, batch_arg_name):
        raise MisconfigurationException(
            f'Field {batch_arg_name} not found in both `model` and `model.hparams`'
        )
    if hasattr(model, batch_arg_name) and hasattr(
            model, "hparams") and batch_arg_name in model.hparams:
        rank_zero_warn(
            f'Field `model.{batch_arg_name}` and `model.hparams.{batch_arg_name}` are mutually exclusive!'
            f' `model.{batch_arg_name}` will be used as the initial batch size for scaling.'
            f' If this is not the intended behavior, please remove either one.'
        )

    if hasattr(model.train_dataloader, 'patch_loader_code'):
        raise MisconfigurationException(
            'The batch scaling feature cannot be used with dataloaders'
            ' passed directly to `.fit()`. Please disable the feature or'
            ' incorporate the dataloader into the model.')

    # Arguments we adjust during the batch size finder, save for restoring
    __scale_batch_dump_params(trainer)

    # Set to values that are required by the algorithm
    __scale_batch_reset_params(trainer, model, steps_per_trial)

    # Save initial model, that is loaded after batch size is found
    save_path = os.path.join(trainer.default_root_dir,
                             'scale_batch_size_temp_model.ckpt')
    trainer.save_checkpoint(str(save_path))

    if trainer.progress_bar_callback:
        trainer.progress_bar_callback.disable()

    # Initially we just double in size until an OOM is encountered
    new_size = _adjust_batch_size(trainer,
                                  value=init_val)  # initially set to init_val
    if mode == 'power':
        new_size = _run_power_scaling(trainer, model, new_size, batch_arg_name,
                                      max_trials, **fit_kwargs)
    elif mode == 'binsearch':
        new_size = _run_binsearch_scaling(trainer, model, new_size,
                                          batch_arg_name, max_trials,
                                          **fit_kwargs)
    else:
        raise ValueError(
            'mode in method `scale_batch_size` can only be `power` or `binsearch'
        )

    garbage_collection_cuda()
    log.info(
        f'Finished batch size finder, will continue with full run using batch size {new_size}'
    )

    # Restore initial state of model
    if trainer.is_global_zero:
        trainer.checkpoint_connector.restore(str(save_path),
                                             on_gpu=trainer.on_gpu)
        fs = get_filesystem(str(save_path))
        if fs.exists(save_path):
            fs.rm(save_path)

    # Finish by resetting variables so trainer is ready to fit model
    __scale_batch_restore_params(trainer)
    if trainer.progress_bar_callback:
        trainer.progress_bar_callback.enable()

    return new_size
    def select_accelerator(self):
        if self.trainer.accelerator_backend is not None:
            return self.trainer.accelerator_backend

        # ----------------------------------
        # Use the user provided accelerator
        # ----------------------------------
        # use the one the user passed in
        if self.accelerator is not None and isinstance(self.accelerator,
                                                       Accelerator):
            self.accelerator.trainer = self.trainer
            self.accelerator.ddp_plugin = self.trainer.plugin_connector.ddp_plugin
            acc = self.accelerator
            return acc

        # ----------------------------------
        # choose an accelerator for the user
        # ----------------------------------
        use_slurm_ddp = self.trainer.use_ddp and self.trainer.is_slurm_managing_tasks

        # torchelastic or general non_slurm ddp
        te_flags_passed = 'WORLD_SIZE' in os.environ and (
            'GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ)
        use_torchelastic_ddp = self.trainer.use_ddp and te_flags_passed

        use_ddp_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_spawn"
        use_ddp_cpu_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_cpu"

        use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self._is_using_torchelastic(
        )
        use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.trainer.is_slurm_managing_tasks

        # ddp script mode uses the same flags as TE
        # TODO: decouple from TE
        if os.environ.get('PL_IN_DDP_SUBPROCESS', False):
            use_torchelastic_ddp = False

        cluster_env = self._select_environment()

        # choose the appropriate accelerator backend
        if self.trainer.use_ddp2:
            accelerator_backend = accelerators.DDP2Accelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_ddp_cpu_slurm:
            accelerator_backend = accelerators.DDPCPUHPCAccelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_slurm_ddp:
            accelerator_backend = accelerators.DDPHPCAccelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_ddp_cpu_torch_elastic:
            accelerator_backend = accelerators.DDPCPUHPCAccelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_torchelastic_ddp:
            accelerator_backend = accelerators.DDPHPCAccelerator(
                self.trainer, cluster_env,
                self.trainer.plugin_connector.ddp_plugin)

        elif use_ddp_spawn:
            accelerator_backend = accelerators.DDPSpawnAccelerator(
                self.trainer,
                nprocs=self.trainer.num_processes,
                cluster_environment=cluster_env,
                ddp_plugin=self.trainer.plugin_connector.ddp_plugin)

        elif use_ddp_cpu_spawn:
            accelerator_backend = accelerators.DDPCPUSpawnAccelerator(
                self.trainer,
                nprocs=self.trainer.num_processes,
                cluster_environment=cluster_env,
                ddp_plugin=self.trainer.plugin_connector.ddp_plugin)

        elif self.trainer.distributed_backend == "ddp":
            accelerator_backend = accelerators.DDPAccelerator(
                self.trainer,
                cluster_env,
                ddp_plugin=self.trainer.plugin_connector.ddp_plugin)

        elif self.trainer.use_dp:
            accelerator_backend = accelerators.DataParallelAccelerator(
                self.trainer, cluster_env)

        elif self.trainer.use_horovod:
            accelerator_backend = accelerators.HorovodAccelerator(
                self.trainer, cluster_env)

        elif self.trainer.use_single_gpu:
            accelerator_backend = accelerators.GPUAccelerator(
                self.trainer, cluster_env)

        elif self.trainer.use_tpu:
            accelerator_backend = accelerators.TPUAccelerator(
                self.trainer, cluster_env)

        elif self.trainer.distributed_backend is None:
            accelerator_backend = accelerators.CPUAccelerator(
                self.trainer, cluster_env)
        else:
            raise MisconfigurationException(
                f'Trainer(accelerator={self.trainer.distributed_backend} is not a supported backend'
            )

        return accelerator_backend
예제 #19
0
    def step(self,
             *args,
             closure: Optional[Callable] = None,
             make_optimizer_step: Optional[bool] = None,
             **kwargs):
        """
        Call this directly from your training_step when doing optimizations manually.
        By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you

        .. tip:: In manual mode we still automatically accumulate grad over batches if
           Trainer(accumulate_grad_batches=x) is set.

        Args:

            closure: One could provide its own optimizer_closure. Set to None by default.

            make_optimizer_step: Whether to force an optimizer step. When nothing is provided,
                we will use `accumulate_grad_batches` for accumulation frequency by default.
                However, one coud provide True and False based on its own scheduling.
                Refer to example 2 and 3

            args: Any parameters provided to wrapped optimizer.step()

            kwargs: Any parameters provided to wrapped optimizer.step()

        Example::

            def training_step(...):
                (opt_a, opt_b) = self.optimizers()
                loss_a = ...
                # automatically applies scaling, etc...
                self.manual_backward(loss_a, opt_a)
                opt_a.step()

        Example::

            def training_step(self, batch, batch_idx):
                # using Boring Model
                opt = self.optimizers() # only 1 optimizer

                def compute_loss():
                    x = batch[0]
                    x = F.dropout(x, 0.1)
                    predictions = self(x)
                    predictions = F.dropout(predictions, 0.1)
                    loss = self.loss(None, predictions)
                    return loss

                def closure():
                    # emulate MC dropout training
                    num_backward = 1
                    losses = []
                    for backward_idx in range(num_backward + 1):
                        loss = compute_loss()
                        losses.append(loss)
                        retain_graph = num_backward!= backward_idx
                        self.manual_backward(loss, opt, retain_graph=retain_graph)
                    loss_mean = torch.stack(losses).mean()
                    loss_std = torch.stack(losses).std()
                    self.log("train_loss_mean", loss_mean, on_step=True, prog_bar=True, on_epoch=True)
                    self.log("train_loss_std", loss_std, on_step=True, prog_bar=True, on_epoch=True)

                opt.step(loss, closure=closure)

        Example::

            # Scenario for a gan.

            def training_step(self, batch, batch_idx, optimizer_idx):

                # emulate gans training
                opt_gen, opt_dis = self.optimizers()

                # Note: Be careful, don't log on the same key in self.log in both closure
                # as they will be aggregated together on epoch_end

                def gen_closure():
                    ... forward and compute loss for generator
                    loss_gen = ...
                    self.log("loss_gen", loss_gen, on_step=True, on_epoch=True)
                    self.manual_backward(loss_gen, opt_gen)

                def dis_closure():
                    ... forward and compute loss for discriminator
                    loss_dis = ...
                    self.log("loss_dis", loss_dis, on_step=True, on_epoch=True)
                    self.manual_backward(loss_dis, opt_dis)

                # this will accumulate gradients for 2 batches and then call opt_gen.step()
                opt_gen.step(closure=gen_closure, make_optimizer_step=batch_idx % 2 == 0)

                # update discriminator every 4 batches
                # therefore, no gradient accumulation for discriminator
                if batch_idx % 4 == 0 :
                    # Note: Set make_optimizer_step to True or it will use by default
                    # Trainer(accumulate_grad_batches=x)
                    opt_dis.step(closure=optimizer_closure, make_optimizer_step=True)
        """
        profiler_name = f"optimizer_step_and_closure_{self._optimizer_idx}"

        if closure is None:
            closure = do_nothing_closure
        else:
            if not isinstance(closure, types.FunctionType):
                raise MisconfigurationException(
                    "When closure is provided, it should be a function")

        make_optimizer_step = self._check_make_optimizer_step(
            make_optimizer_step)

        if make_optimizer_step:
            self.__optimizer_step(*args,
                                  closure=closure,
                                  profiler_name=profiler_name,
                                  **kwargs)
            self._total_optimizer_step_calls += 1
        else:
            # make sure to call optimizer_closure when accumulating
            with self._trainer.profiler.profile(
                    f"closure_{self._optimizer_idx}"):
                with self._trainer.train_loop.block_ddp_sync_behaviour(True):
                    closure()
예제 #20
0
    def run_evaluation(self, test_mode: bool = False):
        # when testing make sure user defined a test step
        if test_mode and not self.is_overriden('test_step'):
            raise MisconfigurationException(
                "You called `.test()` without defining model's `.test_step()`."
                " Please define and try again")

        # Validation/Test begin callbacks
        if test_mode:
            self.on_test_start()
        else:
            self.on_validation_start()

        # hook
        model = self.get_model()
        model.on_pre_performance_check()

        # select dataloaders
        if test_mode:
            if self.test_dataloaders is None:
                self.reset_test_dataloader(model)

            dataloaders = self.test_dataloaders
            max_batches = self.num_test_batches
        else:
            # val
            if self.val_dataloaders is None:
                self.reset_val_dataloader(model)

            dataloaders = self.val_dataloaders
            max_batches = self.num_val_batches

        # cap max batches to 1 when using fast_dev_run
        if self.fast_dev_run:
            max_batches = 1

        # init validation or test progress bar
        # main progress bar will already be closed when testing so initial position is free
        position = 2 * self.process_position + (not test_mode)
        desc = 'Testing' if test_mode else 'Validating'
        total = max_batches if max_batches != float('inf') else None
        pbar = tqdm(desc=desc,
                    total=total,
                    leave=test_mode,
                    position=position,
                    disable=not self.progress_bar_refresh_rate,
                    dynamic_ncols=True,
                    file=sys.stdout)
        setattr(self, f'{"test" if test_mode else "val"}_progress_bar', pbar)

        # run evaluation
        eval_results = self._evaluate(self.model, dataloaders, max_batches,
                                      test_mode)
        _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.process_output(
            eval_results)

        # add metrics to prog bar
        self.add_tqdm_metrics(prog_bar_metrics)

        # log results of test
        if test_mode and self.proc_rank == 0:
            print('-' * 80)
            print('TEST RESULTS')
            pprint(callback_metrics)
            print('-' * 80)

        # log metrics
        self.log_metrics(log_metrics, {})

        # track metrics for callbacks
        self.callback_metrics.update(callback_metrics)

        # hook
        model.on_post_performance_check()

        # add model specific metrics
        if not test_mode:
            self.main_progress_bar.set_postfix(**self.training_tqdm_dict)

        # close progress bar
        if test_mode:
            self.test_progress_bar.close()
        else:
            self.val_progress_bar.close()

        # eventual dataset reloading
        if test_mode:
            if self.reload_dataloaders_every_epoch:
                self.reset_test_dataloader(model)
        else:
            # val
            if self.reload_dataloaders_every_epoch:
                self.reset_val_dataloader(model)

        # Validation/Test end callbacks
        if test_mode:
            self.on_test_end()
        else:
            self.on_validation_end()
예제 #21
0
def from_numpy(value, device: torch.device = None):
    if device is None:
        raise MisconfigurationException(
            "device (torch.device) should be provided.")
    return torch.from_numpy(value).to(device)
예제 #22
0
    def restore_training_state(self, checkpoint):
        """
        Restore trainer state.
        Model will get its change to update
        :param checkpoint:
        :return:
        """
        if 'optimizer_states' not in checkpoint or 'lr_schedulers' not in checkpoint:
            raise KeyError(
                'Trying to restore training state but checkpoint contains only the model.'
                ' This is probably due to `ModelCheckpoint.save_weights_only` being set to `True`.'
            )

        if any([key in checkpoint for key in DEPRECATED_CHECKPOINT_KEYS]):
            raise ValueError(
                "The checkpoint you're attempting to load follows an"
                " outdated schema. You can upgrade to the current schema by running"
                " `python -m pytorch_lightning.utilities.upgrade_checkpoint --file model.ckpt`"
                " where `model.ckpt` is your checkpoint file.")

        # load callback states
        self.trainer.on_load_checkpoint(checkpoint)

        self.trainer.global_step = checkpoint['global_step']
        self.trainer.current_epoch = checkpoint['epoch']

        # crash if max_epochs is lower then the current epoch from the checkpoint
        if self.trainer.current_epoch > self.trainer.max_epochs:
            m = f"""
            you restored a checkpoint with current_epoch={self.trainer.current_epoch}
            but the Trainer(max_epochs={self.trainer.max_epochs})
            """
            raise MisconfigurationException(m)

        # Division deals with global step stepping once per accumulated batch
        # Inequality deals with different global step for odd vs even num_training_batches
        n_accum = 1 if self.trainer.accumulate_grad_batches is None else self.trainer.accumulate_grad_batches
        expected_steps = self.trainer.num_training_batches / n_accum
        if self.trainer.num_training_batches != 0 and self.trainer.global_step % expected_steps > 1:
            rank_zero_warn(
                "You're resuming from a checkpoint that ended mid-epoch. "
                "This can cause unreliable results if further training is done, "
                "consider using an end of epoch checkpoint. ")

        # restore the optimizers
        optimizer_states = checkpoint['optimizer_states']
        for optimizer, opt_state in zip(self.trainer.optimizers,
                                        optimizer_states):
            optimizer.load_state_dict(opt_state)

            # move optimizer to GPU 1 weight at a time
            # avoids OOM
            if self.trainer.root_gpu is not None:
                for state in optimizer.state.values():
                    for k, v in state.items():
                        if isinstance(v, torch.Tensor):
                            state[k] = v.cuda(self.trainer.root_gpu)

        # restore the lr schedulers
        lr_schedulers = checkpoint['lr_schedulers']
        for scheduler, lrs_state in zip(self.trainer.lr_schedulers,
                                        lr_schedulers):
            scheduler['scheduler'].load_state_dict(lrs_state)
예제 #23
0
    def _get_dataloader_init_kwargs(
            dataloader: DataLoader,
            sampler: Optional[Sampler],
            mode: Optional[RunningStage] = None) -> Dict[str, Any]:
        if not isinstance(dataloader, DataLoader):
            raise ValueError(
                f"The dataloader {dataloader} needs to subclass `torch.utils.data.DataLoader`"
            )

        # get the dataloader instance attributes
        attrs = {
            k: v
            for k, v in vars(dataloader).items() if not k.startswith("_")
        }
        # not part of `vars`
        attrs["multiprocessing_context"] = dataloader.multiprocessing_context

        # get the dataloader instance `__init__` parameters
        params = dict(inspect.signature(dataloader.__init__).parameters)

        # keep only the params whose default is different to the current attr value
        non_defaults = {
            name
            for name, p in params.items()
            if name in attrs and p.default != attrs[name]
        }
        # add `dataset` as it might have been replaced with `*args`
        non_defaults.add("dataset")

        # kwargs to re-construct the dataloader
        dl_kwargs = {k: v for k, v in attrs.items() if k in non_defaults}
        dl_kwargs.update(
            TrainerDataLoadingMixin._resolve_batch_sampler(dataloader,
                                                           sampler,
                                                           mode=mode))

        required_args = {
            p.name
            for p in params.values()
            if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
            and p.default is p.empty and p.name not in dl_kwargs
        }
        # the dataloader has required args which we could not extract from the existing attributes
        if required_args:
            required_args = sorted(required_args)
            dataloader_cls_name = dataloader.__class__.__name__
            raise MisconfigurationException(
                f"Trying to inject `DistributedSampler` into the `{dataloader_cls_name}` instance. "
                "This would fail as some of the `__init__` arguments are not available as instance attributes. "
                f"The missing attributes are {required_args}. "
                f"HINT: If you wrote the `{dataloader_cls_name}` class, define `self.missing_arg_name` or "
                "manually add the `DistributedSampler` as: "
                f"`{dataloader_cls_name}(dataset, sampler=DistributedSampler(dataset))`."
            )

        has_variadic_kwargs = any(p.kind is p.VAR_KEYWORD
                                  for p in params.values())
        if not has_variadic_kwargs:
            # the dataloader signature does not allow keyword arguments that need to be passed
            missing_kwargs = dl_kwargs.keys() - params.keys()
            if missing_kwargs:
                missing_kwargs = sorted(missing_kwargs)
                dataloader_cls_name = dataloader.__class__.__name__
                raise MisconfigurationException(
                    f"Trying to inject `DistributedSampler` into the `{dataloader_cls_name}` instance. "
                    "This would fail as it doesn't expose all its attributes in the `__init__` signature. "
                    f"The missing arguments are {missing_kwargs}. "
                    f"HINT: If you wrote the `{dataloader_cls_name}` class, add the `__init__` arguments or "
                    "manually add the `DistributedSampler` as: "
                    f"`{dataloader_cls_name}(dataset, sampler=DistributedSampler(dataset))`."
                )

        if isinstance(dl_kwargs["dataset"], IterableDataset):
            dl_kwargs["batch_sampler"] = None
            dl_kwargs["sampler"] = None

        if _fault_tolerant_training():
            if isinstance(dl_kwargs["dataset"], IterableDataset):
                # wrap the `IterableDataset` into a `CaptureIterableDataset` to record sampler states.
                dl_kwargs["dataset"] = CaptureIterableDataset(
                    dataset=dl_kwargs["dataset"])
            elif len(dl_kwargs["dataset"]):
                dl_kwargs["dataset"] = CaptureMapDataset(
                    dataset=dl_kwargs["dataset"])
            else:
                raise MisconfigurationException(
                    "This shouldn't happen, please open an issue on Lightning Github repository."
                )

        return dl_kwargs
예제 #24
0
 def _check_arguments(self, trainer):
     if trainer.amp_backend is not None:
         raise MisconfigurationException(
             'DDPSequentialPlugin is currently not supported in Automatic Mixed Precision'
         )
예제 #25
0
 def configure_schedulers(self,
                          schedulers: list,
                          monitor: Optional[str] = None):
     # Convert each scheduler into dict structure with relevant information
     lr_schedulers = []
     default_config = {
         'scheduler': None,
         'name': None,  # no custom name
         'interval': 'epoch',  # after epoch is over
         'frequency': 1,  # every epoch/batch
         'reduce_on_plateau':
         False,  # most often not ReduceLROnPlateau scheduler
         'monitor': monitor,  # value to monitor for ReduceLROnPlateau
         'strict':
         True,  # enforce that the monitor exists for ReduceLROnPlateau
     }
     for scheduler in schedulers:
         if isinstance(scheduler, dict):
             # check provided keys
             extra_keys = [
                 k for k in scheduler.keys()
                 if k not in default_config.keys()
             ]
             if extra_keys:
                 rank_zero_warn(
                     f'Found unsupported keys in the lr scheduler dict: {extra_keys}',
                     RuntimeWarning)
             if 'scheduler' not in scheduler:
                 raise MisconfigurationException(
                     'The lr scheduler dict must have the key "scheduler" with its item being an lr scheduler'
                 )
             scheduler['reduce_on_plateau'] = isinstance(
                 scheduler['scheduler'],
                 optim.lr_scheduler.ReduceLROnPlateau)
             if scheduler['reduce_on_plateau'] and scheduler.get(
                     'monitor', None) is None:
                 raise MisconfigurationException(
                     'The lr scheduler dict must include a monitor when a `ReduceLROnPlateau` scheduler is used.'
                     ' For example: {"optimizer": optimizer, "lr_scheduler":'
                     ' {"scheduler": scheduler, "monitor": "your_loss"}}')
             lr_schedulers.append({**default_config, **scheduler})
         elif isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau):
             if monitor is None:
                 raise MisconfigurationException(
                     '`configure_optimizers` must include a monitor when a `ReduceLROnPlateau` scheduler is used.'
                     ' For example:'
                     ' {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "metric_to_track"}'
                 )
             lr_schedulers.append({
                 **default_config, 'scheduler': scheduler,
                 'reduce_on_plateau': True,
                 'monitor': monitor
             })
         elif isinstance(scheduler, optim.lr_scheduler._LRScheduler):
             lr_schedulers.append({
                 **default_config, 'scheduler': scheduler
             })
         else:
             raise ValueError(
                 f'The provided lr scheduler "{scheduler}" is invalid')
     return lr_schedulers