def reset_train_dataloader(self, model: Optional["pl.LightningModule"] = None ) -> None: """Resets the train dataloader and initialises required variables (number of batches, when to validate, etc.). Args: model: The `LightningModule` if calling this outside of the trainer scope. """ self.train_dataloader = self.request_dataloader(RunningStage.TRAINING, model=model) if self.overfit_batches > 0: if hasattr(self.train_dataloader, "sampler") and isinstance( self.train_dataloader.sampler, RandomSampler): rank_zero_warn( "You requested to overfit but enabled training dataloader shuffling." " We are turning off the training dataloader shuffling for you." ) self.train_dataloader = self.replace_sampler( self.train_dataloader, SequentialSampler(self.train_dataloader.dataset), mode=RunningStage.TRAINING) # debugging self.dev_debugger.track_load_dataloader_call( "train_dataloader", dataloaders=[self.train_dataloader]) # automatically add samplers self.train_dataloader = apply_to_collection(self.train_dataloader, DataLoader, self.auto_add_sampler, shuffle=True, mode=RunningStage.TRAINING) # check the workers recursively apply_to_collection(self.train_dataloader, DataLoader, self._worker_check, "train_dataloader") # add worker_init_fn for correct seeding in worker processes apply_to_collection(self.train_dataloader, DataLoader, self.auto_add_worker_init_fn) # add collate_fn to collect metadata for fault tolerant training if _fault_tolerant_training(): apply_to_collection(self.train_dataloader, DataLoader, self._add_sampler_metadata_collate) # wrap the sequence of train loaders to a CombinedLoader object for computing the num_training_batches self.train_dataloader = CombinedLoader( self.train_dataloader, self.data_connector.multiple_trainloader_mode) self.num_training_batches = len(self.train_dataloader) if has_len( self.train_dataloader) else float("inf") if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0: self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches)) elif self.num_training_batches != float("inf"): self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) elif self.limit_train_batches != 1.0: raise MisconfigurationException( "When using an IterableDataset for `limit_train_batches`," " `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies" " `num_training_batches` to use.") # determine when to check validation # if int passed in, val checks that often # otherwise, it checks in [0, 1.0] % range of a training epoch if isinstance(self.val_check_interval, int): self.val_check_batch = self.val_check_interval if self.val_check_batch > self.num_training_batches: raise ValueError( f"`val_check_interval` ({self.val_check_interval}) must be less than or equal " f"to the number of the training batches ({self.num_training_batches}). " "If you want to disable validation set `limit_val_batches` to 0.0 instead." ) else: if not has_len(self.train_dataloader): if self.val_check_interval == 1.0: self.val_check_batch = float("inf") else: raise MisconfigurationException( "When using an IterableDataset for `train_dataloader`," " `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies" " checking validation every k training batches.") else: self.val_check_batch = int(self.num_training_batches * self.val_check_interval) self.val_check_batch = max(1, self.val_check_batch) if self.logger and self.num_training_batches < self.log_every_n_steps: rank_zero_warn( f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval" f" Trainer(log_every_n_steps={self.log_every_n_steps}). Set a lower value for log_every_n_steps if" " you want to see logs for the training epoch.")
def _reset_eval_dataloader( self, mode: RunningStage, model: Optional["pl.LightningModule"] = None ) -> Tuple[List[Union[int, float]], List[DataLoader]]: """Generic method to reset a dataloader for evaluation. Args: mode: The running stage of the ``Trainer`` model: The ``LightningModule`` if calling this outside of the trainer scope. Returns: Tuple (num_batches, dataloaders) """ assert mode.evaluating or mode == RunningStage.PREDICTING # always get the loaders first so we can count how many there are loader_name = f"{mode.dataloader_prefix}_dataloader" dataloaders = self.request_dataloader(mode, model=model) if not isinstance(dataloaders, list): dataloaders = [dataloaders] # when overfitting, use the training loader as val and test # duplicate it the numb of times needed to match the train loaders if self.overfit_batches > 0: train_dataloader = self.request_dataloader(RunningStage.TRAINING, model=model) dataloaders = [ deepcopy(train_dataloader) for _ in range(len(dataloaders)) ] self.dev_debugger.track_load_dataloader_call(loader_name, dataloaders=dataloaders) for loader_i in range(len(dataloaders)): loader = dataloaders[loader_i] if hasattr(loader, "sampler") and isinstance( loader.sampler, RandomSampler): # when overfitting, the dataloader should not have sampler if self.overfit_batches > 0 and mode.evaluating: rank_zero_warn( "You requested to overfit but enabled val/test dataloader shuffling." " We are turning it off for you.") dataloaders[loader_i] = self.replace_sampler( loader, SequentialSampler(loader.dataset), mode=mode) else: rank_zero_warn( f"Your `{mode.dataloader_prefix}_dataloader` has `shuffle=True`," "it is strongly recommended that you turn this off for val/test/predict dataloaders." ) if any(dl is None for dl in dataloaders): rank_zero_warn( "One of given dataloaders is None and it will be skipped.") # add samplers dataloaders = [ self.auto_add_sampler(dl, False, mode=mode) for dl in dataloaders if dl is not None ] # add worker_init_fn for correct seeding in worker processes apply_to_collection(dataloaders, dtype=DataLoader, function=self.auto_add_worker_init_fn) loader_num_batches = [] # determine number of batches # datasets could be none, 1 or 2+ if len(dataloaders) != 0: for i, dataloader in enumerate(dataloaders): num_batches = len(dataloader) if has_len( dataloader) else float("inf") self._worker_check(dataloader, f"{mode.dataloader_prefix}_dataloader {i}") # percent or num_steps limit_eval_batches = getattr( self, f"limit_{mode.dataloader_prefix}_batches") # limit num batches either as a percent or num steps if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0: num_batches = min(num_batches, int(limit_eval_batches)) elif num_batches != float("inf"): num_batches = int(num_batches * limit_eval_batches) elif limit_eval_batches != 1.0: raise MisconfigurationException( f"When using an IterableDataset for `limit_{mode}_batches`," f" `Trainer(limit_{mode.dataloader_prefix}_batches)` must be `0.0`, `1.0` or an int. An int k" f" specifies `num_{mode.dataloader_prefix}_batches` to use." ) if num_batches == 0 and limit_eval_batches > 0.0 and isinstance( limit_eval_batches, float): min_pct = 1.0 / len(dataloader) raise MisconfigurationException( f"you requested to check {limit_eval_batches} of the `{mode.dataloader_prefix}_dataloader` but" f" {limit_eval_batches}*{num_batches} < 1. Please increase the" f" `limit_{mode.dataloader_prefix}_batches` flag. Try at least" f" `limit_{mode.dataloader_prefix}_batches={min_pct}`") loader_num_batches.append(num_batches) return loader_num_batches, dataloaders
def scale_batch_size(self, model: LightningModule, mode: str = 'power', steps_per_trial: int = 3, init_val: int = 2, max_trials: int = 25, batch_arg_name: str = 'batch_size'): r""" Will iteratively try to find the largest batch size for a given model that does not give an out of memory (OOM) error. Args: model: Model to fit. mode: string setting the search mode. Either `power` or `binsearch`. If mode is `power` we keep multiplying the batch size by 2, until we get an OOM error. If mode is 'binsearch', we will initially also keep multiplying by 2 and after encountering an OOM error do a binary search between the last successful batch size and the batch size that failed. steps_per_trial: number of steps to run with a given batch size. Idealy 1 should be enough to test if a OOM error occurs, however in practise a few are needed init_val: initial batch size to start the search with max_trials: max number of increase in batch size done before algorithm is terminated """ if not hasattr(model, batch_arg_name): raise MisconfigurationException( f'Field {batch_arg_name} not found in `model.hparams`') if hasattr(model.train_dataloader, 'patch_loader_code'): raise MisconfigurationException( 'The batch scaling feature cannot be used with dataloaders' ' passed directly to `.fit()`. Please disable the feature or' ' incorporate the dataloader into the model.') # Arguments we adjust during the batch size finder, save for restoring self.__scale_batch_dump_params() # Set to values that are required by the algorithm self.__scale_batch_reset_params(model, steps_per_trial) # Save initial model, that is loaded after batch size is found save_path = os.path.join(self.default_root_dir, 'temp_model.ckpt') self.save_checkpoint(str(save_path)) if self.progress_bar_callback: self.progress_bar_callback.disable() # Initially we just double in size until an OOM is encountered new_size = _adjust_batch_size( self, value=init_val) # initially set to init_val if mode == 'power': new_size = _run_power_scaling(self, model, new_size, batch_arg_name, max_trials) elif mode == 'binsearch': new_size = _run_binsearch_scaling(self, model, new_size, batch_arg_name, max_trials) else: raise ValueError( 'mode in method `scale_batch_size` can only be `power` or `binsearch' ) garbage_collection_cuda() log.info( f'Finished batch size finder, will continue with full run using batch size {new_size}' ) # Restore initial state of model self.restore(str(save_path), on_gpu=self.on_gpu) os.remove(save_path) # Finish by resetting variables so trainer is ready to fit model self.__scale_batch_restore_params() if self.progress_bar_callback: self.progress_bar_callback.enable() return new_size
def _validate_scheduler_optimizer(optimizers, lr_schedulers): if any(sch['scheduler'].optimizer not in optimizers for sch in lr_schedulers): raise MisconfigurationException( "Some schedulers are attatched with an optimizer that wasn't returned from `configure_optimizers`." )
def set_distributed_mode(self): self.trainer.use_dp = False self.trainer.use_ddp = False self.trainer.use_ddp2 = False self.trainer.use_horovod = False self.trainer.use_single_gpu = False if self.trainer.distributed_backend is None: if self.has_horovodrun(): self._set_horovod_backend() elif self.trainer.num_gpus == 0: if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1: self.trainer.use_ddp = True # ddp_cpu elif self.trainer.num_gpus == 1: self.trainer.use_single_gpu = True elif self.trainer.num_gpus > 1: rank_zero_warn( 'You requested multiple GPUs but did not specify a backend, e.g.' ' `Trainer(accelerator="dp"|"ddp"|"ddp2")`.' ' Setting `accelerator="ddp_spawn"` for you.') self.trainer.distributed_backend = "ddp_spawn" if self.trainer.distributed_backend == "dp": # do nothing if num_gpus == 0 if self.trainer.num_gpus == 1: self.trainer.use_single_gpu = True self.trainer.use_dp = True elif self.trainer.num_gpus > 1: self.trainer.use_dp = True elif self.trainer.distributed_backend in ("ddp", "ddp_spawn"): if self.trainer.num_gpus == 0: if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1: self.trainer.use_ddp = True # ddp_cpu elif self.trainer.num_gpus == 1: self.trainer.use_single_gpu = True self.trainer.use_ddp = True elif self.trainer.num_gpus > 1: self.trainer.use_ddp = True self.trainer.num_processes = self.trainer.num_gpus elif self.trainer.distributed_backend == "ddp2": # do nothing if num_gpus == 0 if self.trainer.num_gpus >= 1: self.trainer.use_ddp2 = True elif self.trainer.distributed_backend == "ddp_cpu": if self.trainer.num_gpus > 0: rank_zero_warn( 'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.' ) self.trainer.use_ddp = True self.trainer.data_parallel_device_ids = None self.trainer.on_gpu = False self.trainer.on_cpu = True elif self.trainer.distributed_backend == "horovod": self._set_horovod_backend() # throw error to force user ddp or ddp2 choice if self.trainer.num_nodes > 1 and not (self.trainer.use_ddp2 or self.trainer.use_ddp): raise MisconfigurationException( 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. ' 'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`' ) rank_zero_info( f'GPU available: {torch.cuda.is_available()}, used: {self.trainer.on_gpu}' ) num_cores = self.trainer.tpu_cores if self.trainer.tpu_cores is not None else 0 rank_zero_info( f'TPU available: {TPU_AVAILABLE}, using: {num_cores} TPU cores') if torch.cuda.is_available() and not self.trainer.on_gpu: rank_zero_warn( 'GPU available but not used. Set the --gpus flag when calling the script.' )
def setup_distributed(self) -> None: if not self.on_gpu: raise MisconfigurationException( "You selected accelerator to be `ddp_fully_sharded`, but GPU is not available." ) super().setup_distributed()
def _reset_eval_dataloader( self, model: LightningModule, mode: str, ) -> Tuple[List[Union[int, float]], List[DataLoader]]: """Generic method to reset a dataloader for evaluation. Args: model: The current `LightningModule` mode: Either `'val'` or `'test'` Returns: Tuple (num_batches, dataloaders) """ # always get the loaders first so we can count how many there are loader_name = f'{mode}_dataloader' dataloaders = self.request_dataloader(getattr(model, loader_name)) if not isinstance(dataloaders, list): dataloaders = [dataloaders] # when overfitting use the training loader as val and test # duplicate it the numb of times needed to match the train loaders if self.overfit_batches > 0: num_loaders = len(dataloaders) train_dataloader = self.request_dataloader(getattr(model, 'train_dataloader')) dataloaders = [deepcopy(train_dataloader) for _ in range(num_loaders)] self.dev_debugger.track_load_dataloader_call(loader_name, dataloaders=dataloaders) for loader_i in range(len(dataloaders)): loader = dataloaders[loader_i] # shuffling in val and test set is bad practice modes = ('val', 'test', 'predict') if mode in modes and hasattr(loader, 'sampler') and isinstance(loader.sampler, RandomSampler): # when overfitting, the dataloader should not have sampler if self.overfit_batches > 0: rank_zero_warn( 'You requested to overfit but enabled test/val dataloader shuffling.' ' We are turning it off for you.' ) dataloaders[loader_i] = self.replace_sampler(loader, SequentialSampler(loader.dataset)) else: rank_zero_warn( f'Your {mode}_dataloader has `shuffle=True`, it is best practice to turn' ' this off for validation and test dataloaders.' ) if any([dl is None for dl in dataloaders]): rank_zero_warn("One of given dataloaders is None and it will be skipped.") # add samplers dataloaders = [self.auto_add_sampler(dl, shuffle=False) for dl in dataloaders if dl is not None] loader_num_batches = [] # determine number of batches # datasets could be none, 1 or 2+ if len(dataloaders) != 0: for i, dataloader in enumerate(dataloaders): num_batches = len(dataloader) if has_len(dataloader) else float('inf') self._worker_check(dataloader, f'{mode} dataloader {i}') # percent or num_steps limit_eval_batches = getattr(self, f'limit_{mode}_batches') # limit num batches either as a percent or num steps if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0: num_batches = min(num_batches, int(limit_eval_batches)) elif num_batches != float('inf'): num_batches = int(num_batches * limit_eval_batches) elif limit_eval_batches != 1.0: raise MisconfigurationException( 'When using an IterableDataset for `limit_{mode}_batches`,' f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies' f' `num_{mode}_batches` to use.' ) if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float): min_pct = 1.0 / len(dataloader) raise MisconfigurationException( f'you requested to check {limit_eval_batches} of the {mode} dataloader but' f' {limit_eval_batches}*{num_batches} < 1. Please increase the limit_{mode}_batches.' f' Try at least limit_{mode}_batches={min_pct}' ) loader_num_batches.append(num_batches) return loader_num_batches, dataloaders
def setup_training(self, model: LightningModule): """Sanity check a few things before starting actual training. Args: model: The model to run sanity test on. """ # -------------------------- # Setup?? # -------------------------- ref_model = model if self.trainer.data_parallel: ref_model = model.module # set the ranks and devices self.trainer.accelerator_backend.dist.rank = self.trainer.global_rank self.trainer.accelerator_backend.dist.device = ref_model.device # give model convenience properties ref_model.trainer = self.trainer # set local properties on the model self.trainer.model_connector.copy_trainer_model_properties(ref_model) # init amp. Must be done here instead of __init__ to allow ddp to work if self.trainer.amp_backend == AMPType.NATIVE and self.trainer.precision == 16 and not self.trainer.use_tpu: self.trainer.scaler = self.trainer.precision_connector.backend.scaler # log hyper-parameters if self.trainer.logger is not None: # save exp to get started (this is where the first experiment logs are written) self.trainer.logger.log_hyperparams(ref_model.hparams_initial) self.trainer.logger.log_graph(ref_model) self.trainer.logger.save() # wait for all to join if on distributed self.trainer.accelerator_backend.barrier("setup_training") # register auto-resubmit when on SLURM self.trainer.slurm_connector.register_slurm_signal_handlers() # -------------------------- # Pre-train # -------------------------- # on pretrain routine start self.trainer.on_pretrain_routine_start(ref_model) if self.trainer.is_function_implemented("on_pretrain_routine_start"): ref_model.on_pretrain_routine_start() # print model summary if self.trainer.is_global_zero and self.trainer.weights_summary is not None and not self.trainer.testing: if self.trainer.weights_summary in ModelSummary.MODES: ref_model.summarize(mode=self.trainer.weights_summary) else: raise MisconfigurationException( "weights_summary can be None, " + ", ".join(ModelSummary.MODES)) # track model now. # if cluster resets state, the model will update with the saved weights self.trainer.model = model # restore training and model before hpc is called self.trainer.checkpoint_connector.restore_weights(model) # on pretrain routine end self.trainer.on_pretrain_routine_end(ref_model) if self.trainer.is_function_implemented("on_pretrain_routine_end"): ref_model.on_pretrain_routine_end()
def _evaluate( self, model: LightningModule, dataloaders: List[DataLoader], max_batches: Union[int, List[int]], test_mode: bool = False ): """Run evaluation code. Args: model: The model to evaluate. dataloaders: A list of PyTorch dataloaders. max_batches: An integer or list of integers with length of the number of dataloaders. Each entry is the number of batches to process in the corresponding dataloader. test_mode: """ # enable eval mode model.zero_grad() model.eval() # copy properties for forward overrides self.copy_trainer_model_properties(model) # disable gradients to save memory torch.set_grad_enabled(False) # bookkeeping outputs = [] # convert max_batches to list if isinstance(max_batches, int): max_batches = [max_batches] * len(dataloaders) # -------------------------- # ON_EVAL_EPOCH_START hook # -------------------------- self.__call_eval_loop_hook_start(test_mode) # run validation for dataloader_idx, dataloader in enumerate(dataloaders): dl_outputs = [] # on TPU we have to wrap it under the ParallelLoader if self.use_tpu: device = xm.xla_device(self.tpu_id) dataloader = xla_pl.ParallelLoader(dataloader, [device]) dataloader = dataloader.per_device_loader(device) # each dataloader has a max num batches dl_max_batches = max_batches[dataloader_idx] for batch_idx, batch in enumerate(dataloader): if batch is None: continue # stop short when on fast_dev_run (sets max_batch=1) if batch_idx >= dl_max_batches: break # callbacks if test_mode: self.on_test_batch_start() else: self.on_validation_batch_start() # ----------------- # RUN EVALUATION STEP # ----------------- if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu: with torch.cuda.amp.autocast(): output = self.evaluation_forward(model, batch, batch_idx, dataloader_idx, test_mode) else: output = self.evaluation_forward(model, batch, batch_idx, dataloader_idx, test_mode) # allow only EvalResult when using structured results (from val_step) if isinstance(output, Result) and not isinstance(output, EvalResult): m = 'only EvalResults or dicts are allowed from validation_step' raise MisconfigurationException(m) # on dp / ddp2 might still want to do something with the batch parts if test_mode: if self.is_overridden('test_step_end'): model_ref = self.get_model() with self.profiler.profile('test_step_end'): output = model_ref.test_step_end(output) self.on_test_batch_end() else: if self.is_overridden('validation_step_end'): model_ref = self.get_model() with self.profiler.profile('validation_step_end'): output = model_ref.validation_step_end(output) self.on_validation_batch_end() # track outputs for collation if output is not None: dl_outputs.append(output) self.__eval_add_step_metrics(output) outputs.append(dl_outputs) # --------------------- # EVAL_EPOCH_END # --------------------- using_eval_result = len(outputs) > 0 and len(outputs[0]) > 0 and isinstance(outputs[0][0], EvalResult) eval_results = self.__run_eval_epoch_end(test_mode, outputs, dataloaders, using_eval_result) # log callback metrics self.__update_callback_metrics(eval_results, using_eval_result) # enable train mode again model.train() # enable gradients to save memory torch.set_grad_enabled(True) # -------------------------- # ON_EVAL_EPOCH_END hook # -------------------------- self.__call_eval_loop_hook_end(test_mode) return eval_results
def reset_train_dataloader(self, model: LightningModule) -> None: """Resets the train dataloader and initialises required variables (number of batches, when to validate, etc.). Args: model: The current `LightningModule` """ self.train_dataloader = self.request_dataloader(model.train_dataloader) if self.overfit_batches > 0: if hasattr(self.train_dataloader, 'sampler') and isinstance(self.train_dataloader.sampler, RandomSampler): rank_zero_warn( 'You requested to overfit but enabled training dataloader shuffling.' ' We are turning it off for you.' ) self.train_dataloader = self.replace_sampler( self.train_dataloader, SequentialSampler(self.train_dataloader.dataset) ) # debugging self.dev_debugger.track_load_dataloader_call('train_dataloader', dataloaders=[self.train_dataloader]) # automatically add samplers self.train_dataloader = apply_to_collection( self.train_dataloader, DataLoader, self.auto_add_sampler, shuffle=True ) # check the workers recursively apply_to_collection(self.train_dataloader, DataLoader, self._worker_check, 'train dataloader') # wrap the sequence of train loaders to a CombinedLoader object for computing the num_training_batches self.train_dataloader = CombinedLoader(self.train_dataloader, self._multiple_trainloader_mode) self.num_training_batches = len(self.train_dataloader) if has_len(self.train_dataloader) else float('inf') if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0: self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches)) elif self.num_training_batches != float('inf'): self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) elif self.limit_train_batches != 1.0: raise MisconfigurationException( 'When using an IterableDataset for `limit_train_batches`,' ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies' ' `num_training_batches` to use.' ) # determine when to check validation # if int passed in, val checks that often # otherwise, it checks in [0, 1.0] % range of a training epoch if isinstance(self.val_check_interval, int): self.val_check_batch = self.val_check_interval if self.val_check_batch > self.num_training_batches: raise ValueError( f'`val_check_interval` ({self.val_check_interval}) must be less than or equal ' f'to the number of the training batches ({self.num_training_batches}). ' 'If you want to disable validation set `limit_val_batches` to 0.0 instead.' ) else: if not has_len(self.train_dataloader): if self.val_check_interval == 1.0: self.val_check_batch = float('inf') else: raise MisconfigurationException( 'When using an IterableDataset for `train_dataloader`,' ' `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies' ' checking validation every k training batches.' ) else: self.val_check_batch = int(self.num_training_batches * self.val_check_interval) self.val_check_batch = max(1, self.val_check_batch)
def _call_children_scripts(self): # bookkeeping of spawned processes self._check_can_spawn_children() # DDP Environment variables os.environ["MASTER_ADDR"] = self.cluster_environment.master_address() os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) # allow the user to pass the node rank os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank()) os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank()) # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c` # See https://docs.python.org/3/reference/import.html#main-spec if __main__.__spec__ is None: # pragma: no-cover # Script called as `python a/b/c.py` # when user is using hydra find the absolute path path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path # pull out the commands used to run the script and resolve the abs file path command = sys.argv try: full_path = path_lib(command[0]) except Exception: full_path = os.path.abspath(command[0]) command[0] = full_path # use the same python interpreter and actually running command = [sys.executable] + command else: # Script called as `python -m a.b.c` command = [sys.executable, "-m", __main__.__spec__.name ] + sys.argv[1:] # the visible devices tell us how many GPUs we want to use. # when the trainer script was called the device has already been scoped by the time # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone # but forward the GPUs selected via environment variables if self.parallel_devices is None: raise MisconfigurationException( "you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)" ) os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}" self.interactive_ddp_procs = [] for local_rank in range(1, self.num_processes): env_copy = os.environ.copy() env_copy["LOCAL_RANK"] = f"{local_rank}" # remove env var if global seed not set if os.environ.get( "PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: del env_copy["PL_GLOBAL_SEED"] # start process # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None if _HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() os_cwd = f'"{os.getcwd()}"' command += [ f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}" ] proc = subprocess.Popen(command, env=env_copy, cwd=cwd) self.interactive_ddp_procs.append(proc) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay)
def lr_find(self, model: LightningModule, train_dataloader: Optional[DataLoader] = None, val_dataloaders: Optional[DataLoader] = None, min_lr: float = 1e-8, max_lr: float = 1, num_training: int = 100, mode: str = 'exponential', early_stop_threshold: float = 4.0, num_accumulation_steps=None): r""" lr_find enables the user to do a range test of good initial learning rates, to reduce the amount of guesswork in picking a good starting learning rate. Args: model: Model to do range testing for train_dataloader: A PyTorch DataLoader with training samples. If the model has a predefined train_dataloader method this will be skipped. min_lr: minimum learning rate to investigate max_lr: maximum learning rate to investigate num_training: number of learning rates to test mode: search strategy, either 'linear' or 'exponential'. If set to 'linear' the learning rate will be searched by linearly increasing after each batch. If set to 'exponential', will increase learning rate exponentially. early_stop_threshold: threshold for stopping the search. If the loss at any point is larger than early_stop_threshold*best_loss then the search is stopped. To disable, set to None. num_accumulation_steps: deprepecated, number of batches to calculate loss over. Set trainer argument ``accumulate_grad_batches`` instead. Example:: # Setup model and trainer model = MyModelClass(hparams) trainer = pl.Trainer() # Run lr finder lr_finder = trainer.lr_find(model, ...) # Inspect results fig = lr_finder.plot(); fig.show() suggested_lr = lr_finder.suggestion() # Overwrite lr and create new model hparams.lr = suggested_lr model = MyModelClass(hparams) # Ready to train with new learning rate trainer.fit(model) """ if num_accumulation_steps is not None: rank_zero_warn("Argument `num_accumulation_steps` has been deprepecated" " since v0.7.6 and will be removed in 0.9. Please" " set trainer argument `accumulate_grad_batches` instead.", DeprecationWarning) save_path = os.path.join(self.default_root_dir, 'lr_find_temp.ckpt') self.__lr_finder_dump_params(model) # Prevent going into infinite loop self.auto_lr_find = False # Initialize lr finder object (stores results) lr_finder = _LRFinder(mode, min_lr, max_lr, num_training) # Use special lr logger callback self.callbacks = [_LRCallback(num_training, early_stop_threshold, progress_bar_refresh_rate=1)] # No logging self.logger = DummyLogger() # Max step set to number of iterations self.max_steps = num_training # Disable standard progress bar for fit if self.progress_bar_callback: self.progress_bar_callback.disable() # Disable standard checkpoint & early stopping self.checkpoint_callback = False self.early_stop_callback = None self.enable_early_stop = False # Required for saving the model self.optimizers, self.schedulers = [], [], self.model = model # Dump model checkpoint self.save_checkpoint(str(save_path)) # Configure optimizer and scheduler optimizers, _, _ = self.init_optimizers(model) if len(optimizers) != 1: raise MisconfigurationException( f'`model.configure_optimizers()` returned {len(optimizers)}, but' ' learning rate finder only works with single optimizer') model.configure_optimizers = lr_finder._get_new_optimizer(optimizers[0]) # Fit, lr & loss logged in callback self.fit(model, train_dataloader=train_dataloader, val_dataloaders=val_dataloaders) # Prompt if we stopped early if self.global_step != num_training: log.info('LR finder stopped early due to diverging loss.') # Transfer results from callback to lr finder object lr_finder.results.update({'lr': self.callbacks[0].lrs, 'loss': self.callbacks[0].losses}) lr_finder._total_batch_idx = self.total_batch_idx # for debug purpose # Reset model state self.restore(str(save_path), on_gpu=self.on_gpu) os.remove(save_path) # Finish by resetting variables so trainer is ready to fit model self.__lr_finder_restore_params(model) if self.progress_bar_callback: self.progress_bar_callback.enable() return lr_finder
def init_optimizers(self, model: LightningModule) -> Tuple[List, List, List]: optim_conf = model.configure_optimizers() if optim_conf is None: rank_zero_warn( '`LightningModule.configure_optimizers` returned `None`, this fit will run with no optimizer', UserWarning, ) optim_conf = _MockOptimizer() optimizers, lr_schedulers, optimizer_frequencies = [], [], [] monitor = None # single output, single optimizer if isinstance(optim_conf, Optimizer): optimizers = [optim_conf] # two lists, optimizer + lr schedulers elif isinstance(optim_conf, (list, tuple)) and len(optim_conf) == 2 and isinstance( optim_conf[0], list): opt, sch = optim_conf optimizers = opt lr_schedulers = sch if isinstance(sch, list) else [sch] # single dictionary elif isinstance(optim_conf, dict): optimizers = [optim_conf["optimizer"]] monitor = optim_conf.get('monitor', None) lr_schedulers = [optim_conf["lr_scheduler"] ] if "lr_scheduler" in optim_conf else [] # multiple dictionaries elif isinstance(optim_conf, (list, tuple)) and all( isinstance(d, dict) for d in optim_conf): optimizers = [opt_dict["optimizer"] for opt_dict in optim_conf] lr_schedulers = [ opt_dict["lr_scheduler"] for opt_dict in optim_conf if "lr_scheduler" in opt_dict ] optimizer_frequencies = [ opt_dict["frequency"] for opt_dict in optim_conf if opt_dict.get("frequency", None) is not None ] # assert that if frequencies are present, they are given for all optimizers if optimizer_frequencies and len(optimizer_frequencies) != len( optimizers): raise ValueError( "A frequency must be given to each optimizer.") # single list or tuple, multiple optimizer elif isinstance(optim_conf, (list, tuple)): optimizers = list(optim_conf) # unknown configuration else: raise MisconfigurationException( 'Unknown configuration for model optimizers.' ' Output from `model.configure_optimizers()` should either be:\n' ' * `torch.optim.Optimizer`\n' ' * [`torch.optim.Optimizer`]\n' ' * ([`torch.optim.Optimizer`], [`torch.optim.lr_scheduler`])\n' ' * {"optimizer": `torch.optim.Optimizer`, (optional) "lr_scheduler": `torch.optim.lr_scheduler`}\n' ' * A list of the previously described dict format, with an optional "frequency" key (int)' ) lr_schedulers = self.configure_schedulers(lr_schedulers, monitor=monitor) _validate_scheduler_optimizer(optimizers, lr_schedulers) return optimizers, lr_schedulers, optimizer_frequencies
def set_distributed_mode(self, distributed_backend): self.use_dp = False self.use_ddp = False self.use_ddp2 = False self.use_horovod = False self.single_gpu = False if distributed_backend is None: if self.has_horovodrun(): self._set_horovod_backend() elif self.num_gpus == 0: if self.num_nodes > 1 or self.num_processes > 1: self.use_ddp = True # ddp_cpu elif self.num_gpus == 1: self.single_gpu = True elif self.num_gpus > 1: rank_zero_warn( 'You requested multiple GPUs but did not specify a backend, e.g.' ' Trainer(distributed_backend=dp) (or ddp, ddp2).' ' Setting distributed_backend=ddp_spawn for you.') self.distributed_backend = 'ddp_spawn' distributed_backend = 'ddp_spawn' if distributed_backend == "dp": # do nothing if num_gpus == 0 if self.num_gpus == 1: self.single_gpu = True self.use_dp = True elif self.num_gpus > 1: self.use_dp = True elif distributed_backend in ['ddp', 'ddp_spawn']: if self.num_gpus == 0: if self.num_nodes > 1 or self.num_processes > 1: self.use_ddp = True # ddp_cpu elif self.num_gpus == 1: self.single_gpu = True self.use_ddp = True elif self.num_gpus > 1: self.use_ddp = True self.num_processes = self.num_gpus elif distributed_backend == "ddp2": # do nothing if num_gpus == 0 if self.num_gpus >= 1: self.use_ddp2 = True elif distributed_backend == "ddp_cpu": if self.num_gpus > 0: rank_zero_warn( 'You requested one or more GPUs, but set the backend to `ddp_cpu`.' ' Training will not use GPUs.') self.use_ddp = True self.data_parallel_device_ids = None self.on_gpu = False elif distributed_backend == 'horovod': self._set_horovod_backend() # throw error to force user ddp or ddp2 choice if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp): raise MisconfigurationException( 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. ' 'To silence this warning set distributed_backend=ddp or distributed_backend=ddp2' ) rank_zero_info( f'GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}') num_cores = self.tpu_cores if self.tpu_cores is not None else 0 rank_zero_info( f'TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores')
def __init__( self, accelerator: Optional[ "pl.accelerators.accelerator.Accelerator"] = None, zero_optimization: bool = True, stage: int = 2, remote_device: str = "cpu", offload_optimizer: bool = False, offload_parameters: bool = False, offload_params_device: str = "cpu", nvme_path: str = "/local_nvme", params_buffer_count: int = 5, params_buffer_size: int = 1e8, max_in_cpu: int = 1e9, offload_optimizer_device: str = "cpu", optimizer_buffer_count: int = 4, block_size: int = 1048576, queue_depth: int = 8, single_submit: bool = False, overlap_events: bool = True, thread_count: int = 1, pin_memory: bool = False, sub_group_size: int = 1e12, contiguous_gradients: bool = True, overlap_comm: bool = True, allgather_partitions: bool = True, reduce_scatter: bool = True, allgather_bucket_size: int = 2e8, reduce_bucket_size: int = 2e8, zero_allow_untested_optimizer: bool = True, logging_batch_size_per_gpu: Union[str, int] = "auto", config: Optional[Union[Path, str, dict]] = None, logging_level: int = logging.WARN, parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, loss_scale: float = 0, initial_scale_power: int = 16, loss_scale_window: int = 1000, hysteresis: int = 2, min_loss_scale: int = 1, partition_activations: bool = False, cpu_checkpointing: bool = False, contiguous_memory_optimization: bool = False, synchronize_checkpoint_boundary: bool = False, load_full_weights: bool = False, precision_plugin: Optional[PrecisionPlugin] = None, ) -> None: """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large billion parameter models. `For more information: https://pytorch- lightning.readthedocs.io/en/latest/advanced/advanced_gpu.html#deepspeed`. .. warning:: ``DeepSpeedStrategy`` is in beta and subject to change. Defaults have been set to enable ZeRO-Offload and some have been taken from the link below. These defaults have been set generally, but may require tuning for optimum performance based on your model size. `For more information: https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training`. Arguments: zero_optimization: Enable ZeRO optimization. This is only compatible with precision=16. stage: Different stages of the ZeRO Optimizer. 0 is disabled, 1 is optimizer state partitioning, 2 is optimizer+gradient state partitioning, 3 is optimizer+gradient_parameter partitioning using the infinity engine. remote_device: Device to instantiate the model on initially (``cpu`` or ``nvme``). offload_optimizer: Enable offloading optimizer memory and computation to CPU or NVMe based on ``offload_optimizer_device``. offload_parameters: When using ZeRO Stage 3, Enable offloading parameter memory and computation to CPU or NVMe based on ``offload_params_device``. offload_params_device: When offloading parameters choose the device to offload to, ``cpu`` or ``nvme``. offload_optimizer_device: When offloading optimizer state choose the device to offload to, ``cpu`` or ``nvme``. params_buffer_count: Number of buffers in buffer pool for parameter offloading when ``offload_params_device`` is ``nvme``. params_buffer_size: Size of buffers in buffer pool for parameter offloading when ``offload_params_device`` is ``nvme``. max_in_cpu: Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled. nvme_path: Filesystem path for NVMe device for optimizer/parameter state offloading. optimizer_buffer_count: Number of buffers in buffer pool for optimizer state offloading when ``offload_optimizer_device`` is set to to ``nvme``. This should be at least the number of states maintained per parameter by the optimizer. For example, Adam optimizer has 4 states (parameter, gradient, momentum, and variance). block_size: When using NVMe Offloading, the I/O block size in bytes. queue_depth: When using NVMe Offloading, the I/O queue depth. single_submit: When using NVMe Offloading, submit requests to storage device as multiple individual requests, as opposed to one block of requests. overlap_events: When using NVMe Offloading, submit requests to storage device in an overlapped fashion without waiting for completion of earlier requests. thread_count: When using NVMe Offloading, Intra-request parallelism for each read/write submitted by a user thread. pin_memory: When using ZeRO stage 3, pin optimizer state memory on CPU. This could boost throughput at the cost of extra memory overhead. sub_group_size: When using ZeRO stage 3, defines the number of parameters within a sub group to offload at a time. Smaller numbers require more communication, but improve memory efficiency. contiguous_gradients: Copies gradients to a continuous buffer as they are produced. Avoids memory fragmentation during backwards. Useful when training large models. overlap_comm: Overlap the reduction (synchronization) of gradients with the backwards computation. This is a speed optimization when training across multiple GPUs/machines. allgather_partitions: All gather updated parameters at the end of training step, instead of using a series of broadcast collectives. reduce_scatter: Use reduce/scatter instead of allreduce to average gradients. allgather_bucket_size: Number of elements to allgather at once. Used to limit the memory required for larger model sizes, with a tradeoff with speed. reduce_bucket_size: Number of elements to reduce at once. Used to limit the memory required for larger model sizes, with a tradeoff with speed. zero_allow_untested_optimizer: Allow untested optimizers to be used with ZeRO. Currently only Adam is a DeepSpeed supported optimizer when using ZeRO. logging_batch_size_per_gpu: Config used in DeepSpeed to calculate verbose timing for logging on a per sample per second basis (only displayed if logging=logging.INFO). If set to "auto", the plugin tries to infer this from the train DataLoader's BatchSampler, else defaults to 1. To obtain accurate logs when using datasets that do not support batch samplers, set this to the actual per gpu batch size (trainer.batch_size). config: Pass in a deepspeed formatted config dict, or path to a deepspeed config: https://www.deepspeed.ai/docs/config-json. All defaults will be ignored if a config is passed in. logging_level: Set logging level for deepspeed. loss_scale: Loss scaling value for FP16 training. 0.0 results in dynamic loss scaling, otherwise static. initial_scale_power: Power of the initial dynamic loss scale value. Loss scale is computed by ``2^initial_scale_power``. loss_scale_window: Window in which to raise/lower the dynamic FP16 loss scaling value. hysteresis: FP16 Delay shift in Dynamic Loss scaling. min_loss_scale: The minimum FP16 dynamic loss scaling value. partition_activations: Enables partition activation when used with ZeRO stage 3 and model parallelism. Still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint. See `deepspeed tutorial <https://www.deepspeed.ai/tutorials/megatron/#deepspeed-activation-checkpoints-optional>`_. cpu_checkpointing: Offloads partitioned activations to CPU if ``partition_activations`` is enabled. contiguous_memory_optimization: Copies partitioned activations so that they are contiguous in memory. Not supported by all models. synchronize_checkpoint_boundary: Insert :func:`torch.cuda.synchronize` at each checkpoint boundary. load_full_weights: True when loading a single checkpoint file containing the model state dict when using ZeRO Stage 3. This differs from the DeepSpeed checkpoint which contains shards per worker. """ if not _DEEPSPEED_AVAILABLE: raise MisconfigurationException( "To use the DeepSpeed plugin, you must have DeepSpeed installed. pip install deepspeed" ) super().__init__( accelerator=accelerator, parallel_devices=parallel_devices, cluster_environment=cluster_environment, precision_plugin=precision_plugin, ) self.config = self._load_config(config) if self.config is None: # User has not overridden config, set defaults self.config = self._create_default_config( zero_optimization, zero_allow_untested_optimizer, logging_batch_size_per_gpu, offload_optimizer=offload_optimizer, offload_parameters=offload_parameters, nvme_path=nvme_path, offload_params_device=offload_params_device, params_buffer_count=params_buffer_count, params_buffer_size=params_buffer_size, max_in_cpu=max_in_cpu, pin_memory=pin_memory, offload_optimizer_device=offload_optimizer_device, optimizer_buffer_count=optimizer_buffer_count, block_size=block_size, queue_depth=queue_depth, single_submit=single_submit, overlap_events=overlap_events, thread_count=thread_count, partition_activations=partition_activations, cpu_checkpointing=cpu_checkpointing, contiguous_memory_optimization=contiguous_memory_optimization, synchronize_checkpoint_boundary=synchronize_checkpoint_boundary, stage=stage, contiguous_gradients=contiguous_gradients, overlap_comm=overlap_comm, allgather_partitions=allgather_partitions, reduce_scatter=reduce_scatter, allgather_bucket_size=allgather_bucket_size, reduce_bucket_size=reduce_bucket_size, sub_group_size=sub_group_size, ) self._config_initialized = False deepspeed.utils.logging.logger.setLevel(logging_level) self.remote_device = remote_device self.load_full_weights = load_full_weights # default FP16 parameters. self.loss_scale = loss_scale self.initial_scale_power = initial_scale_power self.loss_scale_window = loss_scale_window self.hysteresis = hysteresis self.min_loss_scale = min_loss_scale
def __verify_train_val_loop_configuration(trainer: "pl.Trainer", model: "pl.LightningModule") -> None: # ----------------------------------- # verify model has a training step # ----------------------------------- has_training_step = is_overridden("training_step", model) if not has_training_step: raise MisconfigurationException( "No `training_step()` method defined. Lightning `Trainer` expects as minimum a" " `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined." ) # ----------------------------------- # verify model has a train dataloader # ----------------------------------- has_train_dataloader = trainer._data_connector._train_dataloader_source.is_defined( ) if not has_train_dataloader: raise MisconfigurationException( "No `train_dataloader()` method defined. Lightning `Trainer` expects as minimum a" " `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined." ) # ----------------------------------- # verify model has optimizer # ----------------------------------- has_optimizers = is_overridden("configure_optimizers", model) if not has_optimizers: raise MisconfigurationException( "No `configure_optimizers()` method defined. Lightning `Trainer` expects as minimum a" " `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined." ) # ---------------------------------------------- # verify model does not have on_train_dataloader # ---------------------------------------------- has_on_train_dataloader = is_overridden("on_train_dataloader", model) if has_on_train_dataloader: rank_zero_deprecation( "Method `on_train_dataloader` is deprecated in v1.5.0 and will be removed in v1.7.0." " Please use `train_dataloader()` directly.") trainer.overriden_optimizer_step = is_overridden("optimizer_step", model) trainer.overriden_optimizer_zero_grad = is_overridden( "optimizer_zero_grad", model) automatic_optimization = model.automatic_optimization going_to_accumulate_grad_batches = trainer.accumulation_scheduler.going_to_accumulate_grad_batches( ) has_overriden_optimization_functions = trainer.overriden_optimizer_step or trainer.overriden_optimizer_zero_grad if has_overriden_optimization_functions and going_to_accumulate_grad_batches and automatic_optimization: rank_zero_warn( "When using `Trainer(accumulate_grad_batches != 1)` and overriding" " `LightningModule.optimizer_{step,zero_grad}`, the hooks will not be called on every batch" " (rather, they are called on every optimization step).") # ----------------------------------- # verify model for val loop # ----------------------------------- has_val_loader = trainer._data_connector._val_dataloader_source.is_defined( ) has_val_step = is_overridden("validation_step", model) if has_val_loader and not has_val_step: rank_zero_warn( "You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop." ) if has_val_step and not has_val_loader: rank_zero_warn( "You defined a `validation_step` but have no `val_dataloader`. Skipping val loop." ) # ---------------------------------------------- # verify model does not have on_val_dataloader # ---------------------------------------------- has_on_val_dataloader = is_overridden("on_val_dataloader", model) if has_on_val_dataloader: rank_zero_deprecation( "Method `on_val_dataloader` is deprecated in v1.5.0 and will be removed in v1.7.0." " Please use `val_dataloader()` directly.")
def scale_batch_size(trainer, model: LightningModule, mode: str = 'power', steps_per_trial: int = 3, init_val: int = 2, max_trials: int = 25, batch_arg_name: str = 'batch_size', **fit_kwargs): r""" Will iteratively try to find the largest batch size for a given model that does not give an out of memory (OOM) error. Args: trainer: The Trainer model: Model to fit. mode: string setting the search mode. Either `power` or `binsearch`. If mode is `power` we keep multiplying the batch size by 2, until we get an OOM error. If mode is 'binsearch', we will initially also keep multiplying by 2 and after encountering an OOM error do a binary search between the last successful batch size and the batch size that failed. steps_per_trial: number of steps to run with a given batch size. Idealy 1 should be enough to test if a OOM error occurs, however in practise a few are needed init_val: initial batch size to start the search with max_trials: max number of increase in batch size done before algorithm is terminated batch_arg_name: name of the attribute that stores the batch size. It is expected that the user has provided a model or datamodule that has a hyperparameter with that name. We will look for this attribute name in the following places - `model` - `model.hparams` - `model.datamodule` - `trainer.datamodule` (the datamodule passed to the tune method) **fit_kwargs: remaining arguments to be passed to .fit(), e.g., dataloader or datamodule. """ if not lightning_hasattr(model, batch_arg_name): raise MisconfigurationException( f'Field {batch_arg_name} not found in both `model` and `model.hparams`' ) if hasattr(model, batch_arg_name) and hasattr( model, "hparams") and batch_arg_name in model.hparams: rank_zero_warn( f'Field `model.{batch_arg_name}` and `model.hparams.{batch_arg_name}` are mutually exclusive!' f' `model.{batch_arg_name}` will be used as the initial batch size for scaling.' f' If this is not the intended behavior, please remove either one.' ) if hasattr(model.train_dataloader, 'patch_loader_code'): raise MisconfigurationException( 'The batch scaling feature cannot be used with dataloaders' ' passed directly to `.fit()`. Please disable the feature or' ' incorporate the dataloader into the model.') # Arguments we adjust during the batch size finder, save for restoring __scale_batch_dump_params(trainer) # Set to values that are required by the algorithm __scale_batch_reset_params(trainer, model, steps_per_trial) # Save initial model, that is loaded after batch size is found save_path = os.path.join(trainer.default_root_dir, 'scale_batch_size_temp_model.ckpt') trainer.save_checkpoint(str(save_path)) if trainer.progress_bar_callback: trainer.progress_bar_callback.disable() # Initially we just double in size until an OOM is encountered new_size = _adjust_batch_size(trainer, value=init_val) # initially set to init_val if mode == 'power': new_size = _run_power_scaling(trainer, model, new_size, batch_arg_name, max_trials, **fit_kwargs) elif mode == 'binsearch': new_size = _run_binsearch_scaling(trainer, model, new_size, batch_arg_name, max_trials, **fit_kwargs) else: raise ValueError( 'mode in method `scale_batch_size` can only be `power` or `binsearch' ) garbage_collection_cuda() log.info( f'Finished batch size finder, will continue with full run using batch size {new_size}' ) # Restore initial state of model if trainer.is_global_zero: trainer.checkpoint_connector.restore(str(save_path), on_gpu=trainer.on_gpu) fs = get_filesystem(str(save_path)) if fs.exists(save_path): fs.rm(save_path) # Finish by resetting variables so trainer is ready to fit model __scale_batch_restore_params(trainer) if trainer.progress_bar_callback: trainer.progress_bar_callback.enable() return new_size
def select_accelerator(self): if self.trainer.accelerator_backend is not None: return self.trainer.accelerator_backend # ---------------------------------- # Use the user provided accelerator # ---------------------------------- # use the one the user passed in if self.accelerator is not None and isinstance(self.accelerator, Accelerator): self.accelerator.trainer = self.trainer self.accelerator.ddp_plugin = self.trainer.plugin_connector.ddp_plugin acc = self.accelerator return acc # ---------------------------------- # choose an accelerator for the user # ---------------------------------- use_slurm_ddp = self.trainer.use_ddp and self.trainer.is_slurm_managing_tasks # torchelastic or general non_slurm ddp te_flags_passed = 'WORLD_SIZE' in os.environ and ( 'GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ) use_torchelastic_ddp = self.trainer.use_ddp and te_flags_passed use_ddp_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_spawn" use_ddp_cpu_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_cpu" use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self._is_using_torchelastic( ) use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.trainer.is_slurm_managing_tasks # ddp script mode uses the same flags as TE # TODO: decouple from TE if os.environ.get('PL_IN_DDP_SUBPROCESS', False): use_torchelastic_ddp = False cluster_env = self._select_environment() # choose the appropriate accelerator backend if self.trainer.use_ddp2: accelerator_backend = accelerators.DDP2Accelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_ddp_cpu_slurm: accelerator_backend = accelerators.DDPCPUHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_slurm_ddp: accelerator_backend = accelerators.DDPHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_ddp_cpu_torch_elastic: accelerator_backend = accelerators.DDPCPUHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_torchelastic_ddp: accelerator_backend = accelerators.DDPHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin) elif use_ddp_spawn: accelerator_backend = accelerators.DDPSpawnAccelerator( self.trainer, nprocs=self.trainer.num_processes, cluster_environment=cluster_env, ddp_plugin=self.trainer.plugin_connector.ddp_plugin) elif use_ddp_cpu_spawn: accelerator_backend = accelerators.DDPCPUSpawnAccelerator( self.trainer, nprocs=self.trainer.num_processes, cluster_environment=cluster_env, ddp_plugin=self.trainer.plugin_connector.ddp_plugin) elif self.trainer.distributed_backend == "ddp": accelerator_backend = accelerators.DDPAccelerator( self.trainer, cluster_env, ddp_plugin=self.trainer.plugin_connector.ddp_plugin) elif self.trainer.use_dp: accelerator_backend = accelerators.DataParallelAccelerator( self.trainer, cluster_env) elif self.trainer.use_horovod: accelerator_backend = accelerators.HorovodAccelerator( self.trainer, cluster_env) elif self.trainer.use_single_gpu: accelerator_backend = accelerators.GPUAccelerator( self.trainer, cluster_env) elif self.trainer.use_tpu: accelerator_backend = accelerators.TPUAccelerator( self.trainer, cluster_env) elif self.trainer.distributed_backend is None: accelerator_backend = accelerators.CPUAccelerator( self.trainer, cluster_env) else: raise MisconfigurationException( f'Trainer(accelerator={self.trainer.distributed_backend} is not a supported backend' ) return accelerator_backend
def step(self, *args, closure: Optional[Callable] = None, make_optimizer_step: Optional[bool] = None, **kwargs): """ Call this directly from your training_step when doing optimizations manually. By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you .. tip:: In manual mode we still automatically accumulate grad over batches if Trainer(accumulate_grad_batches=x) is set. Args: closure: One could provide its own optimizer_closure. Set to None by default. make_optimizer_step: Whether to force an optimizer step. When nothing is provided, we will use `accumulate_grad_batches` for accumulation frequency by default. However, one coud provide True and False based on its own scheduling. Refer to example 2 and 3 args: Any parameters provided to wrapped optimizer.step() kwargs: Any parameters provided to wrapped optimizer.step() Example:: def training_step(...): (opt_a, opt_b) = self.optimizers() loss_a = ... # automatically applies scaling, etc... self.manual_backward(loss_a, opt_a) opt_a.step() Example:: def training_step(self, batch, batch_idx): # using Boring Model opt = self.optimizers() # only 1 optimizer def compute_loss(): x = batch[0] x = F.dropout(x, 0.1) predictions = self(x) predictions = F.dropout(predictions, 0.1) loss = self.loss(None, predictions) return loss def closure(): # emulate MC dropout training num_backward = 1 losses = [] for backward_idx in range(num_backward + 1): loss = compute_loss() losses.append(loss) retain_graph = num_backward!= backward_idx self.manual_backward(loss, opt, retain_graph=retain_graph) loss_mean = torch.stack(losses).mean() loss_std = torch.stack(losses).std() self.log("train_loss_mean", loss_mean, on_step=True, prog_bar=True, on_epoch=True) self.log("train_loss_std", loss_std, on_step=True, prog_bar=True, on_epoch=True) opt.step(loss, closure=closure) Example:: # Scenario for a gan. def training_step(self, batch, batch_idx, optimizer_idx): # emulate gans training opt_gen, opt_dis = self.optimizers() # Note: Be careful, don't log on the same key in self.log in both closure # as they will be aggregated together on epoch_end def gen_closure(): ... forward and compute loss for generator loss_gen = ... self.log("loss_gen", loss_gen, on_step=True, on_epoch=True) self.manual_backward(loss_gen, opt_gen) def dis_closure(): ... forward and compute loss for discriminator loss_dis = ... self.log("loss_dis", loss_dis, on_step=True, on_epoch=True) self.manual_backward(loss_dis, opt_dis) # this will accumulate gradients for 2 batches and then call opt_gen.step() opt_gen.step(closure=gen_closure, make_optimizer_step=batch_idx % 2 == 0) # update discriminator every 4 batches # therefore, no gradient accumulation for discriminator if batch_idx % 4 == 0 : # Note: Set make_optimizer_step to True or it will use by default # Trainer(accumulate_grad_batches=x) opt_dis.step(closure=optimizer_closure, make_optimizer_step=True) """ profiler_name = f"optimizer_step_and_closure_{self._optimizer_idx}" if closure is None: closure = do_nothing_closure else: if not isinstance(closure, types.FunctionType): raise MisconfigurationException( "When closure is provided, it should be a function") make_optimizer_step = self._check_make_optimizer_step( make_optimizer_step) if make_optimizer_step: self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs) self._total_optimizer_step_calls += 1 else: # make sure to call optimizer_closure when accumulating with self._trainer.profiler.profile( f"closure_{self._optimizer_idx}"): with self._trainer.train_loop.block_ddp_sync_behaviour(True): closure()
def run_evaluation(self, test_mode: bool = False): # when testing make sure user defined a test step if test_mode and not self.is_overriden('test_step'): raise MisconfigurationException( "You called `.test()` without defining model's `.test_step()`." " Please define and try again") # Validation/Test begin callbacks if test_mode: self.on_test_start() else: self.on_validation_start() # hook model = self.get_model() model.on_pre_performance_check() # select dataloaders if test_mode: if self.test_dataloaders is None: self.reset_test_dataloader(model) dataloaders = self.test_dataloaders max_batches = self.num_test_batches else: # val if self.val_dataloaders is None: self.reset_val_dataloader(model) dataloaders = self.val_dataloaders max_batches = self.num_val_batches # cap max batches to 1 when using fast_dev_run if self.fast_dev_run: max_batches = 1 # init validation or test progress bar # main progress bar will already be closed when testing so initial position is free position = 2 * self.process_position + (not test_mode) desc = 'Testing' if test_mode else 'Validating' total = max_batches if max_batches != float('inf') else None pbar = tqdm(desc=desc, total=total, leave=test_mode, position=position, disable=not self.progress_bar_refresh_rate, dynamic_ncols=True, file=sys.stdout) setattr(self, f'{"test" if test_mode else "val"}_progress_bar', pbar) # run evaluation eval_results = self._evaluate(self.model, dataloaders, max_batches, test_mode) _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.process_output( eval_results) # add metrics to prog bar self.add_tqdm_metrics(prog_bar_metrics) # log results of test if test_mode and self.proc_rank == 0: print('-' * 80) print('TEST RESULTS') pprint(callback_metrics) print('-' * 80) # log metrics self.log_metrics(log_metrics, {}) # track metrics for callbacks self.callback_metrics.update(callback_metrics) # hook model.on_post_performance_check() # add model specific metrics if not test_mode: self.main_progress_bar.set_postfix(**self.training_tqdm_dict) # close progress bar if test_mode: self.test_progress_bar.close() else: self.val_progress_bar.close() # eventual dataset reloading if test_mode: if self.reload_dataloaders_every_epoch: self.reset_test_dataloader(model) else: # val if self.reload_dataloaders_every_epoch: self.reset_val_dataloader(model) # Validation/Test end callbacks if test_mode: self.on_test_end() else: self.on_validation_end()
def from_numpy(value, device: torch.device = None): if device is None: raise MisconfigurationException( "device (torch.device) should be provided.") return torch.from_numpy(value).to(device)
def restore_training_state(self, checkpoint): """ Restore trainer state. Model will get its change to update :param checkpoint: :return: """ if 'optimizer_states' not in checkpoint or 'lr_schedulers' not in checkpoint: raise KeyError( 'Trying to restore training state but checkpoint contains only the model.' ' This is probably due to `ModelCheckpoint.save_weights_only` being set to `True`.' ) if any([key in checkpoint for key in DEPRECATED_CHECKPOINT_KEYS]): raise ValueError( "The checkpoint you're attempting to load follows an" " outdated schema. You can upgrade to the current schema by running" " `python -m pytorch_lightning.utilities.upgrade_checkpoint --file model.ckpt`" " where `model.ckpt` is your checkpoint file.") # load callback states self.trainer.on_load_checkpoint(checkpoint) self.trainer.global_step = checkpoint['global_step'] self.trainer.current_epoch = checkpoint['epoch'] # crash if max_epochs is lower then the current epoch from the checkpoint if self.trainer.current_epoch > self.trainer.max_epochs: m = f""" you restored a checkpoint with current_epoch={self.trainer.current_epoch} but the Trainer(max_epochs={self.trainer.max_epochs}) """ raise MisconfigurationException(m) # Division deals with global step stepping once per accumulated batch # Inequality deals with different global step for odd vs even num_training_batches n_accum = 1 if self.trainer.accumulate_grad_batches is None else self.trainer.accumulate_grad_batches expected_steps = self.trainer.num_training_batches / n_accum if self.trainer.num_training_batches != 0 and self.trainer.global_step % expected_steps > 1: rank_zero_warn( "You're resuming from a checkpoint that ended mid-epoch. " "This can cause unreliable results if further training is done, " "consider using an end of epoch checkpoint. ") # restore the optimizers optimizer_states = checkpoint['optimizer_states'] for optimizer, opt_state in zip(self.trainer.optimizers, optimizer_states): optimizer.load_state_dict(opt_state) # move optimizer to GPU 1 weight at a time # avoids OOM if self.trainer.root_gpu is not None: for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda(self.trainer.root_gpu) # restore the lr schedulers lr_schedulers = checkpoint['lr_schedulers'] for scheduler, lrs_state in zip(self.trainer.lr_schedulers, lr_schedulers): scheduler['scheduler'].load_state_dict(lrs_state)
def _get_dataloader_init_kwargs( dataloader: DataLoader, sampler: Optional[Sampler], mode: Optional[RunningStage] = None) -> Dict[str, Any]: if not isinstance(dataloader, DataLoader): raise ValueError( f"The dataloader {dataloader} needs to subclass `torch.utils.data.DataLoader`" ) # get the dataloader instance attributes attrs = { k: v for k, v in vars(dataloader).items() if not k.startswith("_") } # not part of `vars` attrs["multiprocessing_context"] = dataloader.multiprocessing_context # get the dataloader instance `__init__` parameters params = dict(inspect.signature(dataloader.__init__).parameters) # keep only the params whose default is different to the current attr value non_defaults = { name for name, p in params.items() if name in attrs and p.default != attrs[name] } # add `dataset` as it might have been replaced with `*args` non_defaults.add("dataset") # kwargs to re-construct the dataloader dl_kwargs = {k: v for k, v in attrs.items() if k in non_defaults} dl_kwargs.update( TrainerDataLoadingMixin._resolve_batch_sampler(dataloader, sampler, mode=mode)) required_args = { p.name for p in params.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD) and p.default is p.empty and p.name not in dl_kwargs } # the dataloader has required args which we could not extract from the existing attributes if required_args: required_args = sorted(required_args) dataloader_cls_name = dataloader.__class__.__name__ raise MisconfigurationException( f"Trying to inject `DistributedSampler` into the `{dataloader_cls_name}` instance. " "This would fail as some of the `__init__` arguments are not available as instance attributes. " f"The missing attributes are {required_args}. " f"HINT: If you wrote the `{dataloader_cls_name}` class, define `self.missing_arg_name` or " "manually add the `DistributedSampler` as: " f"`{dataloader_cls_name}(dataset, sampler=DistributedSampler(dataset))`." ) has_variadic_kwargs = any(p.kind is p.VAR_KEYWORD for p in params.values()) if not has_variadic_kwargs: # the dataloader signature does not allow keyword arguments that need to be passed missing_kwargs = dl_kwargs.keys() - params.keys() if missing_kwargs: missing_kwargs = sorted(missing_kwargs) dataloader_cls_name = dataloader.__class__.__name__ raise MisconfigurationException( f"Trying to inject `DistributedSampler` into the `{dataloader_cls_name}` instance. " "This would fail as it doesn't expose all its attributes in the `__init__` signature. " f"The missing arguments are {missing_kwargs}. " f"HINT: If you wrote the `{dataloader_cls_name}` class, add the `__init__` arguments or " "manually add the `DistributedSampler` as: " f"`{dataloader_cls_name}(dataset, sampler=DistributedSampler(dataset))`." ) if isinstance(dl_kwargs["dataset"], IterableDataset): dl_kwargs["batch_sampler"] = None dl_kwargs["sampler"] = None if _fault_tolerant_training(): if isinstance(dl_kwargs["dataset"], IterableDataset): # wrap the `IterableDataset` into a `CaptureIterableDataset` to record sampler states. dl_kwargs["dataset"] = CaptureIterableDataset( dataset=dl_kwargs["dataset"]) elif len(dl_kwargs["dataset"]): dl_kwargs["dataset"] = CaptureMapDataset( dataset=dl_kwargs["dataset"]) else: raise MisconfigurationException( "This shouldn't happen, please open an issue on Lightning Github repository." ) return dl_kwargs
def _check_arguments(self, trainer): if trainer.amp_backend is not None: raise MisconfigurationException( 'DDPSequentialPlugin is currently not supported in Automatic Mixed Precision' )
def configure_schedulers(self, schedulers: list, monitor: Optional[str] = None): # Convert each scheduler into dict structure with relevant information lr_schedulers = [] default_config = { 'scheduler': None, 'name': None, # no custom name 'interval': 'epoch', # after epoch is over 'frequency': 1, # every epoch/batch 'reduce_on_plateau': False, # most often not ReduceLROnPlateau scheduler 'monitor': monitor, # value to monitor for ReduceLROnPlateau 'strict': True, # enforce that the monitor exists for ReduceLROnPlateau } for scheduler in schedulers: if isinstance(scheduler, dict): # check provided keys extra_keys = [ k for k in scheduler.keys() if k not in default_config.keys() ] if extra_keys: rank_zero_warn( f'Found unsupported keys in the lr scheduler dict: {extra_keys}', RuntimeWarning) if 'scheduler' not in scheduler: raise MisconfigurationException( 'The lr scheduler dict must have the key "scheduler" with its item being an lr scheduler' ) scheduler['reduce_on_plateau'] = isinstance( scheduler['scheduler'], optim.lr_scheduler.ReduceLROnPlateau) if scheduler['reduce_on_plateau'] and scheduler.get( 'monitor', None) is None: raise MisconfigurationException( 'The lr scheduler dict must include a monitor when a `ReduceLROnPlateau` scheduler is used.' ' For example: {"optimizer": optimizer, "lr_scheduler":' ' {"scheduler": scheduler, "monitor": "your_loss"}}') lr_schedulers.append({**default_config, **scheduler}) elif isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau): if monitor is None: raise MisconfigurationException( '`configure_optimizers` must include a monitor when a `ReduceLROnPlateau` scheduler is used.' ' For example:' ' {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "metric_to_track"}' ) lr_schedulers.append({ **default_config, 'scheduler': scheduler, 'reduce_on_plateau': True, 'monitor': monitor }) elif isinstance(scheduler, optim.lr_scheduler._LRScheduler): lr_schedulers.append({ **default_config, 'scheduler': scheduler }) else: raise ValueError( f'The provided lr scheduler "{scheduler}" is invalid') return lr_schedulers