def configure_optimizers(self) -> Tuple[list, list]: param_optimizer = list(self.named_parameters()) no_decay = ["bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [p for n, p in param_optimizer if p.requires_grad and not any(nd in n for nd in no_decay)], "weight_decay": self.hparams.weight_decay, }, { "params": [p for n, p in param_optimizer if p.requires_grad and any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] self.optimizer = AdamW(optimizer_parameters, lr=self.hparams.learning_rate) num_training_steps = int( len(self._train_examples) // (self.hparams.train_batch_size * self.trainer.num_gpus) // self.hparams.accumulate_grad_batches * float(self.hparams.max_epochs) ) rank_zero_info("The total number of training steps: %d", num_training_steps) warmup_steps = int(self.hparams.warmup_proportion * num_training_steps) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps ) return [self.optimizer], [dict(scheduler=self.scheduler, interval="step")]
def _initialize_deepspeed_inference(self, model): # todo: Currently DeepSpeed requires optimizers at inference to partition weights correctly optimizer, scheduler = None, None if "optimizer" not in self.config: rank_zero_info( "You have not specified an optimizer or scheduler within the DeepSpeed config." "Using `configure_optimizers` to define optimizer and scheduler." ) optimizer, lr_scheduler, _ = self._init_optimizers() scheduler = lr_scheduler["scheduler"] inference_config = { # todo: this is required for DeepSpeed throughput timers, or throughput timers will be incorrect "train_micro_batch_size_per_gpu": 1 } if "fp16" in self.config: inference_config.update({"fp16": self.config["fp16"]}) if self.zero_stage_3: inference_config.update({ "zero_allow_untested_optimizer": self.config["zero_allow_untested_optimizer"], "zero_optimization": self.config["zero_optimization"], }) # Remove all module hooks before initializing new model remove_module_hooks(model) model, _, _, _ = deepspeed.initialize( config=inference_config, model=model, optimizer=optimizer, lr_scheduler=scheduler, model_parameters=[], dist_init_required=False, ) self.model = model
def _initialize_deepspeed_train(self, model): if "optimizer" in self.config: optimizer, lr_scheduler = None, _get_default_scheduler_config() else: rank_zero_info( "You have not specified an optimizer or scheduler within the DeepSpeed config." "Using `configure_optimizers` to define optimizer and scheduler." ) optimizer, lr_scheduler, _ = self._init_optimizers() scheduler = lr_scheduler["scheduler"] model_parameters = filter(lambda p: p.requires_grad, self.model.parameters()) model, deepspeed_optimizer, _, deepspeed_scheduler = deepspeed.initialize( config=self.config, model=model, model_parameters=model_parameters, optimizer=optimizer, lr_scheduler=scheduler, dist_init_required=False, ) self._set_deepspeed_activation_checkpointing() # although we set these here, deepspeed manages the specific optimizer logic self.lightning_module.trainer.optimizers = [deepspeed_optimizer] if deepspeed_scheduler is not None: lr_scheduler["scheduler"] = deepspeed_scheduler self.lightning_module.trainer.lr_schedulers = [lr_scheduler] self.model = model
def configure_slurm_ddp(self): # extract SLURM flag vars # whenever we have the correct number of tasks, we let slurm manage processes # otherwise we launch the required number of processes if self.use_ddp or self.use_ddp2: num_requested_gpus = self.num_gpus * self.num_nodes num_slurm_tasks = 0 try: num_slurm_tasks = int(os.environ["SLURM_NTASKS"]) self.is_slurm_managing_tasks = num_slurm_tasks == num_requested_gpus # enable slurm cpu if num_requested_gpus == 0: self.is_slurm_managing_tasks = num_slurm_tasks == self.num_processes # in interactive mode we don't manage tasks job_name = os.environ["SLURM_JOB_NAME"] if job_name == "bash": self.is_slurm_managing_tasks = False except Exception: # likely not on slurm, so set the slurm managed flag to false self.is_slurm_managing_tasks = False # used for tests only, set this flag to simulate slurm managing a task try: should_fake = int(os.environ["FAKE_SLURM_MANAGING_TASKS"]) if should_fake: self.is_slurm_managing_tasks = True except Exception: pass # notify user the that slurm is managing tasks if self.is_slurm_managing_tasks: rank_zero_info("Multi-processing is handled by Slurm.")
def test_v1_8_0_rank_zero_imports(): import warnings from pytorch_lightning.utilities.distributed import rank_zero_debug, rank_zero_info from pytorch_lightning.utilities.warnings import LightningDeprecationWarning, rank_zero_deprecation, rank_zero_warn with pytest.deprecated_call( match="pytorch_lightning.utilities.distributed.rank_zero_debug has been deprecated in v1.6" " and will be removed in v1.8." ): rank_zero_debug("foo") with pytest.deprecated_call( match="pytorch_lightning.utilities.distributed.rank_zero_info has been deprecated in v1.6" " and will be removed in v1.8." ): rank_zero_info("foo") with pytest.deprecated_call( match="pytorch_lightning.utilities.warnings.rank_zero_warn has been deprecated in v1.6" " and will be removed in v1.8." ): rank_zero_warn("foo") with pytest.deprecated_call( match="pytorch_lightning.utilities.warnings.rank_zero_deprecation has been deprecated in v1.6" " and will be removed in v1.8." ): rank_zero_deprecation("foo") with pytest.deprecated_call( match="pytorch_lightning.utilities.warnings.LightningDeprecationWarning has been deprecated in v1.6" " and will be removed in v1.8." ): warnings.warn("foo", LightningDeprecationWarning, stacklevel=5)
def init_ddp_connection(self, global_rank: Optional[int], world_size: Optional[int]) -> None: # TODO: this code is duplicated in DDP and DDPSpawn, make this a function global_rank = global_rank if global_rank is not None else self.cluster_environment.global_rank( ) world_size = world_size if world_size is not None else self.cluster_environment.world_size( ) os.environ["MASTER_ADDR"] = self.cluster_environment.master_address() os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) if not torch.distributed.is_initialized(): log.info( f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}" ) torch.distributed.init_process_group( self.torch_distributed_backend, rank=global_rank, world_size=world_size) # on rank=0 let everyone know training is starting rank_zero_info( f"{'-' * 100}\n" f"distributed_backend={self.torch_distributed_backend}\n" f"All DDP processes registered. Starting ddp with {self.world_size} processes\n" f"{'-' * 100}\n")
def _initialize_deepspeed_train(self, model): optimizer, lightning_scheduler, optimizer_frequencies = None, None, None if "optimizer" not in self.config: rank_zero_info( "You have not specified an optimizer or scheduler within the DeepSpeed config." "Using `configure_optimizers` to define optimizer and scheduler." ) optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer( ) model_parameters = filter(lambda p: p.requires_grad, self.model.parameters()) model, optimizer, _, lr_scheduler = deepspeed.initialize( args=SimpleNamespace(local_rank=self.local_rank), model=model, model_parameters=model_parameters, optimizer=optimizer, lr_scheduler=lightning_scheduler, config_params=self.config, ) self._set_deepspeed_activation_checkpointing() # set optimizer for save/load, but deepspeed manages the specific optimizer logic self.lightning_module.trainer.optimizers = [optimizer] self.lightning_module.trainer.schedulers = [lr_scheduler] self.model = model
def configure_slurm_ddp(self, num_gpu_nodes): self.is_slurm_managing_tasks = False # extract SLURM flag vars # whenever we have the correct number of tasks, we let slurm manage processes # otherwise we launch the required number of processes if self.use_ddp: self.num_requested_gpus = self.num_gpus * num_gpu_nodes self.num_slurm_tasks = 0 try: self.num_slurm_tasks = int(os.environ['SLURM_NTASKS']) # self.is_slurm_managing_tasks = self.num_slurm_tasks == self.num_requested_gpus self.is_slurm_managing_tasks = True print(self.num_slurm_tasks, self.num_requested_gpus) # in interactive mode we don't manage tasks job_name = os.environ['SLURM_JOB_NAME'] if job_name == 'bash': self.is_slurm_managing_tasks = False except Exception: # likely not on slurm, so set the slurm managed flag to false self.is_slurm_managing_tasks = False # used for tests only, set this flag to simulate slurm managing a task try: should_fake = int(os.environ['FAKE_SLURM_MANAGING_TASKS']) if should_fake: self.is_slurm_managing_tasks = True except Exception: pass # notify user the that slurm is managing tasks if self.is_slurm_managing_tasks: rank_zero_info('Multi-processing is handled by Slurm.')
def slurm_sigusr1_handler_fn(self, signum: _SIGNUM, frame: FrameType) -> None: rank_zero_info("handling SIGUSR1") # save logger to make sure we get all the metrics if self.trainer.logger: self.trainer.logger.finalize("finished") hpc_save_path = self.trainer._checkpoint_connector.hpc_save_path( self.trainer.weights_save_path) self.trainer.save_checkpoint(hpc_save_path) if self.trainer.is_global_zero: # find job id job_id = os.environ["SLURM_JOB_ID"] cmd = ["scontrol", "requeue", job_id] # requeue job log.info(f"requeing job {job_id}...") try: result = call(cmd) except FileNotFoundError: # This can occur if a subprocess call to `scontrol` is run outside a shell context # Re-attempt call (now with shell context). If any error is raised, propagate to user. # When running a shell command, it should be passed as a single string. joint_cmd = [str(x) for x in cmd] result = call(" ".join(joint_cmd), shell=True) # print result text if result == 0: log.info(f"requeued exp {job_id}") else: log.warning("requeue failed...")
def _check_time_remaining(self, trainer: "pl.Trainer") -> None: should_stop = self.time_elapsed() >= self._duration should_stop = trainer.training_type_plugin.broadcast(should_stop) trainer.should_stop = trainer.should_stop or should_stop if should_stop and self._verbose: elapsed = timedelta(seconds=int(self.time_elapsed(RunningStage.TRAINING))) rank_zero_info(f"Time limit reached. Elapsed time is {elapsed}. Signaling Trainer to stop.")
def _initialize_deepspeed_inference(self, model): # todo: Currently DeepSpeed requires optimizers at inference to partition weights correctly optimizer, lightning_scheduler, optimizer_frequencies = None, None, None if "optimizer" not in self.config: rank_zero_info( "You have not specified an optimizer or scheduler within the DeepSpeed config." "Using `configure_optimizers` to define optimizer and scheduler." ) optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer( ) inference_config = { # todo: this is required for DeepSpeed throughput timers, or throughput timers will be incorrect 'train_micro_batch_size_per_gpu': 1, } if 'fp16' in self.config: inference_config.update({"fp16": self.config["fp16"]}) if self.zero_stage_3: inference_config.update({ "zero_allow_untested_optimizer": self.config['zero_allow_untested_optimizer'], "zero_optimization": self.config['zero_optimization'], }) # Remove all module hooks before initializing new model remove_module_hooks(model) model, _, _, _ = deepspeed.initialize( args=SimpleNamespace(local_rank=self.local_rank), model=model, optimizer=optimizer, lr_scheduler=lightning_scheduler, config_params=inference_config, model_parameters=[], ) self.model = model
def _initialize_deepspeed_train(self, model): if "optimizer" in self.config: optimizer, lr_scheduler = None, _get_default_scheduler_config() else: rank_zero_info( "You have not specified an optimizer or scheduler within the DeepSpeed config." " Using `configure_optimizers` to define optimizer and scheduler." ) optimizer, lr_scheduler, _ = self._init_optimizers() scheduler = lr_scheduler["scheduler"] model, deepspeed_optimizer = self._setup_model_and_optimizer( model, optimizer, scheduler) self._set_deepspeed_activation_checkpointing() # although we set these here, deepspeed manages the specific optimizer logic self.lightning_module.trainer.optimizers = [deepspeed_optimizer] deepspeed_scheduler = model.lr_scheduler if deepspeed_scheduler is not None: # disable deepspeed lr scheduling as lightning manages scheduling model.lr_scheduler = None lr_scheduler["scheduler"] = deepspeed_scheduler self.lightning_module.trainer.lr_schedulers = [lr_scheduler] self.model = model
def _format_precision_config(self): amp_type = self.lightning_module.trainer.accelerator_connector.amp_type amp_level = self.lightning_module.trainer.accelerator_connector.amp_level precision = self.lightning_module.trainer.accelerator_connector.precision if precision == 16: if "fp16" not in self.config and amp_type == AMPType.NATIVE: # FP16 is a DeepSpeed standalone AMP implementation rank_zero_info("Enabling DeepSpeed FP16.") self.config["fp16"] = { "enabled": True, "loss_scale": self.loss_scale, "initial_scale_power": self.initial_scale_power, "loss_scale_window": self.loss_scale_window, "hysteresis": self.hysteresis, "min_loss_scale": self.min_loss_scale } elif "amp" not in self.config and amp_type == AMPType.APEX: rank_zero_only("Enabling DeepSpeed APEX Implementation.") self.config["amp"] = { "enabled": True, "opt_level": amp_level, } if "zero_optimization" in self.config and not ("amp" in self.config or "fp16" in self.config): raise MisconfigurationException( "To use DeepSpeed ZeRO Optimization, you must set precision=16." )
def _initialize_deepspeed_train(self, model): optimizer, scheduler = None, None if "optimizer" in self.config: rank_zero_info( "You have specified an optimizer and/or scheduler within the DeepSpeed config." " It is recommended to define it in `LightningModule.configure_optimizers`." ) lr_scheduler = None else: optimizer, lr_scheduler, _ = self._init_optimizers() if lr_scheduler is not None: scheduler = lr_scheduler.scheduler model, deepspeed_optimizer = self._setup_model_and_optimizer( model, optimizer, scheduler) self._set_deepspeed_activation_checkpointing() # although we set these here, deepspeed manages the specific optimizer logic self.optimizers = [deepspeed_optimizer] deepspeed_scheduler = model.lr_scheduler if deepspeed_scheduler is not None: # disable deepspeed lr scheduling as lightning manages scheduling model.lr_scheduler = None if lr_scheduler is None: lr_scheduler = LRSchedulerConfig(deepspeed_scheduler) else: lr_scheduler.scheduler = deepspeed_scheduler self.lr_scheduler_configs = [lr_scheduler] self.model = model
def check_checkpoint_callback(self, should_save, is_last=False): # TODO bake this logic into the checkpoint callback if should_save: checkpoint_callbacks = [c for c in self.trainer.callbacks if isinstance(c, ModelCheckpoint)] if is_last and any(c.save_last for c in checkpoint_callbacks): rank_zero_info("Saving latest checkpoint...") model = self.trainer.get_model() [c.on_validation_end(self.trainer, model) for c in checkpoint_callbacks]
def _check_time_remaining(self, trainer: 'pl.Trainer') -> None: should_stop = self.time_elapsed() >= self._duration should_stop = trainer.accelerator.broadcast(should_stop) trainer.should_stop = trainer.should_stop or should_stop if should_stop and self._verbose: rank_zero_info( f"Time limit reached. Elapsed time is {self.time_elapsed}. Signaling Trainer to stop." )
def _load_config(self, config): if config is None and self.DEEPSPEED_ENV_VAR in os.environ: rank_zero_info(f"Loading DeepSpeed config from set {self.DEEPSPEED_ENV_VAR} environment variable") config = os.environ[self.DEEPSPEED_ENV_VAR] if isinstance(config, str) or isinstance(config, Path): if not os.path.isfile(config): raise MisconfigurationException( f"You passed in a path to a DeepSpeed config but the path does not exist: {config}" ) with open(config) as f: config = json.load(f) return config
def check_checkpoint_callback(self, should_update, is_last=False): # TODO bake this logic into the ModelCheckpoint callback if should_update and self.trainer.checkpoint_connector.has_trained: callbacks = self.trainer.checkpoint_callbacks if is_last and any(cb.save_last and cb.verbose for cb in callbacks): rank_zero_info("Saving latest checkpoint...") model = self.trainer.lightning_module for cb in callbacks: cb.on_validation_end(self.trainer, model)
def main(cfg: DictConfig) -> None: rank_zero_info(OmegaConf.to_yaml(cfg)) instantiator = HydraInstantiator() logger = instantiator.logger(cfg) run( instantiator, ignore_warnings=cfg.get("ignore_warnings"), run_test_after_fit=cfg.get("training").get("run_test_after_fit"), dataset=cfg.get("dataset"), tokenizer=cfg.get("tokenizer"), task=cfg.get("task"), trainer=cfg.get("trainer"), logger=logger, )
def main(cfg: DictConfig) -> Any: rank_zero_info(OmegaConf.to_yaml(cfg)) instantiator = HydraInstantiator() y = run( cfg.x, instantiator, checkpoint_path=cfg.get("checkpoint_path"), task=cfg.task, model_data_kwargs=cfg.get("model_data_kwargs"), tokenizer=cfg.get("tokenizer"), pipeline_kwargs=cfg.get("pipeline_kwargs", {}), predict_kwargs=cfg.get("predict_kwargs", {}), ) rank_zero_info(y) return y
def _check_time_remaining(self, trainer) -> None: # Default timer only checks for train time exceeding max_time, this includes time for all stages. train_duration = self.time_elapsed(RunningStage.TRAINING) validation_duration = self.time_elapsed(RunningStage.VALIDATING) test_duration = self.time_elapsed(RunningStage.TESTING) total_duration = train_duration + validation_duration + test_duration should_stop = total_duration >= self._duration # should_stop = trainer.training_type_plugin.broadcast(should_stop) should_stop = trainer.training_type_plugin.reduce_boolean_decision(should_stop) trainer.should_stop = trainer.should_stop or should_stop if should_stop and self._verbose: rank_zero_info(f"Time limit reached. Signaling Trainer to stop.") rank_zero_info( f"Spent {timedelta(seconds=train_duration)} seconds on training, {timedelta(seconds=validation_duration)} seconds on validation and {timedelta(seconds=test_duration)} seconds on testing" )
def _format_precision_config(self) -> None: if self.precision_plugin.precision in (PrecisionType.HALF, PrecisionType.MIXED): if "fp16" not in self.config and self.precision_plugin.amp_type == AMPType.NATIVE: # FP16 is a DeepSpeed standalone AMP implementation rank_zero_info("Enabling DeepSpeed FP16.") self.config["fp16"] = { "enabled": True, "loss_scale": self.loss_scale, "initial_scale_power": self.initial_scale_power, "loss_scale_window": self.loss_scale_window, "hysteresis": self.hysteresis, "min_loss_scale": self.min_loss_scale, } elif "amp" not in self.config and self.precision_plugin.amp_type == AMPType.APEX: rank_zero_info("Enabling DeepSpeed APEX Implementation.") self.config["amp"] = {"enabled": True, "opt_level": self.precision_plugin.amp_level}
def determine_ddp_node_rank(self): if self.trainer.is_slurm_managing_tasks: return int(os.environ['SLURM_NODEID']) # torchelastic uses the envvar GROUP_RANK, whereas other systems(?) use NODE_RANK. # otherwise use given node rank or default to node rank 0 env_vars = ['NODE_RANK', 'GROUP_RANK'] node_ids = [(k, os.environ.get(k, None)) for k in env_vars] node_ids = [(k, v) for k, v in node_ids if v is not None] if len(node_ids) == 0: return 0 if len(node_ids) > 1: log.warning(f"Multiple environment variables ({node_ids}) defined for node rank. Using the first one.") k, rank = node_ids.pop() rank_zero_info(f"Using environment variable {k} for node rank ({rank}).") return int(rank)
def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): if data_parallel_device_ids is None: return # set the correct cuda visible devices (using pci order) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # when slurm is managing the task it sets the visible devices if not is_slurm_managing_tasks and 'CUDA_VISIBLE_DEVICES' not in os.environ: if isinstance(data_parallel_device_ids, int): id_str = ','.join(str(x) for x in list(range(data_parallel_device_ids))) os.environ["CUDA_VISIBLE_DEVICES"] = id_str else: gpu_str = ','.join([str(x) for x in data_parallel_device_ids]) os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str # don't make this debug... this is good UX rank_zero_info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')
def _format_precision_config(self): amp_type = self.lightning_module.trainer.accelerator_connector.amp_type amp_level = self.lightning_module.trainer.accelerator_connector.amp_level precision = self.lightning_module.trainer.accelerator_connector.precision if precision in (16, "mixed"): if "fp16" not in self.config and amp_type == AMPType.NATIVE: # FP16 is a DeepSpeed standalone AMP implementation rank_zero_info("Enabling DeepSpeed FP16.") self.config["fp16"] = { "enabled": True, "loss_scale": self.loss_scale, "initial_scale_power": self.initial_scale_power, "loss_scale_window": self.loss_scale_window, "hysteresis": self.hysteresis, "min_loss_scale": self.min_loss_scale, } elif "amp" not in self.config and amp_type == AMPType.APEX: rank_zero_only("Enabling DeepSpeed APEX Implementation.") self.config["amp"] = {"enabled": True, "opt_level": amp_level}
def setup(self, step: str) -> None: if step == "test": self._train_dataset = [] self._test_dataset = ReaderDataset.load_dataset( self.hparams.test_file, self.hparams.base_pretrained_model, "test", getattr(self.hparams, "nq_gold_test_file", ""), ) rank_zero_info("The number of test examples: %d", len(self._test_dataset)) else: self._train_dataset = ReaderDataset.load_dataset( self.hparams.train_file, self.hparams.base_pretrained_model, "train", self.hparams.nq_gold_train_file, ) rank_zero_info("The number of training examples: %d", len(self._train_dataset)) self._val_dataset = ReaderDataset.load_dataset( self.hparams.validation_file, self.hparams.base_pretrained_model, "val", getattr(self.hparams, "nq_gold_validation_file", ""), ) rank_zero_info("The number of validation examples: %d", len(self._val_dataset))
def configure_slurm_ddp(self, num_gpu_nodes): self.trainer.is_slurm_managing_tasks = False # extract SLURM flag vars # whenever we have the correct number of tasks, we let slurm manage processes # otherwise we launch the required number of processes if self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2): self.trainer.num_requested_gpus = self.trainer.num_gpus * num_gpu_nodes self.trainer.num_slurm_tasks = 0 try: self.trainer.num_slurm_tasks = int(os.environ['SLURM_NTASKS']) self.trainer.is_slurm_managing_tasks = self.trainer.num_slurm_tasks == self.trainer.num_requested_gpus # enable slurm cpu if self.trainer.num_requested_gpus == 0: self.trainer.is_slurm_managing_tasks = self.trainer.num_slurm_tasks == self.trainer.num_processes # in interactive mode we don't manage tasks job_name = os.environ['SLURM_JOB_NAME'] if job_name == 'bash': self.trainer.is_slurm_managing_tasks = False # todo: specify the possible exception except Exception: # likely not on slurm, so set the slurm managed flag to false self.trainer.is_slurm_managing_tasks = False # used for tests only, set this flag to simulate slurm managing a task should_fake = os.environ.get('FAKE_SLURM_MANAGING_TASKS') if should_fake and int(should_fake): self.trainer.is_slurm_managing_tasks = True # notify user the that slurm is managing tasks if self.trainer.is_slurm_managing_tasks: rank_zero_info('Multi-processing is handled by Slurm.')
def set_distributed_mode(self, distributed_backend: Optional[str] = None): if distributed_backend is None and self.is_training_type_in_plugins: return if distributed_backend is not None and distributed_backend in TrainingTypePluginsRegistry: self.distributed_backend = TrainingTypePluginsRegistry[distributed_backend]["distributed_backend"] elif distributed_backend is not None: self.distributed_backend = distributed_backend if isinstance(self.distributed_backend, Accelerator): return if self.distributed_backend is None: if self.has_horovodrun(): self._set_horovod_backend() elif self.num_gpus == 0 and (self.num_nodes > 1 or self.num_processes > 1): self._distrib_type = DistributedType.DDP elif self.num_gpus > 1: rank_zero_warn( 'You requested multiple GPUs but did not specify a backend, e.g.' ' `Trainer(accelerator="dp"|"ddp"|"ddp2")`. Setting `accelerator="ddp_spawn"` for you.' ) self.distributed_backend = "ddp_spawn" # special case with DDP on CPUs if self.distributed_backend == "ddp_cpu": self._distrib_type = DistributedType.DDP_SPAWN if self.num_gpus > 0: rank_zero_warn( 'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.' ) self.parallel_device_ids = None if self.num_processes is None: # define the max CPU available self.num_processes = os.cpu_count() # special case with TPUs elif self.distributed_backend == 'tpu' or self.tpu_cores is not None: self._device_type = DeviceType.TPU if isinstance(self.tpu_cores, int): self._distrib_type = DistributedType.TPU_SPAWN elif self.distributed_backend == 'ipu': self._device_type = DeviceType.IPU elif self.distributed_backend and self._distrib_type is None: self._distrib_type = DistributedType(self.distributed_backend) # unless you request explicitly for CPU and some GPU are available use them _on_cpu = self.distributed_backend and 'cpu' in self.distributed_backend if self.num_gpus > 0 and not _on_cpu: self._device_type = DeviceType.GPU _gpu_distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2) # DP and DDP2 cannot run without GPU if self.num_gpus == 0 and self._distrib_type in _gpu_distrib_types and not _on_cpu: rank_zero_warn( 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.' ) # todo: in some cases it yield in comparison None and int if (self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1): self._distrib_type = DistributedType.DDP else: rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.') self._distrib_type = None # finished configuring self._distrib_type, check ipython environment self.check_interactive_compatibility() # for DDP overwrite nb processes by requested GPUs if ( self._device_type == DeviceType.GPU and self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) ): self.num_processes = self.num_gpus if (self._device_type == DeviceType.GPU and self._distrib_type == DistributedType.DDP2): self.num_processes = self.num_nodes # Horovod is an extra case... if self.distributed_backend == "horovod": self._set_horovod_backend() using_valid_distributed = self.use_ddp or self.use_ddp2 if self.num_nodes > 1 and not using_valid_distributed: # throw error to force user to choose a supported distributed type such as ddp or ddp2 raise MisconfigurationException( 'Your chosen distributed type does not support num_nodes > 1. ' 'Please set accelerator=ddp or accelerator=ddp2.' ) rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}') num_tpu_cores = self.tpu_cores if self.tpu_cores is not None else 0 rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_tpu_cores} TPU cores') num_ipus = self.ipus if self.ipus is not None else 0 rank_zero_info(f'IPU available: {_IPU_AVAILABLE}, using: {num_ipus} IPUs') if torch.cuda.is_available() and self._device_type != DeviceType.GPU: rank_zero_warn( "GPU available but not used. Set the gpus flag in your trainer" " `Trainer(gpus=1)` or script `--gpus=1`." )
def set_distributed_mode(self, distributed_backend): self.use_dp = False self.use_ddp = False self.use_ddp2 = False self.use_horovod = False self.single_gpu = False if distributed_backend is None: if self.has_horovodrun(): self._set_horovod_backend() elif self.num_gpus == 0: if self.num_nodes > 1 or self.num_processes > 1: self.use_ddp = True # ddp_cpu elif self.num_gpus == 1: self.single_gpu = True elif self.num_gpus > 1: rank_zero_warn( 'You requested multiple GPUs but did not specify a backend, e.g.' ' Trainer(distributed_backend=dp) (or ddp, ddp2).' ' Setting distributed_backend=ddp_spawn for you.') self.distributed_backend = 'ddp_spawn' distributed_backend = 'ddp_spawn' if distributed_backend == "dp": # do nothing if num_gpus == 0 if self.num_gpus == 1: self.single_gpu = True self.use_dp = True elif self.num_gpus > 1: self.use_dp = True elif distributed_backend in ['ddp', 'ddp_spawn']: if self.num_gpus == 0: if self.num_nodes > 1 or self.num_processes > 1: self.use_ddp = True # ddp_cpu elif self.num_gpus == 1: self.single_gpu = True self.use_ddp = True elif self.num_gpus > 1: self.use_ddp = True self.num_processes = self.num_gpus elif distributed_backend == "ddp2": # do nothing if num_gpus == 0 if self.num_gpus >= 1: self.use_ddp2 = True elif distributed_backend == "ddp_cpu": if self.num_gpus > 0: rank_zero_warn( 'You requested one or more GPUs, but set the backend to `ddp_cpu`.' ' Training will not use GPUs.') self.use_ddp = True self.data_parallel_device_ids = None self.on_gpu = False elif distributed_backend == 'horovod': self._set_horovod_backend() # throw error to force user ddp or ddp2 choice if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp): raise MisconfigurationException( 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. ' 'To silence this warning set distributed_backend=ddp or distributed_backend=ddp2' ) rank_zero_info( f'GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}')
def set_distributed_mode(self): self.trainer.use_dp = False self.trainer.use_ddp = False self.trainer.use_ddp2 = False self.trainer.use_horovod = False self.trainer.use_single_gpu = False if self.trainer.distributed_backend is None: if self.has_horovodrun(): self._set_horovod_backend() elif self.trainer.num_gpus == 0: if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1: self.trainer.use_ddp = True # ddp_cpu elif self.trainer.num_gpus == 1: self.trainer.use_single_gpu = True elif self.trainer.num_gpus > 1: rank_zero_warn( 'You requested multiple GPUs but did not specify a backend, e.g.' ' Trainer(distributed_backend="dp"|"ddp"|"ddp2").' ' Setting distributed_backend="ddp_spawn" for you.') self.trainer.distributed_backend = "ddp_spawn" if self.trainer.distributed_backend == "dp": # do nothing if num_gpus == 0 if self.trainer.num_gpus == 1: self.trainer.use_single_gpu = True self.trainer.use_dp = True elif self.trainer.num_gpus > 1: self.trainer.use_dp = True elif self.trainer.distributed_backend in ("ddp", "ddp_spawn"): if self.trainer.num_gpus == 0: if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1: self.trainer.use_ddp = True # ddp_cpu elif self.trainer.num_gpus == 1: self.trainer.use_single_gpu = True self.trainer.use_ddp = True elif self.trainer.num_gpus > 1: self.trainer.use_ddp = True self.trainer.num_processes = self.trainer.num_gpus elif self.trainer.distributed_backend == "ddp2": # do nothing if num_gpus == 0 if self.trainer.num_gpus >= 1: self.trainer.use_ddp2 = True elif self.trainer.distributed_backend == "ddp_cpu": if self.trainer.num_gpus > 0: rank_zero_warn( 'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.' ) self.trainer.use_ddp = True self.trainer.data_parallel_device_ids = None self.trainer.on_gpu = False elif self.trainer.distributed_backend == "horovod": self._set_horovod_backend() # throw error to force user ddp or ddp2 choice if self.trainer.num_nodes > 1 and not (self.trainer.use_ddp2 or self.trainer.use_ddp): raise MisconfigurationException( 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. ' 'To silence this warning set distributed_backend=ddp or distributed_backend=ddp2' ) rank_zero_info( f'GPU available: {torch.cuda.is_available()}, used: {self.trainer.on_gpu}' ) num_cores = self.trainer.tpu_cores if self.trainer.tpu_cores is not None else 0 rank_zero_info( f'TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores') if torch.cuda.is_available() and not self.trainer.on_gpu: rank_zero_warn( 'GPU available but not used. Set the --gpus flag when calling the script.' )