示例#1
0
    def configure_optimizers(self) -> Tuple[list, list]:
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if p.requires_grad and not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in param_optimizer if p.requires_grad and any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        self.optimizer = AdamW(optimizer_parameters, lr=self.hparams.learning_rate)
        num_training_steps = int(
            len(self._train_examples)
            // (self.hparams.train_batch_size * self.trainer.num_gpus)
            // self.hparams.accumulate_grad_batches
            * float(self.hparams.max_epochs)
        )
        rank_zero_info("The total number of training steps: %d", num_training_steps)

        warmup_steps = int(self.hparams.warmup_proportion * num_training_steps)

        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
        )
        return [self.optimizer], [dict(scheduler=self.scheduler, interval="step")]
示例#2
0
 def _initialize_deepspeed_inference(self, model):
     # todo: Currently DeepSpeed requires optimizers at inference to partition weights correctly
     optimizer, scheduler = None, None
     if "optimizer" not in self.config:
         rank_zero_info(
             "You have not specified an optimizer or scheduler within the DeepSpeed config."
             "Using `configure_optimizers` to define optimizer and scheduler."
         )
         optimizer, lr_scheduler, _ = self._init_optimizers()
         scheduler = lr_scheduler["scheduler"]
     inference_config = {
         # todo: this is required for DeepSpeed throughput timers, or throughput timers will be incorrect
         "train_micro_batch_size_per_gpu": 1
     }
     if "fp16" in self.config:
         inference_config.update({"fp16": self.config["fp16"]})
     if self.zero_stage_3:
         inference_config.update({
             "zero_allow_untested_optimizer":
             self.config["zero_allow_untested_optimizer"],
             "zero_optimization":
             self.config["zero_optimization"],
         })
     # Remove all module hooks before initializing new model
     remove_module_hooks(model)
     model, _, _, _ = deepspeed.initialize(
         config=inference_config,
         model=model,
         optimizer=optimizer,
         lr_scheduler=scheduler,
         model_parameters=[],
         dist_init_required=False,
     )
     self.model = model
示例#3
0
    def _initialize_deepspeed_train(self, model):
        if "optimizer" in self.config:
            optimizer, lr_scheduler = None, _get_default_scheduler_config()
        else:
            rank_zero_info(
                "You have not specified an optimizer or scheduler within the DeepSpeed config."
                "Using `configure_optimizers` to define optimizer and scheduler."
            )
            optimizer, lr_scheduler, _ = self._init_optimizers()

        scheduler = lr_scheduler["scheduler"]

        model_parameters = filter(lambda p: p.requires_grad,
                                  self.model.parameters())
        model, deepspeed_optimizer, _, deepspeed_scheduler = deepspeed.initialize(
            config=self.config,
            model=model,
            model_parameters=model_parameters,
            optimizer=optimizer,
            lr_scheduler=scheduler,
            dist_init_required=False,
        )

        self._set_deepspeed_activation_checkpointing()

        # although we set these here, deepspeed manages the specific optimizer logic
        self.lightning_module.trainer.optimizers = [deepspeed_optimizer]
        if deepspeed_scheduler is not None:
            lr_scheduler["scheduler"] = deepspeed_scheduler
            self.lightning_module.trainer.lr_schedulers = [lr_scheduler]
        self.model = model
示例#4
0
    def configure_slurm_ddp(self):
        # extract SLURM flag vars
        # whenever we have the correct number of tasks, we let slurm manage processes
        # otherwise we launch the required number of processes
        if self.use_ddp or self.use_ddp2:
            num_requested_gpus = self.num_gpus * self.num_nodes
            num_slurm_tasks = 0
            try:
                num_slurm_tasks = int(os.environ["SLURM_NTASKS"])
                self.is_slurm_managing_tasks = num_slurm_tasks == num_requested_gpus

                # enable slurm cpu
                if num_requested_gpus == 0:
                    self.is_slurm_managing_tasks = num_slurm_tasks == self.num_processes

                # in interactive mode we don't manage tasks
                job_name = os.environ["SLURM_JOB_NAME"]
                if job_name == "bash":
                    self.is_slurm_managing_tasks = False

            except Exception:
                # likely not on slurm, so set the slurm managed flag to false
                self.is_slurm_managing_tasks = False

        # used for tests only, set this flag to simulate slurm managing a task
        try:
            should_fake = int(os.environ["FAKE_SLURM_MANAGING_TASKS"])
            if should_fake:
                self.is_slurm_managing_tasks = True
        except Exception:
            pass

        # notify user the that slurm is managing tasks
        if self.is_slurm_managing_tasks:
            rank_zero_info("Multi-processing is handled by Slurm.")
示例#5
0
def test_v1_8_0_rank_zero_imports():

    import warnings

    from pytorch_lightning.utilities.distributed import rank_zero_debug, rank_zero_info
    from pytorch_lightning.utilities.warnings import LightningDeprecationWarning, rank_zero_deprecation, rank_zero_warn

    with pytest.deprecated_call(
        match="pytorch_lightning.utilities.distributed.rank_zero_debug has been deprecated in v1.6"
        " and will be removed in v1.8."
    ):
        rank_zero_debug("foo")
    with pytest.deprecated_call(
        match="pytorch_lightning.utilities.distributed.rank_zero_info has been deprecated in v1.6"
        " and will be removed in v1.8."
    ):
        rank_zero_info("foo")
    with pytest.deprecated_call(
        match="pytorch_lightning.utilities.warnings.rank_zero_warn has been deprecated in v1.6"
        " and will be removed in v1.8."
    ):
        rank_zero_warn("foo")
    with pytest.deprecated_call(
        match="pytorch_lightning.utilities.warnings.rank_zero_deprecation has been deprecated in v1.6"
        " and will be removed in v1.8."
    ):
        rank_zero_deprecation("foo")
    with pytest.deprecated_call(
        match="pytorch_lightning.utilities.warnings.LightningDeprecationWarning has been deprecated in v1.6"
        " and will be removed in v1.8."
    ):
        warnings.warn("foo", LightningDeprecationWarning, stacklevel=5)
示例#6
0
    def init_ddp_connection(self, global_rank: Optional[int],
                            world_size: Optional[int]) -> None:
        # TODO: this code is duplicated in DDP and DDPSpawn, make this a function
        global_rank = global_rank if global_rank is not None else self.cluster_environment.global_rank(
        )
        world_size = world_size if world_size is not None else self.cluster_environment.world_size(
        )
        os.environ["MASTER_ADDR"] = self.cluster_environment.master_address()
        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())

        if not torch.distributed.is_initialized():
            log.info(
                f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}"
            )
            torch.distributed.init_process_group(
                self.torch_distributed_backend,
                rank=global_rank,
                world_size=world_size)

            # on rank=0 let everyone know training is starting
            rank_zero_info(
                f"{'-' * 100}\n"
                f"distributed_backend={self.torch_distributed_backend}\n"
                f"All DDP processes registered. Starting ddp with {self.world_size} processes\n"
                f"{'-' * 100}\n")
示例#7
0
    def _initialize_deepspeed_train(self, model):
        optimizer, lightning_scheduler, optimizer_frequencies = None, None, None
        if "optimizer" not in self.config:
            rank_zero_info(
                "You have not specified an optimizer or scheduler within the DeepSpeed config."
                "Using `configure_optimizers` to define optimizer and scheduler."
            )
            optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer(
            )
        model_parameters = filter(lambda p: p.requires_grad,
                                  self.model.parameters())
        model, optimizer, _, lr_scheduler = deepspeed.initialize(
            args=SimpleNamespace(local_rank=self.local_rank),
            model=model,
            model_parameters=model_parameters,
            optimizer=optimizer,
            lr_scheduler=lightning_scheduler,
            config_params=self.config,
        )
        self._set_deepspeed_activation_checkpointing()

        # set optimizer for save/load, but deepspeed manages the specific optimizer logic
        self.lightning_module.trainer.optimizers = [optimizer]
        self.lightning_module.trainer.schedulers = [lr_scheduler]
        self.model = model
    def configure_slurm_ddp(self, num_gpu_nodes):
        self.is_slurm_managing_tasks = False

        # extract SLURM flag vars
        # whenever we have the correct number of tasks, we let slurm manage processes
        # otherwise we launch the required number of processes
        if self.use_ddp:
            self.num_requested_gpus = self.num_gpus * num_gpu_nodes
            self.num_slurm_tasks = 0
            try:
                self.num_slurm_tasks = int(os.environ['SLURM_NTASKS'])
                # self.is_slurm_managing_tasks = self.num_slurm_tasks == self.num_requested_gpus
                self.is_slurm_managing_tasks = True
                print(self.num_slurm_tasks, self.num_requested_gpus)
                # in interactive mode we don't manage tasks
                job_name = os.environ['SLURM_JOB_NAME']
                if job_name == 'bash':
                    self.is_slurm_managing_tasks = False

            except Exception:
                # likely not on slurm, so set the slurm managed flag to false
                self.is_slurm_managing_tasks = False

        # used for tests only, set this flag to simulate slurm managing a task
        try:
            should_fake = int(os.environ['FAKE_SLURM_MANAGING_TASKS'])
            if should_fake:
                self.is_slurm_managing_tasks = True
        except Exception:
            pass

        # notify user the that slurm is managing tasks
        if self.is_slurm_managing_tasks:
            rank_zero_info('Multi-processing is handled by Slurm.')
    def slurm_sigusr1_handler_fn(self, signum: _SIGNUM,
                                 frame: FrameType) -> None:
        rank_zero_info("handling SIGUSR1")

        # save logger to make sure we get all the metrics
        if self.trainer.logger:
            self.trainer.logger.finalize("finished")
        hpc_save_path = self.trainer._checkpoint_connector.hpc_save_path(
            self.trainer.weights_save_path)
        self.trainer.save_checkpoint(hpc_save_path)

        if self.trainer.is_global_zero:
            # find job id
            job_id = os.environ["SLURM_JOB_ID"]
            cmd = ["scontrol", "requeue", job_id]

            # requeue job
            log.info(f"requeing job {job_id}...")
            try:
                result = call(cmd)
            except FileNotFoundError:
                # This can occur if a subprocess call to `scontrol` is run outside a shell context
                # Re-attempt call (now with shell context). If any error is raised, propagate to user.
                # When running a shell command, it should be passed as a single string.
                joint_cmd = [str(x) for x in cmd]
                result = call(" ".join(joint_cmd), shell=True)

            # print result text
            if result == 0:
                log.info(f"requeued exp {job_id}")
            else:
                log.warning("requeue failed...")
示例#10
0
 def _check_time_remaining(self, trainer: "pl.Trainer") -> None:
     should_stop = self.time_elapsed() >= self._duration
     should_stop = trainer.training_type_plugin.broadcast(should_stop)
     trainer.should_stop = trainer.should_stop or should_stop
     if should_stop and self._verbose:
         elapsed = timedelta(seconds=int(self.time_elapsed(RunningStage.TRAINING)))
         rank_zero_info(f"Time limit reached. Elapsed time is {elapsed}. Signaling Trainer to stop.")
示例#11
0
 def _initialize_deepspeed_inference(self, model):
     # todo: Currently DeepSpeed requires optimizers at inference to partition weights correctly
     optimizer, lightning_scheduler, optimizer_frequencies = None, None, None
     if "optimizer" not in self.config:
         rank_zero_info(
             "You have not specified an optimizer or scheduler within the DeepSpeed config."
             "Using `configure_optimizers` to define optimizer and scheduler."
         )
         optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer(
         )
     inference_config = {
         # todo: this is required for DeepSpeed throughput timers, or throughput timers will be incorrect
         'train_micro_batch_size_per_gpu': 1,
     }
     if 'fp16' in self.config:
         inference_config.update({"fp16": self.config["fp16"]})
     if self.zero_stage_3:
         inference_config.update({
             "zero_allow_untested_optimizer":
             self.config['zero_allow_untested_optimizer'],
             "zero_optimization":
             self.config['zero_optimization'],
         })
     # Remove all module hooks before initializing new model
     remove_module_hooks(model)
     model, _, _, _ = deepspeed.initialize(
         args=SimpleNamespace(local_rank=self.local_rank),
         model=model,
         optimizer=optimizer,
         lr_scheduler=lightning_scheduler,
         config_params=inference_config,
         model_parameters=[],
     )
     self.model = model
示例#12
0
    def _initialize_deepspeed_train(self, model):
        if "optimizer" in self.config:
            optimizer, lr_scheduler = None, _get_default_scheduler_config()
        else:
            rank_zero_info(
                "You have not specified an optimizer or scheduler within the DeepSpeed config."
                " Using `configure_optimizers` to define optimizer and scheduler."
            )
            optimizer, lr_scheduler, _ = self._init_optimizers()

        scheduler = lr_scheduler["scheduler"]
        model, deepspeed_optimizer = self._setup_model_and_optimizer(
            model, optimizer, scheduler)
        self._set_deepspeed_activation_checkpointing()

        # although we set these here, deepspeed manages the specific optimizer logic
        self.lightning_module.trainer.optimizers = [deepspeed_optimizer]

        deepspeed_scheduler = model.lr_scheduler
        if deepspeed_scheduler is not None:
            # disable deepspeed lr scheduling as lightning manages scheduling
            model.lr_scheduler = None
            lr_scheduler["scheduler"] = deepspeed_scheduler
            self.lightning_module.trainer.lr_schedulers = [lr_scheduler]
        self.model = model
示例#13
0
 def _format_precision_config(self):
     amp_type = self.lightning_module.trainer.accelerator_connector.amp_type
     amp_level = self.lightning_module.trainer.accelerator_connector.amp_level
     precision = self.lightning_module.trainer.accelerator_connector.precision
     if precision == 16:
         if "fp16" not in self.config and amp_type == AMPType.NATIVE:
             # FP16 is a DeepSpeed standalone AMP implementation
             rank_zero_info("Enabling DeepSpeed FP16.")
             self.config["fp16"] = {
                 "enabled": True,
                 "loss_scale": self.loss_scale,
                 "initial_scale_power": self.initial_scale_power,
                 "loss_scale_window": self.loss_scale_window,
                 "hysteresis": self.hysteresis,
                 "min_loss_scale": self.min_loss_scale
             }
         elif "amp" not in self.config and amp_type == AMPType.APEX:
             rank_zero_only("Enabling DeepSpeed APEX Implementation.")
             self.config["amp"] = {
                 "enabled": True,
                 "opt_level": amp_level,
             }
     if "zero_optimization" in self.config and not ("amp" in self.config or
                                                    "fp16" in self.config):
         raise MisconfigurationException(
             "To use DeepSpeed ZeRO Optimization, you must set precision=16."
         )
示例#14
0
    def _initialize_deepspeed_train(self, model):
        optimizer, scheduler = None, None
        if "optimizer" in self.config:
            rank_zero_info(
                "You have specified an optimizer and/or scheduler within the DeepSpeed config."
                " It is recommended to define it in `LightningModule.configure_optimizers`."
            )
            lr_scheduler = None
        else:
            optimizer, lr_scheduler, _ = self._init_optimizers()
            if lr_scheduler is not None:
                scheduler = lr_scheduler.scheduler

        model, deepspeed_optimizer = self._setup_model_and_optimizer(
            model, optimizer, scheduler)
        self._set_deepspeed_activation_checkpointing()

        # although we set these here, deepspeed manages the specific optimizer logic
        self.optimizers = [deepspeed_optimizer]

        deepspeed_scheduler = model.lr_scheduler
        if deepspeed_scheduler is not None:
            # disable deepspeed lr scheduling as lightning manages scheduling
            model.lr_scheduler = None
            if lr_scheduler is None:
                lr_scheduler = LRSchedulerConfig(deepspeed_scheduler)
            else:
                lr_scheduler.scheduler = deepspeed_scheduler
            self.lr_scheduler_configs = [lr_scheduler]
        self.model = model
示例#15
0
 def check_checkpoint_callback(self, should_save, is_last=False):
     # TODO bake this logic into the checkpoint callback
     if should_save:
         checkpoint_callbacks = [c for c in self.trainer.callbacks if isinstance(c, ModelCheckpoint)]
         if is_last and any(c.save_last for c in checkpoint_callbacks):
             rank_zero_info("Saving latest checkpoint...")
         model = self.trainer.get_model()
         [c.on_validation_end(self.trainer, model) for c in checkpoint_callbacks]
示例#16
0
 def _check_time_remaining(self, trainer: 'pl.Trainer') -> None:
     should_stop = self.time_elapsed() >= self._duration
     should_stop = trainer.accelerator.broadcast(should_stop)
     trainer.should_stop = trainer.should_stop or should_stop
     if should_stop and self._verbose:
         rank_zero_info(
             f"Time limit reached. Elapsed time is {self.time_elapsed}. Signaling Trainer to stop."
         )
示例#17
0
 def _load_config(self, config):
     if config is None and self.DEEPSPEED_ENV_VAR in os.environ:
         rank_zero_info(f"Loading DeepSpeed config from set {self.DEEPSPEED_ENV_VAR} environment variable")
         config = os.environ[self.DEEPSPEED_ENV_VAR]
     if isinstance(config, str) or isinstance(config, Path):
         if not os.path.isfile(config):
             raise MisconfigurationException(
                 f"You passed in a path to a DeepSpeed config but the path does not exist: {config}"
             )
         with open(config) as f:
             config = json.load(f)
     return config
示例#18
0
    def check_checkpoint_callback(self, should_update, is_last=False):
        # TODO bake this logic into the ModelCheckpoint callback
        if should_update and self.trainer.checkpoint_connector.has_trained:
            callbacks = self.trainer.checkpoint_callbacks

            if is_last and any(cb.save_last and cb.verbose for cb in callbacks):
                rank_zero_info("Saving latest checkpoint...")

            model = self.trainer.lightning_module

            for cb in callbacks:
                cb.on_validation_end(self.trainer, model)
def main(cfg: DictConfig) -> None:
    rank_zero_info(OmegaConf.to_yaml(cfg))
    instantiator = HydraInstantiator()
    logger = instantiator.logger(cfg)
    run(
        instantiator,
        ignore_warnings=cfg.get("ignore_warnings"),
        run_test_after_fit=cfg.get("training").get("run_test_after_fit"),
        dataset=cfg.get("dataset"),
        tokenizer=cfg.get("tokenizer"),
        task=cfg.get("task"),
        trainer=cfg.get("trainer"),
        logger=logger,
    )
示例#20
0
def main(cfg: DictConfig) -> Any:
    rank_zero_info(OmegaConf.to_yaml(cfg))
    instantiator = HydraInstantiator()
    y = run(
        cfg.x,
        instantiator,
        checkpoint_path=cfg.get("checkpoint_path"),
        task=cfg.task,
        model_data_kwargs=cfg.get("model_data_kwargs"),
        tokenizer=cfg.get("tokenizer"),
        pipeline_kwargs=cfg.get("pipeline_kwargs", {}),
        predict_kwargs=cfg.get("predict_kwargs", {}),
    )
    rank_zero_info(y)
    return y
示例#21
0
 def _check_time_remaining(self, trainer) -> None:
     # Default timer only checks for train time exceeding max_time, this includes time for all stages.
     train_duration = self.time_elapsed(RunningStage.TRAINING)
     validation_duration = self.time_elapsed(RunningStage.VALIDATING)
     test_duration = self.time_elapsed(RunningStage.TESTING)
     total_duration = train_duration + validation_duration + test_duration
     should_stop = total_duration >= self._duration
     # should_stop = trainer.training_type_plugin.broadcast(should_stop)
     should_stop = trainer.training_type_plugin.reduce_boolean_decision(should_stop)
     trainer.should_stop = trainer.should_stop or should_stop
     if should_stop and self._verbose:
         rank_zero_info(f"Time limit reached. Signaling Trainer to stop.")
         rank_zero_info(
             f"Spent {timedelta(seconds=train_duration)} seconds on training, {timedelta(seconds=validation_duration)} seconds on validation and {timedelta(seconds=test_duration)} seconds on testing"
         )
示例#22
0
 def _format_precision_config(self) -> None:
     if self.precision_plugin.precision in (PrecisionType.HALF, PrecisionType.MIXED):
         if "fp16" not in self.config and self.precision_plugin.amp_type == AMPType.NATIVE:
             # FP16 is a DeepSpeed standalone AMP implementation
             rank_zero_info("Enabling DeepSpeed FP16.")
             self.config["fp16"] = {
                 "enabled": True,
                 "loss_scale": self.loss_scale,
                 "initial_scale_power": self.initial_scale_power,
                 "loss_scale_window": self.loss_scale_window,
                 "hysteresis": self.hysteresis,
                 "min_loss_scale": self.min_loss_scale,
             }
         elif "amp" not in self.config and self.precision_plugin.amp_type == AMPType.APEX:
             rank_zero_info("Enabling DeepSpeed APEX Implementation.")
             self.config["amp"] = {"enabled": True, "opt_level": self.precision_plugin.amp_level}
    def determine_ddp_node_rank(self):
        if self.trainer.is_slurm_managing_tasks:
            return int(os.environ['SLURM_NODEID'])

        # torchelastic uses the envvar GROUP_RANK, whereas other systems(?) use NODE_RANK.
        # otherwise use given node rank or default to node rank 0
        env_vars = ['NODE_RANK', 'GROUP_RANK']
        node_ids = [(k, os.environ.get(k, None)) for k in env_vars]
        node_ids = [(k, v) for k, v in node_ids if v is not None]
        if len(node_ids) == 0:
            return 0
        if len(node_ids) > 1:
            log.warning(f"Multiple environment variables ({node_ids}) defined for node rank. Using the first one.")
        k, rank = node_ids.pop()
        rank_zero_info(f"Using environment variable {k} for node rank ({rank}).")
        return int(rank)
    def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
        if data_parallel_device_ids is None:
            return

        # set the correct cuda visible devices (using pci order)
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

        # when slurm is managing the task it sets the visible devices
        if not is_slurm_managing_tasks and 'CUDA_VISIBLE_DEVICES' not in os.environ:
            if isinstance(data_parallel_device_ids, int):
                id_str = ','.join(str(x) for x in list(range(data_parallel_device_ids)))
                os.environ["CUDA_VISIBLE_DEVICES"] = id_str
            else:
                gpu_str = ','.join([str(x) for x in data_parallel_device_ids])
                os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str

        # don't make this debug... this is good UX
        rank_zero_info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')
示例#25
0
 def _format_precision_config(self):
     amp_type = self.lightning_module.trainer.accelerator_connector.amp_type
     amp_level = self.lightning_module.trainer.accelerator_connector.amp_level
     precision = self.lightning_module.trainer.accelerator_connector.precision
     if precision in (16, "mixed"):
         if "fp16" not in self.config and amp_type == AMPType.NATIVE:
             # FP16 is a DeepSpeed standalone AMP implementation
             rank_zero_info("Enabling DeepSpeed FP16.")
             self.config["fp16"] = {
                 "enabled": True,
                 "loss_scale": self.loss_scale,
                 "initial_scale_power": self.initial_scale_power,
                 "loss_scale_window": self.loss_scale_window,
                 "hysteresis": self.hysteresis,
                 "min_loss_scale": self.min_loss_scale,
             }
         elif "amp" not in self.config and amp_type == AMPType.APEX:
             rank_zero_only("Enabling DeepSpeed APEX Implementation.")
             self.config["amp"] = {"enabled": True, "opt_level": amp_level}
示例#26
0
    def setup(self, step: str) -> None:
        if step == "test":
            self._train_dataset = []
            self._test_dataset = ReaderDataset.load_dataset(
                self.hparams.test_file,
                self.hparams.base_pretrained_model,
                "test",
                getattr(self.hparams, "nq_gold_test_file", ""),
            )
            rank_zero_info("The number of test examples: %d",
                           len(self._test_dataset))

        else:
            self._train_dataset = ReaderDataset.load_dataset(
                self.hparams.train_file,
                self.hparams.base_pretrained_model,
                "train",
                self.hparams.nq_gold_train_file,
            )
            rank_zero_info("The number of training examples: %d",
                           len(self._train_dataset))

            self._val_dataset = ReaderDataset.load_dataset(
                self.hparams.validation_file,
                self.hparams.base_pretrained_model,
                "val",
                getattr(self.hparams, "nq_gold_validation_file", ""),
            )
            rank_zero_info("The number of validation examples: %d",
                           len(self._val_dataset))
    def configure_slurm_ddp(self, num_gpu_nodes):
        self.trainer.is_slurm_managing_tasks = False

        # extract SLURM flag vars
        # whenever we have the correct number of tasks, we let slurm manage processes
        # otherwise we launch the required number of processes
        if self.trainer._distrib_type in (DistributedType.DDP,
                                          DistributedType.DDP_SPAWN,
                                          DistributedType.DDP2):
            self.trainer.num_requested_gpus = self.trainer.num_gpus * num_gpu_nodes
            self.trainer.num_slurm_tasks = 0
            try:
                self.trainer.num_slurm_tasks = int(os.environ['SLURM_NTASKS'])
                self.trainer.is_slurm_managing_tasks = self.trainer.num_slurm_tasks == self.trainer.num_requested_gpus

                # enable slurm cpu
                if self.trainer.num_requested_gpus == 0:
                    self.trainer.is_slurm_managing_tasks = self.trainer.num_slurm_tasks == self.trainer.num_processes

                # in interactive mode we don't manage tasks
                job_name = os.environ['SLURM_JOB_NAME']
                if job_name == 'bash':
                    self.trainer.is_slurm_managing_tasks = False
            # todo: specify the possible exception
            except Exception:
                # likely not on slurm, so set the slurm managed flag to false
                self.trainer.is_slurm_managing_tasks = False

        # used for tests only, set this flag to simulate slurm managing a task
        should_fake = os.environ.get('FAKE_SLURM_MANAGING_TASKS')
        if should_fake and int(should_fake):
            self.trainer.is_slurm_managing_tasks = True

        # notify user the that slurm is managing tasks
        if self.trainer.is_slurm_managing_tasks:
            rank_zero_info('Multi-processing is handled by Slurm.')
示例#28
0
    def set_distributed_mode(self, distributed_backend: Optional[str] = None):

        if distributed_backend is None and self.is_training_type_in_plugins:
            return

        if distributed_backend is not None and distributed_backend in TrainingTypePluginsRegistry:
            self.distributed_backend = TrainingTypePluginsRegistry[distributed_backend]["distributed_backend"]
        elif distributed_backend is not None:
            self.distributed_backend = distributed_backend

        if isinstance(self.distributed_backend, Accelerator):
            return

        if self.distributed_backend is None:
            if self.has_horovodrun():
                self._set_horovod_backend()
            elif self.num_gpus == 0 and (self.num_nodes > 1 or self.num_processes > 1):
                self._distrib_type = DistributedType.DDP
            elif self.num_gpus > 1:
                rank_zero_warn(
                    'You requested multiple GPUs but did not specify a backend, e.g.'
                    ' `Trainer(accelerator="dp"|"ddp"|"ddp2")`. Setting `accelerator="ddp_spawn"` for you.'
                )
                self.distributed_backend = "ddp_spawn"

        # special case with DDP on CPUs
        if self.distributed_backend == "ddp_cpu":
            self._distrib_type = DistributedType.DDP_SPAWN
            if self.num_gpus > 0:
                rank_zero_warn(
                    'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.'
                )
                self.parallel_device_ids = None
            if self.num_processes is None:
                # define the max CPU available
                self.num_processes = os.cpu_count()
        # special case with TPUs
        elif self.distributed_backend == 'tpu' or self.tpu_cores is not None:
            self._device_type = DeviceType.TPU
            if isinstance(self.tpu_cores, int):
                self._distrib_type = DistributedType.TPU_SPAWN
        elif self.distributed_backend == 'ipu':
            self._device_type = DeviceType.IPU
        elif self.distributed_backend and self._distrib_type is None:
            self._distrib_type = DistributedType(self.distributed_backend)

        # unless you request explicitly for CPU and some GPU are available use them
        _on_cpu = self.distributed_backend and 'cpu' in self.distributed_backend
        if self.num_gpus > 0 and not _on_cpu:
            self._device_type = DeviceType.GPU

        _gpu_distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
        # DP and DDP2 cannot run without GPU
        if self.num_gpus == 0 and self._distrib_type in _gpu_distrib_types and not _on_cpu:
            rank_zero_warn(
                'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.'
            )
            # todo: in some cases it yield in comparison None and int
            if (self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1):
                self._distrib_type = DistributedType.DDP
            else:
                rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.')
                self._distrib_type = None

        # finished configuring self._distrib_type, check ipython environment
        self.check_interactive_compatibility()

        # for DDP overwrite nb processes by requested GPUs
        if (
            self._device_type == DeviceType.GPU
            and self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
        ):
            self.num_processes = self.num_gpus

        if (self._device_type == DeviceType.GPU and self._distrib_type == DistributedType.DDP2):
            self.num_processes = self.num_nodes

        # Horovod is an extra case...
        if self.distributed_backend == "horovod":
            self._set_horovod_backend()

        using_valid_distributed = self.use_ddp or self.use_ddp2
        if self.num_nodes > 1 and not using_valid_distributed:
            # throw error to force user to choose a supported distributed type such as ddp or ddp2
            raise MisconfigurationException(
                'Your chosen distributed type does not support num_nodes > 1. '
                'Please set accelerator=ddp or accelerator=ddp2.'
            )

        rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}')
        num_tpu_cores = self.tpu_cores if self.tpu_cores is not None else 0
        rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_tpu_cores} TPU cores')

        num_ipus = self.ipus if self.ipus is not None else 0
        rank_zero_info(f'IPU available: {_IPU_AVAILABLE}, using: {num_ipus} IPUs')

        if torch.cuda.is_available() and self._device_type != DeviceType.GPU:
            rank_zero_warn(
                "GPU available but not used. Set the gpus flag in your trainer"
                " `Trainer(gpus=1)` or script `--gpus=1`."
            )
    def set_distributed_mode(self, distributed_backend):
        self.use_dp = False
        self.use_ddp = False
        self.use_ddp2 = False
        self.use_horovod = False
        self.single_gpu = False

        if distributed_backend is None:
            if self.has_horovodrun():
                self._set_horovod_backend()
            elif self.num_gpus == 0:
                if self.num_nodes > 1 or self.num_processes > 1:
                    self.use_ddp = True  # ddp_cpu
            elif self.num_gpus == 1:
                self.single_gpu = True
            elif self.num_gpus > 1:
                rank_zero_warn(
                    'You requested multiple GPUs but did not specify a backend, e.g.'
                    ' Trainer(distributed_backend=dp) (or ddp, ddp2).'
                    ' Setting distributed_backend=ddp_spawn for you.')
                self.distributed_backend = 'ddp_spawn'
                distributed_backend = 'ddp_spawn'

        if distributed_backend == "dp":
            # do nothing if num_gpus == 0
            if self.num_gpus == 1:
                self.single_gpu = True
                self.use_dp = True
            elif self.num_gpus > 1:
                self.use_dp = True

        elif distributed_backend in ['ddp', 'ddp_spawn']:
            if self.num_gpus == 0:
                if self.num_nodes > 1 or self.num_processes > 1:
                    self.use_ddp = True  # ddp_cpu
            elif self.num_gpus == 1:
                self.single_gpu = True
                self.use_ddp = True
            elif self.num_gpus > 1:
                self.use_ddp = True
                self.num_processes = self.num_gpus

        elif distributed_backend == "ddp2":
            # do nothing if num_gpus == 0
            if self.num_gpus >= 1:
                self.use_ddp2 = True
        elif distributed_backend == "ddp_cpu":
            if self.num_gpus > 0:
                rank_zero_warn(
                    'You requested one or more GPUs, but set the backend to `ddp_cpu`.'
                    ' Training will not use GPUs.')
            self.use_ddp = True
            self.data_parallel_device_ids = None
            self.on_gpu = False
        elif distributed_backend == 'horovod':
            self._set_horovod_backend()

        # throw error to force user ddp or ddp2 choice
        if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp):
            raise MisconfigurationException(
                'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
                'To silence this warning set distributed_backend=ddp or distributed_backend=ddp2'
            )

        rank_zero_info(
            f'GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}')
示例#30
0
    def set_distributed_mode(self):
        self.trainer.use_dp = False
        self.trainer.use_ddp = False
        self.trainer.use_ddp2 = False
        self.trainer.use_horovod = False
        self.trainer.use_single_gpu = False

        if self.trainer.distributed_backend is None:
            if self.has_horovodrun():
                self._set_horovod_backend()
            elif self.trainer.num_gpus == 0:
                if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1:
                    self.trainer.use_ddp = True  # ddp_cpu
            elif self.trainer.num_gpus == 1:
                self.trainer.use_single_gpu = True
            elif self.trainer.num_gpus > 1:
                rank_zero_warn(
                    'You requested multiple GPUs but did not specify a backend, e.g.'
                    ' Trainer(distributed_backend="dp"|"ddp"|"ddp2").'
                    ' Setting distributed_backend="ddp_spawn" for you.')
                self.trainer.distributed_backend = "ddp_spawn"

        if self.trainer.distributed_backend == "dp":
            # do nothing if num_gpus == 0
            if self.trainer.num_gpus == 1:
                self.trainer.use_single_gpu = True
                self.trainer.use_dp = True
            elif self.trainer.num_gpus > 1:
                self.trainer.use_dp = True

        elif self.trainer.distributed_backend in ("ddp", "ddp_spawn"):
            if self.trainer.num_gpus == 0:
                if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1:
                    self.trainer.use_ddp = True  # ddp_cpu
            elif self.trainer.num_gpus == 1:
                self.trainer.use_single_gpu = True
                self.trainer.use_ddp = True
            elif self.trainer.num_gpus > 1:
                self.trainer.use_ddp = True
                self.trainer.num_processes = self.trainer.num_gpus

        elif self.trainer.distributed_backend == "ddp2":
            # do nothing if num_gpus == 0
            if self.trainer.num_gpus >= 1:
                self.trainer.use_ddp2 = True
        elif self.trainer.distributed_backend == "ddp_cpu":
            if self.trainer.num_gpus > 0:
                rank_zero_warn(
                    'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.'
                )
            self.trainer.use_ddp = True
            self.trainer.data_parallel_device_ids = None
            self.trainer.on_gpu = False
        elif self.trainer.distributed_backend == "horovod":
            self._set_horovod_backend()

        # throw error to force user ddp or ddp2 choice
        if self.trainer.num_nodes > 1 and not (self.trainer.use_ddp2
                                               or self.trainer.use_ddp):
            raise MisconfigurationException(
                'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
                'To silence this warning set distributed_backend=ddp or distributed_backend=ddp2'
            )

        rank_zero_info(
            f'GPU available: {torch.cuda.is_available()}, used: {self.trainer.on_gpu}'
        )
        num_cores = self.trainer.tpu_cores if self.trainer.tpu_cores is not None else 0
        rank_zero_info(
            f'TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores')

        if torch.cuda.is_available() and not self.trainer.on_gpu:
            rank_zero_warn(
                'GPU available but not used. Set the --gpus flag when calling the script.'
            )