Exemplo n.º 1
0
 def on_train_start(self, trainer, pl_module, *args, **kwargs):
     try:
         # log model to the wandb experiment
         wandb.watch(models=pl_module.model, criterion=pl_module.loss_func)
     except:
         log.info("Skipping wandb.watch --->")
Exemplo n.º 2
0
    def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0):
        """
        Entry point for ddp

        Args:
            process_idx:
            mp_queue: multiprocessing queue
            model:
            is_master:
            proc_offset:

        Returns:

        """
        # offset the process id if requested
        process_idx = process_idx + proc_offset

        # show progressbar only on progress_rank 0
        if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None:
            self.trainer.progress_bar_callback.disable()

        self.trainer.local_rank = self.trainer.node_rank
        self.trainer.global_rank = self.trainer.node_rank
        self.trainer.world_size = self.trainer.num_nodes

        # set warning rank
        rank_zero_only.rank = self.trainer.global_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        model.trainer = self.trainer
        model.init_ddp_connection(
            self.trainer.global_rank,
            self.trainer.world_size,
            self.trainer.is_slurm_managing_tasks
        )

        # call setup after the ddp process has connected
        self.trainer.call_setup_hook(model)

        # on world_size=0 let everyone know training is starting
        if self.trainer.is_global_zero:
            log.info('-' * 100)
            log.info(f'distributed_backend={self.trainer.distributed_backend}')
            log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes')
            log.info('-' * 100)

        # MODEL
        # copy model to each gpu
        if self.trainer.on_gpu:
            gpu_idx = process_idx

            # when using ddp, the master process (proc 0) continues running as the main one
            # this means that the local rank will always be 0
            # (even if cuda visible devices has other visible gpus)
            # this means that the master process needs to pull the 0th visible index as the device number
            if is_master:
                available_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
                gpu_idx = int(available_gpus[self.trainer.local_rank])

            self.trainer.root_gpu = gpu_idx
            torch.cuda.set_device(self.trainer.root_gpu)
            model.cuda(self.trainer.root_gpu)

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model)
        self.trainer.optimizers = optimizers
        self.trainer.lr_schedulers = lr_schedulers
        self.trainer.optimizer_frequencies = optimizer_frequencies

        # set model properties before going into wrapper
        self.trainer.copy_trainer_model_properties(model)

        # AMP - run through amp wrapper before going to distributed DP
        if self.trainer.amp_backend == AMPType.APEX:
            model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level)
            self.trainer.optimizers = optimizers
            self.trainer.reinit_scheduler_properties(self.trainer.optimizers, self.trainer.lr_schedulers)

        # DDP2 uses all GPUs on the machine
        device_ids = self.trainer.data_parallel_device_ids

        # allow user to configure ddp
        model = model.configure_ddp(model, device_ids)

        # continue training routine
        results = self.trainer.run_pretrain_routine(model)

        # get original model
        model = self.trainer.get_model()

        # persist info in ddp_spawn
        self.trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)

        # clean up memory
        torch.cuda.empty_cache()
    def set_distributed_mode(self, distributed_backend):
        self.use_dp = False
        self.use_ddp = False
        self.use_ddp2 = False
        self.use_horovod = False
        self.single_gpu = False

        if distributed_backend is None:
            if self.has_horovodrun():
                self.check_horovod()
                self.use_horovod = True
            elif self.num_gpus == 0:
                if self.num_nodes > 1 or self.num_processes > 1:
                    self.use_ddp = True  # ddp_cpu
            elif self.num_gpus == 1:
                self.single_gpu = True
            elif self.num_gpus > 1:
                rank_zero_warn(
                    'You requested multiple GPUs but did not specify a backend, e.g.'
                    ' Trainer(distributed_backend=dp) (or ddp, ddp2).'
                    ' Setting distributed_backend=dp for you.')
                self.use_dp = True
        elif distributed_backend == "dp":
            # do nothing if num_gpus == 0
            if self.num_gpus == 1:
                self.single_gpu = True
                self.use_dp = True
            elif self.num_gpus > 1:
                self.use_dp = True
        elif distributed_backend == "ddp":
            if self.num_gpus == 0:
                if self.num_nodes > 1 or self.num_processes > 1:
                    self.use_ddp = True  # ddp_cpu
            elif self.num_gpus == 1:
                self.single_gpu = True
                self.use_ddp = True
            elif self.num_gpus > 1:
                self.use_ddp = True
                self.num_processes = self.num_gpus
        elif distributed_backend == "ddp2":
            # do nothing if num_gpus == 0
            if self.num_gpus >= 1:
                self.use_ddp2 = True
        elif distributed_backend == "ddp_cpu":
            if self.num_gpus > 0:
                rank_zero_warn(
                    'You requested one or more GPUs, but set the backend to `ddp_cpu`.'
                    ' Training will not use GPUs.')
            self.use_ddp = True
            self.data_parallel_device_ids = None
            self.on_gpu = False
        elif distributed_backend == 'horovod':
            self.check_horovod()
            self.use_horovod = True

        # throw error to force user ddp or ddp2 choice
        if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp):
            raise MisconfigurationException(
                'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
                'To silence this warning set distributed_backend=ddp or distributed_backend=ddp2'
            )

        log.info(
            f'GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}')
Exemplo n.º 4
0
    def on_validation_end(self, trainer, pl_module):
        # pass
        # only run on main process
        if trainer.global_rank != 0:
            return

        metrics = trainer.callback_metrics
        epoch = trainer.current_epoch
        step = trainer.global_step

        if self.save_top_k == 0:
            # no models are saved
            return
        if self.epoch_last_check is not None and (
                epoch - self.epoch_last_check) < self.period:
            # skipping in this term
            return

        self.epoch_last_check = epoch

        filepath = self.format_checkpoint_name(epoch, metrics, step)
        version_cnt = 0
        while os.path.isfile(filepath):
            filepath = self.format_checkpoint_name(epoch,
                                                   metrics,
                                                   step,
                                                   ver=version_cnt)
            # this epoch called before
            version_cnt += 1

        if self.save_top_k != -1:
            current = metrics.get(self.monitor)

            if not isinstance(current, torch.Tensor):
                rank_zero_warn(
                    f'The metric you returned {current} must be a `torch.Tensor` instance, checkpoint not saved'
                    f' HINT: what is the value of {self.monitor} in validation_epoch_end()?',
                    RuntimeWarning)
                if current is not None:
                    current = torch.tensor(current)

            if current is None:
                rank_zero_warn(
                    f'Can save best model only with {self.monitor} available, skipping.',
                    RuntimeWarning)
            elif self.check_monitor_top_k(current):
                self._do_check_save(filepath, current, epoch)
            elif self.verbose > 0:
                log.info(
                    f'\nEpoch {epoch:02d}: {self.monitor}  was not in top {self.save_top_k}'
                )

        else:
            if self.verbose > 0:
                log.info(f'\nEpoch {epoch:02d}: saving model to {filepath}')

            assert trainer.global_rank == 0, 'tried to make a checkpoint from non global_rank=0'
            self._save_model(filepath)

        if self.save_last:
            filepath = os.path.join(self.dirpath, self.prefix + 'last.ckpt')
            self._save_model(filepath)
    def ddp_train(self,
                  process_idx,
                  mp_queue,
                  model,
                  is_master=False,
                  proc_offset=0):
        """
        Entry point for ddp

        Args:
            process_idx:
            mp_queue: multiprocessing queue
            model:
        """
        seed = os.environ.get("PL_GLOBAL_SEED")
        if seed is not None:
            seed_everything(int(seed))

        # offset the process id if requested
        process_idx = process_idx + proc_offset

        # show progressbar only on progress_rank 0
        if (self.trainer.node_rank != 0 or process_idx != 0
            ) and self.trainer.progress_bar_callback is not None:
            self.trainer.progress_bar_callback.disable()

        # determine which process we are and world size
        self.set_world_ranks(process_idx)

        # set warning rank
        rank_zero_only.rank = self.trainer.global_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        model.trainer = self.trainer
        self.init_ddp_connection(self.trainer.global_rank,
                                 self.trainer.world_size,
                                 self.trainer.is_slurm_managing_tasks)

        # call setup after the ddp process has connected
        self.trainer.call_setup_hook(model)

        # on world_size=0 let everyone know training is starting
        if self.trainer.is_global_zero and not torch.distributed.is_initialized(
        ):
            log.info('-' * 100)
            log.info(f'distributed_backend={self.trainer.distributed_backend}')
            log.info(
                f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes'
            )
            log.info('-' * 100)

        # call sync_bn before .cuda(), configure_apex and configure_ddp
        if self.trainer.sync_batchnorm:
            model = self.configure_sync_batchnorm(model)

        # move the model to the correct device
        self.model_to_device(model, process_idx, is_master)

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.setup_optimizers(model)

        # set model properties before going into wrapper
        self.trainer.model_connector.copy_trainer_model_properties(model)

        # 16-bit
        model = self.trainer.precision_connector.connect(model)

        # device ids change depending on the DDP setup
        device_ids = self.get_device_ids()

        # allow user to configure ddp
        model = self.configure_ddp(model, device_ids)

        # set up training routine
        self.trainer.train_loop.setup_training(model)

        # train or test
        results = self.train_or_test()

        # get original model
        model = self.trainer.get_model()

        # persist info in ddp_spawn
        self.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)

        # clean up memory
        torch.cuda.empty_cache()
Exemplo n.º 6
0
    def __init__(
            self,
            logger: Union[LightningLoggerBase, Iterable[LightningLoggerBase], bool] = True,
            checkpoint_callback: Union[ModelCheckpoint, bool] = True,
            early_stop_callback: Optional[Union[EarlyStopping, bool]] = False,
            callbacks: Optional[List[Callback]] = None,
            default_root_dir: Optional[str] = None,
            gradient_clip_val: float = 0,
            process_position: int = 0,
            num_nodes: int = 1,
            num_processes: int = 1,
            gpus: Optional[Union[List[int], str, int]] = None,
            auto_select_gpus: bool = False,
            num_tpu_cores: Optional[int] = None,
            log_gpu_memory: Optional[str] = None,
            progress_bar_refresh_rate: int = 1,
            overfit_pct: float = 0.0,
            track_grad_norm: int = -1,
            check_val_every_n_epoch: int = 1,
            fast_dev_run: bool = False,
            accumulate_grad_batches: Union[int, Dict[int, int], List[list]] = 1,
            max_epochs: int = 1000,
            min_epochs: int = 1,
            max_steps: Optional[int] = None,
            min_steps: Optional[int] = None,
            train_percent_check: float = 1.0,
            val_percent_check: float = 1.0,
            test_percent_check: float = 1.0,
            val_check_interval: float = 1.0,
            log_save_interval: int = 100,
            row_log_interval: int = 10,
            add_row_log_interval=None,  # backward compatible, todo: remove in v0.8.0
            distributed_backend: Optional[str] = None,
            precision: int = 32,
            print_nan_grads: bool = False,  # backward compatible, todo: remove in v0.9.0
            weights_summary: Optional[str] = 'full',
            weights_save_path: Optional[str] = None,
            amp_level: str = 'O1',
            num_sanity_val_steps: int = 5,
            truncated_bptt_steps: Optional[int] = None,
            resume_from_checkpoint: Optional[str] = None,
            profiler: Optional[BaseProfiler] = None,
            benchmark: bool = False,
            reload_dataloaders_every_epoch: bool = False,
            auto_lr_find: Union[bool, str] = False,
            replace_sampler_ddp: bool = True,
            default_save_path=None,  # backward compatible, todo: remove in v0.8.0
            gradient_clip=None,  # backward compatible, todo: remove in v0.8.0
            nb_gpu_nodes=None,  # backward compatible, todo: remove in v0.8.0
            max_nb_epochs=None,  # backward compatible, todo: remove in v0.8.0
            min_nb_epochs=None,  # backward compatible, todo: remove in v0.8.0
            use_amp=None,  # backward compatible, todo: remove in v0.9.0
            show_progress_bar=None,  # backward compatible, todo: remove in v0.9.0
            nb_sanity_val_steps=None,  # backward compatible, todo: remove in v0.8.0
            terminate_on_nan: bool = False,
            **kwargs
    ):
        r"""

        Customize every aspect of training via flags

        Args:
            logger: Logger (or iterable collection of loggers) for experiment tracking.

            checkpoint_callback: Callback for checkpointing.

            early_stop_callback (:class:`pytorch_lightning.callbacks.EarlyStopping`):

            callbacks: Add a list of callbacks.

            default_root_dir: Default path for logs and weights when no logger/ckpt_callback passed

            default_save_path:
                .. warning:: .. deprecated:: 0.7.3

                    Use `default_root_dir` instead. Will remove 0.9.0.

            gradient_clip_val: 0 means don't clip.

            gradient_clip:
                .. warning:: .. deprecated:: 0.7.0

                    Use `gradient_clip_val` instead. Will remove 0.9.0.

            process_position: orders the tqdm bar when running multiple models on same machine.

            num_nodes: number of GPU nodes for distributed training.

            nb_gpu_nodes:
                .. warning:: .. deprecated:: 0.7.0

                    Use `num_nodes` instead. Will remove 0.9.0.

            gpus: Which GPUs to train on.

            auto_select_gpus:

                If enabled and `gpus` is an integer, pick available
                gpus automatically. This is especially useful when
                GPUs are configured to be in "exclusive mode", such
                that only one process at a time can access them.

            num_tpu_cores: How many TPU cores to train on (1 or 8).

            log_gpu_memory: None, 'min_max', 'all'. Might slow performance

            show_progress_bar:
                .. warning:: .. deprecated:: 0.7.2

                        Set `progress_bar_refresh_rate` to postive integer to enable. Will remove 0.9.0.

            progress_bar_refresh_rate: How often to refresh progress bar (in steps). Value ``0`` disables progress bar.

            overfit_pct: How much of training-, validation-, and test dataset to check.

            track_grad_norm: -1 no tracking. Otherwise tracks that norm

            check_val_every_n_epoch: Check val every n train epochs.

            fast_dev_run: runs 1 batch of train, test  and val to find any bugs (ie: a sort of unit test).

            accumulate_grad_batches: Accumulates grads every k batches or as set up in the dict.

            max_epochs: Stop training once this number of epochs is reached.

            max_nb_epochs:
                .. warning:: .. deprecated:: 0.7.0

                    Use `max_epochs` instead. Will remove 0.9.0.

            min_epochs: Force training for at least these many epochs

            min_nb_epochs:
                .. warning:: .. deprecated:: 0.7.0

                    Use `min_epochs` instead. Will remove 0.9.0.

            max_steps: Stop training after this number of steps. Disabled by default (None).

            min_steps: Force training for at least these number of steps. Disabled by default (None).

            train_percent_check: How much of training dataset to check.

            val_percent_check: How much of validation dataset to check.

            test_percent_check: How much of test dataset to check.

            val_check_interval: How often within one training epoch to check the validation set

            log_save_interval: Writes logs to disk this often

            row_log_interval: How often to add logging rows (does not write to disk)

            add_row_log_interval:
                .. warning:: .. deprecated:: 0.7.0

                    Use `row_log_interval` instead. Will remove 0.9.0.

            distributed_backend: The distributed backend to use.

            use_amp:
                .. warning:: .. deprecated:: 0.7.0

                    Use `precision` instead. Will remove 0.9.0.

            precision: Full precision (32), half precision (16).

            print_nan_grads:
                .. warning:: .. deprecated:: 0.7.2

                    Has no effect. When detected, NaN grads will be printed automatically.
                    Will remove 0.9.0.

            weights_summary: Prints a summary of the weights when training begins.

            weights_save_path: Where to save weights if specified. Will override default_root_dir
                    for checkpoints only. Use this if for whatever reason you need the checkpoints
                    stored in a different place than the logs written in `default_root_dir`.

            amp_level: The optimization level to use (O1, O2, etc...).

            num_sanity_val_steps: Sanity check runs n batches of val before starting the training routine.

            nb_sanity_val_steps:
                .. warning:: .. deprecated:: 0.7.0

                    Use `num_sanity_val_steps` instead. Will remove 0.8.0.

            truncated_bptt_steps: Truncated back prop breaks performs backprop every k steps of

            resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here.

            profiler:  To profile individual steps during training and assist in

            reload_dataloaders_every_epoch: Set to True to reload dataloaders every epoch

            auto_lr_find: If set to True, will `initially` run a learning rate finder,
                trying to optimize initial learning for faster convergence. Sets learning
                rate in self.hparams.lr | self.hparams.learning_rate in the lightning module.
                To use a different key, set a string instead of True with the key name.

            replace_sampler_ddp: Explicitly enables or disables sampler replacement.
                If not specified this will toggled automatically ddp is used

            benchmark: If true enables cudnn.benchmark.

            terminate_on_nan: If set to True, will terminate training (by raising a `ValueError`) at the
                end of each training batch, if any of the parameters or the loss are NaN or +/-inf.
        """

        # Init callbacks
        self.callbacks = callbacks or []
        self.on_init_start()

        # benchmarking
        self.benchmark = benchmark
        torch.backends.cudnn.benchmark = self.benchmark

        # Transfer params
        self.num_nodes = num_nodes
        # Backward compatibility, TODO: remove in v0.8.0
        if nb_gpu_nodes is not None:
            rank_zero_warn("Argument `nb_gpu_nodes` has renamed to `num_nodes` since v0.5.0"
                           " and this method will be removed in v0.8.0", DeprecationWarning)
            self.num_gpu_nodes = nb_gpu_nodes
        self.log_gpu_memory = log_gpu_memory

        self.gradient_clip_val = gradient_clip_val
        # Backward compatibility, TODO: remove in v0.8.0
        if gradient_clip is not None:
            rank_zero_warn("Argument `gradient_clip` has renamed to `gradient_clip_val` since v0.5.0"
                           " and this method will be removed in v0.8.0", DeprecationWarning)
            self.gradient_clip = gradient_clip

        self.progress_bar_refresh_rate = progress_bar_refresh_rate
        self.check_val_every_n_epoch = check_val_every_n_epoch
        self.track_grad_norm = track_grad_norm
        self.on_gpu = True if (gpus and torch.cuda.is_available()) else False

        # tpu config
        self.on_tpu = num_tpu_cores is not None
        self.num_tpu_cores = num_tpu_cores
        assert num_tpu_cores in [1, 8, None], 'num_tpu_cores can only be 1 or 8'

        if num_processes != 1 and distributed_backend != "ddp_cpu":
            rank_zero_warn("num_processes is only used for distributed_backend=\"ddp_cpu\". Ignoring it.")
        self.num_processes = num_processes

        self.process_position = process_position
        self.weights_summary = weights_summary

        self.max_epochs = max_epochs
        # Backward compatibility, TODO: remove in v0.8.0
        if max_nb_epochs is not None:
            rank_zero_warn("Argument `max_nb_epochs` has renamed to `max_epochs` since v0.5.0"
                           " and this method will be removed in v0.8.0", DeprecationWarning)
            self.max_nb_epochs = max_nb_epochs

        self.min_epochs = min_epochs
        # Backward compatibility, TODO: remove in v0.8.0
        if min_nb_epochs is not None:
            rank_zero_warn("Argument `min_nb_epochs` has renamed to `min_epochs` since v0.5.0"
                           " and this method will be removed in v0.8.0", DeprecationWarning)
            self.min_nb_epochs = min_nb_epochs

        self.max_steps = max_steps
        self.min_steps = min_steps

        self.num_sanity_val_steps = num_sanity_val_steps
        # Backward compatibility, TODO: remove in v0.8.0
        if nb_sanity_val_steps is not None:
            rank_zero_warn("Argument `nb_sanity_val_steps` has renamed to "
                           "`num_sanity_val_steps` since v0.5.0"
                           " and this method will be removed in v0.8.0", DeprecationWarning)
            self.nb_sanity_val_steps = nb_sanity_val_steps

        # Backward compatibility, TODO: remove in v0.9.0
        if print_nan_grads:
            rank_zero_warn("Argument `print_nan_grads` has no effect and will be removed in v0.9.0."
                           " NaN grads will be printed automatically when detected.", DeprecationWarning)

        self.reload_dataloaders_every_epoch = reload_dataloaders_every_epoch

        self.auto_lr_find = auto_lr_find
        self.replace_sampler_ddp = replace_sampler_ddp

        self.truncated_bptt_steps = truncated_bptt_steps
        self.resume_from_checkpoint = resume_from_checkpoint
        self.terminate_on_nan = terminate_on_nan
        self.shown_warnings = set()

        self.fast_dev_run = fast_dev_run
        if self.fast_dev_run:
            self.num_sanity_val_steps = 0
            self.max_epochs = 1
            log.info('Running in fast_dev_run mode: will run a full train,'
                     ' val and test loop using a single batch')

        # set default save path if user didn't provide one
        self.default_root_dir = default_root_dir

        # Backward compatibility, TODO: remove in v0.8.0
        if default_save_path is not None:
            self.default_root_dir = default_save_path

        if self.default_root_dir is None:
            self.default_root_dir = os.getcwd()

        # training bookeeping
        self.total_batch_idx = 0
        self.running_loss = TensorRunningAccum(window_length=20)
        self.batch_idx = 0
        self.tqdm_metrics = {}
        self.callback_metrics = {}
        self.num_val_batches = 0
        self.num_training_batches = 0
        self.num_test_batches = 0
        self.train_dataloader = None
        self.test_dataloaders = None
        self.val_dataloaders = None

        # training state
        self.model = None
        self.testing = False
        self.disable_validation = False
        self.lr_schedulers = []
        self.optimizers = None
        self.optimizer_frequencies = []
        self.global_step = 0
        self.current_epoch = 0
        self.total_batches = 0
        self.interrupted = False

        # configure logger
        self.configure_logger(logger)

        # configure profiler
        if profiler is True:
            profiler = SimpleProfiler()
        self.profiler = profiler or PassThroughProfiler()

        # configure early stop callback
        # creates a default one if none passed in
        self.configure_early_stopping(early_stop_callback)

        # configure checkpoint callback
        self.checkpoint_callback = checkpoint_callback
        self.weights_save_path = weights_save_path

        # accumulated grads
        self.accumulate_grad_batches = accumulate_grad_batches
        self.configure_accumulated_gradients(accumulate_grad_batches)

        # for gpus allow int, string and gpu list
        if auto_select_gpus and isinstance(gpus, int):
            self.gpus = pick_multiple_gpus(gpus)
        else:
            self.gpus = gpus

        self.data_parallel_device_ids = parse_gpu_ids(self.gpus)
        self.root_gpu = determine_root_gpu_device(self.data_parallel_device_ids)
        self.root_device = torch.device("cpu")

        # tpu state flags
        self.use_tpu = False
        self.tpu_local_core_rank = None
        self.tpu_global_core_rank = None

        # distributed backend choice
        self.distributed_backend = distributed_backend
        self.set_distributed_mode(distributed_backend)

        # override dist backend when using tpus
        if self.on_tpu:
            self.init_tpu()
            self.current_tpu_idx = None

        # init flags for SLURM+ddp to work
        self.proc_rank = 0
        self.world_size = 1
        self.node_rank = 0
        self.configure_slurm_ddp(self.num_nodes)

        # nvidia setup
        self.set_nvidia_flags(self.is_slurm_managing_tasks, self.data_parallel_device_ids)

        # can't init progress bar here because starting a new process
        # means the progress_bar won't survive pickling
        # backward compatibility
        if show_progress_bar is not None:
            self.show_progress_bar = show_progress_bar

        # logging
        self.log_save_interval = log_save_interval
        self.val_check_interval = val_check_interval

        # backward compatibility
        if add_row_log_interval is not None:
            rank_zero_warn("`add_row_log_interval` has renamed to `row_log_interval` since v0.5.0"
                           " and this method will be removed in v0.8.0", DeprecationWarning)
            if not row_log_interval:  # in case you did not set the proper value
                row_log_interval = add_row_log_interval
        self.row_log_interval = row_log_interval

        # how much of the data to use
        self.overfit_pct = overfit_pct
        self.determine_data_use_amount(train_percent_check, val_percent_check,
                                       test_percent_check, overfit_pct)

        # 16 bit mixed precision training using apex
        self.amp_level = amp_level
        self.precision = precision

        # Backward compatibility, TODO: remove in v0.9.0
        if use_amp is not None:
            rank_zero_warn("`use_amp` has been replaced by `precision` since v0.7.0"
                           " and this argument will be removed in v0.9.0", DeprecationWarning)
            self.precision = 16 if use_amp else 32

        assert self.precision in (16, 32), 'only 32 or 16 bit precision supported'

        if self.precision == 16 and self.num_tpu_cores is None:
            use_amp = True
        self.init_amp(use_amp)

        # Callback system
        self.on_init_end()
Exemplo n.º 7
0
    def train(self):
        warnings.warn(
            'Displayed epoch numbers in the progress bar start from "1" until v0.6.x,'
            ' but will start from "0" in v0.8.0.', RuntimeWarning)

        # get model
        model = self.get_model()

        # load data
        self.reset_train_dataloader(model)
        self.reset_val_dataloader(model)

        # Train start events
        with self.profiler.profile('on_train_start'):
            # callbacks
            self.on_train_start()
            # initialize early stop callback
            if self.early_stop_callback is not None:
                self.early_stop_callback.on_train_start(self, self.get_model())
            # model hooks
            model.on_train_start()

        try:
            # run all epochs
            for epoch in range(self.current_epoch, self.max_epochs):
                # set seed for distributed sampler (enables shuffling for each epoch)
                if self.use_ddp \
                        and hasattr(self.train_dataloader.sampler, 'set_epoch'):
                    self.train_dataloader.sampler.set_epoch(epoch)

                # update training progress in trainer and model
                model.current_epoch = epoch
                self.current_epoch = epoch

                total_val_batches = 0
                is_val_epoch = False
                if not self.disable_validation and self.num_training_batches != float(
                        'inf'):
                    # val can be checked multiple times in epoch
                    is_val_epoch = (self.current_epoch +
                                    1) % self.check_val_every_n_epoch == 0
                    val_checks_per_epoch = self.num_training_batches // self.val_check_batch
                    val_checks_per_epoch = val_checks_per_epoch if is_val_epoch else 0
                    total_val_batches = self.num_val_batches * val_checks_per_epoch

                # total batches includes multiple val checks
                self.total_batches = self.num_training_batches + total_val_batches

                # changing gradient according accumulation_scheduler
                self.accumulation_scheduler.on_epoch_start(
                    self, self.get_model())

                # stores accumulated grad fractions per batch
                self.batch_loss_value = TensorRunningMean(
                    window_length=self.accumulate_grad_batches)

                if self.fast_dev_run:
                    # limit the number of batches to 2 (1 train and 1 val) in fast_dev_run
                    num_iterations = 2
                elif self.total_batches == float('inf'):
                    # for infinite train or val loader, the progress bar never ends
                    num_iterations = None
                else:
                    num_iterations = self.total_batches

                # reset progress bar
                # .reset() doesn't work on disabled progress bar so we should check
                if not self.main_progress_bar.disable:
                    self.main_progress_bar.reset(num_iterations)
                desc = f'Epoch {epoch + 1}'
                self.main_progress_bar.set_description(desc)

                # -----------------
                # RUN TNG EPOCH
                # -----------------
                self.run_training_epoch()

                # update LR schedulers
                self.update_learning_rates(interval='epoch')

                if self.max_steps and self.max_steps == self.global_step:
                    self.run_training_teardown()
                    return

                # early stopping
                met_min_epochs = epoch >= self.min_epochs - 1
                met_min_steps = self.global_step >= self.min_steps if self.min_steps else True

                # TODO wrap this logic into the callback
                if self.enable_early_stop:
                    if (met_min_epochs and met_min_steps) or self.fast_dev_run:
                        should_stop = self.early_stop_callback.on_epoch_end(
                            self, self.get_model())
                        # stop training
                        stop = should_stop and met_min_epochs
                        if stop:
                            self.run_training_teardown()
                            return

            self.run_training_teardown()

        except KeyboardInterrupt:
            log.info(
                'Detected KeyboardInterrupt, attempting graceful shutdown...')
            self.run_training_teardown()
Exemplo n.º 8
0
    def on_validation_end(self, trainer, pl_module):
        # only run on main process
        if trainer.global_rank != 0:
            return

        if trainer.running_sanity_check:
            return

        # TODO: remove when dict results are deprecated
        self.__warn_deprecated_monitor_key()

        metrics = trainer.logger_connector.callback_metrics
        epoch = trainer.current_epoch

        # support structured results
        if metrics.get('checkpoint_on') is not None:
            self.monitor = 'checkpoint_on'

        # conditioned val metrics override conditioned train loop metrics
        if metrics.get('val_checkpoint_on') is not None:
            self.monitor = 'val_checkpoint_on'

        if self.save_top_k == 0:
            # no models are saved
            return
        if self.epoch_last_check is not None and (
                epoch - self.epoch_last_check) < self.period:
            # skipping in this term
            return

        self.epoch_last_check = epoch

        ckpt_name_metrics = trainer.logger_connector.logged_metrics
        filepath = self.format_checkpoint_name(epoch, ckpt_name_metrics)
        version_cnt = 0
        while self._fs.exists(filepath):
            filepath = self.format_checkpoint_name(epoch,
                                                   ckpt_name_metrics,
                                                   ver=version_cnt)
            # this epoch called before
            version_cnt += 1

        if self.save_top_k != -1:
            current = metrics.get(self.monitor)

            if not isinstance(current, torch.Tensor):
                rank_zero_warn(
                    f'The metric you returned {current} must be a `torch.Tensor` instance, checkpoint not saved'
                    f' HINT: what is the value of {self.monitor} in validation_epoch_end()?',
                    RuntimeWarning)
                if current is not None:
                    current = torch.tensor(current)

            if current is None:
                rank_zero_warn(
                    f'Can save best model only with {self.monitor} available, skipping.',
                    RuntimeWarning)
            elif self.check_monitor_top_k(current):
                self._do_check_save(filepath, current, epoch, trainer,
                                    pl_module)
            elif self.verbose > 0:
                log.info(
                    f'Epoch {epoch:d}: {self.monitor} was not in top {self.save_top_k}'
                )

        else:
            if self.verbose > 0:
                log.info(f'Epoch {epoch:d}: saving model to {filepath}')

            assert trainer.global_rank == 0, 'tried to make a checkpoint from non global_rank=0'
            self._save_model(filepath, trainer, pl_module)

        if self.save_last:
            filename = self._format_checkpoint_name(self.CHECKPOINT_NAME_LAST,
                                                    epoch,
                                                    ckpt_name_metrics,
                                                    prefix=self.prefix)
            filepath = os.path.join(self.dirpath, f'{filename}.ckpt')
            self._save_model(filepath, trainer, pl_module)
            if self.last_model_path and self.last_model_path != filepath:
                self._del_model(self.last_model_path)
Exemplo n.º 9
0
 def train_dataloader(self):
     log.info('Training data loader called.')
     return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=4)
Exemplo n.º 10
0
 def train_dataloader(self):
     log.info('Training data loaded.')
     return self.__dataloader(train=True)
Exemplo n.º 11
0
 def val_dataloader(self):
     log.info('Validation data loaded.')
     return self.__dataloader(train=False)
Exemplo n.º 12
0
    def ddp_train(self, process_idx, model, is_master=False, proc_offset=0):
        """
        Entry point into a DP thread
        :param gpu_idx:
        :param model:
        :param cluster_obj:
        :return:
        """
        # offset the process id if requested
        process_idx = process_idx + proc_offset

        # show progressbar only on progress_rank 0
        if (self.node_rank != 0 or process_idx != 0) and self.progress_bar_callback is not None:
            self.progress_bar_callback.disable()

        # determine which process we are and world size
        if self.use_ddp:
            self.local_rank = process_idx
            self.global_rank = self.node_rank * self.num_processes + process_idx
            self.world_size = self.num_nodes * self.num_processes

        elif self.use_ddp2:
            self.local_rank = self.node_rank
            self.global_rank = self.node_rank
            self.world_size = self.num_nodes

        # set warning rank
        rank_zero_only.rank = self.global_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        model.trainer = self
        model.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks)

        # on world_size=0 let everyone know training is starting
        if self.is_global_zero:
            log.info('-' * 100)
            log.info(f'distributed_backend={self.distributed_backend}')
            log.info(f'All DDP processes registered. Starting ddp with {self.world_size} processes')
            log.info('-' * 100)

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)

        # MODEL
        # copy model to each gpu
        if self.on_gpu:
            gpu_idx = process_idx
            if is_master:
                # source of truth is cuda for gpu idx
                gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
                gpu_idx = int(gpus[self.local_rank])

            self.root_gpu = gpu_idx
            torch.cuda.set_device(self.root_gpu)
            model.cuda(self.root_gpu)

        # set model properties before going into wrapper
        self.copy_trainer_model_properties(model)

        # AMP
        # run through amp wrapper before going to distributed DP
        # TODO: remove in v0.8.0
        if self.use_amp and not self.use_native_amp:
            model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level)
            self.optimizers = optimizers
            self.reinit_scheduler_properties(self.optimizers, self.lr_schedulers)

        # DDP2 uses all GPUs on the machine
        if self.distributed_backend == 'ddp' or self.distributed_backend == 'ddp_spawn':
            device_ids = [self.root_gpu]
        elif self.use_ddp2:
            device_ids = self.data_parallel_device_ids
        else:  # includes ddp_cpu
            device_ids = None

        # allow user to configure ddp
        model = model.configure_ddp(model, device_ids)

        # continue training routine
        self.run_pretrain_routine(model)
Exemplo n.º 13
0
    def ddp_train(self, process_idx, mp_queue, model):
        """
        Entry point for ddp

        Args:
            process_idx: current process rank
            mp_queue: multiprocessing queue
            model: pointer to current :class:`LightningModule`

        Returns:
            Dict with evaluation results

        """
        # show progressbar only on progress_rank 0
        if (self.trainer.node_rank != 0 or process_idx != 0
            ) and self.trainer.progress_bar_callback is not None:
            self.trainer.progress_bar_callback.disable()

        # determine which process we are and world size
        self.set_world_ranks(process_idx)

        # set warning rank
        rank_zero_only.rank = self.trainer.global_rank

        # Initialize cuda device
        self.init_device(process_idx)

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        model.trainer = self.trainer
        self.init_ddp_connection(self.trainer.global_rank,
                                 self.trainer.world_size,
                                 self.trainer.is_slurm_managing_tasks)

        if isinstance(self.ddp_plugin, RPCPlugin):
            if not self.ddp_plugin.is_main_rpc_process:
                self.ddp_plugin.on_accelerator_exit_rpc_process(self.trainer)
                self.ddp_plugin.exit_rpc_process()
                if self.ddp_plugin.return_after_exit_rpc_process:
                    return
            else:
                self.ddp_plugin.on_main_rpc_connection(self.trainer)

        # call setup after the ddp process has connected
        self.trainer.call_setup_hook(model)

        # on world_size=0 let everyone know training is starting
        if self.trainer.is_global_zero and not torch.distributed.is_initialized(
        ):
            log.info('-' * 100)
            log.info(f'distributed_backend={self.trainer.distributed_backend}')
            log.info(
                f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes'
            )
            log.info('-' * 100)

        # call sync_bn before .cuda(), configure_apex and configure_ddp
        if self.trainer.sync_batchnorm:
            model = self.configure_sync_batchnorm(model)

        # move the model to the correct device
        self.model_to_device(model)

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.setup_optimizers(model)

        self.ddp_plugin.on_after_setup_optimizers(self.trainer)

        # set model properties before going into wrapper
        self.trainer.model_connector.copy_trainer_model_properties(model)

        # 16-bit
        model = self.trainer.precision_connector.connect(model)

        self.trainer.convert_to_lightning_optimizers()

        # device ids change depending on the DDP setup
        device_ids = self.get_device_ids()

        # allow user to configure ddp
        model = self.configure_ddp(model, device_ids)

        # set up training routine
        self.trainer.train_loop.setup_training(model)

        # train or test
        results = self.train_or_test()

        # clean up memory
        torch.cuda.empty_cache()
        return results
Exemplo n.º 14
0
 def on_train_end(self, trainer, pl_module):
     if self.stopped_epoch > 0:
         log.info(
             f'Epoch {self.stopped_epoch:05d}: early stopping triggered.')
Exemplo n.º 15
0
 def on_train_end(self, trainer, pl_module):
     if self.stopped_epoch > 0 and self.verbose > 0:
         rank_zero_warn(
             'Displayed epoch numbers by `EarlyStopping` start from "1" until v0.6.x,'
             ' but will start from "0" in v0.8.0.', DeprecationWarning)
         log.info(f'Epoch {self.stopped_epoch + 1:05d}: early stopping')
Exemplo n.º 16
0
 def val_dataloader(self):
     log.info('Validation data loader called.')
     return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4)
Exemplo n.º 17
0
    def fit(
            self,
            model: LightningModule,
            train_dataloader: Optional[DataLoader] = None,
            val_dataloaders: Optional[DataLoader] = None
    ):
        r"""
        Runs the full optimization routine.

        Args:
            model: Model to fit.

            train_dataloader: A Pytorch
                DataLoader with training samples. If the model has
                a predefined train_dataloader method this will be skipped.

            val_dataloaders: Either a single
                Pytorch Dataloader or a list of them, specifying validation samples.
                If the model has a predefined val_dataloaders method this will be skipped

        Example::

            # Option 1,
            # Define the train_dataloader() and val_dataloader() fxs
            # in the lightningModule
            # RECOMMENDED FOR MOST RESEARCH AND APPLICATIONS TO MAINTAIN READABILITY
            trainer = Trainer()
            model = LightningModule()
            trainer.fit(model)

            # Option 2
            # in production cases we might want to pass different datasets to the same model
            # Recommended for PRODUCTION SYSTEMS
            train, val = DataLoader(...), DataLoader(...)
            trainer = Trainer()
            model = LightningModule()
            trainer.fit(model, train_dataloader=train, val_dataloader=val)

            # Option 1 & 2 can be mixed, for example the training set can be
            # defined as part of the model, and validation can then be feed to .fit()

        """
        # bind logger and other properties
        model.logger = self.logger
        self.copy_trainer_model_properties(model)

        # set up the passed in dataloaders (if needed)
        self.__attach_dataloaders(model, train_dataloader, val_dataloaders)

        # check that model is configured correctly
        self.check_model_configuration(model)

        # download the data and do whatever transforms we need
        # do before any spawn calls so that the model can assign properties
        # only on proc 0 because no spawn has happened yet
        model.prepare_data()

        # Run learning rate finder:
        if self.auto_lr_find:
            self._run_lr_finder_internally(model)

        # route to appropriate start method
        # when using multi-node or DDP within a node start each module in a separate process
        if self.use_ddp2:
            task = int(os.environ['SLURM_LOCALID'])
            self.ddp_train(task, model)

        elif self.use_ddp:
            if self.is_slurm_managing_tasks:
                task = int(os.environ['SLURM_LOCALID'])
                self.ddp_train(task, model)
            else:
                self.__set_random_port()
                # track for predict
                self.model = model
                # train
                mp.spawn(self.ddp_train, nprocs=self.num_processes, args=(model,))
                # load weights if not interrupted
                self.load_spawn_weights(model)
                self.model = model

        # 1 gpu or dp option triggers training using DP module
        # easier to avoid NCCL issues
        elif self.use_dp:
            self.dp_train(model)

        elif self.single_gpu:
            self.single_gpu_train(model)

        elif self.use_tpu:  # pragma: no-cover
            log.info(f'training on {self.num_tpu_cores} TPU cores')

            #  COLAB_GPU is an env var available by default in Colab environments.
            start_method = 'fork' if os.getenv('COLAB_GPU') else 'spawn'

            # track for predict
            self.model = model

            # train
            xmp.spawn(self.tpu_train, args=(model,), nprocs=self.num_tpu_cores, start_method=start_method)

            # load weights if not interrupted
            self.load_spawn_weights(model)
            self.model = model

        # ON CPU
        else:
            # run through amp wrapper
            if self.use_amp:
                raise MisconfigurationException('amp + cpu is not supported.  Please use a GPU option')

            # CHOOSE OPTIMIZER
            # allow for lr schedulers as well
            self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)

            self.run_pretrain_routine(model)

        # return 1 when finished
        # used for testing or when we need to know that training succeeded
        return 1
Exemplo n.º 18
0
 def test_dataloader(self):
     log.info('Test data loader called.')
     return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4)
Exemplo n.º 19
0
 def term_handler(self, signum, frame):
     # save
     log.info("bypassing sigterm")
Exemplo n.º 20
0
    def ddp_train(self, process_idx, mp_queue, model):
        """
        Entry point for ddp

        Args:
            process_idx:
            mp_queue: multiprocessing queue
            model:

        Returns:

        """
        # show progressbar only on progress_rank 0
        if (self.trainer.node_rank != 0 or process_idx != 0
            ) and self.trainer.progress_bar_callback is not None:
            self.trainer.progress_bar_callback.disable()

        # determine which process we are and world size
        if self.trainer.use_ddp:
            self.trainer.local_rank = process_idx
            self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx
            self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes

        elif self.trainer.use_ddp2:
            self.trainer.local_rank = self.trainer.node_rank
            self.trainer.global_rank = self.trainer.node_rank
            self.trainer.world_size = self.trainer.num_nodes

        # set warning rank
        rank_zero_only.rank = self.trainer.global_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        model.trainer = self.trainer
        model.init_ddp_connection(self.trainer.global_rank,
                                  self.trainer.world_size,
                                  self.trainer.is_slurm_managing_tasks)

        # call setup after the ddp process has connected
        self.trainer.call_setup_hook(model)

        # on world_size=0 let everyone know training is starting
        if self.trainer.is_global_zero:
            log.info('-' * 100)
            log.info(f'distributed_backend={self.trainer.distributed_backend}')
            log.info(
                f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes'
            )
            log.info('-' * 100)

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(
            model)
        self.trainer.optimizers = optimizers
        self.trainer.lr_schedulers = lr_schedulers
        self.trainer.optimizer_frequencies = optimizer_frequencies

        # call sync_bn before .cuda(), configure_apex and configure_ddp
        if self.trainer.sync_batchnorm:
            model = model.configure_sync_batchnorm(model)

        # MODEL
        # copy model to each gpu
        if self.trainer.on_gpu:
            gpu_idx = process_idx
            self.trainer.root_gpu = gpu_idx
            torch.cuda.set_device(self.trainer.root_gpu)
            model.cuda(self.trainer.root_gpu)

        # set model properties before going into wrapper
        self.trainer.copy_trainer_model_properties(model)

        # AMP -
        # run through amp wrapper before going to distributed DP
        if self.trainer.amp_type == AMPType.APEX:
            model, optimizers = model.configure_apex(amp, model,
                                                     self.trainer.optimizers,
                                                     self.trainer.amp_level)
            self.trainer.optimizers = optimizers
            self.trainer.reinit_scheduler_properties(
                self.trainer.optimizers, self.trainer.lr_schedulers)

        # DDP2 uses all GPUs on the machine
        if self.trainer.distributed_backend == 'ddp' or self.trainer.distributed_backend == 'ddp_spawn':
            device_ids = [self.trainer.root_gpu]
        elif self.trainer.use_ddp2:
            device_ids = self.trainer.data_parallel_device_ids
        else:  # includes ddp_cpu
            device_ids = None

        # allow user to configure ddp
        model = model.configure_ddp(model, device_ids)

        # continue training routine
        results = self.trainer.run_pretrain_routine(model)

        # get original model
        model = self.trainer.get_model()

        # persist info in ddp_spawn
        self.trainer.transfer_distrib_spawn_state_on_fit_end(
            model, mp_queue, results)

        # clean up memory
        torch.cuda.empty_cache()
def scale_batch_size(trainer,
                     model: LightningModule,
                     mode: str = 'power',
                     steps_per_trial: int = 3,
                     init_val: int = 2,
                     max_trials: int = 25,
                     batch_arg_name: str = 'batch_size',
                     **fit_kwargs):
    r"""
    Will iteratively try to find the largest batch size for a given model
    that does not give an out of memory (OOM) error.

    Args:
        trainer: The Trainer
        model: Model to fit.

        mode: string setting the search mode. Either `power` or `binsearch`.
            If mode is `power` we keep multiplying the batch size by 2, until
            we get an OOM error. If mode is 'binsearch', we will initially
            also keep multiplying by 2 and after encountering an OOM error
            do a binary search between the last successful batch size and the
            batch size that failed.

        steps_per_trial: number of steps to run with a given batch size.
            Idealy 1 should be enough to test if a OOM error occurs,
            however in practise a few are needed

        init_val: initial batch size to start the search with

        max_trials: max number of increase in batch size done before
           algorithm is terminated

        batch_arg_name: name of the attribute that stores the batch size.
            It is expected that the user has provided a model or datamodule that has a hyperparameter
            with that name. We will look for this attribute name in the following places

            - ``model``
            - ``model.hparams``
            - ``model.datamodule``
            - ``trainer.datamodule`` (the datamodule passed to the tune method)

        **fit_kwargs: remaining arguments to be passed to .fit(), e.g., dataloader
            or datamodule.

    Raises:
        MisconfigurationException:
            If field ``batch_arg_name`` is not found in ``model`` and ``model.hparams``, or
            if batch scaling feature is used with dataloaders passed directly to ``.fit()``.
        ValueError:
            If mode in method ``scale_batch_size`` is neither ``power`` nor ``binsearch``.
    """
    if trainer.fast_dev_run:
        rank_zero_warn(
            'Skipping batch size scaler since fast_dev_run is enabled.',
            UserWarning)
        return

    if not lightning_hasattr(model, batch_arg_name):
        raise MisconfigurationException(
            f'Field {batch_arg_name} not found in both `model` and `model.hparams`'
        )
    if hasattr(model, batch_arg_name) and hasattr(
            model, "hparams") and batch_arg_name in model.hparams:
        rank_zero_warn(
            f'Field `model.{batch_arg_name}` and `model.hparams.{batch_arg_name}` are mutually exclusive!'
            f' `model.{batch_arg_name}` will be used as the initial batch size for scaling.'
            f' If this is not the intended behavior, please remove either one.'
        )

    if hasattr(model.train_dataloader, 'patch_loader_code'):
        raise MisconfigurationException(
            'The batch scaling feature cannot be used with dataloaders passed directly to `.fit()`.'
            ' Please disable the feature or incorporate the dataloader into the model.'
        )

    # Arguments we adjust during the batch size finder, save for restoring
    __scale_batch_dump_params(trainer)

    # Set to values that are required by the algorithm
    __scale_batch_reset_params(trainer, model, steps_per_trial)

    # Save initial model, that is loaded after batch size is found
    save_path = os.path.join(trainer.default_root_dir,
                             'scale_batch_size_temp_model.ckpt')
    trainer.save_checkpoint(str(save_path))

    if trainer.progress_bar_callback:
        trainer.progress_bar_callback.disable()

    # Initially we just double in size until an OOM is encountered
    new_size = _adjust_batch_size(trainer, batch_arg_name,
                                  value=init_val)  # initially set to init_val
    if mode == 'power':
        new_size = _run_power_scaling(trainer, model, new_size, batch_arg_name,
                                      max_trials, **fit_kwargs)
    elif mode == 'binsearch':
        new_size = _run_binsearch_scaling(trainer, model, new_size,
                                          batch_arg_name, max_trials,
                                          **fit_kwargs)
    else:
        raise ValueError(
            'mode in method `scale_batch_size` can only be `power` or `binsearch'
        )

    garbage_collection_cuda()
    log.info(
        f'Finished batch size finder, will continue with full run using batch size {new_size}'
    )

    # Restore initial state of model
    if trainer.is_global_zero:
        trainer.checkpoint_connector.restore(
            str(save_path), on_gpu=trainer._device_type == DeviceType.GPU)
        fs = get_filesystem(str(save_path))
        if fs.exists(save_path):
            fs.rm(save_path)

    # Finish by resetting variables so trainer is ready to fit model
    __scale_batch_restore_params(trainer)
    if trainer.progress_bar_callback:
        trainer.progress_bar_callback.enable()

    return new_size
Exemplo n.º 22
0
 def term_handler(self, signum, frame):
     # Todo: required argument `signum` is not used
     # Todo: required argument `frame` is not used
     log.info("bypassing sigterm")
Exemplo n.º 23
0
    def ddp_train(self, process_idx, q, model, is_master=False, proc_offset=0):
        """
        Entry point for ddp

        Args:
            process_idx:
            q:
            model:
            is_master:
            proc_offset:

        Returns:

        """
        # offset the process id if requested
        process_idx = process_idx + proc_offset

        # show progressbar only on progress_rank 0
        if (self.node_rank != 0 or process_idx != 0) and self.progress_bar_callback is not None:
            self.progress_bar_callback.disable()

        # determine which process we are and world size
        if self.use_ddp:
            self.local_rank = process_idx
            self.global_rank = self.node_rank * self.num_processes + process_idx
            self.world_size = self.num_nodes * self.num_processes

        elif self.use_ddp2:
            self.local_rank = self.node_rank
            self.global_rank = self.node_rank
            self.world_size = self.num_nodes

        # set warning rank
        rank_zero_only.rank = self.global_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        model.trainer = self
        model.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks)

        # call setup after the ddp process has connected
        self.setup('fit')
        if self.is_function_implemented('setup', model):
            model.setup('fit')

        # on world_size=0 let everyone know training is starting
        if self.is_global_zero:
            log.info('-' * 100)
            log.info(f'distributed_backend={self.distributed_backend}')
            log.info(f'All DDP processes registered. Starting ddp with {self.world_size} processes')
            log.info('-' * 100)

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)

        # MODEL
        # copy model to each gpu
        if self.on_gpu:
            gpu_idx = process_idx
            if is_master:
                # source of truth is cuda for gpu idx
                gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
                gpu_idx = int(gpus[self.local_rank])

            self.root_gpu = gpu_idx
            torch.cuda.set_device(self.root_gpu)
            model.cuda(self.root_gpu)

        # set model properties before going into wrapper
        self.copy_trainer_model_properties(model)

        # AMP
        # run through amp wrapper before going to distributed DP
        if self.use_amp and not NATIVE_AMP_AVALAIBLE:
            model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level)
            self.optimizers = optimizers
            self.reinit_scheduler_properties(self.optimizers, self.lr_schedulers)

        # DDP2 uses all GPUs on the machine
        if self.distributed_backend == 'ddp' or self.distributed_backend == 'ddp_spawn':
            device_ids = [self.root_gpu]
        elif self.use_ddp2:
            device_ids = self.data_parallel_device_ids
        else:  # includes ddp_cpu
            device_ids = None

        # allow user to configure ddp
        model = model.configure_ddp(model, device_ids)

        # continue training routine
        # run_pretrain_routine: in "trainer.py" from line 1080 - Sanity check a few things before starting actual training
        results = self.run_pretrain_routine(model)

        # get original model
        model = self.get_model()

        # persist info in ddp_spawn
        self.transfer_ddp_spawn_state_on_fit_end(model, q, results)

        # clean up memory
        torch.cuda.empty_cache()

        if self.global_rank == 0 and self.distributed_backend not in ['ddp_spawn', 'ddp_cpu']:
            return results
Exemplo n.º 24
0
    def scale_batch_size(self,
                         model: LightningModule,
                         mode: str = 'power',
                         steps_per_trial: int = 3,
                         init_val: int = 2,
                         max_trials: int = 25,
                         batch_arg_name: str = 'batch_size'):
        r"""
        Will iteratively try to find the largest batch size for a given model
        that does not give an out of memory (OOM) error.

        Args:
            model: Model to fit.

            mode: string setting the search mode. Either `power` or `binsearch`.
                If mode is `power` we keep multiplying the batch size by 2, until
                we get an OOM error. If mode is 'binsearch', we will initially
                also keep multiplying by 2 and after encountering an OOM error
                do a binary search between the last successful batch size and the
                batch size that failed.

            steps_per_trial: number of steps to run with a given batch size.
                Idealy 1 should be enough to test if a OOM error occurs,
                however in practise a few are needed

            init_val: initial batch size to start the search with

            max_trials: max number of increase in batch size done before
               algorithm is terminated

        """
        if not hasattr(model, batch_arg_name):
            if not hasattr(model.hparams, batch_arg_name):
                raise MisconfigurationException(
                    'Neither of `model.batch_size` and `model.hparams.batch_size` found.'
                )

        if hasattr(model.train_dataloader, 'patch_loader_code'):
            raise MisconfigurationException(
                'The batch scaling feature cannot be used with dataloaders'
                ' passed directly to `.fit()`. Please disable the feature or'
                ' incorporate the dataloader into the model.')

        # Arguments we adjust during the batch size finder, save for restoring
        self.__scale_batch_dump_params()

        # Set to values that are required by the algorithm
        self.__scale_batch_reset_params(model, steps_per_trial)

        # Save initial model, that is loaded after batch size is found
        save_path = os.path.join(self.default_root_dir, 'temp_model.ckpt')
        self.save_checkpoint(str(save_path))

        if self.progress_bar_callback:
            self.progress_bar_callback.disable()

        # Initially we just double in size until an OOM is encountered
        new_size = _adjust_batch_size(
            self, value=init_val)  # initially set to init_val
        if mode == 'power':
            new_size = _run_power_scaling(self, model, new_size,
                                          batch_arg_name, max_trials)
        elif mode == 'binsearch':
            new_size = _run_binsearch_scaling(self, model, new_size,
                                              batch_arg_name, max_trials)
        else:
            raise ValueError(
                'mode in method `scale_batch_size` can only be `power` or `binsearch'
            )

        garbage_collection_cuda()
        log.info(
            f'Finished batch size finder, will continue with full run using batch size {new_size}'
        )

        # Restore initial state of model
        self.restore(str(save_path), on_gpu=self.on_gpu)
        os.remove(save_path)

        # Finish by resetting variables so trainer is ready to fit model
        self.__scale_batch_restore_params()
        if self.progress_bar_callback:
            self.progress_bar_callback.enable()

        return new_size
    def ddp_train(self, process_idx, model):
        """
        Entry point for ddp

        Args:
            process_idx:
            mp_queue: multiprocessing queue
            model:

        Returns:
            Dict with evaluation results

        """
        # determine which process we are and world size
        self.set_world_ranks(process_idx)

        # toggle prog bar
        if self.trainer.global_rank == 0 and self.trainer.progress_bar_callback is not None:
            self.trainer.progress_bar_callback.disable()

        # set warning rank
        rank_zero_only.rank = self.trainer.global_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        model.trainer = self.trainer
        self.init_ddp_connection(self.trainer.global_rank,
                                 self.trainer.world_size,
                                 self.trainer.is_slurm_managing_tasks)

        # call setup after the ddp process has connected
        self.trainer.call_setup_hook(model)

        # on world_size=0 let everyone know training is starting
        if self.trainer.is_global_zero and not torch.distributed.is_initialized(
        ):
            log.info('-' * 100)
            log.info(
                f'distributed_backend={self.trainer.distributed_backend} (TORCH_ELASTIC)'
            )
            log.info(
                f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes'
            )
            log.info('-' * 100)

        # call sync_bn before .cuda(), configure_apex and configure_ddp
        if self.trainer.sync_batchnorm:
            model = self.configure_sync_batchnorm(model)

        # move the model to the correct device
        self.model_to_device(model, process_idx)

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.setup_optimizers(model)

        # set model properties before going into wrapper
        self.trainer.model_connector.copy_trainer_model_properties(model)

        # 16-bit
        model = self.trainer.precision_connector.connect(model)

        # device ids change depending on the DDP setup
        device_ids = self.get_device_ids()

        # allow user to configure ddp
        model = self.configure_ddp(model, device_ids)

        # set up training routine
        self.trainer.train_loop.setup_training(model)

        # train or test
        results = self.train_or_test()

        # clean up memory
        torch.cuda.empty_cache()

        return results
Exemplo n.º 26
0
 def print_nan_gradients(self) -> None:
     model = self.get_model()
     for param in model.parameters():
         if (param.grad is not None) and torch.isnan(
                 param.grad.float()).any():
             log.info(param, param.grad)
Exemplo n.º 27
0
def lr_find(
        trainer,
        model: LightningModule,
        train_dataloader: Optional[DataLoader] = None,
        val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None,
        min_lr: float = 1e-8,
        max_lr: float = 1,
        num_training: int = 100,
        mode: str = 'exponential',
        early_stop_threshold: float = 4.0,
        datamodule: Optional[LightningDataModule] = None,
):
    r"""
    lr_find enables the user to do a range test of good initial learning rates,
    to reduce the amount of guesswork in picking a good starting learning rate.

    Args:
        model: Model to do range testing for

        train_dataloader: A PyTorch
            DataLoader with training samples. If the model has
            a predefined train_dataloader method, this will be skipped.

        min_lr: minimum learning rate to investigate

        max_lr: maximum learning rate to investigate

        num_training: number of learning rates to test

        mode: search strategy, either 'linear' or 'exponential'. If set to
            'linear' the learning rate will be searched by linearly increasing
            after each batch. If set to 'exponential', will increase learning
            rate exponentially.

        early_stop_threshold: threshold for stopping the search. If the
            loss at any point is larger than early_stop_threshold*best_loss
            then the search is stopped. To disable, set to None.

        datamodule: An optional `LightningDataModule` which holds the training
            and validation dataloader(s). Note that the `train_dataloader` and
            `val_dataloaders` parameters cannot be used at the same time as
            this parameter, or a `MisconfigurationException` will be raised.


    Example::

        # Setup model and trainer
        model = MyModelClass(hparams)
        trainer = pl.Trainer()

        # Run lr finder
        lr_finder = trainer.lr_find(model, ...)

        # Inspect results
        fig = lr_finder.plot(); fig.show()
        suggested_lr = lr_finder.suggestion()

        # Overwrite lr and create new model
        hparams.lr = suggested_lr
        model = MyModelClass(hparams)

        # Ready to train with new learning rate
        trainer.fit(model)

    """
    save_path = os.path.join(trainer.default_root_dir, 'lr_find_temp.ckpt')

    __lr_finder_dump_params(trainer, model)

    # Prevent going into infinite loop
    trainer.auto_lr_find = False

    # Initialize lr finder object (stores results)
    lr_finder = _LRFinder(mode, min_lr, max_lr, num_training)

    # Use special lr logger callback
    trainer.callbacks = [_LRCallback(num_training,
                                     early_stop_threshold,
                                     progress_bar_refresh_rate=1)]

    # No logging
    trainer.logger = DummyLogger()

    # Max step set to number of iterations
    trainer.max_steps = num_training

    # Disable standard progress bar for fit
    if trainer.progress_bar_callback:
        trainer.progress_bar_callback.disable()

    # Disable standard checkpoint & early stopping
    trainer.checkpoint_callback = False
    trainer.early_stop_callback = None

    # Required for saving the model
    trainer.optimizers, trainer.schedulers = [], [],
    trainer.model = model

    # Dump model checkpoint
    trainer.save_checkpoint(str(save_path))

    # Configure optimizer and scheduler
    model.configure_optimizers = lr_finder._exchange_scheduler(model.configure_optimizers)

    # Fit, lr & loss logged in callback
    trainer.fit(model,
                train_dataloader=train_dataloader,
                val_dataloaders=val_dataloaders,
                datamodule=datamodule)

    # Prompt if we stopped early
    if trainer.global_step != num_training:
        log.info('LR finder stopped early due to diverging loss.')

    # Transfer results from callback to lr finder object
    lr_finder.results.update({'lr': trainer.callbacks[0].lrs,
                              'loss': trainer.callbacks[0].losses})
    lr_finder._total_batch_idx = trainer.total_batch_idx  # for debug purpose

    # Reset model state
    trainer.checkpoint_connector.restore(str(save_path), on_gpu=trainer.on_gpu)
    os.remove(save_path)

    # Finish by resetting variables so trainer is ready to fit model
    __lr_finder_restore_params(trainer, model)
    if trainer.progress_bar_callback:
        trainer.progress_bar_callback.enable()

    return lr_finder
Exemplo n.º 28
0
    def train(self):
        self.run_sanity_check(self.get_model())

        # set stage for logging
        self.logger_connector.set_stage("train")

        self.checkpoint_connector.has_trained = False

        # enable train mode
        model = self.get_model()
        model.train()
        torch.set_grad_enabled(True)

        # reload data when needed
        self.train_loop.reset_train_val_dataloaders(model)

        # hook
        self.train_loop.on_train_start()

        try:
            if self.train_loop.should_skip_training():
                return
            # run all epochs
            for epoch in range(self.current_epoch, self.max_epochs):

                # hook
                self.train_loop.on_train_epoch_start(epoch)

                with self.profiler.profile("run_training_epoch"):
                    # run train epoch
                    self.train_loop.run_training_epoch()

                if self.max_steps and self.max_steps <= self.global_step:
                    return

                # update LR schedulers
                self.optimizer_connector.update_learning_rates(interval='epoch')

                # early stopping
                met_min_epochs = epoch >= self.min_epochs - 1
                met_min_steps = self.global_step >= self.min_steps if self.min_steps else True

                if self.should_stop:
                    if met_min_epochs and met_min_steps:
                        return
                    log.info(
                        'Trainer was signaled to stop but required minimum epochs'
                        f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has'
                        ' not been met. Training will continue...'
                    )

        except KeyboardInterrupt:
            rank_zero_warn('Detected KeyboardInterrupt, attempting graceful shutdown...')

            # user could press ctrl+c many times... only shutdown once
            if not self.interrupted:
                self.interrupted = True
                self._state = TrainerState.INTERRUPTED
                self.on_keyboard_interrupt()
        finally:
            # hook
            self.train_loop.on_train_end()
Exemplo n.º 29
0
    def train(self):
        # add signal handlers for process kills
        # def _signal_kill_handler(*args):
        #     return TrainerTrainLoopMixin.run_training_teardown(self)
        #
        # orig_signal_handlers = {}
        # for sig_name in SIGNAL_TERMINATE:
        #     orig_signal_handlers[sig_name] = signal.signal(getattr(signal, sig_name),
        #                                                    _signal_kill_handler)

        # get model
        model = self.get_model()

        # enable train mode
        model.train()

        # enable gradients
        torch.set_grad_enabled(True)

        # load data
        # if reload_dataloaders_every_epoch, this is moved to the epoch loop
        if not self.reload_dataloaders_every_epoch:
            self.reset_train_dataloader(model)
        self.reset_val_dataloader(model)

        # Train start events
        with self.profiler.profile('on_train_start'):
            # callbacks
            self.on_train_start()
            # model hooks
            model.on_train_start()

        try:
            # run all epochs
            for epoch in range(self.current_epoch, self.max_epochs):
                # reset train dataloader
                if self.reload_dataloaders_every_epoch:
                    self.reset_train_dataloader(model)
                # set seed for distributed sampler (enables shuffling for each epoch)
                if (self.use_ddp or self.use_horovod) \
                        and hasattr(self.train_dataloader, 'sampler') \
                        and hasattr(self.train_dataloader.sampler, 'set_epoch'):
                    self.train_dataloader.sampler.set_epoch(epoch)

                # update training progress in trainer and model
                model.current_epoch = epoch
                self.current_epoch = epoch

                # changing gradient according accumulation_scheduler
                self.accumulation_scheduler.on_epoch_start(
                    self, self.get_model())

                # stores accumulated grad fractions per batch
                self.batch_loss_value = TensorRunningAccum(
                    window_length=self.accumulate_grad_batches)

                # -----------------
                # RUN TNG EPOCH
                # -----------------
                self.run_training_epoch()

                if self.max_steps and self.max_steps <= self.global_step:
                    self.run_training_teardown()
                    return

                # update LR schedulers
                self.update_learning_rates(interval='epoch')

                # early stopping
                met_min_epochs = epoch >= self.min_epochs - 1
                met_min_steps = self.global_step >= self.min_steps if self.min_steps else True

                if self.should_stop:
                    if (met_min_epochs and met_min_steps) or self.fast_dev_run:
                        self.run_training_teardown()
                        return
                    else:
                        log.info(
                            'Trainer was signaled to stop but required minimum epochs'
                            f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has'
                            ' not been met. Training will continue...')

            self.run_training_teardown()

        except KeyboardInterrupt:
            rank_zero_warn(
                'Detected KeyboardInterrupt, attempting graceful shutdown...')

            # user could press ctrl+c many times... only shutdown once
            if not self.interrupted:
                self.interrupted = True
                self.on_keyboard_interrupt()

                self.run_training_teardown()
Exemplo n.º 30
0
    def configure_optimizers(self):
        """
        method required by pytorch lightning's module

        Here we use the fact that Every optimizer of pytorch can take as argument a list of dict.
        Each dict defining a separate parameter group, and should contain a `params` key, containing a list of
        parameters belonging to it. Other keys should match the keyword arguments accepted by the optimizers,
        and will be used as optimization options for this group.


        Returns
        -------
            One or multiple optimizers and learning_rate schedulers in any of these options:

                - Single optimizer.
                - List or Tuple - List of optimizers.
                - Two lists - The first list has multiple optimizers, the second a list of LR schedulers.
                - Dictionary, with an ‘optimizer’ key and (optionally) a ‘lr_scheduler’ key.
                - Tuple of dictionaries as described, with an optional ‘frequency’ key.
                - None - Fit will run without any optimizer.

        more details on:
        https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.html
        at configure_optimizers()
        """

        # REQUIRED
        if self.hparams.optimizer.name == 'Ranger':
            from ranger import Ranger
            optimizer_class = Ranger
        elif self.hparams.optimizer.name == 'RAdam':
            from radam import RAdam
            optimizer_class = RAdam
        else:
            optimizer_class = getattr(torch.optim, self.hparams.optimizer.name)

        params = []
        if self.depth_net is not None:
            params.append({
                'name': 'Depth',
                'params': self.depth_net.parameters(),
                **self.hparams.optimizer.depth_net_options
            })
            terminal_logger.info("DepthNet's optimizer configured.")

        if self.pose_net is not None:
            params.append({
                'name': 'Pose',
                'params': self.pose_net.parameters(),
                **self.hparams.optimizer.pose_net_options
            })
            terminal_logger.info("PoseNet's optimizer configured.")

        # Create optimizer with parameters
        optimizer = optimizer_class(params)

        # Load and initialize schedulers
        if self.hparams.scheduler.name == 'FlatCosAnnealScheduler':
            from schedulers.flat_cos_anneal_scheduler import FlatCosAnnealScheduler
            step_factor = self.hparams.dataloaders.train.batch_size * self.hparams.trainer.accumulate_grad_batches
            steps_per_epoch = len(self.train_dataset) / step_factor

            scheduler = {
                'scheduler':
                FlatCosAnnealScheduler(optimizer, steps_per_epoch,
                                       self.hparams.trainer.max_epochs,
                                       **self.hparams.scheduler.options),
                'name':
                'FlatCosAnnealScheduler',
                'interval':
                'step',  # so that scheduler.step() is done at batch-level instead of epoch
                'frequency':
                1
            }

        else:
            scheduler_class = getattr(torch.optim.lr_scheduler,
                                      self.hparams.scheduler.name)
            # assumes the schedulers used from torch.optim are epoch-based
            scheduler = {
                'scheduler':
                scheduler_class(optimizer, **self.hparams.scheduler.options),
                'name':
                self.hparams.scheduler.name,
                'interval':
                'epoch',
                'frequency':
                1
            }

        terminal_logger.info("Optimizers and Schedulers configured.")

        return {'optimizer': optimizer, 'lr_scheduler': scheduler}