Exemplo n.º 1
0
 def __configure_accumulated_gradients(self, accumulate_grad_batches):
     if isinstance(accumulate_grad_batches, dict):
         self.accumulation_scheduler = GradientAccumulationScheduler(accumulate_grad_batches)
     elif isinstance(accumulate_grad_batches, int):
         schedule = {1: accumulate_grad_batches}
         self.accumulation_scheduler = GradientAccumulationScheduler(schedule)
     else:
         raise TypeError("Gradient accumulation supports only int and dict types")
 def configure_accumulated_gradients(
         self, accumulate_grad_batches: Union[int, Dict[int, int]]) -> None:
     if isinstance(accumulate_grad_batches, dict):
         self.trainer.accumulation_scheduler = GradientAccumulationScheduler(
             accumulate_grad_batches)
     elif isinstance(accumulate_grad_batches, int):
         schedule = {0: accumulate_grad_batches}
         self.trainer.accumulation_scheduler = GradientAccumulationScheduler(
             schedule)
     else:
         raise TypeError(
             "Gradient accumulation supports only int and dict types")
Exemplo n.º 3
0
    def _handle_gradient_accumulation_steps(self):
        """This functions overrides the trainer.accumulation_scheduler to generate ``accumulate_grad_batches=1``.

        Therefore, ``optimizer_step`` will be called on every batch, and the IPU will handle grad accumulation.
        """
        if self.accumulate_grad_batches > 1:
            self.lightning_module.trainer.accumulation_scheduler = GradientAccumulationScheduler(
                {0: 1})
def test_attach_model_callbacks():
    """ Test that the callbacks defined in the model and through Trainer get merged correctly. """
    def assert_composition(trainer_callbacks, model_callbacks, expected):
        model = Mock()
        model.configure_callbacks.return_value = model_callbacks
        trainer = Trainer(checkpoint_callback=False,
                          progress_bar_refresh_rate=0,
                          callbacks=trainer_callbacks)
        cb_connector = CallbackConnector(trainer)
        cb_connector._attach_model_callbacks(model, trainer)
        assert trainer.callbacks == expected

    early_stopping = EarlyStopping()
    progress_bar = ProgressBar()
    lr_monitor = LearningRateMonitor()
    grad_accumulation = GradientAccumulationScheduler({1: 1})

    # no callbacks
    assert_composition(trainer_callbacks=[], model_callbacks=[], expected=[])

    # callbacks of different types
    assert_composition(trainer_callbacks=[early_stopping],
                       model_callbacks=[progress_bar],
                       expected=[early_stopping, progress_bar])

    # same callback type twice, different instance
    assert_composition(trainer_callbacks=[progress_bar,
                                          EarlyStopping()],
                       model_callbacks=[early_stopping],
                       expected=[progress_bar, early_stopping])

    # multiple callbacks of the same type in trainer
    assert_composition(trainer_callbacks=[
        LearningRateMonitor(),
        EarlyStopping(),
        LearningRateMonitor(),
        EarlyStopping()
    ],
                       model_callbacks=[early_stopping, lr_monitor],
                       expected=[early_stopping, lr_monitor])

    # multiple callbacks of the same type, in both trainer and model
    assert_composition(trainer_callbacks=[
        LearningRateMonitor(), progress_bar,
        EarlyStopping(),
        LearningRateMonitor(),
        EarlyStopping()
    ],
                       model_callbacks=[
                           early_stopping, lr_monitor, grad_accumulation,
                           early_stopping
                       ],
                       expected=[
                           progress_bar, early_stopping, lr_monitor,
                           grad_accumulation, early_stopping
                       ])
Exemplo n.º 5
0
 def _handle_gradient_accumulation_steps(self):
     """
     This functions overrides the trainer.accumulation_scheduler to generate
     ``accumulate_grad_batches=1``.
     Therefore, ``optimizer_step`` will be called on every batches seen
     so DeepSpeed Engine handles the gradient accumulation logic internally.
     """
     if self.config.get("gradient_accumulation_steps") > 1:
         self._original_accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches
         # todo (tchaton) Add support for accumulate_grad_batches being a dictionary.
         self.lightning_module.trainer.accumulation_scheduler = GradientAccumulationScheduler({0: 1})
     else:
         self._original_accumulate_grad_batches = None
Exemplo n.º 6
0
 def _handle_gradient_accumulation_steps(self):
     """
     This functions overrides the trainer.accumulation_scheduler to generate
     ``accumulate_grad_batches=1``.
     Therefore, ``optimizer_step`` will be called on every batch, and the IPU will handle grad accumulation.
     """
     self._original_accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches
     if not isinstance(self._original_accumulate_grad_batches, int):
         raise MisconfigurationException(
             f"IPUs currently only support accumulate_grad_batches being an integer value. "
             f"Received {self._original_accumulate_grad_batches}")
     if self._original_accumulate_grad_batches > 1:
         self.lightning_module.trainer.accumulation_scheduler = GradientAccumulationScheduler(
             {0: 1})
    def _configure_accumulated_gradients(
        self,
        accumulate_grad_batches: Optional[Union[int, Dict[int, int]]] = None
    ) -> None:
        grad_accum_callback = [
            cb for cb in self.trainer.callbacks
            if isinstance(cb, GradientAccumulationScheduler)
        ]

        if grad_accum_callback:
            if accumulate_grad_batches is not None:
                raise MisconfigurationException(
                    "You have set both `accumulate_grad_batches` and passed an instance of "
                    "`GradientAccumulationScheduler` inside callbacks. Either remove `accumulate_grad_batches` "
                    "from trainer or remove `GradientAccumulationScheduler` from callbacks list."
                )
            grad_accum_callback = grad_accum_callback[0]
        else:
            if accumulate_grad_batches is None:
                accumulate_grad_batches = 1

            if isinstance(accumulate_grad_batches, dict):
                grad_accum_callback = GradientAccumulationScheduler(
                    accumulate_grad_batches)
            elif isinstance(accumulate_grad_batches, int):
                grad_accum_callback = GradientAccumulationScheduler(
                    {0: accumulate_grad_batches})
            else:
                raise MisconfigurationException(
                    f"`accumulate_grad_batches` should be an int or a dict. Got {accumulate_grad_batches}."
                )

            self.trainer.callbacks.append(grad_accum_callback)

        self.trainer.accumulate_grad_batches = grad_accum_callback.get_accumulate_grad_batches(
            0)
        self.trainer.accumulation_scheduler = grad_accum_callback
def test_trainer_accumulate_grad_batches_with_callback(tmpdir):
    with patch("torch.optim.SGD.zero_grad") as sgd_zero_grad:
        model = BoringModel()
        trainer = Trainer(
            default_root_dir=tmpdir,
            limit_train_batches=10,
            limit_val_batches=1,
            max_epochs=4,
            enable_model_summary=False,
            callbacks=[GradientAccumulationScheduler({
                1: 2,
                3: 4
            })],
        )
        assert trainer.accumulate_grad_batches == 1
        trainer.fit(model)

        assert sum(
            isinstance(cb, GradientAccumulationScheduler)
            for cb in trainer.callbacks) == 1
        assert sgd_zero_grad.call_count == 10 + 5 + 5 + 3
Exemplo n.º 9
0
class Trainer(TrainerIO):
    def __init__(self,
                 experiment=None,
                 early_stop_callback=None,
                 checkpoint_callback=None,
                 gradient_clip=0,
                 process_position=0,
                 nb_gpu_nodes=1,
                 gpus=None,
                 log_gpu_memory=False,
                 show_progress_bar=True,
                 overfit_pct=0.0,
                 track_grad_norm=-1,
                 check_val_every_n_epoch=1,
                 fast_dev_run=False,
                 accumulate_grad_batches=1,
                 max_nb_epochs=1000,
                 min_nb_epochs=1,
                 train_percent_check=1.0,
                 val_percent_check=1.0,
                 test_percent_check=1.0,
                 val_check_interval=1.0,
                 log_save_interval=100,
                 add_log_row_interval=10,
                 distributed_backend=None,
                 use_amp=False,
                 print_nan_grads=False,
                 print_weights_summary=True,
                 weights_save_path=None,
                 amp_level='O2',
                 nb_sanity_val_steps=5):
        """

        :param experiment: Test-tube experiment
        :param early_stop_callback: Callback for early stopping
        :param checkpoint_callback: Callback for checkpointing
        :param gradient_clip: int. 0 means don't clip.
        :param process_position: shown in the tqdm bar
        :param nb_gpu_nodes: number of GPU nodes
        :param gpus: int. (ie: 2 gpus) OR list to specify which GPUs [0, 1] or '0,1'
        :param log_gpu_memory: Bool. If true, adds memory logs
        :param show_progress_bar: Bool. If true shows tqdm bar
        :param overfit_pct: float. uses this much of all datasets
        :param track_grad_norm: int. -1 no tracking. Otherwise tracks that norm
        :param check_val_every_n_epoch: int. check val every n train epochs
        :param fast_dev_run: Bool. runs full iteration over everything to find bugs
        :param accumulate_grad_batches: int. Accumulates grads every k batches
        :param max_nb_epochs: int.
        :param min_nb_epochs: int.
        :param train_percent_check: int. How much of train set to check
        :param val_percent_check: int. How much of val set to check
        :param test_percent_check: int. How much of test set to check
        :param val_check_interval: int. Check val this frequently within a train epoch
        :param log_save_interval: int. Writes logs to disk this often
        :param add_log_row_interval: int. How often to add logging rows
        :param distributed_backend: str. dp, or ddp.
        :param use_amp: Bool. If true uses apex for 16bit precision
        :param print_nan_grads: Bool. Prints nan gradients
        :param print_weights_summary: Bool. Prints summary of weights
        :param weights_save_path: Bool. Where to save weights if on cluster
        :param amp_level: str. Check nvidia docs for level
        :param nb_sanity_val_steps: int. How many val steps before a full train loop.
        """
        # Transfer params
        self.nb_gpu_nodes = nb_gpu_nodes
        self.log_gpu_memory = log_gpu_memory
        self.gradient_clip = gradient_clip
        self.check_val_every_n_epoch = check_val_every_n_epoch
        self.enable_early_stop = early_stop_callback is not None
        self.track_grad_norm = track_grad_norm
        self.fast_dev_run = fast_dev_run
        self.on_gpu = gpus is not None and torch.cuda.is_available()
        self.process_position = process_position
        self.print_weights_summary = print_weights_summary
        self.max_nb_epochs = max_nb_epochs
        self.min_nb_epochs = min_nb_epochs
        self.nb_sanity_val_steps = nb_sanity_val_steps
        self.print_nan_grads = print_nan_grads

        # training bookeeping
        self.total_batch_nb = 0
        self.running_loss = []
        self.avg_loss = 0
        self.batch_nb = 0
        self.tqdm_metrics = {}
        self.nb_val_batches = 0
        self.nb_tng_batches = 0
        self.nb_test_batches = 0
        self.tng_dataloader = None
        self.test_dataloader = None
        self.val_dataloader = None

        # training state
        self.model = None
        self.testing = False
        self.lr_schedulers = []
        self.optimizers = None
        self.global_step = 0
        self.current_epoch = 0
        self.total_batches = 0

        # configure early stop callback
        self.early_stop_callback = early_stop_callback

        # configure weights save path
        self.__configure_weights_path(checkpoint_callback, weights_save_path)

        # configure experiment
        self.experiment = experiment
        self.exp_save_path = None
        if self.experiment is not None:
            self.exp_save_path = experiment.get_data_path(
                experiment.name, experiment.version)

        # accumulated grads
        self.__configure_accumulated_gradients(accumulate_grad_batches)

        # allow int, string and gpu list
        self.data_parallel_device_ids = self.__parse_gpu_ids(gpus)

        # distributed backend choice
        self.use_ddp = False
        self.use_dp = False
        self.single_gpu = False
        self.__set_distributed_mode(distributed_backend, nb_gpu_nodes)

        # init flags for SLURM+ddp to work
        self.proc_rank = 0
        self.world_size = 1
        self.node_rank = 0
        self.__configure_slurm_ddp(self.data_parallel_device_ids, nb_gpu_nodes)

        # nvidia setup
        self.__set_nvidia_flags(self.is_slurm_managing_tasks,
                                self.data_parallel_device_ids)

        # can't init progress bar here because starting a new process
        # means the prog_bar won't survive pickling
        self.show_progress_bar = show_progress_bar

        # logging
        self.log_save_interval = log_save_interval
        self.val_check_interval = val_check_interval
        self.add_log_row_interval = add_log_row_interval

        # how much of the data to use
        self.__determine_data_use_amount(train_percent_check,
                                         val_percent_check, test_percent_check,
                                         overfit_pct)

        # 16 bit mixed precision training using apex
        self.amp_level = amp_level
        self.__init_amp(use_amp)

    def __configure_weights_path(self, checkpoint_callback, weights_save_path):
        """
        Weight path set in this priority:
        Checkpoint_callback's path (if passed in).
        User provided weights_saved_path
        Otherwise use os.getcwd()
        """
        self.weights_save_path = weights_save_path

        # configure checkpoint callback
        self.checkpoint_callback = checkpoint_callback
        if self.checkpoint_callback is not None:
            self.checkpoint_callback.save_function = self.save_checkpoint

            # if checkpoint callback used, then override the weights path
            self.weights_save_path = self.checkpoint_callback.filepath

        # if weights_save_path is still none here, set to current workingdir
        if self.weights_save_path is None:
            self.weights_save_path = os.getcwd()

    def __init_amp(self, use_amp):
        self.use_amp = use_amp and APEX_AVAILABLE
        if self.use_amp:
            print('using 16bit precision')

        if use_amp and not APEX_AVAILABLE:  # pragma: no cover
            msg = """
            You set use_amp=True but do not have apex installed.
            Install apex first using this guide and rerun with use_amp=True:
            https://github.com/NVIDIA/apex#linux

            this run will NOT use 16 bit precision
            """
            raise ModuleNotFoundError(msg)

    def __configure_accumulated_gradients(self, accumulate_grad_batches):
        if isinstance(accumulate_grad_batches, dict):
            self.accumulation_scheduler = GradientAccumulationScheduler(
                accumulate_grad_batches)
        elif isinstance(accumulate_grad_batches, int):
            schedule = {1: accumulate_grad_batches}
            self.accumulation_scheduler = GradientAccumulationScheduler(
                schedule)
        else:
            raise TypeError(
                "Gradient accumulation supports only int and dict types")

    def __parse_gpu_ids(self, gpus):
        """
        :param gpus: Int, string or list of ids
        :return:
        """
        # if gpus = -1 then use all available devices
        # otherwise, split the string using commas
        if gpus is not None:
            if type(gpus) is list:
                gpus = gpus
            elif type(gpus) is str:
                if gpus == '-1':
                    gpus = list(range(0, torch.cuda.device_count()))
                else:
                    gpus = [int(x.strip()) for x in gpus.split(',')]
            elif type(gpus) is int:
                gpus = gpus
            else:
                raise Exception('gpus has to be a string, int or list of ints')

        return gpus

    @property
    def num_gpus(self):
        gpus = self.data_parallel_device_ids
        if gpus is None:
            return 0
        if type(gpus) is list:
            return len(gpus)
        if type(gpus) is int:
            return gpus

        m = 'gpus must be int, none or list of ints'
        raise MisconfigurationException(m)

    def __set_distributed_mode(self, distributed_backend, nb_gpu_nodes):
        # make DP and DDP mutually exclusive
        # single GPU will also use DP with devices=[0]
        requested_gpus = self.data_parallel_device_ids is not None

        num_gpus = self.num_gpus
        if num_gpus > 0:
            # single GPU case
            if num_gpus == 1:
                self.single_gpu = True

            elif num_gpus > 1 and distributed_backend is not None:
                # DP, DDP case
                self.use_dp = distributed_backend == 'dp'
                self.use_ddp = distributed_backend == 'ddp'

                # use ddp automatically if nb_gpu_nodes > 1
                if nb_gpu_nodes > 1 and self.use_dp:  # pragma: no cover
                    self.use_ddp = True
                    self.use_dp = False
                    w = 'DataParallel does not support nb_gpu_nodes > 1. ' \
                        'Switching to DistributedDataParallel for you. ' \
                        'To silence this warning set distributed_backend=ddp'
                    warnings.warn(w)

            elif distributed_backend is None:
                m = 'When using multiple GPUs set ' \
                    'Trainer(distributed_backend=dp) (or ddp)'
                raise MisconfigurationException(m)

        print('gpu available: {}, used: {}'.format(torch.cuda.is_available(),
                                                   self.on_gpu))

    def __configure_slurm_ddp(self, gpu_ids, nb_gpu_nodes):
        self.is_slurm_managing_tasks = False

        nb_gpus = len(gpu_ids) if type(gpu_ids) is list else gpu_ids

        # extract SLURM flag vars
        # whenever we have the correct number of tasks, we let slurm manage processes
        # otherwise we launch the required number of processes
        if self.use_ddp:
            self.nb_requested_gpus = nb_gpus * nb_gpu_nodes
            self.nb_slurm_tasks = 0
            try:
                self.nb_slurm_tasks = int(os.environ['SLURM_NTASKS'])
                self.is_slurm_managing_tasks = self.nb_slurm_tasks == self.nb_requested_gpus
            except Exception:
                # likely not on slurm, so set the slurm managed flag to false
                self.is_slurm_managing_tasks = False

    def __set_nvidia_flags(self, is_slurm_managing_tasks,
                           data_parallel_device_ids):
        if data_parallel_device_ids is None:
            return

        # set the correct cuda visible devices (using pci order)
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

        # when slurm is managing the task it sets the visible devices
        if not is_slurm_managing_tasks:
            if type(data_parallel_device_ids) is int:
                id_str = ','.join(
                    str(x) for x in list(range(data_parallel_device_ids)))
                os.environ["CUDA_VISIBLE_DEVICES"] = id_str
            else:
                gpu_str = ','.join([str(x) for x in data_parallel_device_ids])
                os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str

        print(f'VISIBLE GPUS: {os.environ["CUDA_VISIBLE_DEVICES"]}')

    @property
    def data_parallel(self):
        return self.use_dp or self.use_ddp

    def __determine_data_use_amount(self, train_percent_check,
                                    val_percent_check, test_percent_check,
                                    overfit_pct):
        """
        Use less data for debugging purposes
        """
        self.train_percent_check = train_percent_check
        self.val_percent_check = val_percent_check
        self.test_percent_check = test_percent_check
        if overfit_pct > 0:
            self.train_percent_check = overfit_pct
            self.val_percent_check = overfit_pct
            self.test_percent_check = overfit_pct

    def __get_model(self):
        return self.model.module if self.data_parallel else self.model

    def __is_function_implemented(self, f_name):
        model = self.__get_model()
        f_op = getattr(model, f_name, None)
        return callable(f_op)

    def __is_overriden(self, f_name):
        model = self.__get_model()
        super_object = LightningModule

        # when code pointers are different, it was overriden
        is_overriden = getattr(model, f_name).__code__ is not getattr(
            super_object, f_name).__code__
        return is_overriden

    @property
    def __tng_tqdm_dic(self):
        tqdm_dic = {
            'loss': '{0:.3f}'.format(self.avg_loss),
            'epoch': '{}'.format(self.current_epoch),
            'batch_nb': '{}'.format(self.batch_nb),
        }

        if self.experiment is not None:
            tqdm_dic['v_nb'] = self.experiment.version

        tqdm_dic.update(self.tqdm_metrics)

        if self.on_gpu:
            tqdm_dic['gpu'] = '{}'.format(torch.cuda.current_device())

        return tqdm_dic

    @property
    def tng_tqdm_dic(self):
        """
        Read-only for tqdm metrics
        :return:
        """
        return self.__tng_tqdm_dic

    def __layout_bookeeping(self):

        # determine number of training batches
        self.nb_tng_batches = len(self.tng_dataloader)
        self.nb_tng_batches = int(self.nb_tng_batches *
                                  self.train_percent_check)

        # determine number of validation batches
        # val datasets could be none, 1 or 2+
        if self.val_dataloader is not None:
            self.nb_val_batches = sum(
                len(dataloader) for dataloader in self.val_dataloader)
            self.nb_val_batches = int(self.nb_val_batches *
                                      self.val_percent_check)
            self.nb_val_batches = max(1, self.nb_val_batches)

        # determine number of test batches
        if self.test_dataloader is not None:
            self.nb_test_batches = sum(
                len(dataloader) for dataloader in self.test_dataloader)
            self.nb_test_batches = int(self.nb_test_batches *
                                       self.test_percent_check)
            self.nb_test_batches = max(1, self.nb_test_batches)

        # determine when to check validation
        self.val_check_batch = int(self.nb_tng_batches *
                                   self.val_check_interval)
        self.val_check_batch = max(1, self.val_check_batch)

    def __add_tqdm_metrics(self, metrics):
        for k, v in metrics.items():
            if type(v) is torch.Tensor:
                v = v.item()

            self.tqdm_metrics[k] = v

    def __evaluation_forward(self,
                             model,
                             data_batch,
                             batch_i,
                             dataloader_i,
                             test=False):
        # make dataloader_i arg in validation_step optional
        args = [data_batch, batch_i]

        if test and len(self.test_dataloader) > 1:
            args.append(dataloader_i)

        elif not test and len(self.val_dataloader) > 1:
            args.append(dataloader_i)

        # handle DP, DDP forward
        if self.use_ddp or self.use_dp:
            output = model(*args)
            return output

        # CPU, single GPU
        if self.single_gpu:
            # for single GPU put inputs on gpu manually
            root_gpu = 0
            if type(self.data_parallel_device_ids) is list:
                root_gpu = self.data_parallel_device_ids[0]
            data_batch = self.transfer_batch_to_gpu(data_batch, root_gpu)
            args[0] = data_batch

        if test:
            output = model.test_step(*args)
        else:
            output = model.validation_step(*args)

        return output

    def evaluate(self, model, dataloaders, max_batches, test=False):
        """
        Run evaluation code
        :param model: PT model
        :param dataloaders: list of PT dataloaders
        :param max_batches: Scalar
        :param dataloader_i:
        :param test: boolean
        :return:
        """
        # enable eval mode
        model.zero_grad()
        model.eval()

        # disable gradients to save memory
        torch.set_grad_enabled(False)

        # bookkeeping
        outputs = []

        # run training
        for dataloader_i, dl in enumerate(dataloaders):
            dl_outputs = []
            for batch_i, data_batch in enumerate(dl):

                if data_batch is None:  # pragma: no cover
                    continue

                # stop short when on fast_dev_run (sets max_batch=1)
                if batch_i >= max_batches:
                    break

                # -----------------
                # RUN EVALUATION STEP
                # -----------------
                output = self.__evaluation_forward(model, data_batch, batch_i,
                                                   dataloader_i, test)

                # track outputs for collation
                dl_outputs.append(output)

                # batch done
                if self.show_progress_bar:
                    self.progress_bar.update(1)
            outputs.append(dl_outputs)

        eval_results = {}

        # give model a chance to do something with the outputs (and method defined)
        model = self.__get_model()
        if len(dataloaders) == 1:
            outputs = outputs[0]
        if test and self.__is_overriden('test_end'):
            eval_results = model.test_end(outputs)
        elif self.__is_overriden('validation_end'):
            eval_results = model.validation_end(outputs)

        # enable train mode again
        model.train()

        # enable gradients to save memory
        torch.set_grad_enabled(True)

        return eval_results

    def get_dataloaders(self, model):
        """
        Dataloaders are provided by the model
        :param model:
        :return:
        """

        self.tng_dataloader = model.tng_dataloader
        self.test_dataloader = model.test_dataloader
        self.val_dataloader = model.val_dataloader

        # handle returning an actual dataloader instead of a list of loaders
        have_test_loaders = self.test_dataloader is not None
        if have_test_loaders and not isinstance(self.test_dataloader, list):
            self.test_dataloader = [self.test_dataloader]

        have_val_loaders = self.val_dataloader is not None
        if have_val_loaders and not isinstance(self.val_dataloader, list):
            self.val_dataloader = [self.val_dataloader]

        if self.use_ddp and not isinstance(self.tng_dataloader.sampler,
                                           DistributedSampler):
            msg = """
            You're using multiple gpus and multiple nodes without using a DistributedSampler
            to assign a subset of your data to each process. To silence this warning, pass a
            DistributedSampler to your DataLoader.

            ie: this:
            dataset = myDataset()
            dataloader = Dataloader(dataset)

            becomes:
            dataset = myDataset()
            dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
            dataloader = Dataloader(dataset, sampler=dist_sampler)

            If you want each process to load the full dataset, ignore this warning.
            """
            warnings.warn(msg)

        if self.use_ddp and self.val_dataloader is not None:
            for dataloader in self.val_dataloader:
                if not isinstance(dataloader, DistributedSampler):
                    msg = """
                    Your val_dataloader(s) are not all DistributedSamplers.
                    You're using multiple gpus and multiple nodes without using a DistributedSampler
                    to assign a subset of your data to each process. To silence this warning, pass a
                    DistributedSampler to your DataLoader.

                    ie: this:
                    dataset = myDataset()
                    dataloader = Dataloader(dataset)

                    becomes:
                    dataset = myDataset()
                    dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
                    dataloader = Dataloader(dataset, sampler=dist_sampler)

                    If you want each process to load the full dataset, ignore this warning.
                    """
                    warnings.warn(msg)
                    break

        if self.use_ddp and self.test_dataloader is not None:
            for dataloader in self.test_dataloader:
                if not isinstance(dataloader, DistributedSampler):
                    msg = """
                    Your test_dataloader(s) are not all DistributedSamplers.
                    You're using multiple gpus and multiple nodes without using a DistributedSampler
                    to assign a subset of your data to each process. To silence this warning, pass a
                    DistributedSampler to your DataLoader.

                    ie: this:
                    dataset = myDataset()
                    dataloader = Dataloader(dataset)

                    becomes:
                    dataset = myDataset()
                    dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
                    dataloader = Dataloader(dataset, sampler=dist_sampler)

                    If you want each process to load the full dataset, ignore this warning.
                    """
                    warnings.warn(msg)
                    break

    # -----------------------------
    # MODEL TRAINING
    # -----------------------------
    def fit(self, model):
        # when using multi-node or DDP within a node start each module in a separate process
        if self.use_ddp:
            # must copy only the meta of the exp so it survives pickle/unpickle
            #  when going to new process
            if self.experiment is not None:
                self.experiment = self.experiment.get_meta_copy()

            if self.is_slurm_managing_tasks:
                task = int(os.environ['SLURM_LOCALID'])
                self.ddp_train(task, model)
            else:
                nb_gpus = self.nb_requested_gpus
                nb_tasks = self.nb_slurm_tasks
                msg = f"""
                You requested {nb_gpus}s GPUs but launched {nb_tasks}s slurm tasks.
                We will launch {nb_gpus}s processes for you.
                We recommend you let slurm manage the processes by setting:
                --ntasks-per-node={nb_gpus}s
                If you're not using SLURM, ignore this message!
                """
                warnings.warn(msg)
                mp.spawn(self.ddp_train, nprocs=self.num_gpus, args=(model, ))

        # 1 gpu or dp option triggers training using DP module
        # easier to avoid NCCL issues
        elif self.use_dp:
            self.__dp_train(model)

        elif self.single_gpu:
            self.__single_gpu_train(model)

        # ON CPU
        else:
            # run through amp wrapper
            if self.use_amp:
                raise MisconfigurationException('amp + cpu is not supported.'
                                                ' Please use a GPU option')

            # CHOOSE OPTIMIZER
            # allow for lr schedulers as well
            self.optimizers, self.lr_schedulers = self.init_optimizers(
                model.configure_optimizers())

            self.__run_pretrain_routine(model)

        # return 1 when finished
        # used for testing or when we need to know that training succeeded
        return 1

    def init_optimizers(self, optimizers):

        # single optimizer
        if isinstance(optimizers, Optimizer):
            return [optimizers], []

        # two lists
        elif len(optimizers) == 2 and isinstance(optimizers[0], list):
            optimizers, lr_schedulers = optimizers
            return optimizers, lr_schedulers

        # single list or tuple
        elif isinstance(optimizers, list) or isinstance(optimizers, tuple):
            return optimizers, []

    def __single_gpu_train(self, model):
        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers = self.init_optimizers(
            model.configure_optimizers())

        root_gpu = 0
        if type(self.data_parallel_device_ids) is list:
            root_gpu = self.data_parallel_device_ids[0]
        model.cuda(root_gpu)

        if self.use_amp:
            # An example
            model, optimizers = amp.initialize(
                model,
                self.optimizers,
                opt_level=self.amp_level,
            )
            self.optimizers = optimizers

        self.__run_pretrain_routine(model)

    def __dp_train(self, model):

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers = self.init_optimizers(
            model.configure_optimizers())

        root_gpu = 0
        if type(self.data_parallel_device_ids) is list:
            root_gpu = self.data_parallel_device_ids[0]
        model.cuda(root_gpu)

        # check for this bug (amp + dp + !01 doesn't work)
        # https://github.com/NVIDIA/apex/issues/227
        if self.use_dp and self.use_amp:
            m = f"""
            Amp level {self.amp_level} with DataParallel is not supported.
            See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.
            We recommend you switch to ddp if you want to use amp
            """
            raise MisconfigurationException(m)

        model = LightningDataParallel(model,
                                      device_ids=self.data_parallel_device_ids)

        self.__run_pretrain_routine(model)

    def ddp_train(self, gpu_nb, model):
        """
        Entry point into a DP thread
        :param gpu_nb:
        :param model:
        :param cluster_obj:
        :return:
        """
        # node rank using relative slurm id
        # otherwise default to node rank 0
        try:
            node_id = os.environ['SLURM_NODEID']
            self.node_rank = int(node_id)
        except Exception:
            self.node_rank = 0

        # recover original exp before went into process
        # init in write mode only on proc 0
        if self.experiment is not None:
            self.experiment.debug = self.proc_rank > 0
            self.experiment = self.experiment.get_non_ddp_exp()

        # show progbar only on prog_rank 0
        self.show_progress_bar = self.show_progress_bar and self.node_rank == 0 and gpu_nb == 0

        # determine which process we are and world size
        self.proc_rank = self.node_rank * self.num_gpus + gpu_nb
        self.world_size = self.nb_gpu_nodes * self.num_gpus

        # let the exp know the rank to avoid overwriting logs
        if self.experiment is not None:
            self.experiment.rank = self.proc_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        self.__init_tcp_connection()

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers = self.init_optimizers(
            model.configure_optimizers())

        # MODEL
        # copy model to each gpu
        torch.cuda.set_device(gpu_nb)
        model.cuda(gpu_nb)

        # AMP
        # run through amp wrapper before going to distributed DP
        if self.use_amp:
            # An example
            model, optimizers = amp.initialize(
                model,
                self.optimizers,
                opt_level=self.amp_level,
            )
            self.optimizers = optimizers

        model = LightningDistributedDataParallel(model,
                                                 device_ids=[gpu_nb],
                                                 find_unused_parameters=True)

        # continue training routine
        self.__run_pretrain_routine(model)

    def __init_tcp_connection(self):
        """
        Connect all procs in the world using the env:// init
        Use the first node as the root address
        :param port:
        :param tries:
        :return:
        """
        # sets the appropriate port
        try:
            port = os.environ['MASTER_PORT']
        except Exception:
            port = 12910
            os.environ['MASTER_PORT'] = str(port)

        # figure out the root node addr
        try:
            root_node = os.environ['SLURM_NODELIST'].split(' ')[0]
        except Exception:
            root_node = '127.0.0.2'

        root_node = self.resolve_root_node_address(root_node)
        os.environ['MASTER_ADDR'] = root_node

        dist.init_process_group("nccl",
                                rank=self.proc_rank,
                                world_size=self.world_size)

    def resolve_root_node_address(self, root_node):
        if '[' in root_node:
            name = root_node.split('[')[0]
            number = root_node.split(',')[0]
            if '-' in number:
                number = number.split('-')[0]

            number = re.sub('[^0-9]', '', number)
            root_node = name + number

        return root_node

    def __run_pretrain_routine(self, model):
        """
        Sanity check a few things before starting actual training
        :param model:
        :return:
        """
        ref_model = model
        if self.data_parallel:
            ref_model = model.module

        # give model convenience properties
        ref_model.trainer = self

        # set local properties on the model
        ref_model.on_gpu = self.on_gpu
        ref_model.use_dp = self.use_dp
        ref_model.use_ddp = self.use_ddp
        ref_model.use_amp = self.use_amp
        ref_model.testing = self.testing

        # register auto-resubmit when on SLURM
        self.register_slurm_signal_handlers()

        # transfer data loaders from model
        self.get_dataloaders(ref_model)

        # init training constants
        self.__layout_bookeeping()

        # print model summary
        if self.proc_rank == 0 and self.print_weights_summary:
            ref_model.summarize()

        # link up experiment object
        if self.experiment is not None:
            ref_model.experiment = self.experiment

            # save exp to get started
            if self.proc_rank == 0:
                self.experiment.save()

        # track model now.
        # if cluster resets state, the model will update with the saved weights
        self.model = model

        # restore training and model before hpc call
        self.restore_weights(model)

        # progress bar init
        if self.show_progress_bar:
            self.progress_bar = tqdm.tqdm(0, position=self.process_position)

        # when testing requested only run test and return
        if self.testing:
            self.__run_evaluation(test=True)
            return

        # run tiny validation (if validation defined)
        # to make sure program won't crash during val
        ref_model.on_sanity_check_start()
        if self.val_dataloader is not None and self.nb_sanity_val_steps > 0:
            # reset progress_bar limit for sanity check
            if self.show_progress_bar:
                self.progress_bar.reset(self.nb_sanity_val_steps)

            self.evaluate(model, self.val_dataloader, self.nb_sanity_val_steps,
                          self.testing)

        # ---------------------------
        # CORE TRAINING LOOP
        # ---------------------------
        self.__train()

    def __train(self):
        # run all epochs
        for epoch_nb in range(self.current_epoch, self.max_nb_epochs):
            # get model
            model = self.__get_model()

            # update training progress in trainer and model
            model.current_epoch = epoch_nb
            self.current_epoch = epoch_nb
            self.total_batches = self.nb_tng_batches + self.nb_val_batches
            self.batch_loss_value = 0  # accumulated grads

            # init progress_bar when requested
            if self.show_progress_bar:
                self.progress_bar.reset(self.total_batches)

            # changing gradient according accumulation_scheduler
            self.accumulation_scheduler.on_epoch_begin(epoch_nb, self)

            # -----------------
            # RUN TNG EPOCH
            # -----------------
            self.run_tng_epoch()

            # update LR schedulers
            if self.lr_schedulers is not None:
                for lr_scheduler in self.lr_schedulers:
                    lr_scheduler.step()

            # early stopping
            met_min_epochs = epoch_nb > self.min_nb_epochs
            if self.enable_early_stop and met_min_epochs:
                should_stop = self.early_stop_callback.on_epoch_end(
                    epoch=epoch_nb, logs=self.__tng_tqdm_dic)
                # stop training
                stop = should_stop and met_min_epochs
                if stop:
                    return

    def run_tng_epoch(self):
        # before epoch hook
        if self.__is_function_implemented('on_epoch_start'):
            model = self.__get_model()
            model.on_epoch_start()

        # run epoch
        for batch_nb, data_batch in enumerate(self.tng_dataloader):
            self.batch_nb = batch_nb
            self.global_step += 1

            model = self.__get_model()
            model.global_step = self.global_step

            # stop when the flag is changed or we've gone past the amount
            #  requested in the batches
            self.total_batch_nb += 1
            met_batch_limit = batch_nb > self.nb_tng_batches
            if met_batch_limit:
                break

            # ---------------
            # RUN TRAIN STEP
            # ---------------
            batch_result = self.__run_tng_batch(data_batch, batch_nb)
            early_stop_epoch = batch_result == -1

            # ---------------
            # RUN VAL STEP
            # ---------------
            is_val_check_batch = (batch_nb + 1) % self.val_check_batch == 0
            can_check_epoch = (self.current_epoch +
                               1) % self.check_val_every_n_epoch == 0
            if self.fast_dev_run or is_val_check_batch or early_stop_epoch:
                if can_check_epoch:
                    self.__run_evaluation(test=self.testing)

            # when batch should be saved
            if (batch_nb +
                    1) % self.log_save_interval == 0 or early_stop_epoch:
                if self.proc_rank == 0 and self.experiment is not None:
                    self.experiment.save()

            # when metrics should be logged
            if batch_nb % self.add_log_row_interval == 0 or early_stop_epoch:
                # count items in memory
                # nb_params, nb_tensors = count_mem_items()

                model = self.__get_model()
                metrics = self.__tng_tqdm_dic

                # add gpu memory
                if self.on_gpu and self.log_gpu_memory:
                    mem_map = get_gpu_memory_map()
                    metrics.update(mem_map)

                # add norms
                if self.track_grad_norm > 0:
                    model = self.__get_model()
                    grad_norm_dic = model.grad_norm(self.track_grad_norm)
                    metrics.update(grad_norm_dic)

                if self.__is_function_implemented('on_tng_metrics'):
                    model.on_tng_metrics(metrics)

                # log metrics
                scalar_metrics = self.__metrics_to_scalars(
                    metrics, blacklist=self.__log_vals_blacklist())
                if self.proc_rank == 0 and self.experiment is not None:
                    self.experiment.log(scalar_metrics,
                                        global_step=self.global_step)
                    self.experiment.save()

            # end epoch early
            if early_stop_epoch:
                break

        # epoch end hook
        if self.__is_function_implemented('on_epoch_end'):
            model = self.__get_model()
            model.on_epoch_end()

    def test(self, model=None):
        if model is not None:
            self.testing = True
            self.fit(model)
        else:
            self.__run_evaluation(test=True)

    def __metrics_to_scalars(self, metrics, blacklist=set()):
        new_metrics = {}
        for k, v in metrics.items():
            if type(v) is torch.Tensor:
                v = v.item()

            if type(v) is dict:
                v = self.__metrics_to_scalars(v)

            if k not in blacklist:
                new_metrics[k] = float(v)

        return new_metrics

    def __log_vals_blacklist(self):
        """avoid logging some vals lightning uses to maintain state"""
        blacklist = {'batch_nb', 'v_nb', 'gpu'}
        return blacklist

    def transfer_batch_to_gpu(self, batch, gpu_id):
        # base case: object can be directly moved using `cuda` or `to`
        if callable(getattr(batch, 'cuda', None)):
            return batch.cuda(gpu_id)

        elif callable(getattr(batch, 'to', None)):
            return batch.to(torch.device('cuda', gpu_id))

        # when list
        elif isinstance(batch, list):
            for i, x in enumerate(batch):
                batch[i] = self.transfer_batch_to_gpu(x, gpu_id)
            return batch

        # when tuple
        elif isinstance(batch, tuple):
            batch = list(batch)
            for i, x in enumerate(batch):
                batch[i] = self.transfer_batch_to_gpu(x, gpu_id)
            return tuple(batch)

        # when dict
        elif isinstance(batch, dict):
            for k, v in batch.items():
                batch[k] = self.transfer_batch_to_gpu(v, gpu_id)

            return batch

        # nothing matches, return the value as is without transform
        return batch

    def __tng_forward(self, data_batch, batch_nb, opt_idx):
        """
        Handle forward for each training case (distributed, single gpu, etc...)
        :param data_batch:
        :param batch_nb:
        :return:
        """
        # ---------------
        # FORWARD
        # ---------------
        # enable not needing to add opt_idx to training_step
        args = [data_batch, batch_nb]
        if len(self.optimizers) > 1:
            args.append(opt_idx)

        if self.use_ddp:
            output = self.model(*args)
        elif self.use_dp:
            output = self.model(*args)
        elif self.single_gpu:
            gpu_id = 0
            if type(self.data_parallel_device_ids) is list:
                gpu_id = self.data_parallel_device_ids[0]
            data_batch = self.transfer_batch_to_gpu(data_batch, gpu_id)
            args[0] = data_batch
            output = self.model.training_step(*args)

        else:
            output = self.model.training_step(*args)

        # ---------------
        # TQDM metrics
        # ---------------
        try:
            prog_output = output['prog']

            # reduce prog metrics for tqdm when using dp
            if self.use_dp:
                nb_gpus = self.num_gpus
                prog_output = reduce_distributed_output(prog_output, nb_gpus)

            model_specific_tqdm_metrics_dic = prog_output
        except Exception:
            model_specific_tqdm_metrics_dic = {}

        # ---------------
        # EXTRACT LOSS
        # ---------------
        # if output dict doesn't have the keyword loss
        # then assume the output=loss if scalar
        try:
            loss = output['loss']
        except Exception:
            if type(output) is torch.Tensor:
                loss = output

        # when using dp need to reduce the loss
        if self.use_dp:
            loss = reduce_distributed_output(loss, self.num_gpus)

        return loss, model_specific_tqdm_metrics_dic

    def __clip_gradients(self):
        if self.gradient_clip > 0:
            model = self.__get_model()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           self.gradient_clip)

    def __print_nan_grads(self):
        model = self.__get_model()
        for param in model.parameters():
            if torch.isnan(param.grad.float()).any():
                print(param, param.grad)

    def __run_tng_batch(self, data_batch, batch_nb):
        if data_batch is None:
            return 0

        # hook
        if self.__is_function_implemented('on_batch_start'):
            model_ref = self.__get_model()
            response = model_ref.on_batch_start(data_batch)

            if response == -1:
                return -1

        if self.show_progress_bar:
            self.progress_bar.update(1)

        # call training_step once per optimizer
        for opt_idx, optimizer in enumerate(self.optimizers):

            # forward pass
            loss, model_specific_tqdm_metrics = self.__tng_forward(
                data_batch, batch_nb, opt_idx)

            # track metrics
            self.__add_tqdm_metrics(model_specific_tqdm_metrics)

            # accumulate loss
            # (if accumulate_grad_batches = 1 no effect)
            loss = loss / self.accumulate_grad_batches

            # backward pass
            if self.use_amp:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # insert after step hook
            if self.__is_function_implemented('on_after_backward'):
                model_ref = self.__get_model()
                model_ref.on_after_backward()

            # nan grads
            if self.print_nan_grads:
                self.__print_nan_grads()

            # track total loss for logging (avoid mem leaks)
            self.batch_loss_value += loss.item()

            # gradient update with accumulated gradients
            if (self.batch_nb + 1) % self.accumulate_grad_batches == 0:
                # clip gradients
                self.__clip_gradients()

                # calls .step(), .zero_grad()
                # override function to modify this behavior
                model = self.__get_model()
                model.optimizer_step(self.current_epoch, batch_nb, optimizer,
                                     opt_idx)

                # calculate running loss for display
                self.running_loss.append(self.batch_loss_value)
                self.batch_loss_value = 0
                self.avg_loss = np.mean(self.running_loss[-100:])

                # update progbar
                if self.show_progress_bar:
                    # add model specific metrics
                    tqdm_metrics = self.__tng_tqdm_dic
                    self.progress_bar.set_postfix(**tqdm_metrics)

        # activate batch end hook
        if self.__is_function_implemented('on_batch_end'):
            model = self.__get_model()
            model.on_batch_end()

        return 0

    def __run_evaluation(self, test=False):
        # when testing make sure user defined a test step
        can_run_test_step = False
        if test:
            can_run_test_step = self.__is_overriden(
                'test_step') and self.__is_overriden('test_end')
            if not can_run_test_step:
                m = '''You called .test() without defining a test step or test_end.
                Please define and try again'''
                raise MisconfigurationException(m)

        # validate only if model has validation_step defined
        # test only if test_step or validation_step are defined
        run_val_step = self.__is_overriden('validation_step')

        if run_val_step or can_run_test_step:

            # hook
            model = self.__get_model()
            model.on_pre_performance_check()

            # select dataloaders
            dataloaders = self.val_dataloader
            max_batches = self.nb_val_batches

            # calculate max batches to use
            if test:
                dataloaders = self.test_dataloader
                max_batches = self.nb_test_batches

            # cap max batches to 1 when using fast_dev_run
            if self.fast_dev_run:
                max_batches = 1

            eval_out_metrics = self.evaluate(self.model, dataloaders,
                                             max_batches, test)

            self.__add_tqdm_metrics(eval_out_metrics)

            # hook
            model.on_post_performance_check()

            if self.show_progress_bar:
                # add model specific metrics
                tqdm_metrics = self.__tng_tqdm_dic
                self.progress_bar.set_postfix(**tqdm_metrics)

        # model checkpointing
        if self.proc_rank == 0 and self.checkpoint_callback is not None and not test:
            print('save callback...')
            self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch,
                                                  logs=self.__tng_tqdm_dic)
Exemplo n.º 10
0
def test_attach_model_callbacks():
    """Test that the callbacks defined in the model and through Trainer get merged correctly."""

    def _attach_callbacks(trainer_callbacks, model_callbacks):
        model = LightningModule()
        model.configure_callbacks = lambda: model_callbacks
        has_progress_bar = any(isinstance(cb, ProgressBarBase) for cb in trainer_callbacks + model_callbacks)
        trainer = Trainer(
            enable_checkpointing=False,
            enable_progress_bar=has_progress_bar,
            enable_model_summary=False,
            callbacks=trainer_callbacks,
        )
        trainer.model = model
        cb_connector = CallbackConnector(trainer)
        cb_connector._attach_model_callbacks()
        return trainer

    early_stopping = EarlyStopping(monitor="foo")
    progress_bar = TQDMProgressBar()
    lr_monitor = LearningRateMonitor()
    grad_accumulation = GradientAccumulationScheduler({1: 1})

    # no callbacks
    trainer = _attach_callbacks(trainer_callbacks=[], model_callbacks=[])
    assert trainer.callbacks == [trainer.accumulation_scheduler]

    # callbacks of different types
    trainer = _attach_callbacks(trainer_callbacks=[early_stopping], model_callbacks=[progress_bar])
    assert trainer.callbacks == [early_stopping, trainer.accumulation_scheduler, progress_bar]

    # same callback type twice, different instance
    trainer = _attach_callbacks(
        trainer_callbacks=[progress_bar, EarlyStopping(monitor="foo")],
        model_callbacks=[early_stopping],
    )
    assert trainer.callbacks == [progress_bar, trainer.accumulation_scheduler, early_stopping]

    # multiple callbacks of the same type in trainer
    trainer = _attach_callbacks(
        trainer_callbacks=[
            LearningRateMonitor(),
            EarlyStopping(monitor="foo"),
            LearningRateMonitor(),
            EarlyStopping(monitor="foo"),
        ],
        model_callbacks=[early_stopping, lr_monitor],
    )
    assert trainer.callbacks == [trainer.accumulation_scheduler, early_stopping, lr_monitor]

    # multiple callbacks of the same type, in both trainer and model
    trainer = _attach_callbacks(
        trainer_callbacks=[
            LearningRateMonitor(),
            progress_bar,
            EarlyStopping(monitor="foo"),
            LearningRateMonitor(),
            EarlyStopping(monitor="foo"),
        ],
        model_callbacks=[early_stopping, lr_monitor, grad_accumulation, early_stopping],
    )
    assert trainer.callbacks == [progress_bar, early_stopping, lr_monitor, grad_accumulation, early_stopping]
Exemplo n.º 11
0
# logger = TensorBoardLogger("logs", name=classifier_model_name, log_graph=True)
#%%
if __name__ == "__main__":
    if load_pretrained:
        checkpoint_path = os.path.join(classifier_model_dir, checkpoints[-1])
        model = classifier.load_from_checkpoint(checkpoint_path)
    else:
        model = classifier(**model_args)
    pl.seed_everything(42)
    wandb.login(key='355d7f0e367b84fb9f8a140be052641fbd926fb5')
    logger = WandbLogger(name=classifier_model_name,
                         save_dir='logs',
                         offline=True)
    logger.watch(model, log='gradients', log_freq=100)
    #logger = TensorBoardLogger("logs", name=classifier_model_name, log_graph=True)
    grad_acumulator = GradientAccumulationScheduler(scheduling={0: 2, 1: 3})
    lr_monitor = LearningRateMonitor(logging_interval='step')
    model_chkpt = ModelCheckpoint(dirpath=classifier_model_dir,
                                  monitor='val_acc_epoch',
                                  filename='{epoch}-{val_acc_epoch:.2f}',
                                  verbose=True)
    early_stopper = EarlyStopping(monitor='val_acc_epoch',
                                  patience=6,
                                  verbose=True)
    trainer = pl.Trainer(logger=logger,
                         callbacks=[lr_monitor, model_chkpt, early_stopper],
                         **trainer_args)
    #%%
    # lr_finder = trainer.tuner.lr_find(model, datamodule)

    # # Results can be found in
Exemplo n.º 12
0
    def __init__(self,
                 experiment=None,
                 early_stop_callback=None,
                 checkpoint_callback=None,
                 gradient_clip=0,
                 cluster=None,
                 process_position=0,
                 current_gpu_name=0,
                 nb_gpu_nodes=1,
                 gpus=None,
                 log_gpu_memory=False,
                 show_progress_bar=True,
                 overfit_pct=0.0,
                 track_grad_norm=-1,
                 check_val_every_n_epoch=1,
                 fast_dev_run=False,
                 accumulate_grad_batches=1,
                 max_nb_epochs=1000,
                 min_nb_epochs=1,
                 train_percent_check=1.0,
                 val_percent_check=1.0,
                 test_percent_check=1.0,
                 val_check_interval=1.0,
                 log_save_interval=100,
                 add_log_row_interval=10,
                 distributed_backend='dp',
                 use_amp=False,
                 print_nan_grads=False,
                 print_weights_summary=True,
                 amp_level='O2',
                 nb_sanity_val_steps=5):
        """

        :param experiment: Test-tube experiment
        :param early_stop_callback: from pytorch_lightning import EarlyStopping
        :param checkpoint_callback: from pytorch_lightning import Checkpoint
        :param gradient_clip:
        :param cluster:
        :param process_position:
        :param current_gpu_name:
        :param nb_gpu_nodes:
        :param gpus:
        :param \: Log GPU memory utilization as metric
            during training. This can lead to lower performance on some
            servers, in particular when `nvidia-smi` is slow.
        :param show_progress_bar:
        :param overfit_pct:
        :param track_grad_norm:
        :param check_val_every_n_epoch:
        :param fast_dev_run:
        :param accumulate_grad_batches:
        :param max_nb_epochs:
        :param min_nb_epochs:
        :param train_percent_check:
        :param val_percent_check:
        :param test_percent_check:
        :param val_check_interval:
        :param log_save_interval:
        :param add_log_row_interval:
        :param distributed_backend:
            'do' to use DistributedParallel, 'dp' to use DistributedDataParallel, 'n' to use none
        :param use_amp:
        :param print_nan_grads:
        :param print_weights_summary:
        :param amp_level:
        :param nb_sanity_val_steps:
        """
        # Transfer params
        self.nb_gpu_nodes = nb_gpu_nodes
        self.log_gpu_memory = log_gpu_memory
        self.gradient_clip = gradient_clip
        self.check_val_every_n_epoch = check_val_every_n_epoch
        self.enable_early_stop = early_stop_callback is not None
        self.track_grad_norm = track_grad_norm
        self.fast_dev_run = fast_dev_run
        self.on_gpu = gpus is not None and torch.cuda.is_available()
        self.experiment = experiment
        self.exp_save_path = None
        if self.experiment is not None:
            self.exp_save_path = experiment.get_data_path(
                experiment.name, experiment.version)
        self.cluster = cluster
        self.process_position = process_position
        self.current_gpu_name = current_gpu_name
        self.print_weights_summary = print_weights_summary
        self.checkpoint_callback = checkpoint_callback

        if self.checkpoint_callback is not None:
            self.checkpoint_callback.save_function = self.save_checkpoint

        self.early_stop = early_stop_callback
        self.model = None
        self.max_nb_epochs = max_nb_epochs
        if isinstance(accumulate_grad_batches, dict):
            self.accumulation_scheduler = GradientAccumulationScheduler(
                accumulate_grad_batches)
        elif isinstance(accumulate_grad_batches, int):
            schedule = {1: accumulate_grad_batches}
            self.accumulation_scheduler = GradientAccumulationScheduler(
                schedule)
        else:
            raise TypeError(
                "Gradient accumulation supports only int and dict types")
        self.early_stop_callback = early_stop_callback
        self.min_nb_epochs = min_nb_epochs
        self.nb_sanity_val_steps = nb_sanity_val_steps
        self.lr_schedulers = []
        self.amp_level = amp_level
        self.print_nan_grads = print_nan_grads
        self.data_parallel_device_ids = None
        self.world_size = 1
        self.node_rank = 0
        self.use_ddp = False
        self.use_dp = False
        self.single_gpu = False
        self.testing = False

        # training bookeeping
        self.total_batch_nb = 0
        self.running_loss = []
        self.avg_loss = 0
        self.batch_nb = 0
        self.tqdm_metrics = {}
        self.nb_val_batches = 0
        self.nb_tng_batches = 0
        self.nb_test_batches = 0

        # gpus come in as a string.
        # if gpus = -1 then use all available devices
        # otherwise, split the string using commas
        if gpus is not None:
            if type(gpus) is list:
                self.data_parallel_device_ids = gpus
            elif type(gpus) is str:
                if gpus == '-1':
                    self.data_parallel_device_ids = list(
                        range(0, torch.cuda.device_count()))
                else:
                    self.data_parallel_device_ids = [
                        int(x.strip()) for x in gpus.split(',')
                    ]
            else:
                raise Exception('gpus has to be a string or list of ids')

            # set the correct cuda visible devices (using pci order)
            os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
            os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(
                [str(x) for x in self.data_parallel_device_ids])
            print('VISIBLE GPUS: %r' % os.environ["CUDA_VISIBLE_DEVICES"])

        # make DP and DDP mutually exclusive
        # single GPU will also use DP with devices=[0]
        requested_gpus = self.data_parallel_device_ids is not None
        if requested_gpus and len(self.data_parallel_device_ids) > 0:
            self.use_dp = distributed_backend == 'dp'
            self.use_ddp = distributed_backend == 'ddp'

            # use ddp automatically if nb_gpu_nodes > 1
            if nb_gpu_nodes > 1 and self.use_dp:  # pragma: no cover
                self.use_ddp = True
                self.use_dp = False
                w = 'DataParallel does not support nb_gpu_nodes > 1. ' \
                    'Switching to DistributedDataParallel for you. ' \
                    'To silence this warning set distributed_backend=ddp'
                warnings.warn(w)

        # remove dp and ddp when requesting single gpu
        if self.data_parallel_device_ids is not None and len(
                self.data_parallel_device_ids) == 1:
            self.use_ddp = False
            self.use_dp = False
            self.single_gpu = True

        # extract SLURM flag vars
        # whenever we have the correct number of tasks, we let slurm manage processes
        # otherwise we launch the required number of processes
        if self.use_ddp:
            self.nb_requested_gpus = len(
                self.data_parallel_device_ids) * self.nb_gpu_nodes
            self.nb_slurm_tasks = 0
            try:
                self.nb_slurm_tasks = int(os.environ['SLURM_NTASKS'])
                self.is_slurm_managing_tasks = self.nb_slurm_tasks == self.nb_requested_gpus
            except Exception:
                # likely not on slurm, so set the slurm managed flag to false
                self.is_slurm_managing_tasks = False

        # process info
        self.proc_rank = 0

        # training state
        self.optimizers = None
        self.global_step = 0
        self.current_epoch = 0
        self.total_batches = 0

        # can't init progress bar here because starting a new process
        # means the prog_bar won't survive pickling
        self.show_progress_bar = show_progress_bar

        # logging
        self.log_save_interval = log_save_interval
        self.val_check_interval = val_check_interval
        self.add_log_row_interval = add_log_row_interval

        # dataloaders
        self.tng_dataloader = None
        self.test_dataloader = None
        self.val_dataloader = None

        # how much of the data to use
        self.__determine_data_use_amount(train_percent_check,
                                         val_percent_check, test_percent_check,
                                         overfit_pct)
        print('gpu available: {}, used: {}'.format(torch.cuda.is_available(),
                                                   self.on_gpu))

        # 16 bit mixed precision training using apex
        self.use_amp = use_amp and APEX_AVAILABLE
        if self.use_amp:
            print('using 16bit precision')

        if use_amp and not APEX_AVAILABLE:  # pragma: no cover
            msg = """
            You set use_amp=True but do not have apex installed.
            Install apex first using this guide and rerun with use_amp=True:
            https://github.com/NVIDIA/apex#linux

            this run will NOT use 16 bit precision
            """
            raise ModuleNotFoundError(msg)
def test_invalid_values_for_grad_accum_scheduler(scheduling):
    with pytest.raises(MisconfigurationException,
                       match="Accumulation factor should be an int"):
        _ = GradientAccumulationScheduler(scheduling=scheduling)