Пример #1
0
    def ddp_train(self, gpu_nb, model):
        """
        Entry point into a DP thread
        :param gpu_nb:
        :param model:
        :param cluster_obj:
        :return:
        """
        # node rank using relative slurm id
        # otherwise default to node rank 0
        try:
            node_id = os.environ['SLURM_NODEID']
            self.node_rank = int(node_id)
        except Exception as e:
            self.node_rank = 0

        # recover original exp before went into process
        # init in write mode only on proc 0
        self.experiment.debug = self.proc_rank > 0
        self.experiment = self.experiment.get_non_ddp_exp()

        # show progbar only on prog_rank 0
        self.prog_bar = self.prog_bar and self.node_rank == 0 and gpu_nb == 0

        # determine which process we are and world size
        self.proc_rank = self.node_rank * len(self.data_parallel_device_ids) + gpu_nb
        self.world_size = self.nb_gpu_nodes * len(self.data_parallel_device_ids)

        # let the exp know the rank to avoid overwriting logs
        self.experiment.rank = self.proc_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        self.__init_tcp_connection()

        # CHOOSE OPTIMIZER
        # filter out the weights that were done on gpu so we can load on good old cpus
        self.optimizers = model.configure_optimizers()

        # MODEL
        # copy model to each gpu
        torch.cuda.set_device(gpu_nb)
        model.cuda(gpu_nb)

        # AMP
        # run through amp wrapper before going to distributed DP
        if self.use_amp:
            # An example
            model, optimizers = amp.initialize(
                model, self.optimizers, opt_level=self.amp_level,
            )
            self.optimizers = optimizers

        model = LightningDistributedDataParallel(model, device_ids=[gpu_nb], find_unused_parameters=True)

        # continue training routine
        self.__run_pretrain_routine(model)
Пример #2
0
 def configure_ddp(self, model, device_ids):
     """
     Override to init DDP in a different way or use your own wrapper.
     Must return model.
     :param model:
     :param device_ids:
     :return: DDP wrapped model
     """
     model = LightningDistributedDataParallel(model,
                                              device_ids=device_ids,
                                              find_unused_parameters=True)
     return model
Пример #3
0
    def ddp_train(self, gpu_nb, model):
        """
        Entry point into a DP thread
        :param gpu_nb:
        :param model:
        :param cluster_obj:
        :return:
        """
        # node rank using relative slurm id
        # otherwise default to node rank 0
        try:
            node_id = os.environ['SLURM_NODEID']
            self.node_rank = int(node_id)
        except Exception:
            self.node_rank = 0

        # show progressbar only on progress_rank 0
        self.show_progress_bar = self.show_progress_bar and self.node_rank == 0 and gpu_nb == 0

        # determine which process we are and world size
        if self.use_ddp:
            self.proc_rank = self.node_rank * self.num_gpus + gpu_nb
            self.world_size = self.nb_gpu_nodes * self.num_gpus

        elif self.use_ddp2:
            self.proc_rank = self.node_rank
            self.world_size = self.nb_gpu_nodes

        # let the exp know the rank to avoid overwriting logs
        if self.logger is not None:
            self.logger.rank = self.proc_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        self.__init_tcp_connection()

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers = self.init_optimizers(model.configure_optimizers())

        # MODEL
        # copy model to each gpu
        if self.distributed_backend == 'ddp':
            torch.cuda.set_device(gpu_nb)
        model.cuda(gpu_nb)

        # set model properties before going into wrapper
        self.copy_trainer_model_properties(model)

        # override root GPU
        self.root_gpu = gpu_nb

        # AMP
        # run through amp wrapper before going to distributed DP
        if self.use_amp:
            # An example
            model, optimizers = amp.initialize(
                model, self.optimizers, opt_level=self.amp_level,
            )
            self.optimizers = optimizers

        # DDP2 uses all GPUs on the machine
        if self.distributed_backend == 'ddp':
            device_ids = [gpu_nb]
        elif self.use_ddp2:
            device_ids = None

        model = LightningDistributedDataParallel(
            model,
            device_ids=device_ids,
            find_unused_parameters=True
        )

        # continue training routine
        self.run_pretrain_routine(model)