예제 #1
0
    def tpu_train(self, tpu_core_idx, model):
        # put model on tpu
        model.to(xm.xla_device())

        # get the appropriate tpu ranks
        self.tpu_local_core_rank = xm.get_local_ordinal()
        self.tpu_global_core_rank = xm.get_ordinal()

        # avoid duplicating progress bar
        self.progress_bar_refresh_rate = self.progress_bar_refresh_rate if self.tpu_global_core_rank == 0 else 0

        # track current tpu
        self.current_tpu_idx = tpu_core_idx
        self.proc_rank = self.tpu_local_core_rank
        set_proc_rank(self.proc_rank)

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)

        # init 16 bit for TPU
        if self.precision == 16:
            os.environ['XLA_USE_BF16'] = str(1)

        log.info(f'INIT TPU local core: {self.tpu_local_core_rank},'
                 f' global rank: {self.tpu_global_core_rank}')

        # continue training routine
        self.run_pretrain_routine(model)

        self.save_spawn_weights(model)
    def ddp_train(self, process_idx, model):
        """
        Entry point into a DP thread
        :param gpu_idx:
        :param model:
        :param cluster_obj:
        :return:
        """
        # node rank using relative slurm id
        # otherwise default to node rank 0
        try:
            node_id = os.environ['SLURM_NODEID']
            self.node_rank = int(node_id)
        except Exception:
            self.node_rank = 0

        # show progressbar only on progress_rank 0
        self.progress_bar_refresh_rate = (self.progress_bar_refresh_rate
                                          if self.node_rank == 0
                                          and process_idx == 0 else 0)

        # determine which process we are and world size
        if self.use_ddp:
            self.proc_rank = self.node_rank * self.num_processes + process_idx
            self.world_size = self.num_nodes * self.num_processes

        elif self.use_ddp2:
            self.proc_rank = self.node_rank
            self.world_size = self.num_nodes
        # set warning rank
        set_proc_rank(self.proc_rank)

        # let the exp know the rank to avoid overwriting logs
        if self.logger is not None:
            self.logger.rank = self.proc_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        model.trainer = self
        model.init_ddp_connection(self.proc_rank, self.world_size)

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(
            model)

        # MODEL
        # copy model to each gpu
        if self.on_gpu:
            self.root_gpu = self.data_parallel_device_ids[process_idx]
            torch.cuda.set_device(self.root_gpu)
            model.cuda(self.root_gpu)

        # set model properties before going into wrapper
        self.copy_trainer_model_properties(model)

        # AMP
        # run through amp wrapper before going to distributed DP
        if self.use_amp:
            # An example
            model, optimizers = model.configure_apex(amp, model,
                                                     self.optimizers,
                                                     self.amp_level)
            self.optimizers = optimizers

        # DDP2 uses all GPUs on the machine
        if self.distributed_backend == 'ddp':
            device_ids = [self.root_gpu]
        elif self.use_ddp2:
            device_ids = self.data_parallel_device_ids
        else:  # includes ddp_cpu
            device_ids = None

        # allow user to configure ddp
        model = model.configure_ddp(model, device_ids)

        # continue training routine
        self.run_pretrain_routine(model)

        # when ddp ends, we save the model
        self.save_spawn_weights(model)
    def ddp_train(self, process_idx, model):
        """
        Entry point into a DP thread
        :param gpu_idx:
        :param model:
        :param cluster_obj:
        :return:
        """
        # node rank using relative slurm id if under slurm management
        # otherwise use given node rank or default to node rank 0
        try:
            node_id = os.environ[
                'SLURM_NODEID'] if self.is_slurm_managing_tasks else os.environ[
                    'NODE_RANK']
            self.node_rank = int(node_id)
        except KeyError:
            log.warning(
                "SLURM_NODEID or NODE_RANK environment variable is not defined. Set as 0."
            )
            self.node_rank = 0

        # show progressbar only on progress_rank 0
        if (self.node_rank != 0 or
                process_idx != 0) and self.progress_bar_callback is not None:
            self.progress_bar_callback.disable()

        # determine which process we are and world size
        if self.use_ddp:
            self.proc_rank = self.node_rank * self.num_processes + process_idx
            self.world_size = self.num_nodes * self.num_processes

        elif self.use_ddp2:
            self.proc_rank = self.node_rank
            self.world_size = self.num_nodes
        # set warning rank
        set_proc_rank(self.proc_rank)

        # let the exp know the rank to avoid overwriting logs
        if self.logger is not None:
            self.logger.rank = self.proc_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        model.trainer = self
        model.init_ddp_connection(self.proc_rank, self.world_size,
                                  self.is_slurm_managing_tasks)

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(
            model)

        # MODEL
        # copy model to each gpu
        if self.on_gpu:
            self.root_gpu = process_idx
            torch.cuda.set_device(self.root_gpu)
            model.cuda(self.root_gpu)

        # set model properties before going into wrapper
        self.copy_trainer_model_properties(model)

        # AMP
        # run through amp wrapper before going to distributed DP
        # TODO: remove in v0.8.0
        if self.use_amp and not self.use_native_amp:
            model, optimizers = model.configure_apex(amp, model,
                                                     self.optimizers,
                                                     self.amp_level)
            self.optimizers = optimizers

        # DDP2 uses all GPUs on the machine
        if self.distributed_backend == 'ddp':
            device_ids = [self.root_gpu]
        elif self.use_ddp2:
            device_ids = self.data_parallel_device_ids
        else:  # includes ddp_cpu
            device_ids = None

        # allow user to configure ddp
        model = model.configure_ddp(model, device_ids)

        # continue training routine
        self.run_pretrain_routine(model)

        # when ddp ends, we save the model
        self.save_spawn_weights(model)
예제 #4
0
    def horovod_train(self, model):
        # Horovod: initialize library
        hvd.init()

        if torch.cuda.is_available() and self.on_gpu:
            # Horovod: pin GPU to local rank
            torch.cuda.set_device(hvd.local_rank())
            model.cuda(hvd.local_rank())

        # Only show progress bar from the first worker
        self.progress_bar_refresh_rate = self.progress_bar_refresh_rate if hvd.rank(
        ) == 0 else 0

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(
            model)

        # Horovod: scale the learning rate by the number of workers to account for
        # increased total batch size
        for optimizer in self.optimizers:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= hvd.size()

        if self.use_amp:
            # An example
            model, optimizers = model.configure_apex(amp, model,
                                                     self.optimizers,
                                                     self.amp_level)
            self.optimizers = optimizers

        # Horovod: broadcast parameters & optimizer state to ensure consistent initialization
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        for optimizer in self.optimizers:
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        def filter_named_parameters(model, optimizer):
            opt_params = set([
                p for group in optimizer.param_groups
                for p in group.get('params', [])
            ])
            return [(name, p) for name, p in model.named_parameters()
                    if p in opt_params]

        # Horovod: wrap optimizers to perform gradient aggregation via allreduce
        self.optimizers = [
            hvd.DistributedOptimizer(optimizer,
                                     named_parameters=filter_named_parameters(
                                         model, optimizer))
            for optimizer in self.optimizers
        ]

        # Update logger rank info from Horovod to avoid race conditions from  different ranks
        # creating directories / writing files in the same locations.
        self.proc_rank = hvd.rank()
        set_proc_rank(self.proc_rank)
        if self.logger:
            self.logger.rank = self.proc_rank
        if model.logger:
            model.logger.rank = self.proc_rank

        with ExitStack() as stack:
            for optimizer in self.optimizers:
                # Synchronization will be performed explicitly following backward()
                stack.enter_context(optimizer.skip_synchronize())

            self.run_pretrain_routine(model)