def to_master_grads(model_param_groups,
                    master_param_groups,
                    flat_master: bool = False) -> None:
    for (model_param_group, master_param_group) in zip(model_param_groups,
                                                       master_param_groups):
        fp16.model_grads_to_master_grads(model_param_group,
                                         master_param_group,
                                         flat_master=flat_master)
示例#2
0
def to_master_grads(model_pgs, master_pgs, flat_master: bool = False) -> None:
    '''
    Copys all the gradients to the fp32 (master parameters) of model, so that the
    optimizer sted can be performed in fp32
    '''
    for (model_params, master_params) in zip(model_pgs, master_pgs):
        fp16.model_grads_to_master_grads(model_params['params'],
                                         master_params['params'],
                                         flat_master=flat_master)
示例#3
0
 def to_master_grads(model_param_groups: List[List[Tensor]],
                     master_param_groups: List[List[Tensor]]):
     """
     Copy gradient from float 16 model to master model
     :param model_param_groups:
     :param master_param_groups:
     """
     for model_group, master_group in zip(model_param_groups,
                                          master_param_groups):
         model_grads_to_master_grads(model_params=model_group,
                                     master_params=master_group)
示例#4
0
    def step(self, closure=None, s=None, find_median=False):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                                          and returns the loss.
        """
        # print(f'[rank: {torch.distributed.get_rank()}] optimizer.py line 339')
        # Update the gradient every `update_interval` steps.
        if self.batch_counter % self.update_interval != self.update_interval - 1:
            self.batch_counter += 1
            return None

        log_timing = self.verbose_freq > 0 and self.batch_counter % self.verbose_freq == 0
        if log_timing:
            start_time = time.time()
        if self.model_parameters is not None:
            import apex.fp16_utils as fp16_utils
            fp16_utils.model_grads_to_master_grads(self.model_parameters,
                                                   self.master_parameters)
            # TODO: This division might not be in the right place, given that
            # scaling happens right after. Look into this if problems arise.
            if self.loss_scale != 1.0:
                for parameter in self.master_parameters:
                    parameter.grad.data = parameter.grad.data / self.loss_scale

        for p in self.param_groups[0]['params']:
            if p.grad is not None:
                p.grad.div_(self.update_interval)

        # assert
        # if self.optim_name == 'SpectrainCHC':
        # print('Correct optim name!')
        loss = self.base_optimizer.step()
        # else:
        #     print('Error')
        # raise Exception('Wrong optim name!')

        if self.model_parameters is not None:
            import apex.fp16_utils as fp16_utils
            fp16_utils.master_params_to_model_params(self.model_parameters,
                                                     self.master_parameters)
        self.latest_version = self.latest_version.incr()
        # if self.num_versions > 1:
        self.buffered_state_dicts = self.queue[0][0]
        self.queue.append(self.get_params(clone=False))

        if log_timing:
            print("Optimizer step took: %.3f" % (time.time() - start_time))
        self.batch_counter += 1
        return loss
示例#5
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                                          and returns the loss.
        """
        # Update the gradient every `update_interval` steps.
        if self.batch_counter % self.update_interval != self.update_interval - 1:
            self.batch_counter += 1
            return None

        log_timing = self.verbose_freq > 0 and self.batch_counter % self.verbose_freq == 0
        if log_timing:
            start_time = time.time()
        if self.model_parameters is not None:
            import apex.fp16_utils as fp16_utils
            fp16_utils.model_grads_to_master_grads(self.model_parameters,
                                                   self.master_parameters)
            # TODO: This division might not be in the right place, given that
            # scaling happens right after. Look into this if problems arise.
            if self.loss_scale != 1.0:
                for parameter in self.master_parameters:
                    parameter.grad.data = parameter.grad.data / self.loss_scale

        # for p in self.param_groups[0]['params']:
        #     if p.grad is not None:
        #         p.grad.div_(self.update_interval)

        # loss = self.base_optimizer.step()

        self.base_optimizer.average_grad(self.update_interval)

        self.base_optimizer.step()

        if self.model_parameters is not None:
            import apex.fp16_utils as fp16_utils
            fp16_utils.master_params_to_model_params(self.model_parameters,
                                                     self.master_parameters)
        self.latest_version = self.latest_version.incr()
        if self.num_versions > 1:
            self.buffered_state_dicts = self.queue[0][0]
            self.queue.append(self.get_params(clone=False))

        if log_timing:
            print("Optimizer step took: %.3f" % (time.time() - start_time))
        self.batch_counter += 1
def to_master_grads(model_pgs, master_pgs, flat_master: bool = False) -> None:
    for (model_params, master_params) in zip(model_pgs, master_pgs):
        fp16.model_grads_to_master_grads(model_params,
                                         master_params,
                                         flat_master=flat_master)
示例#7
0
def benchmark_training(model, opts):
    """Benchmarks training phase.

    :param obj model: A model to benchmark
    :param dict opts: A dictionary of parameters.
    :rtype: tuple:
    :return: A tuple of (model_name, list of batch times)
    """
    def _reduce_tensor(tensor):
        reduced = tensor.clone()
        dist.all_reduce(reduced, op=dist.reduce_op.SUM)
        reduced /= opts['world_size']
        return reduced

    if opts['phase'] != 'training':
        raise "Phase in benchmark_training func is '%s'" % opts['phase']

    opts['distributed'] = opts['world_size'] > 1
    opts['with_cuda'] = opts['device'] == 'gpu'
    opts['fp16'] = opts['dtype'] == 'float16'
    opts['loss_scale'] = 1

    if opts['fp16'] and not opts['with_cuda']:
        raise ValueError(
            "Configuration error: FP16 can only be used with GPUs")

    if opts['with_cuda']:
        torch.cuda.set_device(opts['local_rank'])
        cudnn.benchmark = opts['cudnn_benchmark']
        cudnn.fastest = opts['cudnn_fastest']

    if opts['distributed']:
        dist.init_process_group(backend=opts['dist_backend'],
                                init_method='env://')

    if opts['with_cuda']:
        model = model.cuda()
        if opts['dtype'] == 'float16':
            model = network_to_half(model)

    if opts['distributed']:
        model = DDP(model, shared_param=True)

    if opts['fp16']:
        model_params, master_params = prep_param_lists(model)
    else:
        master_params = list(model.parameters())

    criterion = nn.CrossEntropyLoss()
    if opts['with_cuda']:
        criterion = criterion.cuda()
    optimizer = optim.SGD(master_params,
                          lr=0.01,
                          momentum=0.9,
                          weight_decay=1e-4)

    data_loader = DatasetFactory.get_data_loader(opts, opts['__input_shape'],
                                                 opts['__num_classes'])

    is_warmup = opts['num_warmup_batches'] > 0
    done = opts['num_warmup_batches'] == 0
    num_iterations_done = 0
    model.train()
    batch_times = np.zeros(opts['num_batches'])
    end_time = timeit.default_timer()
    while not done:
        prefetcher = DataPrefetcher(data_loader, opts)
        batch_data, batch_labels = prefetcher.next()
        while batch_data is not None:
            data_var = torch.autograd.Variable(batch_data)
            labels_var = torch.autograd.Variable(batch_labels)

            output = model(data_var)

            loss = criterion(output, labels_var)
            loss = loss * opts['loss_scale']
            # I'll need this for reporting
            #reduced_loss = _reduce_tensor(loss.data) if opts['distributed'] else loss.data

            if opts['fp16']:
                model.zero_grad()
                loss.backward()
                model_grads_to_master_grads(model_params, master_params)
                if opts['loss_scale'] != 1:
                    for param in master_params:
                        param.grad.data = param.grad.data / opts['loss_scale']
                optimizer.step()
                master_params_to_model_params(model_params, master_params)
            else:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if opts['with_cuda']:
                torch.cuda.synchronize()

            # Track progress
            num_iterations_done += 1
            cur_time = timeit.default_timer()

            batch_data, batch_labels = prefetcher.next()

            if is_warmup:
                if num_iterations_done >= opts['num_warmup_batches']:
                    is_warmup = False
                    num_iterations_done = 0
            else:
                if opts['num_batches'] != 0:
                    batch_times[num_iterations_done - 1] = cur_time - end_time
                if num_iterations_done >= opts['num_batches']:
                    done = True
                    break
            end_time = cur_time

    return (opts['__name'], batch_times)
示例#8
0
z = model(x)
loss = F.cross_entropy(z, torch.randint(0, 2, (20, )).cuda())
loss.backward()

to_master_grads(model_p, master_p)


def check_grads(m1, m2):
    for p1, p2 in zip(m1, m2):
        if p1.grad is None: assert p2.grad is None
        else: assert torch.allclose(p1.grad.data, p2.grad.data)


check_grads(model_p, master_p)

fp16.model_grads_to_master_grads(model_p, master_p)

check_grads(model_p, master_p)

# ### Copy the master params to the model params

# After the step, we need to copy back the master parameters to the model parameters for the next update.

from torch._utils import _unflatten_dense_tensors


def to_model_params(model_params,
                    master_params,
                    flat_master: bool = False) -> None:
    if flat_master:
        for model, master in zip(
示例#9
0
    def train(self, epoch):
        if self.args.model in ['DCRSR', 'EDSR']:
            self.scheduler.step()
            self.loss.step()
        #epoch = self.optimizer.get_last_epoch() + 1
        #lr = self.optimizer.get_lr()

        self.ckp.write_log('[Epoch {}]\tLearning rate: {:.2e}'.format(
            epoch, Decimal(self.args.lr)))
        self.loss.start_log()
        self.model.train()
        self.ckp.add_log(torch.zeros(1, len(self.scale)), True)
        timer_data, timer_model = utility.timer(), utility.timer()
        idx_scale = 0
        for batch in range(self.loader_train.train_batches):
            #print(batch)
            lr, hr = self.loader_train._getitem_and_make_batch(
                batch, self.args.batch_size)
            lr, hr = self.prepare(lr, hr)
            timer_data.hold()
            timer_model.tic()
            sr = self.model(lr)
            #            print(sr.shape)
            #            print(hr.shape)
            loss = self.loss(sr.float(), hr.float())

            self.model.zero_grad()
            loss.backward()
            if self.args.gclip > 0:
                utils.clip_grad_value_(self.model.model_params,
                                       self.args.gclip)
            if self.args.precision == 'half':
                # Now we move the calculated gradients to the master params
                # so that we can apply the gradient update in FP32.
                fp16.model_grads_to_master_grads(self.model.model_params,
                                                 self.model.master_params)
                if self.loss.loss[0]['weight'] > 1:
                    # If we scaled our losses now is a good time to scale it
                    # back since our gradients are in FP32.
                    for params in self.model.master_params:
                        if params.grad is not None:
                            params.grad.data.mul_(1. /
                                                  self.loss.loss[0]['weight'])
                # Apply weight update in FP32.
                self.optimizer.step()
                # Copy the updated weights back FP16 model weights.
                fp16.master_params_to_model_params(self.model.model_params,
                                                   self.model.master_params)
            else:
                self.optimizer.step()

            timer_model.hold()

            self.ckp.log2[-1, idx_scale] += utility.calc_psnr(
                sr.float(), hr.float())

            if (batch + 1) % self.args.print_every == 0:
                self.ckp.write_log('[{}/{}]\t{}\t{:.1f}+{:.1f}s'.format(
                    (batch + 1) * self.args.batch_size,
                    self.loader_train.train_batches * self.args.batch_size,
                    self.loss.display_loss(batch), timer_model.release(),
                    timer_data.release()))

            timer_data.tic()
        for idx_scale, scale in enumerate(self.scale):
            self.ckp.log2[-1, idx_scale] /= (batch + 1)
            best = self.ckp.log2.max(0)
            self.ckp.write_log(
                '[x{}]\tPSNR: {:.3f} (Best: {:.3f} @epoch {})'.format(
                    scale, self.ckp.log2[-1, idx_scale], best[0][idx_scale],
                    best[1][idx_scale] + 1))
        self.loss.end_log(self.loader_train.train_batches, 'train')
        self.error_last = self.loss.log[-1, -1]