def _async_train_step(self, rank, device_id, criterion):
        self.model.train()

        # zero grads even if net_input is None, since we will all-reduce them
        self.optimizer.zero_grad()

        # calculate loss and grads
        loss = 0
        if self._sample is not None:
            net_output = self.model(**self._sample['net_input'])
            loss_ = criterion(net_output, self._sample)
            loss_.backward()
            loss = loss_.data[0]

        # flatten grads into a contiguous block of memory
        if self.flat_grads is None:
            self.flat_grads = self._flatten_grads_(self.model)

        # all-reduce grads
        nccl.all_reduce(self.flat_grads)

        # clip grads
        grad_norm = self._clip_grads_(self.flat_grads, self.args.clip_norm)

        # take an optimization step
        self.optimizer.step()

        return loss, grad_norm
    def _async_backward_and_opt(self, rank, device_id, grad_denom):
        if self.loss is not None:
            # backward pass
            self.loss.backward()

        # flatten grads into a contiguous block of memory
        if self.flat_grads is None:
            self.flat_grads = self._flatten_grads_(self.model)

        # all-reduce grads
        nccl.all_reduce(self.flat_grads)

        # normalize grads
        if grad_denom != 0:
            self.flat_grads.div_(grad_denom)

        # clip grads
        grad_norm = self._clip_grads_(self.flat_grads, self.args.clip_norm)

        # take an optimization step
        self.optimizer.step()

        # reset loss
        self.loss = None

        return grad_norm
 def all_reduce_buffer():
     # copy grads into buffer_t
     offset = 0
     for g in buffer:
         numel = g.numel()
         buffer_t[offset:offset + numel].copy_(g.view(-1))
         offset += numel
     # all-reduce and rescale
     nccl.all_reduce(buffer_t[:offset])
     buffer_t.div_(grad_denom)
     # copy all-reduced buffer back into grads
     offset = 0
     for g in buffer:
         numel = g.numel()
         g.view(-1).copy_(buffer_t[offset:offset + numel])
         offset += numel
    def _all_reduce_and_rescale_grads(self, grad_denom, buffer_size=10485760):
        """All-reduce and rescale gradients in chunks of the specified size."""
        grads = [
            p.grad.data for p in self.model.parameters()
            if p.requires_grad and p.grad is not None
        ]
        buffer_t = grads[0].new(
            math.ceil(buffer_size / grads[0].element_size())).zero_()
        buffer = []

        def all_reduce_buffer():
            # copy grads into buffer_t
            offset = 0
            for g in buffer:
                numel = g.numel()
                buffer_t[offset:offset + numel].copy_(g.view(-1))
                offset += numel
            # all-reduce and rescale
            nccl.all_reduce(buffer_t[:offset])
            buffer_t.div_(grad_denom)
            # copy all-reduced buffer back into grads
            offset = 0
            for g in buffer:
                numel = g.numel()
                g.view(-1).copy_(buffer_t[offset:offset + numel])
                offset += numel

        filled = 0
        for g in grads:
            sz = g.numel() * g.element_size()
            if sz > buffer_size:
                # grad is bigger than buffer, all-reduce and rescale directly
                nccl.all_reduce(g)
                g.div_(grad_denom)
            elif filled + sz > buffer_size:
                # buffer is full, all-reduce and replace buffer with grad
                all_reduce_buffer()
                buffer = [g]
                filled = sz
            else:
                # add grad to buffer
                buffer.append(g)
                filled += sz
        if len(buffer) > 0:
            all_reduce_buffer()
예제 #5
0
    def _async_train_step(self, rank, device_id, criterion):

        # If enable_rl, generate two outputs in inference model
        # 1) greedy
        # 2) sampled
        if self.enable_rl:
            args = self.args
            # since deepcopy does not support our case, we do not use fast generation
            # cur_model = copy.deepcopy(self.model) # deep copy current model, since once made generation fast, cannot be trained
            # cur_model.make_generation_fast_(1, not args.no_beamable_mm) # for fast generation
            self.model.eval()
            self.generator.models = [self.model]
            input = self._sample['net_input']

            ref_hypo_res = self.generate(input)
            refs = [item[0] for item in ref_hypo_res]
            greedy_sums = [item[1] for item in ref_hypo_res]
            sampled_sums = [item[2] for item in ref_hypo_res]
            sum_log_probs = [item[3] for item in ref_hypo_res]

            rouge_greedy = [
                utils.evaluate([greedy_sums[i]], [refs[i]])
                for i in range(len(refs))
            ]
            rouge_sampled = [
                utils.evaluate([sampled_sums[i]], [refs[i]])
                for i in range(len(refs))
            ]

            rl_loss = 0

            for r_g, r_s, sum_log_prob in zip(rouge_greedy, rouge_sampled,
                                              sum_log_probs):
                rl_loss += (r_g - r_s) * sum_log_prob
            rl_loss /= len(rouge_greedy)  # normalized by # sentences
        else:
            rl_loss = mean_rouge_greedy = mean_rouge_sampled = mean_sum_log_prob = None

        self.model.train()

        # zero grads even if net_input is None, since we will all-reduce them
        self.optimizer.zero_grad()

        # calculate loss and grads
        loss = 0
        if self._sample is not None:
            net_output = self.model(**self._sample['net_input'])
            ml_loss = criterion(net_output, self._sample)
            if self.enable_rl:
                loss_ = args.loss_scale * rl_loss + (1 -
                                                     args.loss_scale) * ml_loss
                mean_rouge_greedy = sum(rouge_greedy) / len(rouge_greedy)
                mean_rouge_sampled = sum(rouge_sampled) / len(rouge_sampled)
                mean_sum_log_prob = sum(sum_log_probs) / len(sum_log_probs)
            else:
                loss_ = ml_loss
            loss_.backward()
            loss = loss_.data[0]
            ml_loss = ml_loss.data[0]

        # flatten grads into a contiguous block of memory
        if self.flat_grads is None:
            self.flat_grads = self._flatten_grads_(self.model)

        # all-reduce grads
        nccl.all_reduce(self.flat_grads)

        # clip grads
        grad_norm = self._clip_grads_(self.flat_grads, self.args.clip_norm)

        # take an optimization step
        self.optimizer.step()

        res = Results(loss=loss,
                      grad_norm=grad_norm,
                      ml_loss=ml_loss,
                      rl_loss=rl_loss,
                      mean_rouge_greedy=mean_rouge_greedy,
                      mean_rouge_sampled=mean_rouge_sampled,
                      mean_sum_log_prob=mean_sum_log_prob)
        return res