def _async_train_step(self, rank, device_id, criterion): self.model.train() # zero grads even if net_input is None, since we will all-reduce them self.optimizer.zero_grad() # calculate loss and grads loss = 0 if self._sample is not None: net_output = self.model(**self._sample['net_input']) loss_ = criterion(net_output, self._sample) loss_.backward() loss = loss_.data[0] # flatten grads into a contiguous block of memory if self.flat_grads is None: self.flat_grads = self._flatten_grads_(self.model) # all-reduce grads nccl.all_reduce(self.flat_grads) # clip grads grad_norm = self._clip_grads_(self.flat_grads, self.args.clip_norm) # take an optimization step self.optimizer.step() return loss, grad_norm
def _async_backward_and_opt(self, rank, device_id, grad_denom): if self.loss is not None: # backward pass self.loss.backward() # flatten grads into a contiguous block of memory if self.flat_grads is None: self.flat_grads = self._flatten_grads_(self.model) # all-reduce grads nccl.all_reduce(self.flat_grads) # normalize grads if grad_denom != 0: self.flat_grads.div_(grad_denom) # clip grads grad_norm = self._clip_grads_(self.flat_grads, self.args.clip_norm) # take an optimization step self.optimizer.step() # reset loss self.loss = None return grad_norm
def all_reduce_buffer(): # copy grads into buffer_t offset = 0 for g in buffer: numel = g.numel() buffer_t[offset:offset + numel].copy_(g.view(-1)) offset += numel # all-reduce and rescale nccl.all_reduce(buffer_t[:offset]) buffer_t.div_(grad_denom) # copy all-reduced buffer back into grads offset = 0 for g in buffer: numel = g.numel() g.view(-1).copy_(buffer_t[offset:offset + numel]) offset += numel
def _all_reduce_and_rescale_grads(self, grad_denom, buffer_size=10485760): """All-reduce and rescale gradients in chunks of the specified size.""" grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] buffer_t = grads[0].new( math.ceil(buffer_size / grads[0].element_size())).zero_() buffer = [] def all_reduce_buffer(): # copy grads into buffer_t offset = 0 for g in buffer: numel = g.numel() buffer_t[offset:offset + numel].copy_(g.view(-1)) offset += numel # all-reduce and rescale nccl.all_reduce(buffer_t[:offset]) buffer_t.div_(grad_denom) # copy all-reduced buffer back into grads offset = 0 for g in buffer: numel = g.numel() g.view(-1).copy_(buffer_t[offset:offset + numel]) offset += numel filled = 0 for g in grads: sz = g.numel() * g.element_size() if sz > buffer_size: # grad is bigger than buffer, all-reduce and rescale directly nccl.all_reduce(g) g.div_(grad_denom) elif filled + sz > buffer_size: # buffer is full, all-reduce and replace buffer with grad all_reduce_buffer() buffer = [g] filled = sz else: # add grad to buffer buffer.append(g) filled += sz if len(buffer) > 0: all_reduce_buffer()
def _async_train_step(self, rank, device_id, criterion): # If enable_rl, generate two outputs in inference model # 1) greedy # 2) sampled if self.enable_rl: args = self.args # since deepcopy does not support our case, we do not use fast generation # cur_model = copy.deepcopy(self.model) # deep copy current model, since once made generation fast, cannot be trained # cur_model.make_generation_fast_(1, not args.no_beamable_mm) # for fast generation self.model.eval() self.generator.models = [self.model] input = self._sample['net_input'] ref_hypo_res = self.generate(input) refs = [item[0] for item in ref_hypo_res] greedy_sums = [item[1] for item in ref_hypo_res] sampled_sums = [item[2] for item in ref_hypo_res] sum_log_probs = [item[3] for item in ref_hypo_res] rouge_greedy = [ utils.evaluate([greedy_sums[i]], [refs[i]]) for i in range(len(refs)) ] rouge_sampled = [ utils.evaluate([sampled_sums[i]], [refs[i]]) for i in range(len(refs)) ] rl_loss = 0 for r_g, r_s, sum_log_prob in zip(rouge_greedy, rouge_sampled, sum_log_probs): rl_loss += (r_g - r_s) * sum_log_prob rl_loss /= len(rouge_greedy) # normalized by # sentences else: rl_loss = mean_rouge_greedy = mean_rouge_sampled = mean_sum_log_prob = None self.model.train() # zero grads even if net_input is None, since we will all-reduce them self.optimizer.zero_grad() # calculate loss and grads loss = 0 if self._sample is not None: net_output = self.model(**self._sample['net_input']) ml_loss = criterion(net_output, self._sample) if self.enable_rl: loss_ = args.loss_scale * rl_loss + (1 - args.loss_scale) * ml_loss mean_rouge_greedy = sum(rouge_greedy) / len(rouge_greedy) mean_rouge_sampled = sum(rouge_sampled) / len(rouge_sampled) mean_sum_log_prob = sum(sum_log_probs) / len(sum_log_probs) else: loss_ = ml_loss loss_.backward() loss = loss_.data[0] ml_loss = ml_loss.data[0] # flatten grads into a contiguous block of memory if self.flat_grads is None: self.flat_grads = self._flatten_grads_(self.model) # all-reduce grads nccl.all_reduce(self.flat_grads) # clip grads grad_norm = self._clip_grads_(self.flat_grads, self.args.clip_norm) # take an optimization step self.optimizer.step() res = Results(loss=loss, grad_norm=grad_norm, ml_loss=ml_loss, rl_loss=rl_loss, mean_rouge_greedy=mean_rouge_greedy, mean_rouge_sampled=mean_rouge_sampled, mean_sum_log_prob=mean_sum_log_prob) return res