def _check_grad_norms(self, grad_norm): """Check that grad norms are consistent across workers.""" if self._grad_norm_buf is not None: self._grad_norm_buf.zero_() self._grad_norm_buf[self.args['distributed_training']['distributed_rank']] = grad_norm distributed_utils.all_reduce( self._grad_norm_buf, group=self.data_parallel_process_group ) def is_consistent(tensor): max_abs_diff = torch.max(torch.abs(tensor - tensor[0])) return ( not torch.isfinite(tensor).any() or (max_abs_diff / (tensor[0] + 1e-6) < 1e-6).all() ) if not is_consistent(self._grad_norm_buf): pretty_detail = "\n".join( "rank {:3d} = {:.8f}".format(r, n) for r, n in enumerate(self._grad_norm_buf.tolist()) ) error_detail = "grad_norm across the workers:\n{}\n".format(pretty_detail) raise RuntimeError( "Fatal error: gradients are inconsistent between workers. " "Try --ddp-backend=no_c10d. " "Or are you mixing up different generation of GPUs in training?" + "\n" + "-" * 80 + "\n{}\n".format(error_detail) + "-" * 80 )
def all_reduce(params): buffer = self.buffer nonzero_buffer = False if len(params) > 1: offset = 0 for p in params: sz = p.numel() if p.grad is not None: buffer[offset:offset+sz].copy_(p.grad.data.view(-1)) nonzero_buffer = True else: buffer[offset:offset+sz].zero_() offset += sz else: # we only have a single grad to all-reduce p = params[0] if p.grad is not None: buffer = p.grad.data nonzero_buffer = True elif p.numel() <= self.buffer.numel(): buffer = buffer[:p.numel()] buffer.zero_() else: buffer = torch.zeros_like(p) if nonzero_buffer: buffer.div_(self.world_size) distributed_utils.all_reduce(buffer, self.process_group) # copy all-reduced grads back into their original place offset = 0 for p in params: sz = p.numel() if p.grad is not None: p.grad.data.copy_(buffer[offset:offset+sz].view_as(p)) else: p.grad = buffer[offset:offset+sz].view_as(p).clone() offset += sz