def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760): """Sums tensors from multiple GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: inputs (Iterable[Iterable[Tensor]]): iterable of iterables that contain tensors from a single device. destination (int, optional): a device on which the output will be placed (default: current device). buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple of tensors containing an elementwise sum of each group of inputs, placed on the ``destination`` device. """ dense_tensors = [] # shape (num_tensors, num_gpus) output = [] for tensor_at_gpus in zip(*inputs): if tensor_at_gpus[0].is_sparse: result = reduce_add(tensor_at_gpus, destination) output.append(result) else: dense_tensors.append(tensor_at_gpus) itrs = [_take_tensors(tensors, buffer_size) for tensors in zip(*dense_tensors)] for chunks in zip(*itrs): tensors = [_flatten_dense_tensors(chunk) for chunk in chunks] result = reduce_add(tensors, destination) output.extend(_unflatten_dense_tensors(result, chunks[0])) return tuple(_reorder_tensors_as(output, inputs[0]))
def reduction_fn_nccl(): # This function only needs to be called once if not self.need_reduction: return self.need_reduction = False all_grads = [[] for _ in range(len(self._module_copies))] all_grads_buckets_iters = [] # Bucketing all the gradients for dev_idx, module in enumerate(self._module_copies): for param in module.parameters(): if not param.requires_grad or param.grad is None: continue if param.grad.requires_grad: raise RuntimeError( "DistributedDataParallel only works " "with gradients that don't require " "grad") # Adding the gradients for reduction all_grads[dev_idx].append(param.grad.data) # Now bucketing the parameters dev_grads_buckets = _take_tensors(all_grads[dev_idx], self.nccl_reduce_bucket_size) all_grads_buckets_iters.append(dev_grads_buckets) # Now reduce each bucket one after another for grads_batch in zip(*all_grads_buckets_iters): grads_batch_coalesced = [] # Coalesce each bucket for dev_idx, dev_grads_batch in enumerate(grads_batch): dev_id = self.device_ids[dev_idx] with torch.cuda.device(dev_id): dev_grads_batch_coalesced = _flatten_dense_tensors( dev_grads_batch) grads_batch_coalesced.append(dev_grads_batch_coalesced) # We will only use device 0's results, but this single op should be # faster than doing the following two operation sequentially: # (1) intra-node reduce to lead GPU, followed by # (2) inter-node allreduce for all the first lead GPUs in all nodes dist.all_reduce_multigpu(grads_batch_coalesced, group=self.nccl_reduction_group_id) # Now only work on the first device of self.device_ids, uncoalesce # the gradients for each bucket grads_batch_coalesced[0] /= dist.get_world_size() grads_batch_reduced = _unflatten_dense_tensors( grads_batch_coalesced[0], grads_batch[0]) for grad, reduced in zip(grads_batch[0], grads_batch_reduced): grad.copy_(reduced) # clear the gradients and save memory for replicas for module in self._module_copies[1:]: for param in module.parameters(): if param.requires_grad: param.grad = None param.data.set_()
def broadcast_coalesced(tensors, devices, buffer_size=10485760): """Broadcasts a sequence tensors to the specified GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: tensors (sequence): tensors to broadcast. devices (Iterable): an iterable of devices among which to broadcast. Note that it should be like (src, dst1, dst2, ...), the first element of which is the source device to broadcast from. buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple containing copies of the ``tensor``, placed on devices corresponding to indices from ``devices``. """ for tensor in tensors: if tensor.get_device() != devices[0]: raise RuntimeError('all tensors must be on devices[0]') outputs = [[] for _ in devices] # use the original tensors for the first device outputs[0].extend(tensors) for chunk in _take_tensors(tensors, buffer_size): results = broadcast(_flatten_tensors(chunk), devices) # use the broadcasted tensors for the remaining devices for dst, res in zip(outputs[1:], results[1:]): dst.extend(_unflatten_tensors(res, chunk)) return tuple(outputs)
def _dist_broadcast_coalesced(self, tensors, buffer_size): for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) dist.broadcast(flat_tensors, 0) for tensor, synced in zip( tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def nccl_allreduce_by_buckets(nc, kn, all_grads): # Now bucketing the parameters dev_grads_buckets = _take_tensors(all_grads, nccl_reduce_bucket_size) for grads_batch in dev_grads_buckets: grads_batch_coalesced = _flatten_dense_tensors(grads_batch) # NOTE: torch.cuda.synchronize() # NOTE: #nbutils.cuda_current_context().synchronize() # or, nc.stream_sync() sz = np.prod(grads_batch_coalesced.size()) nc.do_all_reduce(grads_batch_coalesced.data_ptr(), grads_batch_coalesced.data_ptr(), sz) nc.stream_sync() grads_batch_coalesced[:] = grads_batch_coalesced / float(kn) grads_batch_reduced = _unflatten_dense_tensors( grads_batch_coalesced, grads_batch) for grad, reduced in zip(grads_batch, grads_batch_reduced): grad.copy_(reduced)
def all_reduce_coalesced(tensors, divisor=1, op=ReduceOp.SUM, buffer_size=256 * MB): for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) dist.all_reduce(flat_tensors, op) if divisor != 1: flat_tensors.div_(divisor) for old_t, new_t in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)): old_t.data = new_t
def sync_params_bucket(self): params = [p.data for p in list(self.model.parameters())] for tensors in _take_tensors(params, self.mpi_size): flat_tensors = _flatten_dense_tensors(tensors) dist.broadcast(flat_tensors, src=0) for tensor, synced in zip( tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def sync_grads_bucket(self): grads = [p.grad.data for p in list(self.model.parameters()) if p.requires_grad] for tensors in _take_tensors(grads, self.mpi_size): flat_tensors = _flatten_dense_tensors(tensors) new_all_reduce(flat_tensors, cuda=self.cuda) flat_tensors.div_(self.world_size) for tensor, synced in zip(tensors,_unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def sync_grads_bucket(self): grads = [ p.grad.data for p in list(self.model.parameters()) if p.requires_grad ] for tensors in _take_tensors(grads, self.mpi_size): flat_tensors = _flatten_dense_tensors(tensors) dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM)
def reduction_fn_nccl(): # This function only needs to be called once if not self.need_reduction: return self.need_reduction = False all_grads = [[] for _ in range(len(self._module_copies))] all_grads_buckets_iters = [] # Bucketing all the gradients for dev_idx, module in enumerate(self._module_copies): for param in module.parameters(): if not param.requires_grad or param.grad is None: continue if param.grad.requires_grad: raise RuntimeError("DistributedDataParallel only works " "with gradients that don't require " "grad") # Adding the gradients for reduction all_grads[dev_idx].append(param.grad.data) # Now bucketing the parameters dev_grads_buckets = _take_tensors(all_grads[dev_idx], self.nccl_reduce_bucket_size) all_grads_buckets_iters.append(dev_grads_buckets) # Now reduce each bucket one after another for grads_batch in zip(*all_grads_buckets_iters): grads_batch_coalesced = [] # Coalesce each bucket for dev_idx, dev_grads_batch in enumerate(grads_batch): dev_id = self.device_ids[dev_idx] with torch.cuda.device(dev_id): dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch) grads_batch_coalesced.append(dev_grads_batch_coalesced) # We will only use device 0's results, but this single op should be # faster than doing the following two operation sequentially: # (1) intra-node reduce to lead GPU, followed by # (2) inter-node allreduce for all the first lead GPUs in all nodes dist.all_reduce_multigpu(grads_batch_coalesced, group=self.nccl_reduction_group_id) # Now only work on the first device of self.device_ids, uncoalesce # the gradients for each bucket grads_batch_coalesced[0] /= dist.get_world_size() grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0]) for grad, reduced in zip(grads_batch[0], grads_batch_reduced): grad.copy_(reduced) # clear the gradients and save memory for replicas for module in self._module_copies[1:]: for param in module.parameters(): if param.requires_grad: param.grad = None param.data.set_()
def sync_buffers_bucket(self): buffers = [p.data for p in list(self.model._all_buffers())] for tensors in _take_tensors(buffers, self.mpi_size): flat_tensors = _flatten_dense_tensors(tensors) flat_tensors.zero_() dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM) flat_tensors.div_(self.num_workers) for tensor, synced in zip( tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def sync_grads_bucket(self): grads = [ p.grad.data for p in list(self.model.parameters()) if p.requires_grad ] for tensors in _take_tensors(grads, self.mpi_size): flat_tensors = _flatten_dense_tensors(tensors) dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM) flat_tensors.div_(self.num_workers) for tensor, synced in zip( tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def all_gather_coalesced(tensors, buffer_size=256 * MB): assert dist.get_backend() == dist.dist_backend.NCCL # gloo gives some weird device error world_size = dist.get_world_size() rcv_lsts = [[] for _ in range(world_size)] for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) tmp_rcv_lst = [torch.empty_like(flat_tensors) for _ in range(world_size)] dist.all_gather(tmp_rcv_lst, flat_tensors) for i, rcv_flat_tensors in enumerate(tmp_rcv_lst): for rcv_t in _unflatten_dense_tensors(rcv_flat_tensors, tensors): rcv_lsts[i].append(rcv_t) return rcv_lsts
def _get_coalesced_bucket(tensors, buffer_size_mb=-1): if buffer_size_mb > 0: buffer_size = buffer_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, buffer_size) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() return buckets
def _dist_broadcast_coalesced(self, tensors, buffer_size): """ Broadcast a sequence of tensors to the default group from rank 0. Small tensors are first coalesced into a buffer to reduce the number of broadcasts. tensors (sequence): tensors to broadcast. Each tensor needs to be on the same GPU. buffer_size (int): maximum size of the buffer for coalescing """ for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) dist.broadcast(flat_tensors, 0) for tensor, synced in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def _dist_broadcast_coalesced(self, tensors, buffer_size): """ Broadcast a sequence of tensors to the default group from rank 0. Small tensors are first coalesced into a buffer to reduce the number of broadcasts. tensors (sequence): tensors to broadcast. Each tensor needs to be on the same GPU. buffer_size (int): maximum size of the buffer for coalescing """ for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) dist.broadcast(flat_tensors, 0) for tensor, synced in zip( tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def broadcast_coalesced(tensors, src=0, buffer_size=10 * MB): r""" Broadcast a sequence of tensors to the default group from rank 0. Small tensors are first coalesced into a buffer to reduce the number of broadcasts. tensors (sequence): tensors to broadcast. Each tensor needs to be on the same GPU. src (int): src rank. Default: 0. buffer_size (int): maximum size of the buffer for coalescing. Default: 10MB. """ for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) dist.broadcast(flat_tensors, src) for old_t, new_t in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)): old_t.data = new_t
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760): """Sums tensors from multiple GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: inputs (Iterable[Iterable[Tensor]]): iterable of iterables that contain tensors from a single device. destination (int, optional): a device on which the output will be placed (default: current device). buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple of tensors containing an elementwise sum of each group of inputs, placed on the ``destination`` device. """ # TODO: When `len(inputs) == 1` and all inputs are on `destination`, just # return `inputs`. dense_tensors: List[List] = [[] for _ in inputs ] # shape (num_gpus, num_tensors) output = [] ref_order = [] # process sparse ones first since they may have different sizes on different gpus for tensor_at_gpus in zip(*inputs): if all(t.is_sparse for t in tensor_at_gpus): result = reduce_add(tensor_at_gpus, destination) # this will be sparse too output.append(result) ref_order.append(tensor_at_gpus[0]) else: for coll, t in zip(dense_tensors, tensor_at_gpus): coll.append(t.to_dense() if t.is_sparse else t) ref_order.append(dense_tensors[0][-1]) itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors] # now the dense ones, which have consistent sizes for chunks in zip(*itrs): flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks] # (num_gpus,) flat_result = reduce_add(flat_tensors, destination) for t in _unflatten_dense_tensors(flat_result, chunks[0]): # The unflattened tensors do not share storage, and we don't expose # base flat tensor anyways, so give them different version counters. # See NOTE [ Version Counter in comm.*_coalesced ] output.append(t.data) return tuple(_reorder_tensors_as(output, ref_order))
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=(- 1)): if (bucket_size_mb > 0): bucket_size_bytes = ((bucket_size_mb * 1024) * 1024) buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if (tp not in buckets): buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for (tensor, synced) in zip(bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced)
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): if bucket_size_mb > 0: bucket_size_bytes = bucket_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) # all_reduce perform SUM by default and we div gpu_num here. flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced)
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): """Allreduce parameters as a whole.""" if bucket_size_mb > 0: bucket_size_bytes = bucket_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced)
def reduction_fn(): # This function only needs to be called once if not self.need_reduction: return self.need_reduction = False all_grads = [] # Bucketing all the gradients for param in self.module.parameters(): if not param.requires_grad: continue if param.grad is not None and param.grad.requires_grad: raise RuntimeError("DistributedDataParallel only works " "with gradients that don't require " "grad") if param.grad is not None: # Adding the gradients for reduction all_grads.append(param.grad.data) else: all_grads.append(torch.zeros_like(param)) # Now bucketing the parameters dev_grads_buckets = _take_tensors(all_grads, self.reduce_bucket_size) # Now reduce each bucket one after another for grads_batch in dev_grads_buckets: grads_batch_coalesced = _flatten_dense_tensors(grads_batch) grads_batch_coalesced /= self.world_size distributed_utils.all_reduce(grads_batch_coalesced, self.process_group) grads_batch_reduced = _unflatten_dense_tensors( grads_batch_coalesced, grads_batch) for grad, reduced in zip(grads_batch, grads_batch_reduced): grad.copy_(reduced)
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760): """Sums tensors from multiple GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: inputs (Iterable[Iterable[Tensor]]): iterable of iterables that contain tensors from a single device. destination (int, optional): a device on which the output will be placed (default: current device). buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple of tensors containing an elementwise sum of each group of inputs, placed on the ``destination`` device. """ dense_tensors = [[] for _ in inputs] # shape (num_gpus, num_tensors) output = [] ref_order = [] # process sparse ones first since they may have different sizes on different gpus for tensor_at_gpus in zip(*inputs): if all(t.is_sparse for t in tensor_at_gpus): result = reduce_add(tensor_at_gpus, destination) output.append(result) ref_order.append(tensor_at_gpus[0]) else: for coll, t in zip(dense_tensors, tensor_at_gpus): coll.append(t.to_dense() if t.is_sparse else t) ref_order.append(dense_tensors[0][-1]) itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors] # now the dense ones, which have consistent sizes for chunks in zip(*itrs): flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks] flat_result = reduce_add(flat_tensors, destination) output.extend(_unflatten_dense_tensors(flat_result, chunks[0])) return tuple(_reorder_tensors_as(output, ref_order))
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760): """Sums tensors from multiple GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: inputs (Iterable[Iterable[Tensor]]): iterable of iterables that contain tensors from a single device. destination (int, optional): a device on which the output will be placed (default: current device). buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple of tensors containing an elementwise sum of each group of inputs, placed on the ``destination`` device. """ output = [] itrs = [_take_tensors(tensors, buffer_size) for tensors in inputs] for chunks in zip(*itrs): flattened = [_flatten_tensors(chunk) for chunk in chunks] result = reduce_add(flattened, destination) output.extend(_unflatten_tensors(result, chunks[0])) return tuple(output)
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: # collect gradients first all_grads = [] for p in group['params']: if p.grad is None: continue d_p = p.grad.data all_grads.append(d_p) dev_grads_buckets = _take_tensors(all_grads, self.bucket_size) for dev_grads in dev_grads_buckets: d_p_new = _flatten_dense_tensors(dev_grads) if self.all_reduce: self.all_reduce_time.set() dist.all_reduce(d_p_new, group=0) self.all_reduce_time.record() dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads) for grad, reduced in zip(dev_grads, dev_grads_new): grad.copy_(reduced) for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError( 'Adam_distribute does not support sparse gradients, please consider SparseAdam_distribute instead' ) amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 if group['weight_decay'] != 0: grad = grad.add(group['weight_decay'], p.data) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = max_exp_avg_sq.sqrt().add_(group['eps']) else: denom = exp_avg_sq.sqrt().add_(group['eps']) bias_correction1 = 1 - beta1**state['step'] bias_correction2 = 1 - beta2**state['step'] step_size = group['lr'] * math.sqrt( bias_correction2) / bias_correction1 p.data.addcdiv_(-step_size, exp_avg, denom) return loss
def __init__(self, module, device_ids=None, output_device=None, dim=0, broadcast_buffers=True): super(DistributedDataParallel, self).__init__() if device_ids is None: device_ids = list(range(torch.cuda.device_count())) if output_device is None: output_device = device_ids[0] self.dim = dim self.module = module self.device_ids = device_ids self.output_device = output_device self.broadcast_buffers = broadcast_buffers # Flag used by the NCCL backend to make sure we only reduce gradients # one time in the execution engine self.need_reduction = False MB = 1024 * 1024 # used for intra-node param sync and inter-node sync as well self.broadcast_bucket_size = 10 * MB self.nccl_reduce_bucket_size = 256 * MB # Sync params and buffers module_states = list(self.module.state_dict().values()) if len(module_states) > 0: self._dist_broadcast_coalesced(module_states, self.broadcast_bucket_size) if len(device_ids) > 1: # TODO: we don't need to replicate params in here. they're always going to # be broadcasted using larger blocks in broadcast_coalesced, so it might be # better to not pollute the caches with these small blocks self._module_copies = replicate(self.module, self.device_ids, detach=True) self._module_copies[0] = self.module for module_copy in self._module_copies[1:]: for param, copy_param in zip(self.module.parameters(), module_copy.parameters()): copy_param.requires_grad = param.requires_grad else: self._module_copies = [self.module] # For NCCL backend, since every single NCCL call is asynchoronous, we # therefore directly enqueue all the NCCL reduction calls to the # default CUDA stream without spawning up other reduction threads. # This achieves the best performance. if dist._backend == dist.dist_backend.NCCL: self._register_nccl_grad_hook() return bucket_bytes_cap = 1 * MB # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems param_buckets = [] # Split the parameters into buckets and by types as well for dev_idx, module in enumerate(self._module_copies): param_buckets.append(list(_take_tensors(module.parameters(), bucket_bytes_cap))) self.bucket_sizes = [] self.bucket_map = {} # We transpose param_buckets, so the loop is over buckets. # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)): self.bucket_sizes.append(0) # Now, we transpose again, so we iterate over bucket_elems, but getting tuples # of params from each device. for idx, param_tuple in enumerate(zip(*param_buckets_tuple)): if idx == 0: # Bucket parameter type tracking bucket_param_type = param_tuple[0].type() # Only gloo and nccl support half-precision if bucket_param_type == torch.cuda.HalfTensor and \ dist._backend != dist.dist_backend.GLOO: raise RuntimeError("DistributedDataParallel currently only " "supports half precision parameters " "with Nccl and Gloo backend") if not param_tuple[0].requires_grad: continue for p in param_tuple: self.bucket_map[p] = bucket_idx self.bucket_sizes[bucket_idx] += 1 self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))] self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))] self.reduced = [False] * len(self.bucket_sizes) self._register_grad_hooks() self.dispatch_lock = threading.Lock() self._start_reduction_threads()
def __init__(self, module, device_ids=None, output_device=None, dim=0, broadcast_buffers=True): super(DistributedDataParallel, self).__init__() if dist._backend not in (dist.dist_backend.NCCL, dist.dist_backend.GLOO): raise ValueError( 'Invalid backend, only NCCL and GLOO backends are supported by DistributedDataParallel' ) if device_ids is None: device_ids = list(range(torch.cuda.device_count())) if output_device is None: output_device = device_ids[0] self.dim = dim self.module = module self.device_ids = device_ids self.output_device = output_device self.broadcast_buffers = broadcast_buffers # Flag used by the NCCL backend to make sure we only reduce gradients # one time in the execution engine self.need_reduction = False MB = 1024 * 1024 # used for intra-node param sync and inter-node sync as well self.broadcast_bucket_size = 10 * MB self.nccl_reduce_bucket_size = 256 * MB # Sync params and buffers module_states = list(self.module.state_dict().values()) if len(module_states) > 0: self._dist_broadcast_coalesced(module_states, self.broadcast_bucket_size) if len(device_ids) > 1: # TODO: we don't need to replicate params in here. they're always going to # be broadcasted using larger blocks in broadcast_coalesced, so it might be # better to not pollute the caches with these small blocks self._module_copies = replicate(self.module, self.device_ids, detach=True) self._module_copies[0] = self.module for module_copy in self._module_copies[1:]: for param, copy_param in zip(self.module.parameters(), module_copy.parameters()): copy_param.requires_grad = param.requires_grad else: self._module_copies = [self.module] # For NCCL backend, since every single NCCL call is asynchoronous, we # therefore directly enqueue all the NCCL reduction calls to the # default CUDA stream without spawning up other reduction threads. # This achieves the best performance. if dist._backend == dist.dist_backend.NCCL: self._register_nccl_grad_hook() return bucket_bytes_cap = 1 * MB # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems param_buckets = [] # Split the parameters into buckets and by types as well for dev_idx, module in enumerate(self._module_copies): param_buckets.append( list(_take_tensors(module.parameters(), bucket_bytes_cap))) self.bucket_sizes = [] self.bucket_map = {} # We transpose param_buckets, so the loop is over buckets. # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)): self.bucket_sizes.append(0) # Now, we transpose again, so we iterate over bucket_elems, but getting tuples # of params from each device. for idx, param_tuple in enumerate(zip(*param_buckets_tuple)): if idx == 0: # Bucket parameter type tracking bucket_param_type = param_tuple[0].type() # Only gloo and nccl support half-precision if bucket_param_type == torch.cuda.HalfTensor and \ dist._backend != dist.dist_backend.GLOO: raise RuntimeError( "DistributedDataParallel currently only " "supports half precision parameters " "with Nccl and Gloo backend") if not param_tuple[0].requires_grad: continue for p in param_tuple: self.bucket_map[p] = bucket_idx self.bucket_sizes[bucket_idx] += 1 self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))] self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))] self.reduced = [False] * len(self.bucket_sizes) self._register_grad_hooks() self.dispatch_lock = threading.Lock() self._start_reduction_threads()
def step(self, closure=None): loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] all_grads = [] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if self.compression_buffer == False: if weight_decay != 0: d_p.add_(weight_decay, p.data) if momentum != 0: # signum param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state[ 'momentum_buffer'] = torch.zeros_like(p.data) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_((1 - momentum), d_p) d_p.copy_(buf) all_grads.append(d_p) dev_grads_buckets = _take_tensors(all_grads, self.bucket_size) for dev_grads in dev_grads_buckets: d_p_new = _flatten_dense_tensors(dev_grads) if self.all_reduce: dist.all_reduce(d_p_new, group=0) #self.all_gpu else: if self.nodes > 1: if self.compression_buffer: coded, data_time = QSGD_gpu.encode(d_p_new) #specific coded dic just on CPU tensor_signs = coded['signs'].float().to( self.device) tensor_selected = coded['selected'].float().to( self.device) tensor_norm = coded['norm'] #size tensor_signs_size = self.pack_len_tensor_into_tensor( tensor_signs) tensor_selected_size = self.pack_len_tensor_into_tensor( tensor_selected) #tensor_norm_size = self.pack_len_tensor_into_tensor(tensor_norm) norm doesn't need size #custom ''' print(tensor_signs.type()) print(tensor_selected.type()) print(tensor_norm.type()) ''' else: d_p_new = torch.sign(d_p_new) if self.local_rank == 0: if self.all_gather_commu: #This version only for instances each with one GPU for node_index in self.inter_node_list: if node_index != self.nodes_rank: d.set() f.set() coded_temp = coded.copy() f.record() b.set() tensor_signs_size_temp = tensor_signs_size.clone( ) dist.broadcast( tensor_signs_size_temp, node_index, group=self.all_inter_node_group) b.record() c.set() tensor_signs_temp = torch.zeros( [int(tensor_signs_size_temp[0])], device=self.device, dtype=torch.float) c.record() a.set() dist.broadcast( tensor_signs_temp, node_index, group=self.all_inter_node_group) a.record() d.record() e.set() tensor_selected_size_temp = tensor_selected_size.clone( ) dist.broadcast( tensor_selected_size_temp, node_index, group=self.all_inter_node_group) tensor_selected_temp = torch.zeros( [ int(tensor_selected_size_temp[ 0]) ], device=self.device, dtype=torch.float) dist.broadcast( tensor_selected_temp, node_index, group=self.all_inter_node_group) e.record() tensor_norm_temp = tensor_norm.clone() dist.broadcast( tensor_norm_temp, node_index, group=self.all_inter_node_group) coded_temp[ 'signs'] = tensor_signs_temp.int() coded_temp[ 'selected'] = tensor_selected_temp.long( ) coded_temp['norm'] = tensor_norm_temp tensor_decoded = QSGD_gpu.decode( coded_temp, cuda=True) d_p_new = d_p_new + tensor_decoded ''' print('a', a.get_time()) print('b', b.get_time()) print('c', c.get_time()) print('d', d.get_time()) print('e', e.get_time()) print('f', f.get_time()) ''' else: dist.broadcast( tensor_signs_size, node_index, group=self.all_inter_node_group) dist.broadcast( tensor_signs, node_index, group=self.all_inter_node_group) dist.broadcast( tensor_selected_size, node_index, group=self.all_inter_node_group) dist.broadcast( tensor_selected, node_index, group=self.all_inter_node_group) dist.broadcast( tensor_norm, node_index, group=self.all_inter_node_group) d_p_new = d_p_new / dist.get_world_size() else: if dist.get_rank() == 0: for index, inter_node_group in enumerate( self.inter_node_group_list): coded_temp = coded.copy() tensor_signs_size_temp = tensor_signs_size.clone( ) dist.broadcast( tensor_signs_size_temp, self.inter_node_list[index + 1], group=inter_node_group) tensor_signs_temp = torch.zeros( [int(tensor_signs_size_temp[0])], device=self.device, dtype=torch.float) dist.broadcast( tensor_signs_temp, self.inter_node_list[index + 1], group=inter_node_group) tensor_selected_size_temp = tensor_selected_size.clone( ) dist.broadcast( tensor_selected_size_temp, self.inter_node_list[index + 1], group=inter_node_group) tensor_selected_temp = torch.zeros( [ int(tensor_selected_size_temp[ 0]) ], device=self.device, dtype=torch.float) dist.broadcast( tensor_selected_temp, self.inter_node_list[index + 1], group=inter_node_group) tensor_norm_temp = tensor_norm.clone() dist.broadcast( tensor_norm_temp, self.inter_node_list[index + 1], group=inter_node_group) coded_temp[ 'signs'] = tensor_signs_temp.int() coded_temp[ 'selected'] = tensor_selected_temp.long( ) coded_temp['norm'] = tensor_norm_temp tensor_decoded = QSGD_gpu.decode( coded_temp, cuda=True) d_p_new = d_p_new + tensor_decoded ''' #temp print(tensor_decoded) tensor_decoded_temp = tensor_decoded.clone() dist.broadcast(tensor_decoded_temp, self.inter_node_list[index + 1], group = inter_node_group) if tensor_decoded == tensor_decoded_temp: print('success') print(tensor_signs_size_temp) print(tensor_selected_size_temp) ''' d_p_new = d_p_new / dist.get_world_size() else: dist.broadcast( tensor_signs_size, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.broadcast( tensor_signs, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.broadcast( tensor_selected_size, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.broadcast( tensor_selected, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.broadcast( tensor_norm, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) ''' #temp tensor_decoded = QSGD_gpu.decode(coded, cuda = True) print(tensor_decoded) dist.broadcast(tensor_decoded, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) print(tensor_signs_size) print(tensor_selected_size) ''' dist.barrier( group=self.all_inter_node_group) #os._exit() if self.bidirection_compress: if dist.get_rank() == 0: coded, data_time = QSGD_gpu.encode( d_p_new) tensor_signs = coded['signs'] tensor_selected = coded['selected'] tensor_norm = coded['norm'] tensor_signs_size = self.pack_len_tensor_into_tensor( tensor_signs) tensor_selected_size = self.pack_len_tensor_into_tensor( tensor_selected) dist.barrier( group=self.all_inter_node_group) dist.broadcast( tensor_signs_size, 0, group=self.all_inter_node_group) dist.broadcast( tensor_selected_size, 0, group=self.all_inter_node_group) if dist.get_rank() != 0: tensor_signs = torch.randn([ int(tensor_signs_size[0]) ]).type_as(tensor_signs) tensor_selected = torch.randn([ int(tensor_selected_size[0]) ]).type_as(tensor_selected) dist.barrier( group=self.all_inter_node_group) dist.broadcast( tensor_signs, 0, group=self.all_inter_node_group) dist.broadcast( tensor_selected, 0, group=self.all_inter_node_group) dist.broadcast( tensor_norm, 0, group=self.all_inter_node_group) coded['signs'] = tensor_signs coded['selected'] = tensor_selected coded['norm'] = tensor_norm tensor_decoded = QSGD_gpu.decode(coded, cuda=True) d_p_new = tensor_decoded else: if dist.get_rank() == 0: dist.barrier( group=self.all_inter_node_group) dist.broadcast( d_p_new, 0, group=self.all_inter_node_group) else: # test for one coded, data_time = QSGD_gpu.encode(d_p_new) tensor_decoded = QSGD_gpu.decode(coded, cuda=True) d_p_new = tensor_decoded #unflatten dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads) for grad, reduced in zip(dev_grads, dev_grads_new): grad.copy_(reduced) for p in group['params']: if self.compression_buffer: if weight_decay != 0: p.grad.data.add_(weight_decay, p.data) p.data.add_(-group['lr'], p.grad.data) return loss
def __init__(self, module, device_ids=None, output_device=None, dim=0, broadcast_buffers=True, process_group=None, bucket_cap_mb=25): super(_DistributedDataParallelC10d, self).__init__() # Use all devices by default if device_ids is None: device_ids = list(range(torch.cuda.device_count())) if output_device is None: output_device = device_ids[0] if process_group is None: self.process_group = c10d.get_default_group() else: self.process_group = process_group self.dim = dim self.module = module self.device_ids = device_ids self.output_device = output_device self.broadcast_buffers = broadcast_buffers self.allreduce_opts = c10d.AllreduceOptions() MB = 1024 * 1024 # used for intra-node param sync and inter-node sync as well self.broadcast_bucket_size = 25 * MB # Sync params and buffers module_states = list(self.module.state_dict().values()) if len(module_states) > 0: self._dist_broadcast_coalesced(module_states, self.broadcast_bucket_size) if len(device_ids) > 1: # TODO: we don't need to replicate params in here. they're always going to # be broadcasted using larger blocks in broadcast_coalesced, so it might be # better to not pollute the caches with these small blocks self._module_copies = replicate(self.module, self.device_ids, detach=True) self._module_copies[0] = self.module for module_copy in self._module_copies[1:]: for param, copy_param in zip(self.module.parameters(), module_copy.parameters()): copy_param.requires_grad = param.requires_grad else: self._module_copies = [self.module] # .data() of each parameter for each model replica self.modules_params_data = [[] for _ in range(len(self.device_ids))] # .data() of each buffer for each model replica self.modules_buffers_data = [[] for _ in range(len(self.device_ids))] for dev_idx, module in enumerate(self._module_copies): self.modules_params_data[dev_idx] = [ p.data for p in module.parameters() ] self.modules_buffers_data[dev_idx] = [ b.data for b in module.buffers() ] bucket_bytes_cap = bucket_cap_mb * MB # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems param_buckets = [] # Split the parameters into buckets and by types as well param_buckets = [ list(_take_tensors(m.parameters(), bucket_bytes_cap)) for m in self._module_copies ] self.bucket_sizes = [] self.bucket_map = {} # We transpose param_buckets, so the loop is over buckets. # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)): self.bucket_sizes.append(0) # Now, we transpose again, so we iterate over bucket_elems, but getting tuples # of params from each device. for idx, param_tuple in enumerate(zip(*param_buckets_tuple)): if not param_tuple[0].requires_grad: continue for p in param_tuple: self.bucket_map[p] = (bucket_idx, idx) self.bucket_sizes[bucket_idx] += 1 self.buckets = [[[None for _ in range(self.bucket_sizes[i])] for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] # The number of params ready in each bucket self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] # coalesced bucket for only device 0 self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] # We will always reduce the bucket following the reverse order # that is, alway reduces following the order of: n - 1, n - 2, ..., 0 self.next_bucket = len(self.bucket_sizes) - 1 self.ready_buckets_not_reduced = set() self.reduction_works = [None for _ in range(len(self.bucket_sizes))] self.devs_ready = [0 for _ in range(len(self.bucket_sizes))] # default stream tracking to launch nccl reduce kernels self.default_streams = [] for dev_id in self.device_ids: with torch.cuda.device(dev_id): self.default_streams.append(torch.cuda.current_stream()) self._register_grad_hooks()
def step(self, closure=None): args = self.args loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] all_grads = [] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if self.compression_buffer == False: if weight_decay != 0: d_p.add_(weight_decay, p.data) if momentum != 0: # signum param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state[ 'momentum_buffer'] = torch.zeros_like(p.data) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_((1 - momentum), d_p) d_p.copy_(buf) all_grads.append(d_p) dev_grads_buckets = _take_tensors(all_grads, self.bucket_size) for dev_grads in dev_grads_buckets: d_p_new = _flatten_dense_tensors(dev_grads) if self.all_reduce: dist.all_reduce(d_p_new) #self.all_gpu, group = 0 if self.signum: d_p_new = torch.sign(d_p_new) elif self.signum: if self.nodes > 1: if self.compression_buffer: d_p_new, tensor_size = self.compressor.compress( d_p_new) else: d_p_new = torch.sign(d_p_new) if self.local_rank == 0: if dist.get_rank() == 0: d_p_new_list = [] for index, inter_node_group in enumerate( self.inter_node_group_list): d_p_temp = d_p_new.clone() dist.broadcast(d_p_temp, self.inter_node_list[index + 1], group=inter_node_group) d_p_new_list.append(d_p_temp) else: dist.broadcast( d_p_new, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.barrier(group=self.all_inter_node_group) if dist.get_rank() == 0: if self.compression_buffer: d_p_new_list.append(d_p_new) #count itself d_p_new = self.compressor.majority_vote( d_p_new_list) else: for d_p_temp in d_p_new_list: d_p_new.add_(d_p_temp) d_p_new = d_p_new / self.nodes dist.barrier(group=self.all_inter_node_group) dist.broadcast(d_p_new, 0, group=self.all_inter_node_group) if self.compression_buffer: d_p_new = self.compressor.uncompress( d_p_new, tensor_size) else: print('You can not run without signum or all_reduce') #unflatten dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads) for grad, reduced in zip(dev_grads, dev_grads_new): grad.copy_(reduced) #LARC saving self.layer_adaptive_lr = [] layer_index = 0 laryer_saving = [ 1, 2, 3, 23, 49, 87 ] #conv1.weight(no bias), bn1.weight, layer1.1.conv1.weight, layer2.1.conv1.weight, layer3.1.conv1.weight, layer4.1.conv1.weight ### for p in group['params']: layer_index += 1 ### ''' LARC This part of code was originally forked from (https://github.com/NVIDIA/apex/blob/master/apex/parallel/LARC.py) ''' if args.larc_enable: trust_coefficient = args.larc_trust_coefficient clip = args.larc_clip eps = args.larc_eps param_norm = torch.norm(p.data) grad_norm = torch.norm(p.grad.data) if param_norm != 0 and grad_norm != 0: # calculate adaptive lr + weight decay adaptive_lr = trust_coefficient * (param_norm) / ( grad_norm + param_norm * weight_decay + eps) #add adaptive lr saving if layer_index in laryer_saving: self.layer_adaptive_lr.append(adaptive_lr) # clip learning rate for LARC if clip: # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)` adaptive_lr = min(adaptive_lr / group['lr'], 1) else: adaptive_lr = adaptive_lr / group['lr'] p.grad.data *= adaptive_lr ### if self.compression_buffer: #This part of code is temporary if weight_decay != 0: p.grad.data.add_(weight_decay, p.data) p.data.add_(-group['lr'], p.grad.data) return loss
def step(self, closure=None): loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] all_grads = [] for p in group['params']: if p.grad is None: continue d_p = p.grad.data ''' if weight_decay != 0: d_p.add_(weight_decay, p.data) ''' if momentum != 0: # signum param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state[ 'momentum_buffer'] = torch.zeros_like(p.data) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_((1 - momentum), d_p) d_p.copy_(buf) all_grads.append(d_p) #torch.cuda.init() #torch.cuda.empty_cache() if not self.single_worker: self.all_time.set() self.bucketing_time.set() #start bucketing dev_grads_buckets = _take_tensors(all_grads, self.bucket_size) self.bucketing_time.record() for dev_grads in dev_grads_buckets: self.bucketing_time.set() d_p_new = _flatten_dense_tensors(dev_grads) #print('the size of each bucket',d_p_new.size()) #os._exit(0) self.bucketing_time.record() #d_p_new = d_p.clone() if self.all_reduce: self.all_reduce_time.set() #torch.cuda.synchronize() #d_p_new = torch.sign(d_p_new) dist.all_reduce(d_p_new, group=0) #self.all_gpu #use boradcast_gather #take the sign to test #d_p_new = torch.sign(d_p_new) #torch.cuda.synchronize() self.all_reduce_time.record() else: #print('once') self.compress_all_time.set() self.all_reduce_time.set() #torch.cuda.synchronize() if self.gpus_per_machine > 1: dist.all_reduce(d_p_new, group=self.intra_node_group_list[ self.nodes_rank]) dist.barrier(group=self.all_gpu) self.all_reduce_time.record() #leave compression if self.nodes > 1: self.compression_time.set() ##torch.cuda.synchronize() if self.compression_buffer: d_p_new, tensor_size = self.compressor.compress( d_p_new) else: d_p_new = torch.sign(d_p_new) ##torch.cuda.synchronize() self.compression_time.record() self.compress_all_time.record() self.gather_all_time.set() if self.local_rank == 0: if dist.get_rank() == 0: d_p_new_list = [] for index, inter_node_group in enumerate( self.inter_node_group_list): #print('gather', inter_node_list[index + 1]) d_p_temp = d_p_new.clone() self.broadcast_time.set() dist.broadcast( d_p_temp, self.inter_node_list[index + 1], group=inter_node_group) self.broadcast_time.record() d_p_new_list.append(d_p_temp) else: self.broadcast_time.set() dist.broadcast( d_p_new, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) self.broadcast_time.record() #print(dist.get_rank(), 'finish broadcast') dist.barrier( group=self.all_inter_node_group) self.gather_all_time.record() self.calculate_all_time.set() if dist.get_rank() == 0: self.majority_vote_time.set() if self.compression_buffer: d_p_new_list.append( d_p_new) #count itself d_p_new = self.compressor.majority_vote( d_p_new_list) else: for d_p_temp in d_p_new_list: d_p_new.add_(d_p_temp) d_p_new = d_p_new / self.nodes ##torch.cuda.synchronize() self.majority_vote_time.record() dist.barrier( group=self.all_inter_node_group) self.calculate_all_time.record() self.broadcast_all_time.set() self.broadcast_time.set() dist.broadcast(d_p_new, 0, group=self.all_inter_node_group) self.broadcast_time.record() #dist.barrier(group = self.all_inter_node_group) #broadcast to all #print('start broadcast') #self.broadcast_time.set() dist.broadcast(d_p_new, self.local_dst_in_global, group=self.intra_node_group_list[ self.nodes_rank]) #self.broadcast_time.record() self.uncompression_time.set() ##torch.cuda.synchronize() if self.compression_buffer: d_p_new = self.compressor.uncompress( d_p_new, tensor_size) #torch.cuda.synchronize() self.uncompression_time.record() self.broadcast_all_time.record() #os._exit(0) self.debucketing_time.set() #unflatten dev_grads_new = _unflatten_dense_tensors( d_p_new, dev_grads) for grad, reduced in zip(dev_grads, dev_grads_new): grad.copy_(reduced) self.debucketing_time.record() self.all_time.record() self.update_para_time.set() #torch.cuda.synchronize() for p in group['params']: if weight_decay != 0: p.grad.data.add_(weight_decay, p.data) if self.single_worker and self.compression_buffer: p.data.add_(-group['lr'], torch.sign(p.grad.data)) else: p.data.add_(-group['lr'], p.grad.data) #torch.cuda.synchronize() self.update_para_time.record() return loss
def __init__(self, module, device_ids=None, output_device=None, dim=0, broadcast_buffers=True): super(DistributedDataParallel, self).__init__() if device_ids is None: device_ids = list(range(torch.cuda.device_count())) if output_device is None: output_device = device_ids[0] self.dim = dim self.module = module self.device_ids = device_ids self.output_device = output_device self.broadcast_buffers = broadcast_buffers MB = 1024 * 1024 # used for intra-node param sync and inter-node sync as well self.broadcast_bucket_size = 10 * MB # Sync params and buffers module_states = list(self.module.state_dict().values()) if len(module_states) > 0: self._dist_broadcast_coalesced(module_states, self.broadcast_bucket_size) if len(device_ids) > 1: # TODO: we don't need to replicate params in here. they're always going to # be broadcasted using larger blocks in broadcast_coalesced, so it might be # better to not pollute the caches with these small blocks self._module_copies = replicate(self.module, self.device_ids) self._module_copies[0] = self.module for module_copy in self._module_copies[1:]: for param, copy_param in zip(self.module.parameters(), module_copy.parameters()): copy_param.detach_() copy_param.requires_grad = param.requires_grad else: self._module_copies = [self.module] # Currently NCCL backend only supports single reduction thread/bucket if dist._backend == dist.dist_backend.NCCL: bucket_bytes_cap = float('inf') else: bucket_bytes_cap = 1 * MB # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems param_buckets = [] # Split the parameters into buckets and by types as well for dev_idx, module in enumerate(self._module_copies): param_buckets.append(list(_take_tensors(module.parameters(), bucket_bytes_cap))) self.bucket_sizes = [] self.bucket_map = {} param_types = set() # We transpose param_buckets, so the loop is over buckets. # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)): self.bucket_sizes.append(0) # Now, we transpose again, so we iterate over bucket_elems, but getting tuples # of params from each device. for idx, param_tuple in enumerate(zip(*param_buckets_tuple)): if idx == 0: # Bucket parameter type tracking bucket_param_type = param_tuple[0].type() param_types.add(bucket_param_type) # Only gloo and nccl support half-precision if bucket_param_type == torch.cuda.HalfTensor and \ dist._backend != dist.dist_backend.NCCL and \ dist._backend != dist.dist_backend.GLOO: raise RuntimeError("DistributedDataParallel currently only " "supports half precision parameters " "with Nccl and Gloo backend") if not param_tuple[0].requires_grad: continue for p in param_tuple: self.bucket_map[p] = bucket_idx self.bucket_sizes[bucket_idx] += 1 # TODO, adding mixed precision support in NCCL reduction code path # This is because NCCL backend doesn't support multiple reduction # bucket. if len(param_types) > 1 and dist._backend == dist.dist_backend.NCCL: raise RuntimeError("DistributedDataParallel currently doesn't " "support mixed precision type for NCCL backend") self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))] self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))] self.reduced = [False] * len(self.bucket_sizes) self._register_grad_hooks() self.dispatch_lock = threading.Lock() self._start_reduction_threads()
def step(self, closure=None): args = self.args loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] cur_lr = group['lr'] all_grads = [] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if self.compression_buffer == False: if weight_decay != 0: d_p.add_(weight_decay, p.data) if momentum != 0: # signum param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state[ 'momentum_buffer'] = torch.zeros_like(p.data) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(d_p) d_p.add_(momentum, buf) all_grads.append(d_p) length = 0 for _ in _take_tensors(all_grads, self.bucket_size): length += 1 dev_grads_buckets = _take_tensors(all_grads, self.bucket_size) for i, dev_grads in enumerate(dev_grads_buckets): d_p_new = _flatten_dense_tensors(dev_grads) if len(self.err_buf) < length: self.err_buf.append(torch.zeros_like(d_p_new)) self.server_err_buf.append(torch.zeros_like(d_p_new)) err_buf = self.err_buf[i] server_err_buf = self.server_err_buf[i] d_p_new.add_(self.prev_lr / cur_lr, err_buf) p_buf = d_p_new if self.all_reduce: dist.all_reduce(d_p_new) #self.all_gpu, group = 0 if self.signum: d_p_new = torch.sign(d_p_new) elif self.signum: if self.nodes > 1: if self.compression_buffer: d_p_new_scale = torch.ones(1) d_p_new_scale[0] = d_p_new.abs().sum().cpu().item( ) / d_p_new.numel() d_p_new, tensor_size = self.compressor.compress( d_p_new) tmp = self.compressor.uncompress( d_p_new.clone(), tensor_size) tmp.mul_(d_p_new_scale.item()) err_buf.copy_(p_buf).sub_(tmp) else: d_p_new = torch.sign(d_p_new) if dist.get_rank() == 0: d_p_new_list = [] d_p_new_scale_list = [] for index, inter_node_group in enumerate( self.inter_node_group_list): d_p_temp = d_p_new.clone() d_p_scale_temp = d_p_new_scale.clone() dist.broadcast(d_p_scale_temp, self.inter_node_list[index + 1], group=inter_node_group) dist.broadcast(d_p_temp, self.inter_node_list[index + 1], group=inter_node_group) d_p_new_list.append(d_p_temp) d_p_new_scale_list.append(d_p_scale_temp) else: dist.broadcast(d_p_new_scale, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.broadcast(d_p_new, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.barrier(group=self.all_inter_node_group) if dist.get_rank() == 0: if self.compression_buffer: d_p_new_list.append(d_p_new) #count itself d_p_new_scale_list.append( d_p_new_scale) #count itself #d_p_new = self.compressor.majority_vote(d_p_new_list) d_p_new = torch.zeros(tensor_size).cuda() for d_p, d_p_scale in zip( d_p_new_list, d_p_new_scale_list): tmp = self.compressor.uncompress( d_p, tensor_size) d_p_new.add_(d_p_scale.item(), tmp) d_p_new /= self.nodes d_p_new.add_(self.prev_lr / cur_lr, server_err_buf) un_compr = d_p_new d_p_new_scale = torch.ones(1) d_p_new_scale[0] = d_p_new.abs().sum().cpu( ).item() / d_p_new.numel() d_p_new, _ = self.compressor.compress(d_p_new) tmp = self.compressor.uncompress( d_p_new.clone(), tensor_size) tmp.mul_(d_p_new_scale.item()) server_err_buf.copy_(un_compr).sub_(tmp) else: for d_p_temp in d_p_new_list: d_p_new.add_(d_p_temp) d_p_new = d_p_new / self.nodes dist.barrier(group=self.all_inter_node_group) dist.broadcast(d_p_new, 0, group=self.all_inter_node_group) if self.compression_buffer: dist.broadcast(d_p_new_scale, 0, group=self.all_inter_node_group) if self.compression_buffer: d_p_new = self.compressor.uncompress( d_p_new, tensor_size) d_p_new.mul_(d_p_new_scale.item()) else: print('You can not run without signum or all_reduce') #unflatten dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads) for grad, reduced in zip(dev_grads, dev_grads_new): grad.copy_(reduced) for p in group['params']: if self.compression_buffer: #This part of code is temporary if weight_decay != 0: if momentum != 0: param_state = self.state[p] if 'wd_mom' not in param_state: buf = param_state['wd_mom'] = torch.zeros_like( p.data) else: buf = param_state['wd_mom'] buf.mul_(momentum).add_(weight_decay, p.data) p.grad.data.add_(momentum, buf) p.grad.data.add_(weight_decay, p.data) p.data.add_(-group['lr'], p.grad.data) self.prev_lr = group['lr'] return loss
def sync_buffers_bucket(self): buffers = [p.data for p in list(self.model._all_buffers())] for tensors in _take_tensors(buffers, self.mpi_size): flat_tensors = _flatten_dense_tensors(tensors) dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM)