def allreduce_params(): if (self.needs_reduction): self.needs_reduction = False buckets = {} for name, param in self.module.named_parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print( "WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case." ) self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) torch.cuda.synchronize() coalesced /= dist.get_world_size() for buf, synced in zip( grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def _sync_params(self): if len(self.device_ids) > 1: # intra-node parameter sync params = [p.data for p in self.module.parameters()] result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size) for tensors, module in zip(result[1:], self._module_copies[1:]): for tensor, param in zip(tensors, module.parameters()): param.data.set_(tensor) buffers = list(self.module._all_buffers()) if len(buffers) > 0: # cross-node buffer sync flat_buffers = _flatten_dense_tensors(buffers) dist.broadcast(flat_buffers, 0) for buf, synced in zip( buffers, _unflatten_dense_tensors(flat_buffers, buffers)): buf.copy_(synced) if len(self.device_ids) > 1: # intra-node buffer sync result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size) for tensors, module in zip(result[1:], self._module_copies[1:]): for tensor, buf in zip(tensors, module._all_buffers()): buf.set_(tensor)
def flat_dist_call(tensors, call, extra_args=None): flat_dist_call.warn_on_half = True buckets = {} for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) if flat_dist_call.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case.") flat_dist_call.warn_on_half = False for tp in buckets: bucket = buckets[tp] coalesced = _flatten_dense_tensors(bucket) if extra_args is not None: call(coalesced, *extra_args) else: call(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(bucket, _unflatten_dense_tensors(coalesced, bucket)): buf.copy_(synced)
def _sync_params(self): groups = dict() for p in self.module.parameters(): if not p.requires_grad or p.grad is None: continue if hasattr(p, "dp_comm"): dp_comm = p.dp_comm else: dp_comm = "dp" group_key = (dp_comm, p.dtype) if group_key not in groups: groups[group_key] = [p] else: groups[group_key].append(p) for (dp_comm, _), group in groups.items(): if dp_comm not in self.comms: continue comm = self.comms[dp_comm] datas = [p.data for p in group] coalesced = _flatten_dense_tensors(datas) torch.distributed.broadcast(coalesced, 0, group=comm) torch.cuda.synchronize() synced = _unflatten_dense_tensors(coalesced, datas) for d, s in zip(datas, synced): d.copy_(s)
def nccl_allreduce_by_buckets(nc, kn, all_grads): # Now bucketing the parameters dev_grads_buckets = _take_tensors(all_grads, nccl_reduce_bucket_size) for grads_batch in dev_grads_buckets: grads_batch_coalesced = _flatten_dense_tensors(grads_batch) # NOTE: torch.cuda.synchronize() # NOTE: #nbutils.cuda_current_context().synchronize() # or, nc.stream_sync() sz = np.prod(grads_batch_coalesced.size()) nc.do_all_reduce(grads_batch_coalesced.data_ptr(), grads_batch_coalesced.data_ptr(), sz) nc.stream_sync() grads_batch_coalesced[:] = grads_batch_coalesced / float(kn) grads_batch_reduced = _unflatten_dense_tensors( grads_batch_coalesced, grads_batch) for grad, reduced in zip(grads_batch, grads_batch_reduced): grad.copy_(reduced)
def allreduce_params(no_scale=False, reduce_after=False, fp32_allreduce=False): groups = dict() for p in self.module.parameters(): if not p.requires_grad or p.grad is None: continue if hasattr(p, "dp_comm"): dp_comm = p.dp_comm else: dp_comm = "dp" group_key = (dp_comm, p.dtype) if group_key not in groups: groups[group_key] = [p] else: groups[group_key].append(p) for (dp_comm, dtype), group in groups.items(): if dp_comm not in self.comms: continue comm = self.comms[dp_comm] grads = [p.grad.data for p in group] coalesced = _flatten_dense_tensors(grads) if fp32_allreduce and dtype != torch.float32: coalesced = coalesced.float() if not no_scale and not reduce_after: coalesced /= comm.size() torch.distributed.all_reduce(coalesced, group=comm) torch.cuda.synchronize() if not no_scale and reduce_after: coalesced /= comm.size() synced = _unflatten_dense_tensors(coalesced, grads) for g, s in zip(grads, synced): g.copy_(s)
def _process_batch(): dev_grad_batch, dev_events, job_event = queue.get() dev_coalesced = [] # Coalesce the tensors on all devices and start a local reduction for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams): with torch.cuda.device(dev_id), torch.cuda.stream(stream): stream.wait_event(event) coalesced = _flatten_dense_tensors(grad_batch) dev_coalesced.append(coalesced) # Wait for all copies to complete before starting the NCCL kernel for stream in reduction_streams: stream.synchronize() nccl.reduce(dev_coalesced, root=0, streams=nccl_streams) # From now on we're only going to work on the first device (from device_ids) grad_batch = dev_grad_batch[0] coalesced = dev_coalesced[0] reduce_stream = reduction_streams[0] with torch.cuda.stream(reduce_stream): reduce_stream.wait_stream(nccl_streams[0]) coalesced /= dist.get_world_size() dist.all_reduce(coalesced, group=group_id) for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)): grad.copy_(reduced) job_event.set()
def _sync_reduction_works(self): # Now only work on the first GPU of self.device_ids, uncoalesce # the gradients for each bucket for bucket_idx, grads_batch in enumerate(self.buckets): # wait will let current stream wait on the c10d reduction stream self.reduction_works[bucket_idx].wait() self.buckets_coalesced[bucket_idx] /= self.process_group.size() grads_batch_reduced = _unflatten_dense_tensors( self.buckets_coalesced[bucket_idx], grads_batch[0]) for grad, reduced in zip(grads_batch[0], grads_batch_reduced): grad.copy_(reduced) # Reset the module states self.next_bucket = len(self.bucket_sizes) - 1 self.ready_buckets_not_reduced = set() self.reduction_works = [None for _ in range(len(self.bucket_sizes))] self.devs_ready = [0 for _ in range(len(self.bucket_sizes))] self.buckets = [[[None for _ in range(self.bucket_sizes[i])] for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
def _sync_reduction_works(self): # Now only work on the first GPU of self.device_ids # _sync_reduction will use a seperate CUDA stream to uncoalesce # the coalesced tensors to achieve more parallelisms temp = [None for _ in range(self.parameter_length)] for p in self.module.parameters(): if p.requires_grad: bucket_idx = self.bucket_map[p] temp[bucket_idx] = p.grad.data flatten_tensor = _flatten_dense_tensors(temp) self.buckets = flatten_tensor[self.mask[0]] / self.process_group.size() dist.all_reduce(self.buckets, async_op=False) temp_zero = torch.zeros(self.flat_parameter.shape, device=self.device_id) temp_zero[self.mask[0]] = self.buckets dense_tensor = _unflatten_dense_tensors(temp_zero, temp) for p in self.module.parameters(): if p.requires_grad: bucket_idx = self.bucket_map[p] p.grad.data.copy_(dense_tensor[bucket_idx]) # Reset the module states self.next_bucket = len(self.bucket_sizes) - 1 self.reduction_works = [None for _ in range(len(self.bucket_sizes))] self.devs_ready = [0 for _ in range(len(self.bucket_sizes))] self.buckets = [None] self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] self.buckets_ready_size = [0 for i in range(len(self.bucket_sizes))]
def master2model(model_params:Sequence[Tensor], master_params:Sequence[Tensor], flat_master:bool=False): "Copy master parameters to model parameters" if flat_master: for model, master in zip(model_params, _unflatten_dense_tensors(master_params[0].data, model_params)): model.data.copy_(master) else: for model, master in zip(model_params, master_params): model.data.copy_(master.data)
def allreduce_gradients(self): """Reduce gradients across data parallel ranks.""" # If we have buffers, simply reduce the data in the buffer. if self._grad_buffers is not None: for _, buffer_ in self._grad_buffers.items(): buffer_.data /= mpu.get_data_parallel_world_size() torch.distributed.all_reduce( buffer_.data, group=mpu.get_data_parallel_group()) else: # Otherwise, bucketize and all-reduce buckets = {} # Pack the buckets. for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = param.data.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) param.main_grad = param.grad # For each bucket, all-reduce and copy all-reduced grads. for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) coalesced /= mpu.get_data_parallel_world_size() torch.distributed.all_reduce( coalesced, group=mpu.get_data_parallel_group()) for buf, synced in zip( grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def _dist_broadcast_coalesced(self, tensors, buffer_size): for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) dist.broadcast(flat_tensors, 0) for tensor, synced in zip( tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def allreduce_params(self, reduce_after=True, no_scale=False, fp32_allreduce=False): # adopted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/distributed.py buckets = {} for param in self.all_parameters: if param.requires_grad and param.grad is not None: tp = (param.data.type()) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) if fp32_allreduce: coalesced = coalesced.float() if not no_scale and not reduce_after: coalesced /= dist.get_world_size( group=self.data_parallel_group) dist.all_reduce(coalesced, group=self.data_parallel_group) torch.cuda.synchronize() if not no_scale and reduce_after: coalesced /= dist.get_world_size( group=self.data_parallel_group) for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def _copy_params_fp32_to_fp16(self): for fp16_group, fp32_group in zip(self.fp16_param_groups, self.fp32_param_groups): for fp16_param, fp32_param in zip( fp16_group, _unflatten_dense_tensors(fp32_group, fp16_group)): fp16_param.data.copy_(fp32_param.data)
def step(self, closure=None): """ Not supporting closure. """ # First compute norm for all group so we know if there is overflow grads_groups_flat = [] norm_groups = [] skip = False for i, group in enumerate(self.fp16_groups): grads_groups_flat.append( _flatten_dense_tensors([p.grad for p in group])) norm_groups.append(self._compute_grad_norm(grads_groups_flat[i])) if norm_groups[i] == -1: #TODO: early break skip = True if skip: self._update_scale(skip) return # norm is in fact norm*cur_scale self.optimizer.step(grads=[[g] for g in grads_groups_flat], output_params=[[p] for p in self.fp16_groups_flat], scale=self.cur_scale, grad_norms=norm_groups) # TODO: we probably don't need this? just to be safe for i in range(len(norm_groups)): updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data = q.data self._update_scale(False) return
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760): """Sums tensors from multiple GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: inputs (Iterable[Iterable[Tensor]]): iterable of iterables that contain tensors from a single device. destination (int, optional): a device on which the output will be placed (default: current device). buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple of tensors containing an elementwise sum of each group of inputs, placed on the ``destination`` device. """ dense_tensors = [] # shape (num_tensors, num_gpus) output = [] for tensor_at_gpus in zip(*inputs): if tensor_at_gpus[0].is_sparse: result = reduce_add(tensor_at_gpus, destination) output.append(result) else: dense_tensors.append(tensor_at_gpus) itrs = [_take_tensors(tensors, buffer_size) for tensors in zip(*dense_tensors)] for chunks in zip(*itrs): tensors = [_flatten_dense_tensors(chunk) for chunk in chunks] result = reduce_add(tensors, destination) output.extend(_unflatten_dense_tensors(result, chunks[0])) return tuple(_reorder_tensors_as(output, inputs[0]))
def reduction_fn_nccl(): # This function only needs to be called once if not self.need_reduction: return self.need_reduction = False all_grads = [[] for _ in range(len(self._module_copies))] all_grads_buckets_iters = [] # Bucketing all the gradients for dev_idx, module in enumerate(self._module_copies): for param in module.parameters(): if not param.requires_grad or param.grad is None: continue if param.grad.requires_grad: raise RuntimeError( "DistributedDataParallel only works " "with gradients that don't require " "grad") # Adding the gradients for reduction all_grads[dev_idx].append(param.grad.data) # Now bucketing the parameters dev_grads_buckets = _take_tensors(all_grads[dev_idx], self.nccl_reduce_bucket_size) all_grads_buckets_iters.append(dev_grads_buckets) # Now reduce each bucket one after another for grads_batch in zip(*all_grads_buckets_iters): grads_batch_coalesced = [] # Coalesce each bucket for dev_idx, dev_grads_batch in enumerate(grads_batch): dev_id = self.device_ids[dev_idx] with torch.cuda.device(dev_id): dev_grads_batch_coalesced = _flatten_dense_tensors( dev_grads_batch) grads_batch_coalesced.append(dev_grads_batch_coalesced) # We will only use device 0's results, but this single op should be # faster than doing the following two operation sequentially: # (1) intra-node reduce to lead GPU, followed by # (2) inter-node allreduce for all the first lead GPUs in all nodes dist.all_reduce_multigpu(grads_batch_coalesced, group=self.nccl_reduction_group_id) # Now only work on the first device of self.device_ids, uncoalesce # the gradients for each bucket grads_batch_coalesced[0] /= dist.get_world_size() grads_batch_reduced = _unflatten_dense_tensors( grads_batch_coalesced[0], grads_batch[0]) for grad, reduced in zip(grads_batch[0], grads_batch_reduced): grad.copy_(reduced) # clear the gradients and save memory for replicas for module in self._module_copies[1:]: for param in module.parameters(): if param.requires_grad: param.grad = None param.data.set_()
def _process_batch(): dev_grad_batch, dev_events, job_event = queue.get() dev_coalesced = [] # Coalesce the tensors on all devices and start a local reduction for dev_id, grad_batch, event, stream in zip( device_ids, dev_grad_batch, dev_events, reduction_streams): with torch.cuda.device(dev_id), torch.cuda.stream(stream): stream.wait_event(event) coalesced = _flatten_dense_tensors(grad_batch) dev_coalesced.append(coalesced) # Wait for all copies to complete before starting the NCCL kernel for stream in reduction_streams: stream.synchronize() nccl.reduce(dev_coalesced, root=0, streams=nccl_streams) # From now on we're only going to work on the first device (from device_ids) grad_batch = dev_grad_batch[0] coalesced = dev_coalesced[0] reduce_stream = reduction_streams[0] with torch.cuda.stream(reduce_stream): reduce_stream.wait_stream(nccl_streams[0]) coalesced /= dist.get_world_size() dist.all_reduce(coalesced, group=group_id) for grad, reduced in zip( grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)): grad.copy_(reduced) job_event.set()
def sync_params_bucket(self): params = [p.data for p in list(self.model.parameters())] for tensors in _take_tensors(params, self.mpi_size): flat_tensors = _flatten_dense_tensors(tensors) dist.broadcast(flat_tensors, src=0) for tensor, synced in zip( tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def all_reduce_coalesced(tensors, divisor=1, op=ReduceOp.SUM, buffer_size=256 * MB): for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) dist.all_reduce(flat_tensors, op) if divisor != 1: flat_tensors.div_(divisor) for old_t, new_t in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)): old_t.data = new_t
def _broadcast_coalesced(tensors, bucket_size_mb=-1): buckets = _get_coalesced_bucket(tensors, bucket_size_mb) for tensors in buckets: flat_tensors = _flatten_dense_tensors(tensors) dist.broadcast(flat_tensors, 0) for tensor, synced in zip( tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def _copy_params_fp32_to_fp16(self): for fp16_group, fp32_group in zip(self.fp16_param_groups, self.fp32_flattened_groups): if len(fp16_group) > 0: for fp16_param, fp32_data in zip( fp16_group, _unflatten_dense_tensors(fp32_group.data, fp16_group)): fp16_param.data.copy_(fp32_data)
def sync_grads_bucket(self): grads = [p.grad.data for p in list(self.model.parameters()) if p.requires_grad] for tensors in _take_tensors(grads, self.mpi_size): flat_tensors = _flatten_dense_tensors(tensors) new_all_reduce(flat_tensors, cuda=self.cuda) flat_tensors.div_(self.world_size) for tensor, synced in zip(tensors,_unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def reduction_fn_nccl(): # This function only needs to be called once if not self.need_reduction: return self.need_reduction = False all_grads = [[] for _ in range(len(self._module_copies))] all_grads_buckets_iters = [] # Bucketing all the gradients for dev_idx, module in enumerate(self._module_copies): for param in module.parameters(): if not param.requires_grad or param.grad is None: continue if param.grad.requires_grad: raise RuntimeError("DistributedDataParallel only works " "with gradients that don't require " "grad") # Adding the gradients for reduction all_grads[dev_idx].append(param.grad.data) # Now bucketing the parameters dev_grads_buckets = _take_tensors(all_grads[dev_idx], self.nccl_reduce_bucket_size) all_grads_buckets_iters.append(dev_grads_buckets) # Now reduce each bucket one after another for grads_batch in zip(*all_grads_buckets_iters): grads_batch_coalesced = [] # Coalesce each bucket for dev_idx, dev_grads_batch in enumerate(grads_batch): dev_id = self.device_ids[dev_idx] with torch.cuda.device(dev_id): dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch) grads_batch_coalesced.append(dev_grads_batch_coalesced) # We will only use device 0's results, but this single op should be # faster than doing the following two operation sequentially: # (1) intra-node reduce to lead GPU, followed by # (2) inter-node allreduce for all the first lead GPUs in all nodes dist.all_reduce_multigpu(grads_batch_coalesced, group=self.nccl_reduction_group_id) # Now only work on the first device of self.device_ids, uncoalesce # the gradients for each bucket grads_batch_coalesced[0] /= dist.get_world_size() grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0]) for grad, reduced in zip(grads_batch[0], grads_batch_reduced): grad.copy_(reduced) # clear the gradients and save memory for replicas for module in self._module_copies[1:]: for param in module.parameters(): if param.requires_grad: param.grad = None param.data.set_()
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): buckets = _get_coalesced_bucket(tensors, bucket_size_mb) for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced)
def sync_buffers_bucket(self): buffers = [p.data for p in list(self.model._all_buffers())] for tensors in _take_tensors(buffers, self.mpi_size): flat_tensors = _flatten_dense_tensors(tensors) flat_tensors.zero_() dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM) flat_tensors.div_(self.num_workers) for tensor, synced in zip( tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def master2model(model_params:Sequence[Tensor], master_params:Sequence[Tensor], flat_master:bool=False)->None: "Copy `master_params` to `model_params`." if flat_master: for model_group,master_group in zip(model_params,master_params): if len(model_group) != 0: for model, master in zip(model_group, _unflatten_dense_tensors(master_group[0].data, model_group)): model.data.copy_(master) else: for model_group,master_group in zip(model_params,master_params): for model, master in zip(model_group, master_group): model.data.copy_(master.data)
def __init__(self, init_optimizer, static_loss_scale=1.0, dynamic_loss_scale=False, dynamic_loss_args=None, verbose=True): # The fused optimizer does all the work. We need this layer for two reason: # 1. maintain same user API from apex.fp16_utils # 2. keep common stuff here in case we need to add new fused optimizer later # differences from apex.fp16_utils: # - assume all model params in fp16 # - assume all params requires grad # - flat by groups, not keeping state. TODO: remove state explicitly? # - master gard and unflat master weight never exist. TODO: a way to save out unflat master? if not torch.cuda.is_available: raise SystemError("Cannot use fp16 without CUDA.") self.optimizer = init_optimizer # param flattened by groups self.fp16_groups = [] self.fp16_groups_flat = [] self.fp32_groups_flat = [] # loop to deal with groups for i, param_group in enumerate(self.optimizer.param_groups): # push this group to list before modify self.fp16_groups.append(param_group['params']) # init fp16 weight buffer, flattened self.fp16_groups_flat.append(_flatten_dense_tensors([p.clone().detach() for p in self.fp16_groups[i]])) # set model fp16 weight to slices of flattened buffer updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i]) for p,q in zip(self.fp16_groups[i], updated_params): p.data = q.data # init master weight, flattened self.fp32_groups_flat.append(self.fp16_groups_flat[i].clone().float().detach()) # modify optimizer of have flat master weight self.fp32_groups_flat[i].requires_grad = True # keep this in case internal optimizer uses it param_group['params'] = [self.fp32_groups_flat[i]] # we may have a way of fusing dynamic scale. Do not support for now if dynamic_loss_scale: if dynamic_loss_args is not None: raise SystemError("Do not support dynamic loss scale args for now.") self.dynamic_loss_scale = True self.cur_scale = 2**32 self.cur_iter = 0 self.last_overflow_iter = -1 self.scale_factor = 2 self.scale_window = 1000 else: self.dynamic_loss_scale = False self.cur_iter = 0 self.cur_scale = static_loss_scale
def to_model_params(model_params, master_params, flat_master: bool = False) -> None: if flat_master: for model, master in zip( model_params, _unflatten_dense_tensors(master_params[0].data, model_params)): model.data.copy_(master) else: for model, master in zip(model_params, master_params): model.data.copy_(master.data)
def all_gather_multigpu(output_tensor_lists, input_tensor_list, group=group.WORLD): """Gathers tensors from the whole group in a list. Each tensor in tensor_list should reside on a separate GPU Only nccl backend is currently supported tensors should only be GPU tensors Arguments: output_tensor_lists (List[List[Tensor]]): Output lists. It should contain correctly-sized tensors on each GPU to be used for output of the collective. e.g. output_tensor_lists[i] contains the all_gather result that resides on the GPU of input_tensor_list[i]. Note that each element of output_tensor_lists[i] has the size of world_size * len(input_tensor_list), since the function all gathers the result from every single GPU in the group. To interpret each element of output_tensor_list[i], note that input_tensor_list[j] of rank k will be appear in output_tensor_list[i][rank * world_size + j] Also note that len(output_tensor_lists), and the size of each element in output_tensor_lists (each element is a list, therefore len(output_tensor_lists[i])), need to be the same for all the distributed processes calling this function. input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to be broadcast from current process. Note that len(input_tensor_list) needs to be the same for all the distributed processes calling this function. group (optional): Group of the collective. """ assert torch.distributed._initialized == _INITIALIZED_PG, \ "collective only supported in process-group mode" flatten_tensor_list = [] for output_tensor_list in output_tensor_lists: flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list)) ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list, input_tensor_list, group) for output_tensor_list, flatten_tensor in zip(output_tensor_lists, flatten_tensor_list): for tensor, value in zip(output_tensor_list, _unflatten_dense_tensors(flatten_tensor, output_tensor_list)): tensor.copy_(value) return ret
def all_gather_multigpu(output_tensor_lists, input_tensor_list, group=group.WORLD): """Gathers tensors from the whole group in a list. Each tensor in tensor_list should reside on a separate GPU Only nccl backend is currently supported tensors should only be GPU tensors Arguments: output_tensor_lists (List[List[Tensor]]): Output lists. It should contain correctly-sized tensors on each GPU to be used for output of the collective. e.g. output_tensor_lists[i] contains the all_gather result that resides on the GPU of input_tensor_list[i]. Note that each element of output_tensor_lists[i] has the size of world_size * len(input_tensor_list), since the function all gathers the result from every single GPU in the group. To interpret each element of output_tensor_list[i], note that input_tensor_list[j] of rank k will be appear in output_tensor_list[i][rank * world_size + j] Also note that len(output_tensor_lists), and the size of each element in output_tensor_lists (each element is a list, therefore len(output_tensor_lists[i])), need to be the same for all the distributed processes calling this function. input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to be broadcast from current process. Note that len(input_tensor_list) needs to be the same for all the distributed processes calling this function. group (optional): Group of the collective. """ assert torch.distributed._initialized == _INITIALIZED_PG, \ "collective only supported in process-group mode" flatten_tensor_list = [] for output_tensor_list in output_tensor_lists: flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list)) ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list, input_tensor_list, group) for output_tensor_list, flatten_tensor in zip(output_tensor_lists, flatten_tensor_list): for tensor, value in zip( output_tensor_list, _unflatten_dense_tensors(flatten_tensor, output_tensor_list)): tensor.copy_(value) return ret
def sync_grads_bucket(self): grads = [ p.grad.data for p in list(self.model.parameters()) if p.requires_grad ] for tensors in _take_tensors(grads, self.mpi_size): flat_tensors = _flatten_dense_tensors(tensors) dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM) flat_tensors.div_(self.num_workers) for tensor, synced in zip( tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def all_gather_coalesced(tensors, buffer_size=256 * MB): assert dist.get_backend() == dist.dist_backend.NCCL # gloo gives some weird device error world_size = dist.get_world_size() rcv_lsts = [[] for _ in range(world_size)] for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) tmp_rcv_lst = [torch.empty_like(flat_tensors) for _ in range(world_size)] dist.all_gather(tmp_rcv_lst, flat_tensors) for i, rcv_flat_tensors in enumerate(tmp_rcv_lst): for rcv_t in _unflatten_dense_tensors(rcv_flat_tensors, tensors): rcv_lsts[i].append(rcv_t) return rcv_lsts
def _dist_broadcast_coalesced(self, tensors, buffer_size): """ Broadcast a sequence of tensors to the default group from rank 0. Small tensors are first coalesced into a buffer to reduce the number of broadcasts. tensors (sequence): tensors to broadcast. Each tensor needs to be on the same GPU. buffer_size (int): maximum size of the buffer for coalescing """ for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) dist.broadcast(flat_tensors, 0) for tensor, synced in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def allreduce_params(): if self.needs_reduction: self.needs_reduction = False buckets = defaultdict(list) for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) buckets[tp].append(param) for bucket in buckets.values(): grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def all_gather_multigpu(output_tensor_lists, input_tensor_list, group=group.WORLD): """Gathers tensors from the whole group in a list. Each tensor in tensor_list should reside on a separate GPU Only nccl backend is currently supported tensors should only be GPU tensors Arguments: output_tensor_lists (List[List[Tensor]]): Output lists. It should contain correctly-sized tensors on each GPU to be used for output of the collective. input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to be broadcast from current process. group (optional): Group of the collective. """ assert torch.distributed._initialized == _INITIALIZED_PG, \ "collective only supported in process-group mode" warnings.warn(""" ================================================================================ WARNING ================================================================================ all_gather_multigpu is still experimental. The API will change without notice and we're can't guarantee full correctness and expected performance yet. We'll announce it once it's ready. """) flatten_tensor_list = [] for output_tensor_list in output_tensor_lists: flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list)) ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list, input_tensor_list, group) for output_tensor_list, flatten_tensor in zip(output_tensor_lists, flatten_tensor_list): for tensor, value in zip(output_tensor_list, _unflatten_dense_tensors(flatten_tensor, output_tensor_list)): tensor.copy_(value) return ret
def _sync_params(self): params = [p.data for p in self.module.parameters()] result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size) for tensors, module in zip(result[1:], self._module_copies[1:]): for tensor, param in zip(tensors, module.parameters()): param.data.set_(tensor) buffers = list(self.module._all_buffers()) if len(buffers) > 0: # cross-node buffer sync flat_buffers = _flatten_dense_tensors(buffers) dist.broadcast(flat_buffers, 0) for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)): buf.copy_(synced) # intra-node buffer sync result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size) for tensors, module in zip(result[1:], self._module_copies[1:]): for tensor, buf in zip(tensors, module._all_buffers()): buf.set_(tensor)
def broadcast_coalesced(tensors, devices, buffer_size=10485760): """Broadcasts a sequence tensors to the specified GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: tensors (sequence): tensors to broadcast. devices (Iterable): an iterable of devices among which to broadcast. Note that it should be like (src, dst1, dst2, ...), the first element of which is the source device to broadcast from. buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple containing copies of the ``tensor``, placed on devices corresponding to indices from ``devices``. """ for tensor in tensors: if tensor.get_device() != devices[0]: raise RuntimeError('all tensors must be on devices[0]') outputs = [[] for _ in devices] # use the original tensors for the first device outputs[0].extend(tensors) for chunk in _take_tensors(tensors, buffer_size): if chunk[0].is_sparse: flat_indices, flat_values = _flatten_sparse_tensors(chunk) result_indices = broadcast(flat_indices, devices) result_values = broadcast(flat_values, devices) unflat_results = tuple(_unflatten_sparse_tensors(iv, chunk) for iv in zip(result_indices, result_values)) else: flat = _flatten_dense_tensors(chunk) results = broadcast(flat, devices) unflat_results = tuple(_unflatten_dense_tensors(tensor, chunk) for tensor in results) # use the broadcasted tensors for the remaining devices for dst, unflat_res in zip(outputs[1:], unflat_results[1:]): dst.extend(unflat_res) for i, output in enumerate(outputs): outputs[i] = _reorder_tensors_as(output, tensors) return tuple(outputs)
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760): """Sums tensors from multiple GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: inputs (Iterable[Iterable[Tensor]]): iterable of iterables that contain tensors from a single device. destination (int, optional): a device on which the output will be placed (default: current device). buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple of tensors containing an elementwise sum of each group of inputs, placed on the ``destination`` device. """ dense_tensors = [[] for _ in inputs] # shape (num_gpus, num_tensors) output = [] ref_order = [] # process sparse ones first since they may have different sizes on different gpus for tensor_at_gpus in zip(*inputs): if all(t.is_sparse for t in tensor_at_gpus): result = reduce_add(tensor_at_gpus, destination) output.append(result) ref_order.append(tensor_at_gpus[0]) else: for coll, t in zip(dense_tensors, tensor_at_gpus): coll.append(t.to_dense() if t.is_sparse else t) ref_order.append(dense_tensors[0][-1]) itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors] # now the dense ones, which have consistent sizes for chunks in zip(*itrs): flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks] flat_result = reduce_add(flat_tensors, destination) output.extend(_unflatten_dense_tensors(flat_result, chunks[0])) return tuple(_reorder_tensors_as(output, ref_order))
def allreduce_params(): if (self.needs_reduction): self.needs_reduction = False buckets = {} for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case.") self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)