def _process_batch(): dev_grad_batch, dev_events, job_event = queue.get() dev_coalesced = [] # Coalesce the tensors on all devices and start a local reduction for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams): with torch.cuda.device(dev_id), torch.cuda.stream(stream): stream.wait_event(event) coalesced = _flatten_dense_tensors(grad_batch) dev_coalesced.append(coalesced) # Wait for all copies to complete before starting the NCCL kernel for stream in reduction_streams: stream.synchronize() nccl.reduce(dev_coalesced, root=0, streams=nccl_streams) # From now on we're only going to work on the first device (from device_ids) grad_batch = dev_grad_batch[0] coalesced = dev_coalesced[0] reduce_stream = reduction_streams[0] with torch.cuda.stream(reduce_stream): reduce_stream.wait_stream(nccl_streams[0]) coalesced /= dist.get_world_size() dist.all_reduce(coalesced, group=group_id) for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)): grad.copy_(reduced) job_event.set()
def reduction_fn_nccl(): # This function only needs to be called once if not self.need_reduction: return self.need_reduction = False all_grads = [[] for _ in range(len(self._module_copies))] all_grads_buckets_iters = [] # Bucketing all the gradients for dev_idx, module in enumerate(self._module_copies): for param in module.parameters(): if not param.requires_grad or param.grad is None: continue if param.grad.requires_grad: raise RuntimeError( "DistributedDataParallel only works " "with gradients that don't require " "grad") # Adding the gradients for reduction all_grads[dev_idx].append(param.grad.data) # Now bucketing the parameters dev_grads_buckets = _take_tensors(all_grads[dev_idx], self.nccl_reduce_bucket_size) all_grads_buckets_iters.append(dev_grads_buckets) # Now reduce each bucket one after another for grads_batch in zip(*all_grads_buckets_iters): grads_batch_coalesced = [] # Coalesce each bucket for dev_idx, dev_grads_batch in enumerate(grads_batch): dev_id = self.device_ids[dev_idx] with torch.cuda.device(dev_id): dev_grads_batch_coalesced = _flatten_dense_tensors( dev_grads_batch) grads_batch_coalesced.append(dev_grads_batch_coalesced) # We will only use device 0's results, but this single op should be # faster than doing the following two operation sequentially: # (1) intra-node reduce to lead GPU, followed by # (2) inter-node allreduce for all the first lead GPUs in all nodes dist.all_reduce_multigpu(grads_batch_coalesced, group=self.nccl_reduction_group_id) # Now only work on the first device of self.device_ids, uncoalesce # the gradients for each bucket grads_batch_coalesced[0] /= dist.get_world_size() grads_batch_reduced = _unflatten_dense_tensors( grads_batch_coalesced[0], grads_batch[0]) for grad, reduced in zip(grads_batch[0], grads_batch_reduced): grad.copy_(reduced) # clear the gradients and save memory for replicas for module in self._module_copies[1:]: for param in module.parameters(): if param.requires_grad: param.grad = None param.data.set_()
def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None): """ Samples batches assuming they are in order of size to batch similarly sized samples together. """ super(DistributedBucketingSampler, self).__init__(data_source) if num_replicas is None: num_replicas = get_world_size() if rank is None: rank = get_rank() self.data_source = data_source self.ids = list(range(0, len(data_source))) self.batch_size = batch_size self.bins = [ self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size) ] self.num_replicas = num_replicas self.rank = rank self.num_samples = int( math.ceil(len(self.bins) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas
def test_send_recv(self): rank = dist.get_rank() tensor = _build_tensor(rank + 1) for dest in range(0, dist.get_world_size()): if dest == rank: continue dist.send(tensor, dest) for src in range(0, dist.get_world_size()): if src == rank: continue tensor = _build_tensor(src + 1, value=-1) expected_tensor = _build_tensor(src + 1) dist.recv(tensor, src) self.assertEqual(tensor, expected_tensor) self._barrier()
def get_dist_info(): if dist._initialized: rank = dist.get_rank() world_size = dist.get_world_size() else: rank = 0 world_size = 1 return rank, world_size
def test_send_recv_any_source(self): rank = dist.get_rank() tensor = _build_tensor(10, rank) for dest in range(0, dist.get_world_size()): if dest == rank: continue dist.send(tensor, dest) recv_ranks = set() for src in range(0, dist.get_world_size()): if src == rank: continue tensor = _build_tensor(10, value=-1) sender = dist.recv(tensor) self.assertTrue(tensor.eq(sender).all()) recv_ranks.add(sender) self.assertEqual(len(recv_ranks), dist.get_world_size() - 1) self._barrier()
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = True
def allreduce_params(): if self.needs_reduction: self.needs_reduction = False buckets = defaultdict(list) for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) buckets[tp].append(param) for bucket in buckets.values(): grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def test_isend(self): rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: requests = [ dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size) ] for request in requests: request.wait() self.assertTrue(request.is_completed()) else: tensor = _build_tensor(rank, -1) dist.recv(tensor, 0) self.assertEqual(tensor, _build_tensor(rank, 10)) self._barrier()
def test_irecv(self): rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)] requests = [ dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size) ] for src in range(1, world_size): requests[src - 1].wait() self.assertTrue(requests[src - 1].is_completed()) self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10)) else: tensor = _build_tensor(rank, 10) dist.send(tensor, 0) self._barrier()
def _init_multigpu_helper(self): """Multigpu tests are designed to simulate the multi nodes with multi GPUs on each node. Nccl backend requires equal #GPUs in each process. On a single node, all visible GPUs are evenly divided to subsets, each process only uses a subset. """ nGPUs = torch.cuda.device_count() world_size = dist.get_world_size() visible_devices = range(nGPUs) if BACKEND == "nccl": apply_hack_for_nccl() nGPUs_per_process = nGPUs // world_size rank_to_GPU = { i: list(visible_devices[i * nGPUs_per_process:(i + 1) * nGPUs_per_process]) for i in range(world_size) } return rank_to_GPU
def test_get_rank(self): test_dir = os.path.join(TEMP_DIR, "test_dir") pid = str(os.getpid()) num_processes = dist.get_world_size() with open(os.path.join(test_dir, pid), "w") as f: f.write(str(dist.get_rank())) self._barrier() all_ranks = set() for f_name in os.listdir(test_dir): with open(os.path.join(test_dir, f_name), "r") as f: all_ranks.add(int(f.read())) self.assertEqual(len(all_ranks), num_processes) self._barrier() if dist.get_rank() == 0: for f_name in os.listdir(test_dir): os.unlink(os.path.join(test_dir, f_name)) self._barrier()
def run(): modell = model.CNN() # modell = model.AlexNet() size = dist.get_world_size() rank = dist.get_rank() group_list = [] for i in range(size): group_list.append(i) group = dist.new_group(group_list) while (1): for param in modell.parameters(): # for dst in range(1, size): # dist.send(param.data, dst=dst) dist.broadcast(param.data, src=0, group=group) for param in modell.parameters(): tensor_temp = torch.zeros_like(param.data) dist.reduce(tensor_temp, dst=0, op=dist.reduce_op.SUM, group=group) param.data = tensor_temp / (size - 1)
def sync(cls, timeout=5): cls.barrier_id += 1 barrier_dir = os.path.join(TEMP_DIR, "barrier") pid = str(os.getpid()) barrier_file = os.path.join(barrier_dir, pid) with _lock(): with open(barrier_file, "w") as f: f.write(str(cls.barrier_id)) start_time = time.time() while True: arrived = 0 with _lock(): for f_name in os.listdir(barrier_dir): with open(os.path.join(barrier_dir, f_name), "r") as f: data = f.read() if int(data) >= cls.barrier_id: arrived += 1 if arrived == dist.get_world_size(): break if time.time() - start_time > timeout: raise RuntimeError("barrier timeout") time.sleep(0.1)
def get_world_size(): if _use_c10d[0]: return dist_c10d.get_world_size() else: return dist_no_c10d.get_world_size()
def _init_global_test(self): group = [i for i in range(0, dist.get_world_size())] group_id = dist.group.WORLD rank = dist.get_rank() return (group, group_id, rank)