def update(self, src_rank): """Receive gradients and update""" keys = list(self.params.keys()) grads = dict() recv_list = [] for key in keys: to_recv = self.params[key] recv_list.append(torch.zeros(to_recv.size()).cuda()) groupStart() for i in range(len(keys)): collective.recv(recv_list[i], src_rank, "default") groupEnd() for i in range(len(keys)): grads[keys[i]] = recv_list[i] self._inc_gradients(grads) if self.grad_counts == len(self.workers): #self.optimizer.zero_grad() #self._set_gradients(grads) self.optimizer.step() self.optimizer.zero_grad() return True
def test_send_recv(self): devs = [0, 1] comms = nccl.NcclCommunicator.initAll(devs) nccl.groupStart() for comm in comms: dev_id = comm.device_id() rank = comm.rank_id() assert rank == dev_id if rank == 0: with cuda.Device(dev_id): sendbuf = cupy.arange(10, dtype=cupy.int64) comm.send(sendbuf.data.ptr, 10, nccl.NCCL_INT64, 1, cuda.Stream.null.ptr) elif rank == 1: with cuda.Device(dev_id): recvbuf = cupy.zeros(10, dtype=cupy.int64) comm.recv(recvbuf.data.ptr, 10, nccl.NCCL_INT64, 0, cuda.Stream.null.ptr) nccl.groupEnd() # check result with cuda.Device(1): expected = cupy.arange(10, dtype=cupy.int64) assert (recvbuf == expected).all()
def send_recv(cls, comm, in_array, out_array, peer, stream=None): comm._check_contiguous(in_array) comm._check_contiguous(out_array) stream = comm._get_stream(stream) idtype, icount = comm._get_nccl_dtype_and_count(in_array) odtype, ocount = comm._get_nccl_dtype_and_count(out_array) nccl.groupStart() cls._send(comm, in_array, peer, idtype, icount, stream) cls._recv(comm, out_array, peer, odtype, ocount, stream) nccl.groupEnd()
def send(cls, comm, array, peer, stream=None): arrays = cls._get_internal_arrays(array) shape_and_sizes = cls._get_shape_and_sizes(arrays, array.shape) cls._exchange_shape_and_sizes(comm, peer, shape_and_sizes, 'send', stream) # Naive approach, we send each of the subarrays one by one nccl.groupStart() for a in arrays: cls._send(comm, a, peer, a.dtype, a.size, stream) nccl.groupEnd()
def test_single_proc_single_dev(self): comms = nccl.NcclCommunicator.initAll(1) nccl.groupStart() for comm in comms: cuda.Device(comm.device_id()).use() sendbuf = cupy.arange(10) recvbuf = cupy.zeros_like(sendbuf) comm.allReduce(sendbuf.data.ptr, recvbuf.data.ptr, 10, nccl.NCCL_INT64, nccl.NCCL_SUM, cuda.Stream.null.ptr) nccl.groupEnd() assert cupy.allclose(sendbuf, recvbuf)
def scatter(cls, comm, in_array, out_array, root=0, stream=None): # in_array is a list of sparse matrices if comm.rank == root: nccl.groupStart() for peer, s_a in enumerate(in_array): if peer != root: cls.send(comm, s_a, peer, stream) nccl.groupEnd() cls._assign_arrays(out_array, cls._get_internal_arrays(in_array[root]), in_array[root].shape) else: cls.recv(comm, out_array, root, stream)
def compute(self): """Returns the loss, and send gradients to servers""" # First receive params from servers param_shards = [] weights = self.get_weights(cpu=False) params = dict() # create the receive lists to group collective calls recv_list = [] for i in range(self.num_ps): recv_list.append([]) param_shard_keys = self.name_list[i] for key in param_shard_keys: to_recv = weights[key] recv_list[-1].append((torch.ones(to_recv.size()) * 2).cuda()) logging.warning( f"worker {self.rank} {recv_list[0][0][0][0]}, {recv_list[0][0].size()}, {recv_list[0][1]}, {recv_list[0][1].size()}, {recv_list[0][2]}, {recv_list[0][2].size()}" ) groupStart() for i in range(self.num_ps): for j in range(len(self.name_list[i])): logging.warning(f"recv {i}{j} {self.name_list[i][j]}") collective.recv(recv_list[i][j], self.num_workers + i, "default") if j == 2: break break groupEnd() logging.warning( f"worker {self.rank} {recv_list[0][0][0][0]}, {recv_list[0][0].size()}, {recv_list[0][1]}, {recv_list[0][1].size()}, {recv_list[0][2]}, {recv_list[0][2].size()}" ) time.sleep(100) for i in range(self.num_ps): param_shard_keys = self.name_list[i] for j in range(len(param_shard_keys)): params[param_shard_keys[j]] = recv_list[i][j] grad, loss = self.compute_gradients(params) split_grad = self.split_gradients(grad, self.assignments) groupStart() for i in range(self.num_ps): this_shard = self.index_shard(split_grad, i) for _, v in this_shard.items(): collective.send(v, self.num_workers + i, "default") groupEnd() return loss
def scatter(cls, comm, in_array, out_array, root=0, stream=None): if in_array.shape[0] != comm._n_devices: raise RuntimeError( f'scatter requires in_array to have {comm._n_devices}' f'elements in its first dimension, found {in_array.shape}') comm._check_contiguous(in_array) comm._check_contiguous(out_array) stream = comm._get_stream(stream) nccl.groupStart() if root == comm.rank: for i in range(comm._n_devices): array = in_array[i] idtype, icount = comm._get_nccl_dtype_and_count(array) cls._send(comm, array, i, idtype, icount, stream) dtype, count = comm._get_nccl_dtype_and_count(out_array) cls._recv(comm, out_array, root, dtype, count, stream) nccl.groupEnd()
def recv(cls, comm, out_array, peer, stream=None): shape_and_sizes = cls._exchange_shape_and_sizes( comm, peer, (), 'recv', stream) # Change the array sizes in out_array to match the sent ones # Receive the three arrays # TODO(ecastill) dtype is not correct, it must match the internal # sparse matrix arrays dtype arrays = cls._get_internal_arrays(out_array) shape = tuple(shape_and_sizes[0:2]) sizes = shape_and_sizes[2:] # TODO(use the out_array datatypes) arrs = [cupy.empty(s, dtype=a.dtype) for s, a in zip(sizes, arrays)] nccl.groupStart() for a in arrs: cls._recv(comm, a, peer, a.dtype, a.size, stream) nccl.groupEnd() # Create a sparse matrix from the received arrays cls._assign_arrays(out_array, arrs, shape)
def gather(cls, comm, in_array, out_array, root=0, stream=None): # TODO(ecastill) out_array needs to have comm size in shape[0] if out_array.shape[0] != comm._n_devices: raise RuntimeError( f'gather requires out_array to have {comm._n_devices}' f'elements in its first dimension, found {out_array.shape}') comm._check_contiguous(in_array) comm._check_contiguous(out_array) stream = comm._get_stream(stream) nccl.groupStart() if root == comm.rank: for i in range(comm._n_devices): array = out_array[i] odtype, ocount = comm._get_nccl_dtype_and_count(array) cls._recv(comm, array, i, odtype, ocount, stream) dtype, count = comm._get_nccl_dtype_and_count(in_array) cls._send(comm, in_array, root, dtype, count, stream) nccl.groupEnd()
def reduce(cls, comm, in_array, out_array, root=0, op='sum', stream=None): arrays = cls._get_internal_arrays(in_array) # All the matrices must share the same size shape_and_sizes = cls._get_shape_and_sizes(arrays, in_array.shape) shape_and_sizes = cls._exchange_shape_and_sizes( comm, root, shape_and_sizes, 'gather', stream) if comm.rank == root: if _get_sparse_type(in_array) != _get_sparse_type(out_array): raise ValueError( 'in_array and out_array must be the same format') result = in_array partial = _make_sparse_empty(in_array.dtype, _get_sparse_type(in_array)) # each device will send and array with a different size for peer, ss in enumerate(shape_and_sizes): shape = tuple(ss[0:2]) sizes = ss[2:] arrays = [ cupy.empty(s, dtype=a.dtype) for s, a in zip(sizes, arrays) ] if peer != root: nccl.groupStart() for a in arrays: cls._recv(comm, a, peer, a.dtype, a.size, stream) nccl.groupEnd() cls._assign_arrays(partial, arrays, shape) if op == 'sum': result = result + partial elif op == 'prod': result = result * partial else: raise ValueError( 'Sparse matrix only supports sum/prod reduction') # TODO, check output types # If out_array is coo we need to convert result to coo before # reasiging cls._assign_arrays(out_array, cls._get_internal_arrays(result), result.shape) else: nccl.groupStart() for a in arrays: cls._send(comm, a, root, a.dtype, a.size, stream) nccl.groupEnd()
def send_params(self, dst_rank): """ Send this param shard to the destination worker """ count = 0 groupStart() for name, v in self.params.items(): collective.send(v, dst_rank, "default") if count < 1: count += 1 logging.warning(f"{name} {v[0][0]}, {v.size()}") elif count < 2: count += 1 logging.warning(f"{name} {v}, {v.size()}") elif count < 3: count += 1 logging.warning(f"{name} {v}, {v.size()}") else: break groupEnd() time.sleep(5000)
def all_to_all(cls, comm, in_array, out_array, stream=None): # TODO(ecastill) out_array needs to have comm size in shape[0] if out_array.shape[0] != comm._n_devices: raise RuntimeError( f'all_to_all requires in_array to have {comm._n_devices}' f'elements in its first dimension, found {in_array.shape}') if out_array.shape[0] != comm._n_devices: raise RuntimeError( f'all_to_all requires out_array to have {comm._n_devices}' f'elements in its first dimension, found {out_array.shape}') comm._check_contiguous(in_array) comm._check_contiguous(out_array) stream = comm._get_stream(stream) idtype, icount = comm._get_nccl_dtype_and_count(in_array[0]) odtype, ocount = comm._get_nccl_dtype_and_count(out_array[0]) # TODO check out dtypes are the same as in dtypes nccl.groupStart() for i in range(comm._n_devices): cls._send(comm, in_array[i], i, idtype, icount, stream) cls._recv(comm, out_array[i], i, odtype, ocount, stream) nccl.groupEnd()
def compute(self): """Returns the loss, and send gradients to servers""" # First receive params from servers param_shards = [] weights = self.get_weights(cpu=False) params = dict() # create the receive lists to group collective calls recv_list = [] for i in range(self.num_ps): recv_list.append([]) param_shard_keys = self.name_list[i] for key in param_shard_keys: to_recv = weights[key] recv_list[-1].append(torch.zeros(to_recv.size()).cuda()) logging.warning( f" worker {self.rank} {recv_list[0][0][0][0][0]},{recv_list[0][0].size()}, {recv_list[0][1]}" ) recv_op = [dist.P2POp(dist.irecv, v, 1) for v in recv_list[0]] reqs = dist.batch_isend_irecv(recv_op) for req in reqs: req.wait() logging.warning( f"worker {self.rank} {recv_list[0][0][0][0][0]}, {recv_list[0][1]}" ) time.sleep(100) for i in range(self.num_ps): param_shard_keys = self.name_list[i] for j in range(len(param_shard_keys)): params[param_shard_keys[j]] = recv_list[i][j] grad, loss = self.compute_gradients(params) split_grad = self.split_gradients(grad, self.assignments) groupStart() for i in range(self.num_ps): this_shard = self.index_shard(split_grad, i) for _, v in this_shard.items(): collective.send(v, self.num_workers + i, "default") groupEnd() return loss
def broadcast(cls, comm, in_out_array, root=0, stream=None): arrays = cls._get_internal_arrays(in_out_array) if comm.rank == root: shape_and_sizes = cls._get_shape_and_sizes(arrays, in_out_array.shape) else: shape_and_sizes = () shape_and_sizes = cls._exchange_shape_and_sizes( comm, root, shape_and_sizes, 'bcast', stream) shape = tuple(shape_and_sizes[0:2]) sizes = shape_and_sizes[2:] # Naive approach, we send each of the subarrays one by one if comm.rank != root: arrays = [ cupy.empty(s, dtype=a.dtype) for s, a in zip(sizes, arrays) ] # TODO(ecastill): measure if its faster to just contatenate # the arrays in a single one and send it nccl.groupStart() for a in arrays: _DenseNCCLCommunicator.broadcast(comm, a, root, stream) nccl.groupEnd() cls._assign_arrays(in_out_array, arrays, shape)
def send_recv(cls, comm, in_array, out_array, peer, stream=None): nccl.groupStart() cls.send(comm, in_array, peer, stream) cls.recv(comm, out_array, peer, stream) nccl.groupEnd()