def test_nbytes(): multi_dim = np.ones(shape=(10, 10)) scalar = np.array(1) assert nbytes(scalar) == scalar.nbytes assert nbytes(multi_dim) == multi_dim.nbytes assert nbytes(memoryview(scalar)) == scalar.nbytes assert nbytes(memoryview(multi_dim)) == multi_dim.nbytes
async def write( self, msg: dict, serializers=("cuda", "dask", "pickle", "error"), on_error: str = "message", ): with log_errors(): if self.closed(): raise CommClosedError( "Endpoint is closed -- unable to send message") try: if serializers is None: serializers = ("cuda", "dask", "pickle", "error") # msg can also be a list of dicts when sending batched messages frames = await to_frames( msg, serializers=serializers, on_error=on_error, allow_offload=self.allow_offload, ) nframes = len(frames) cuda_frames = tuple( hasattr(f, "__cuda_array_interface__") for f in frames) sizes = tuple(nbytes(f) for f in frames) cuda_send_frames, send_frames = zip( *((is_cuda, each_frame) for is_cuda, each_frame in zip(cuda_frames, frames) if nbytes(each_frame) > 0)) # Send meta data # Send close flag and number of frames (_Bool, int64) await self.ep.send(struct.pack("?Q", False, nframes)) # Send which frames are CUDA (bool) and # how large each frame is (uint64) await self.ep.send( struct.pack(nframes * "?" + nframes * "Q", *cuda_frames, *sizes)) # Send frames # It is necessary to first synchronize the default stream before start # sending We synchronize the default stream because UCX is not # stream-ordered and syncing the default stream will wait for other # non-blocking CUDA streams. Note this is only sufficient if the memory # being sent is not currently in use on non-blocking CUDA streams. if any(cuda_send_frames): synchronize_stream(0) for each_frame in send_frames: await self.ep.send(each_frame) return sum(sizes) except (ucp.exceptions.UCXBaseException): self.abort() raise CommClosedError( "While writing, the connection was closed")
async def send(ep, frames): await ep.send(np.array([len(frames)], dtype=np.uint64)) await ep.send( np.array([hasattr(f, "__cuda_array_interface__") for f in frames], dtype=np.bool)) await ep.send(np.array([nbytes(f) for f in frames], dtype=np.uint64)) # Send frames for frame in frames: if nbytes(frame) > 0: await ep.send(frame)
async def send(ep, frames): pytest.importorskip("distributed") from distributed.utils import nbytes await ep.send(np.array([len(frames)], dtype=np.uint64)) await ep.send( np.array([hasattr(f, "__cuda_array_interface__") for f in frames], dtype=np.bool)) await ep.send(np.array([nbytes(f) for f in frames], dtype=np.uint64)) # Send frames for frame in frames: if nbytes(frame) > 0: await ep.send(frame)
def client(env, port, func): # wait for server to come up # receive cudf object # deserialize # assert deserialized msg is cdf # send receipt os.environ.update(env) before_rx, before_tx = total_nvlink_transfer() async def read(): await asyncio.sleep(1) ep = await get_ep("client", port) import cupy as cp cp.cuda.set_allocator(None) for i in range(ITERATIONS): bytes_used = pynvml.nvmlDeviceGetMemoryInfo( pynvml.nvmlDeviceGetHandleByIndex(0)).used print("Bytes Used:", bytes_used, i) frames, msg = await recv(ep) # Send meta data await send(ep, frames) close_msg = b"shutdown listener" close_msg_size = np.array([len(close_msg)], dtype=np.uint64) await ep.send(close_msg_size) await ep.send(close_msg) print("Shutting Down Client...") return msg["data"] rx_cuda_obj = asyncio.get_event_loop().run_until_complete(read()) num_bytes = nbytes(rx_cuda_obj) print(f"TOTAL DATA RECEIVED: {num_bytes}") # nvlink only measures in KBs if num_bytes > 90000: rx, tx = total_nvlink_transfer() msg = f"RX BEFORE SEND: {before_rx} -- RX AFTER SEND: {rx} \ -- TOTAL DATA: {num_bytes}" print(msg) assert rx > before_rx cuda_obj_generator = cloudpickle.loads(func) pure_cuda_obj = cuda_obj_generator() from cudf.tests.utils import assert_eq import cupy as cp if isinstance(rx_cuda_obj, cp.ndarray): cp.testing.assert_allclose(rx_cuda_obj, pure_cuda_obj) else: assert_eq(rx_cuda_obj, pure_cuda_obj)
async def write(self, cdf): header, _frames = cdf.serialize() frames = [pickle.dumps(header)] + _frames # Send meta data await self.ep.send(np.array([len(frames)], dtype=np.uint64)) await self.ep.send( np.array( [hasattr(f, "__cuda_array_interface__") for f in frames], dtype=np.bool, ) ) await self.ep.send(np.array([nbytes(f) for f in frames], dtype=np.uint64)) # Send frames for frame in frames: if nbytes(frame) > 0: await self.ep.send(frame)
def client(port, func, comm_api): # wait for server to come up # receive cudf object # deserialize # assert deserialized msg is cdf # send receipt from distributed.utils import nbytes ucp.init() if comm_api == "am": register_am_allocators() # must create context before importing # cudf/cupy/etc async def read(): await asyncio.sleep(1) ep = await get_ep("client", port) msg = None import cupy cupy.cuda.set_allocator(None) for i in range(ITERATIONS): if comm_api == "tag": frames, msg = await recv(ep) else: frames, msg = await am_recv(ep) close_msg = b"shutdown listener" if comm_api == "tag": close_msg_size = np.array([len(close_msg)], dtype=np.uint64) await ep.send(close_msg_size) await ep.send(close_msg) else: await ep.am_send(close_msg) print("Shutting Down Client...") return msg["data"] rx_cuda_obj = asyncio.get_event_loop().run_until_complete(read()) rx_cuda_obj + rx_cuda_obj num_bytes = nbytes(rx_cuda_obj) print(f"TOTAL DATA RECEIVED: {num_bytes}") cuda_obj_generator = cloudpickle.loads(func) pure_cuda_obj = cuda_obj_generator() if isinstance(rx_cuda_obj, cupy.ndarray): cupy.testing.assert_allclose(rx_cuda_obj, pure_cuda_obj) else: from cudf.testing._utils import assert_eq assert_eq(rx_cuda_obj, pure_cuda_obj)
async def read(self, deserializers=("cuda", "dask", "pickle", "error")): with log_errors(): if deserializers is None: deserializers = ("cuda", "dask", "pickle", "error") try: # Recv meta data # Recv close flag and number of frames (_Bool, int64) msg = host_array(struct.calcsize("?Q")) await self.ep.recv(msg) (shutdown, nframes) = struct.unpack("?Q", msg) if shutdown: # The writer is closing the connection raise CommClosedError("Connection closed by writer") # Recv which frames are CUDA (bool) and # how large each frame is (uint64) header_fmt = nframes * "?" + nframes * "Q" header = host_array(struct.calcsize(header_fmt)) await self.ep.recv(header) header = struct.unpack(header_fmt, header) cuda_frames, sizes = header[:nframes], header[nframes:] except ( ucp.exceptions.UCXCloseError, ucp.exceptions.UCXCanceled, ) + (getattr(ucp.exceptions, "UCXConnectionReset", ()), ): self.abort() raise CommClosedError("Connection closed by writer") else: # Recv frames frames = [ device_array(each_size) if is_cuda else host_array(each_size) for is_cuda, each_size in zip(cuda_frames, sizes) ] cuda_recv_frames, recv_frames = zip( *((is_cuda, each_frame) for is_cuda, each_frame in zip(cuda_frames, frames) if nbytes(each_frame) > 0)) # It is necessary to first populate `frames` with CUDA arrays and synchronize # the default stream before starting receiving to ensure buffers have been allocated if any(cuda_recv_frames): synchronize_stream(0) for each_frame in recv_frames: await self.ep.recv(each_frame) msg = await from_frames( frames, deserialize=self.deserialize, deserializers=deserializers, allow_offload=self.allow_offload, ) return msg
async def write(ep): import cupy cupy.cuda.set_allocator(None) print("CREATING CUDA OBJECT IN SERVER...") cuda_obj_generator = cloudpickle.loads(func) cuda_obj = cuda_obj_generator() msg = {"data": to_serialize(cuda_obj)} frames = await to_frames(msg, serializers=("cuda", "dask", "pickle")) for i in range(ITERATIONS): # Send meta data await ep.send(np.array([len(frames)], dtype=np.uint64)) await ep.send( np.array( [ hasattr(f, "__cuda_array_interface__") for f in frames ], dtype=np.bool, )) await ep.send( np.array([nbytes(f) for f in frames], dtype=np.uint64)) # Send frames for frame in frames: if nbytes(frame) > 0: await ep.send(frame) print("CONFIRM RECEIPT") close_msg = b"shutdown listener" msg_size = np.empty(1, dtype=np.uint64) await ep.recv(msg_size) msg = np.empty(msg_size[0], dtype=np.uint8) await ep.recv(msg) recv_msg = msg.tobytes() assert recv_msg == close_msg print("Shutting Down Server...") await ep.close() lf.close()
def check(obj, expected): assert nbytes(obj) == expected assert nbytes(memoryview(obj)) == expected
def client(env, port, func): # wait for server to come up # receive cudf object # deserialize # assert deserialized msg is cdf # send receipt ucp.reset() os.environ.update(env) ucp.init() # must create context before importing # cudf/cupy/etc before_rx, before_tx = total_nvlink_transfer() async def read(): await asyncio.sleep(1) ep = await get_ep("client", port) msg = None import cupy cupy.cuda.set_allocator(None) for i in range(ITERATIONS): # storing cu objects in msg # we delete to minimize GPU memory usage # del msg try: # Recv meta data nframes = np.empty(1, dtype=np.uint64) await ep.recv(nframes) is_cudas = np.empty(nframes[0], dtype=np.bool) await ep.recv(is_cudas) sizes = np.empty(nframes[0], dtype=np.uint64) await ep.recv(sizes) except (ucp.exceptions.UCXCanceled, ucp.exceptions.UCXCloseError) as e: msg = "SOMETHING TERRIBLE HAS HAPPENED IN THE TEST" raise e(msg) else: # Recv frames frames = [] for is_cuda, size in zip(is_cudas.tolist(), sizes.tolist()): if size > 0: if is_cuda: frame = cuda_array(size) else: frame = np.empty(size, dtype=np.uint8) await ep.recv(frame) frames.append(frame) else: if is_cuda: frames.append(cuda_array(size)) else: frames.append(b"") msg = await from_frames(frames) close_msg = b"shutdown listener" close_msg_size = np.array([len(close_msg)], dtype=np.uint64) await ep.send(close_msg_size) await ep.send(close_msg) print("Shutting Down Client...") return msg["data"] rx_cuda_obj = asyncio.get_event_loop().run_until_complete(read()) rx_cuda_obj + rx_cuda_obj num_bytes = nbytes(rx_cuda_obj) print(f"TOTAL DATA RECEIVED: {num_bytes}") # nvlink only measures in KBs if num_bytes > 90000: rx, tx = total_nvlink_transfer() msg = f"RX BEFORE SEND: {before_rx} -- RX AFTER SEND: {rx} \ -- TOTAL DATA: {num_bytes}" print(msg) assert rx > before_rx cuda_obj_generator = cloudpickle.loads(func) pure_cuda_obj = cuda_obj_generator() if isinstance(rx_cuda_obj, cupy.ndarray): cupy.testing.assert_allclose(rx_cuda_obj, pure_cuda_obj) else: cudf.tests.utils.assert_eq(rx_cuda_obj, pure_cuda_obj)
async def write(self, msg, serializers=None, on_error="message"): stream = self.stream if stream is None: raise CommClosedError() frames = await to_frames( msg, allow_offload=self.allow_offload, serializers=serializers, on_error=on_error, context={ "sender": self.local_info, "recipient": self.remote_info, **self.handshake_options, }, frame_split_size=self.max_shard_size, ) frames_nbytes = [nbytes(f) for f in frames] frames_nbytes_total = sum(frames_nbytes) header = pack_frames_prelude(frames) header = struct.pack("Q", nbytes(header) + frames_nbytes_total) + header frames = [header, *frames] frames_nbytes = [nbytes(header), *frames_nbytes] frames_nbytes_total += frames_nbytes[0] if frames_nbytes_total < 2**17: # 128kiB # small enough, send in one go frames = [b"".join(frames)] frames_nbytes = [frames_nbytes_total] try: # trick to enque all frames for writing beforehand for each_frame_nbytes, each_frame in zip(frames_nbytes, frames): if each_frame_nbytes: if stream._write_buffer is None: raise StreamClosedError() if isinstance(each_frame, memoryview): # Make sure that `len(data) == data.nbytes` # See <https://github.com/tornadoweb/tornado/pull/2996> each_frame = memoryview(each_frame).cast("B") stream._write_buffer.append(each_frame) stream._total_write_index += each_frame_nbytes # start writing frames stream.write(b"") except StreamClosedError as e: self.stream = None self._closed = True if not sys.is_finalizing(): convert_stream_closed_error(self, e) except Exception: # Some OSError or a another "low-level" exception. We do not really know # what was already written to the underlying socket, so it is not even safe # to retry here using the same stream. The only safe thing to do is to # abort. (See also GitHub #4133). self.abort() raise return frames_nbytes_total