def client(env, port, func, enable_rmm): # connect to server's listener # receive object for TRANSFER_ITERATIONS # repeat for EP_ITERATIONS import numba.cuda numba.cuda.current_context() async def read(): await asyncio.sleep(1) ep = await get_ep("client", port) for i in range(TRANSFER_ITERATIONS): frames, msg = await recv(ep) print("size of the message: ", len(msg["data"])) print("Shutting Down Client...") await ep.close() if enable_rmm: set_rmm() for i in range(EP_ITERATIONS): print("ITER: ", i) get_event_loop().run_until_complete(read()) print("FINISHED")
def _test_from_worker_address_client_fixedsize(queue): async def run(): # Read local worker address address = ucp.get_worker_address() recv_tag = ucp.utils.hash64bits(os.urandom(16)) send_tag = ucp.utils.hash64bits(os.urandom(16)) packed_address = _pack_address_and_tag(address, recv_tag, send_tag) # Receive worker address from server via multiprocessing.Queue, create # endpoint to server remote_address = queue.get() ep = await ucp.create_endpoint_from_worker_address(remote_address) # Send local address to server on tag 0 await ep.send(packed_address, tag=0, force_tag=True) # Receive message from server recv_msg = np.empty(10, dtype=np.int64) await ep.recv(recv_msg, tag=recv_tag, force_tag=True) np.testing.assert_array_equal(recv_msg, np.arange(10, dtype=np.int64)) # Send message to server send_msg = np.arange(20, dtype=np.int64) await ep.send(send_msg, tag=send_tag, force_tag=True) get_event_loop().run_until_complete(run())
def _test_shutdown_unexpected_closed_peer_client(client_queue, server_queue, endpoint_error_handling): async def run(): server_port = client_queue.get() ep = await ucp.create_endpoint( ucp.get_address(), server_port, endpoint_error_handling=endpoint_error_handling, ) msg = np.empty(100, dtype=np.int64) await ep.recv(msg) get_event_loop().run_until_complete(run())
def server(port, func, comm_api): # create listener receiver # write cudf object # confirm message is sent correctly from distributed.comm.utils import to_frames from distributed.protocol import to_serialize ucp.init() if comm_api == "am": register_am_allocators() async def f(listener_port): # coroutine shows up when the client asks # to connect async def write(ep): import cupy cupy.cuda.set_allocator(None) print("CREATING CUDA OBJECT IN SERVER...") cuda_obj_generator = cloudpickle.loads(func) cuda_obj = cuda_obj_generator() msg = {"data": to_serialize(cuda_obj)} frames = await to_frames(msg, serializers=("cuda", "dask", "pickle")) for i in range(ITERATIONS): # Send meta data if comm_api == "tag": await send(ep, frames) else: await am_send(ep, frames) print("CONFIRM RECEIPT") close_msg = b"shutdown listener" if comm_api == "tag": msg_size = np.empty(1, dtype=np.uint64) await ep.recv(msg_size) msg = np.empty(msg_size[0], dtype=np.uint8) await ep.recv(msg) else: msg = await ep.am_recv() recv_msg = msg.tobytes() assert recv_msg == close_msg print("Shutting Down Server...") await ep.close() lf.close() lf = ucp.create_listener(write, port=listener_port) try: while not lf.closed(): await asyncio.sleep(0.1) except ucp.UCXCloseError: pass loop = get_event_loop() loop.run_until_complete(f(port))
def client(port, func, comm_api): # wait for server to come up # receive cudf object # deserialize # assert deserialized msg is cdf # send receipt from distributed.utils import nbytes ucp.init() if comm_api == "am": register_am_allocators() # must create context before importing # cudf/cupy/etc async def read(): await asyncio.sleep(1) ep = await get_ep("client", port) msg = None import cupy cupy.cuda.set_allocator(None) for i in range(ITERATIONS): if comm_api == "tag": frames, msg = await recv(ep) else: frames, msg = await am_recv(ep) close_msg = b"shutdown listener" if comm_api == "tag": close_msg_size = np.array([len(close_msg)], dtype=np.uint64) await ep.send(close_msg_size) await ep.send(close_msg) else: await ep.am_send(close_msg) print("Shutting Down Client...") return msg["data"] rx_cuda_obj = get_event_loop().run_until_complete(read()) rx_cuda_obj + rx_cuda_obj num_bytes = nbytes(rx_cuda_obj) print(f"TOTAL DATA RECEIVED: {num_bytes}") cuda_obj_generator = cloudpickle.loads(func) pure_cuda_obj = cuda_obj_generator() if isinstance(rx_cuda_obj, cupy.ndarray): cupy.testing.assert_allclose(rx_cuda_obj, pure_cuda_obj) else: from cudf.testing._utils import assert_eq assert_eq(rx_cuda_obj, pure_cuda_obj)
def _test_from_worker_address_server_fixedsize(num_nodes, queue): async def run(): async def _handle_client(packed_remote_address): # Unpack the fixed-size address+tag buffer unpacked = _unpack_address_and_tag(packed_remote_address) remote_address = ucp.get_ucx_address_from_buffer( unpacked["address"]) # Create endpoint to remote worker using the received address ep = await ucp.create_endpoint_from_worker_address(remote_address) # Send data to client's endpoint send_msg = np.arange(10, dtype=np.int64) await ep.send(send_msg, tag=unpacked["send_tag"], force_tag=True) # Receive data from client's endpoint recv_msg = np.empty(20, dtype=np.int64) await ep.recv(recv_msg, tag=unpacked["recv_tag"], force_tag=True) np.testing.assert_array_equal(recv_msg, np.arange(20, dtype=np.int64)) # Send worker address to client processes via multiprocessing.Queue, # one entry for each client. address = ucp.get_worker_address() for i in range(num_nodes): queue.put(address) address_info = _get_address_info() server_tasks = [] for i in range(num_nodes): # Receive fixed-size address+tag buffer on tag 0 packed_remote_address = bytearray(address_info["frame_size"]) await ucp.recv(packed_remote_address, tag=0) # Create an async task for client server_tasks.append(_handle_client(packed_remote_address)) # Await handling each client request await asyncio.gather(*server_tasks) get_event_loop().run_until_complete(run())
def _test_from_worker_address_error_server(q1, q2, error_type): async def run(): address = bytearray(ucp.get_worker_address()) if error_type == "unreachable": # Shutdown worker, then send its address to client process via # multiprocessing.Queue ucp.reset() q1.put(address) else: # Send worker address to client process via # multiprocessing.Queue, # wait for client to connect, then shutdown worker. q1.put(address) ep_ready = q2.get() assert ep_ready == "ready" ucp.reset() q1.put("disconnected") get_event_loop().run_until_complete(run())
def _test_shutdown_unexpected_closed_peer_server(client_queue, server_queue, endpoint_error_handling): global ep_is_alive ep_is_alive = None async def run(): async def server_node(ep): try: global ep_is_alive await ep.send(np.arange(100, dtype=np.int64)) # Waiting for signal to close the endpoint await mp_queue_get_nowait(server_queue) # At this point, the client should have died and the endpoint # is not alive anymore. `True` only when endpoint error # handling is enabled. ep_is_alive = ep._ep.is_alive() await ep.close() finally: listener.close() listener = ucp.create_listener( server_node, endpoint_error_handling=endpoint_error_handling) client_queue.put(listener.port) while not listener.closed(): await asyncio.sleep(0.1) log_stream = StringIO() logging.basicConfig(stream=log_stream, level=logging.DEBUG) get_event_loop().run_until_complete(run()) log = log_stream.getvalue() if endpoint_error_handling is True: assert ep_is_alive is False else: assert ep_is_alive assert log.find("""UCXError('<[Send shutdown]""") != -1
def _test_from_worker_address_client(queue): async def run(): # Read local worker address address = ucp.get_worker_address() # Receive worker address from server via multiprocessing.Queue, create # endpoint to server remote_address = queue.get() ep = await ucp.create_endpoint_from_worker_address(remote_address) # Send local address to server on tag 0 await ep.send(np.array(address.length, np.int64), tag=0, force_tag=True) await ep.send(address, tag=0, force_tag=True) # Receive message from server recv_msg = np.empty(10, dtype=np.int64) await ep.recv(recv_msg, tag=1, force_tag=True) np.testing.assert_array_equal(recv_msg, np.arange(10, dtype=np.int64)) get_event_loop().run_until_complete(run())
def _test_from_worker_address_server(queue): async def run(): # Send worker address to client process via multiprocessing.Queue address = ucp.get_worker_address() queue.put(address) # Receive address size address_size = np.empty(1, dtype=np.int64) await ucp.recv(address_size, tag=0) # Receive address buffer on tag 0 and create UCXAddress from it remote_address = bytearray(address_size[0]) await ucp.recv(remote_address, tag=0) remote_address = ucp.get_ucx_address_from_buffer(remote_address) # Create endpoint to remote worker using the received address ep = await ucp.create_endpoint_from_worker_address(remote_address) # Send data to client's endpoint send_msg = np.arange(10, dtype=np.int64) await ep.send(send_msg, tag=1, force_tag=True) get_event_loop().run_until_complete(run())
def client(env, port, func, verbose): # wait for server to come up # receive cudf object # deserialize # assert deserialized msg is cdf # send receipt os.environ.update(env) before_rx, before_tx = total_nvlink_transfer() async def read(): await asyncio.sleep(1) ep = await get_ep("client", port) for i in range(ITERATIONS): bytes_used = pynvml.nvmlDeviceGetMemoryInfo( pynvml.nvmlDeviceGetHandleByIndex(0)).used bytes_used # print("Bytes Used:", bytes_used, i) frames, msg = await recv(ep) # Send meta data await send(ep, frames) print("Shutting Down Client...") await ep.close() set_rmm() for i in range(ITERATIONS): print("ITER: ", i) t = time.time() get_event_loop().run_until_complete(read()) if verbose: print("Time take for interation %d: %ss" % (i, time.time() - t)) print("FINISHED")
def server(env, port, func, verbose): # create listener receiver # write cudf object # confirm message is sent correctly os.environ.update(env) async def f(listener_port): # coroutine shows up when the client asks # to connect set_rmm() async def write(ep): print("CREATING CUDA OBJECT IN SERVER...") cuda_obj_generator = cloudpickle.loads(func) cuda_obj = cuda_obj_generator() msg = {"data": to_serialize(cuda_obj)} frames = await to_frames(msg, serializers=("cuda", "dask", "pickle")) while True: for i in range(ITERATIONS): print("ITER: ", i) # Send meta data await send(ep, frames) frames, msg = await recv(ep) print("CONFIRM RECEIPT") await ep.close() break # lf.close() del msg del frames lf = ucp.create_listener(write, port=listener_port) try: while not lf.closed(): await asyncio.sleep(0.1) except ucp.UCXCloseError: pass loop = get_event_loop() while True: loop.run_until_complete(f(port))
def server(env, port, func, enable_rmm, num_workers, proc_conn): # create frames to send # create listener # notify parent process of listener status # write object to each new connection for TRANSFER_ITERATIONS # close listener after num_workers*EP_ITERATIONS have disconnected os.environ.update(env) import numba.cuda numba.cuda.current_context() loop = get_event_loop() # Creates frames only once to prevent filling the entire GPU print("CREATING CUDA OBJECT IN SERVER...") cuda_obj_generator = cloudpickle.loads(func) cuda_obj = cuda_obj_generator() msg = {"data": to_serialize(cuda_obj)} frames = loop.run_until_complete( to_frames(msg, serializers=("cuda", "dask", "pickle"))) async def f(listener_port, frames): # coroutine shows up when the client asks # to connect if enable_rmm: set_rmm() # Use a global so the `write` callback function can read frames global _frames global _connected global _disconnected global _lock _connected = 0 _disconnected = 0 _lock = threading.Lock() _frames = frames async def write(ep): global _connected global _disconnected _lock.acquire() _connected += 1 _lock.release() for i in range(TRANSFER_ITERATIONS): print("ITER: ", i) # Send meta data await send(ep, _frames) print("CONFIRM RECEIPT") await ep.close() _lock.acquire() _disconnected += 1 _lock.release() # break lf = ucp.create_listener(write, port=listener_port) proc_conn.send("initialized") proc_conn.close() try: while _disconnected < num_workers * EP_ITERATIONS: await asyncio.sleep(0.1) print("Closing listener") lf.close() except ucp.UCXCloseError: pass loop.run_until_complete(f(port, frames))
def server(queue, args): if args.server_cpu_affinity >= 0: os.sched_setaffinity(0, [args.server_cpu_affinity]) if args.object_type == "numpy": import numpy as xp elif args.object_type == "cupy": import cupy as xp xp.cuda.runtime.setDevice(args.server_dev) else: import cupy as xp import rmm rmm.reinitialize( pool_allocator=True, managed_memory=False, initial_pool_size=args.rmm_init_pool_size, devices=[args.server_dev], ) xp.cuda.runtime.setDevice(args.server_dev) xp.cuda.set_allocator(rmm.rmm_cupy_allocator) ucp.init() register_am_allocators(args) async def run(): async def server_handler(ep): if not args.enable_am: msg_recv_list = [] if not args.reuse_alloc: for _ in range(args.n_iter + args.n_warmup_iter): msg_recv_list.append(xp.zeros(args.n_bytes, dtype="u1")) else: t = xp.zeros(args.n_bytes, dtype="u1") for _ in range(args.n_iter + args.n_warmup_iter): msg_recv_list.append(t) assert msg_recv_list[0].nbytes == args.n_bytes for i in range(args.n_iter + args.n_warmup_iter): if args.enable_am is True: recv = await ep.am_recv() await ep.am_send(recv) else: await ep.recv(msg_recv_list[i]) await ep.send(msg_recv_list[i]) await ep.close() lf.close() lf = ucp.create_listener(server_handler, port=args.port) queue.put(lf.port) while not lf.closed(): await asyncio.sleep(0.5) loop = get_event_loop() loop.run_until_complete(run())
def client(queue, port, server_address, args): if args.client_cpu_affinity >= 0: os.sched_setaffinity(0, [args.client_cpu_affinity]) import numpy as np if args.object_type == "numpy": import numpy as xp elif args.object_type == "cupy": import cupy as xp xp.cuda.runtime.setDevice(args.client_dev) else: import cupy as xp import rmm rmm.reinitialize( pool_allocator=True, managed_memory=False, initial_pool_size=args.rmm_init_pool_size, devices=[args.client_dev], ) xp.cuda.runtime.setDevice(args.client_dev) xp.cuda.set_allocator(rmm.rmm_cupy_allocator) ucp.init() register_am_allocators(args) async def run(): ep = await ucp.create_endpoint(server_address, port) if args.enable_am: msg = xp.arange(args.n_bytes, dtype="u1") else: msg_send_list = [] msg_recv_list = [] if not args.reuse_alloc: for i in range(args.n_iter + args.n_warmup_iter): msg_send_list.append(xp.arange(args.n_bytes, dtype="u1")) msg_recv_list.append(xp.zeros(args.n_bytes, dtype="u1")) else: t1 = xp.arange(args.n_bytes, dtype="u1") t2 = xp.zeros(args.n_bytes, dtype="u1") for i in range(args.n_iter + args.n_warmup_iter): msg_send_list.append(t1) msg_recv_list.append(t2) assert msg_send_list[0].nbytes == args.n_bytes assert msg_recv_list[0].nbytes == args.n_bytes if args.cuda_profile: xp.cuda.profiler.start() times = [] for i in range(args.n_iter + args.n_warmup_iter): start = clock() if args.enable_am: await ep.am_send(msg) await ep.am_recv() else: await ep.send(msg_send_list[i]) await ep.recv(msg_recv_list[i]) stop = clock() if i >= args.n_warmup_iter: times.append(stop - start) if args.cuda_profile: xp.cuda.profiler.stop() queue.put(times) loop = get_event_loop() loop.run_until_complete(run()) times = queue.get() assert len(times) == args.n_iter bw_avg = format_bytes(2 * args.n_iter * args.n_bytes / sum(times)) bw_med = format_bytes(2 * args.n_bytes / np.median(times)) lat_avg = int(sum(times) * 1e9 / (2 * args.n_iter)) lat_med = int(np.median(times) * 1e9 / 2) print("Roundtrip benchmark") print_separator(separator="=") print_key_value(key="Iterations", value=f"{args.n_iter}") print_key_value(key="Bytes", value=f"{format_bytes(args.n_bytes)}") print_key_value(key="Object type", value=f"{args.object_type}") print_key_value(key="Reuse allocation", value=f"{args.reuse_alloc}") print_key_value(key="Transfer API", value=f"{'AM' if args.enable_am else 'TAG'}") print_key_value(key="UCX_TLS", value=f"{ucp.get_config()['TLS']}") print_key_value(key="UCX_NET_DEVICES", value=f"{ucp.get_config()['NET_DEVICES']}") print_separator(separator="=") if args.object_type == "numpy": print_key_value(key="Device(s)", value="CPU-only") s_aff = ( args.server_cpu_affinity if args.server_cpu_affinity >= 0 else "affinity not set" ) c_aff = ( args.client_cpu_affinity if args.client_cpu_affinity >= 0 else "affinity not set" ) print_key_value(key="Server CPU", value=f"{s_aff}") print_key_value(key="Client CPU", value=f"{c_aff}") else: print_key_value(key="Device(s)", value=f"{args.server_dev}, {args.client_dev}") print_separator(separator="=") print_key_value("Bandwidth (average)", value=f"{bw_avg}/s") print_key_value("Bandwidth (median)", value=f"{bw_med}/s") print_key_value("Latency (average)", value=f"{lat_avg} ns") print_key_value("Latency (median)", value=f"{lat_med} ns") if not args.no_detailed_report: print_separator(separator="=") print_key_value(key="Iterations", value="Bandwidth, Latency") print_separator(separator="-") for i, t in enumerate(times): ts = format_bytes(2 * args.n_bytes / t) lat = int(t * 1e9 / 2) print_key_value(key=i, value=f"{ts}/s, {lat}ns")
def _test_from_worker_address_error_client(q1, q2, error_type): async def run(): # Receive worker address from server via multiprocessing.Queue remote_address = ucp.get_ucx_address_from_buffer(q1.get()) if error_type == "unreachable": with pytest.raises( ucp.exceptions.UCXError, match="Destination is unreachable|Endpoint timeout", ): # Here, two cases may happen: # 1. With TCP creating endpoint will immediately raise # "Destination is unreachable" # 2. With rc/ud creating endpoint will succeed, but raise # "Endpoint timeout" after UCX_UD_TIMEOUT seconds have passed. # We need to keep progressing UCP until timeout is raised. ep = await ucp.create_endpoint_from_worker_address( remote_address) start = time.monotonic() while not ep._ep.raise_on_error(): ucp.progress() # Prevent hanging if time.monotonic() - start >= 1.0: return else: # Create endpoint to remote worker, and: # # 1. For timeout_am_send/timeout_send: # - inform remote worker that local endpoint is ready for remote # shutdown; # - wait for remote worker to shutdown and confirm; # - attempt to send message. # # 2. For timeout_am_recv/timeout_recv: # - schedule ep.recv; # - inform remote worker that local endpoint is ready for remote # shutdown; # - wait for it to shutdown and confirm # - wait for recv message. ep = await ucp.create_endpoint_from_worker_address(remote_address) if re.match("timeout.*send", error_type): q2.put("ready") remote_disconnected = q1.get() assert remote_disconnected == "disconnected" with pytest.raises(ucp.exceptions.UCXError, match="Endpoint timeout"): if error_type == "timeout_am_send": await asyncio.wait_for(ep.am_send(np.zeros(10)), timeout=1.0) else: await asyncio.wait_for(ep.send(np.zeros(10), tag=0, force_tag=True), timeout=1.0) else: with pytest.raises(ucp.exceptions.UCXCanceled): if error_type == "timeout_am_recv": task = asyncio.wait_for(ep.am_recv(), timeout=3.0) else: msg = np.empty(10) task = asyncio.wait_for(ep.recv(msg, tag=0, force_tag=True), timeout=3.0) q2.put("ready") remote_disconnected = q1.get() assert remote_disconnected == "disconnected" await task get_event_loop().run_until_complete(run())