async def test_send_recv_am(size, blocking_progress_mode, recv_wait, data): rndv_thresh = 8192 ucp.init( options={"RNDV_THRESH": str(rndv_thresh)}, blocking_progress_mode=blocking_progress_mode, ) ucp.register_am_allocator(data["allocator"], data["memory_type"]) msg = data["generator"](size) recv = [] listener = ucp.create_listener(simple_server(size, recv)) num_clients = 1 clients = [ await ucp.create_endpoint(ucp.get_address(), listener.port) for i in range(num_clients) ] for c in clients: if recv_wait: # By sleeping here we ensure that the listener's # ep.am_recv call will have to wait, rather than return # immediately as receive data is already available. await asyncio.sleep(1) await c.am_send(msg) for c in clients: await c.close() listener.close() if data["memory_type"] == "cuda" and msg.nbytes < rndv_thresh: # Eager messages are always received on the host, if no host # allocator is registered UCX-Py defaults to `bytearray`. assert recv[0] == bytearray(msg.get()) else: data["validator"](recv[0], msg)
async def _func_ucp_create_listener(sessionId, r): """ Creates a UCP listener for incoming endpoint connections. This function runs in a loop asynchronously in the background on the worker :param sessionId: uuid Unique id for current instance :param r: float a random number to stop the function from being cached """ if "ucp_listener" in worker_state(sessionId): print("Listener already started for sessionId=" + str(sessionId)) else: ucp.init() listener = ucp.start_listener(_connection_func, 0, is_coroutine=True) worker_state(sessionId)["ucp_listener"] = listener while not listener.done(): await listener.coroutine await asyncio.sleep(1) del worker_state(sessionId)["ucp_listener"] del listener ucp.fin()
def init_once(): global ucp, cuda_array if ucp is not None: return import ucp as _ucp ucp = _ucp ucp.init(options=dask.config.get("ucx"), env_takes_precedence=True) # Find the function, `cuda_array()`, to use when allocating new CUDA arrays try: import rmm if hasattr(rmm, "DeviceBuffer"): cuda_array = lambda n: rmm.DeviceBuffer(size=n) else: # pre-0.11.0 cuda_array = lambda n: rmm.device_array(n, dtype=np.uint8) except ImportError: try: import numba.cuda cuda_array = lambda n: numba.cuda.device_array( (n, ), dtype=np.uint8) except ImportError: def cuda_array(n): raise RuntimeError( "In order to send/recv CUDA arrays, Numba or RMM is required" )
def test_init_options(): ucp.reset() options = {"SEG_SIZE": "3M"} # environment specification should be ignored ucp.init(options) config = ucp.get_config() assert config["SEG_SIZE"] == options["SEG_SIZE"]
def client(port, func, endpoint_error_handling): # wait for server to come up # receive object # process suicides ucp.init() # must create context before importing # cudf/cupy/etc async def read(): await asyncio.sleep(1) ep = await get_ep("client", port, endpoint_error_handling) msg = None import cupy cupy.cuda.set_allocator(None) frames, msg = recv(ep) # Client process suicides to force an "Endpoint timeout" # on the server os.kill(os.getpid(), signal.SIGKILL) asyncio.get_event_loop().run_until_complete(read())
def server(port, func, comm_api): # create listener receiver # write cudf object # confirm message is sent correctly from distributed.comm.utils import to_frames from distributed.protocol import to_serialize ucp.init() if comm_api == "am": register_am_allocators() async def f(listener_port): # coroutine shows up when the client asks # to connect async def write(ep): import cupy cupy.cuda.set_allocator(None) print("CREATING CUDA OBJECT IN SERVER...") cuda_obj_generator = cloudpickle.loads(func) cuda_obj = cuda_obj_generator() msg = {"data": to_serialize(cuda_obj)} frames = await to_frames(msg, serializers=("cuda", "dask", "pickle")) for i in range(ITERATIONS): # Send meta data if comm_api == "tag": await send(ep, frames) else: await am_send(ep, frames) print("CONFIRM RECEIPT") close_msg = b"shutdown listener" if comm_api == "tag": msg_size = np.empty(1, dtype=np.uint64) await ep.recv(msg_size) msg = np.empty(msg_size[0], dtype=np.uint8) await ep.recv(msg) else: msg = await ep.am_recv() recv_msg = msg.tobytes() assert recv_msg == close_msg print("Shutting Down Server...") await ep.close() lf.close() lf = ucp.create_listener(write, port=listener_port) try: while not lf.closed(): await asyncio.sleep(0.1) except ucp.UCXCloseError: pass loop = get_event_loop() loop.run_until_complete(f(port))
async def test_fence(blocking_progress_mode): # Test needs to be async here to ensure progress tasks are cleared # and avoid warnings. ucp.init(blocking_progress_mode=blocking_progress_mode) # this should always succeed ucp.fence()
def test_init_options(): ucp.reset() os.environ["UCX_SEG_SIZE"] = "2M" # Should be ignored options = {"SEG_SIZE": "3M"} ucp.init(options) config = ucp.get_config() assert config["SEG_SIZE"] == options["SEG_SIZE"]
def test_init_options_and_env(): ucp.reset() os.environ["UCX_SEG_SIZE"] = "4M" options = {"SEG_SIZE": "3M"} # Should be ignored ucp.init(options, env_takes_precedence=True) config = ucp.get_config() assert config["SEG_SIZE"] == options["SEG_SIZE"]
def start(self): async def serve_forever(client_ep, listener_instance): ucx = UCX( client_ep, local_addr=self.address, peer_addr=self. address, # TODO: https://github.com/Akshay-Venkatesh/ucx-py/issues/111 deserialize=self.deserialize, ) self.listener_instance = listener_instance if self.comm_handler: await self.comm_handler(ucx) ucp.init() self.ucp_server = ucp.start_listener(serve_forever, listener_port=self._input_port, is_coroutine=True) try: loop = asyncio.get_running_loop() except (RuntimeError, AttributeError): loop = asyncio.get_event_loop() t = loop.create_task(self.ucp_server.coroutine) self._task = t
def _worker_process(queue, rank, server_address, n_workers, ucx_options_list, func, args): import ucp if ucx_options_list is not None: ucp.init(ucx_options_list[rank]) async def run(): eps = {} async def server_handler(ep): peer_rank = np.empty((1, ), dtype=np.uint64) await ep.recv(peer_rank) assert peer_rank[0] not in eps eps[peer_rank[0]] = ep lf = ucp.create_listener(server_handler) queue.put(lf.port) port_list = queue.get() for i in range(rank + 1, n_workers): assert i not in eps eps[i] = await ucp.create_endpoint(server_address, port_list[i]) await eps[i].send(np.array([rank], dtype=np.uint64)) while len(eps) != n_workers - 1: await asyncio.sleep(0.1) if asyncio.iscoroutinefunction(func): return await func(rank, eps, args) else: return func(rank, eps, args) loop = asyncio.get_event_loop() ret = loop.run_until_complete(run()) queue.put(ret)
def ucp_init(): try: set_env() ucp.init() yield finally: ucp.fin()
def test_init_options_and_env(): ucp.reset() options = {"SEG_SIZE": "3M"} # Should be ignored ucp.init(options, env_takes_precedence=True) config = ucp.get_config() assert config["SEG_SIZE"] == os.environ["UCX_SEG_SIZE"] # Provided options dict was not modified. assert options == {"SEG_SIZE": "3M"}
def server(port, func, endpoint_error_handling): # create listener receiver # add queue logger # write cudf object # terminates ep/listener # checks that "Endpoint timeout" was logged ucp.init() log_queue, log_listener = get_log_queue_handler() log_listener.start() async def f(listener_port): # coroutine shows up when the client asks # to connect async def write(ep): import cupy cupy.cuda.set_allocator(None) print("CREATING CUDA OBJECT IN SERVER...") cuda_obj_generator = cloudpickle.loads(func) cuda_obj = cuda_obj_generator() msg = {"data": to_serialize(cuda_obj)} frames = await to_frames(msg, serializers=("cuda", "dask", "pickle")) # Send meta data try: await send(ep, frames) except Exception: # Avoids process hanging on "Endpoint timeout" pass print("Shutting Down Server...") await ep.close() lf.close() lf = ucp.create_listener( write, port=listener_port, endpoint_error_handling=endpoint_error_handling ) try: while not lf.closed(): await asyncio.sleep(0.1) except ucp.UCXCloseError: pass log_listener.stop() asyncio.get_event_loop().run_until_complete(f(port)) # Check log for the expected "Endpoint timeout" and exits with # status -80 if encountered, 0 otherwise. The process will exit # with status -6 when endpoint_error_callback=False. while not log_queue.empty(): log = log_queue.get() if "Endpoint timeout" in log.getMessage(): sys.exit(-80) sys.exit(0)
def client(queue, port, args): import ucp ucp.init() if args.object_type == "numpy": import numpy as np elif args.object_type == "cupy": import cupy as np np.cuda.runtime.setDevice(args.client_dev) else: import cupy as np import rmm rmm.reinitialize( pool_allocator=True, managed_memory=False, initial_pool_size=args.rmm_init_pool_size, devices=[args.client_dev], ) np.cuda.runtime.setDevice(args.client_dev) np.cuda.set_allocator(rmm.rmm_cupy_allocator) async def run(): ep = await ucp.create_endpoint(args.server_address, port) msg_send_list = [] msg_recv_list = [] if not args.reuse_alloc: for i in range(args.n_iter): msg_send_list.append(np.arange(args.n_bytes, dtype="u1")) msg_recv_list.append(np.zeros(args.n_bytes, dtype="u1")) else: t1 = np.arange(args.n_bytes, dtype="u1") t2 = np.zeros(args.n_bytes, dtype="u1") for i in range(args.n_iter): msg_send_list.append(t1) msg_recv_list.append(t2) assert msg_send_list[0].nbytes == args.n_bytes assert msg_recv_list[0].nbytes == args.n_bytes if args.cuda_profile: np.cuda.profiler.start() times = [] for i in range(args.n_iter): start = clock() await ep.send(msg_send_list[i], args.n_bytes) await ep.recv(msg_recv_list[i], args.n_bytes) stop = clock() times.append(stop - start) if args.cuda_profile: np.cuda.profiler.stop() queue.put(times) loop = asyncio.get_event_loop() loop.run_until_complete(run()) loop.close()
def client(port, func, comm_api): # wait for server to come up # receive cudf object # deserialize # assert deserialized msg is cdf # send receipt from distributed.utils import nbytes ucp.init() if comm_api == "am": register_am_allocators() # must create context before importing # cudf/cupy/etc async def read(): await asyncio.sleep(1) ep = await get_ep("client", port) msg = None import cupy cupy.cuda.set_allocator(None) for i in range(ITERATIONS): if comm_api == "tag": frames, msg = await recv(ep) else: frames, msg = await am_recv(ep) close_msg = b"shutdown listener" if comm_api == "tag": close_msg_size = np.array([len(close_msg)], dtype=np.uint64) await ep.send(close_msg_size) await ep.send(close_msg) else: await ep.am_send(close_msg) print("Shutting Down Client...") return msg["data"] rx_cuda_obj = asyncio.get_event_loop().run_until_complete(read()) rx_cuda_obj + rx_cuda_obj num_bytes = nbytes(rx_cuda_obj) print(f"TOTAL DATA RECEIVED: {num_bytes}") cuda_obj_generator = cloudpickle.loads(func) pure_cuda_obj = cuda_obj_generator() if isinstance(rx_cuda_obj, cupy.ndarray): cupy.testing.assert_allclose(rx_cuda_obj, pure_cuda_obj) else: from cudf.testing._utils import assert_eq assert_eq(rx_cuda_obj, pure_cuda_obj)
async def connect(self, address: str, deserialize=True, **connection_args) -> UCX: logger.debug("UCXConnector.connect: %s", address) ucp.init() ip, port = parse_host_port(address) ep = await ucp.get_endpoint(ip.encode(), port) return self.comm_class( ep, local_addr=None, peer_addr=self.prefix + address, deserialize=deserialize, )
async def test_zero_port(): ucp.init() listener = ucp.start_listener(talk_to_client, listener_port=0, is_coroutine=True) assert 0 < listener.port < 2**16 ip = ucp.get_address() await asyncio.gather(listener.coroutine, talk_to_server(ip.encode(), listener.port)) ucp.fin()
def server(queue, args): if args.server_cpu_affinity >= 0: os.sched_setaffinity(0, [args.server_cpu_affinity]) ucp.init() if args.object_type == "numpy": import numpy as np elif args.object_type == "cupy": import cupy as np np.cuda.runtime.setDevice(args.server_dev) else: import cupy as np import rmm rmm.reinitialize( pool_allocator=True, managed_memory=False, initial_pool_size=args.rmm_init_pool_size, devices=[args.server_dev], ) np.cuda.runtime.setDevice(args.server_dev) np.cuda.set_allocator(rmm.rmm_cupy_allocator) async def run(): async def server_handler(ep): msg_recv_list = [] if not args.reuse_alloc: for _ in range(args.n_iter): msg_recv_list.append(np.zeros(args.n_bytes, dtype="u1")) else: t = np.zeros(args.n_bytes, dtype="u1") for _ in range(args.n_iter): msg_recv_list.append(t) assert msg_recv_list[0].nbytes == args.n_bytes for i in range(args.n_iter): await ep.recv(msg_recv_list[i], args.n_bytes) await ep.send(msg_recv_list[i], args.n_bytes) await ep.close() lf.close() lf = ucp.create_listener(server_handler) queue.put(lf.port) while not lf.closed(): await asyncio.sleep(0.5) loop = asyncio.get_event_loop() loop.run_until_complete(run()) loop.close()
async def test_send_recv_bytes(size, blocking_progress_mode): ucp.init(blocking_progress_mode=blocking_progress_mode) msg = bytearray(b"m" * size) msg_size = np.array([len(msg)], dtype=np.uint64) listener = ucp.create_listener(make_echo_server(lambda n: bytearray(n))) client = await ucp.create_endpoint(ucp.get_address(), listener.port) await client.send(msg_size) await client.send(msg) resp = bytearray(size) await client.recv(resp) assert resp == msg
def test_check_transport(transports): transports_list = transports.split(",") inactive_transports = list(set(["posix", "tcp"]) - set(transports_list)) ucp.reset() options = {"TLS": transports, "NET_DEVICES": "all"} ucp.init(options) active_transports = ucp.get_active_transports() for t in transports_list: assert any([at.startswith(t) for at in active_transports]) for it in inactive_transports: assert any([not at.startswith(it) for at in active_transports])
async def test_send_recv_numpy(size, dtype, blocking_progress_mode): ucp.init(blocking_progress_mode=blocking_progress_mode) msg = np.arange(size, dtype=dtype) msg_size = np.array([msg.nbytes], dtype=np.uint64) listener = ucp.create_listener( make_echo_server(lambda n: np.empty(n, dtype=np.uint8))) client = await ucp.create_endpoint(ucp.get_address(), listener.port) await client.send(msg_size) await client.send(msg) resp = np.empty_like(msg) await client.recv(resp) np.testing.assert_array_equal(resp, msg)
async def test_send_recv_error(blocking_progress_mode): ucp.init(blocking_progress_mode=blocking_progress_mode) async def say_hey_server(ep): await ep.send(bytearray(b"Hey")) listener = ucp.create_listener(say_hey_server) client = await ucp.create_endpoint(ucp.get_address(), listener.port) msg = bytearray(100) with pytest.raises( ucp.exceptions.UCXError, match=r"length mismatch: 3 \(got\) != 100 \(expected\)"): await client.recv(msg)
async def echo_pair(cuda_info=None): ucp.init() loop = asyncio.get_event_loop() listener = ucp.start_listener(ucp.make_server(cuda_info), is_coroutine=True) t = loop.create_task(listener.coroutine) address = ucp.get_address() client = await ucp.get_endpoint(address.encode(), listener.port) try: yield listener, client finally: ucp.destroy_ep(client) await t ucp.fin()
async def test_send_recv_obj(blocking_progress_mode): ucp.init(blocking_progress_mode=blocking_progress_mode) async def echo_obj_server(ep): obj = await ep.recv_obj() await ep.send_obj(obj) listener = ucp.create_listener(echo_obj_server) client = await ucp.create_endpoint(ucp.get_address(), listener.port) msg = bytearray(b"hello") await client.send_obj(msg) got = await client.recv_obj() assert msg == got
async def test_send_recv_numba(size, dtype, blocking_progress_mode): ucp.init(blocking_progress_mode=blocking_progress_mode) cuda = pytest.importorskip("numba.cuda") ary = np.arange(size, dtype=dtype) msg = cuda.to_device(ary) msg_size = np.array([msg.nbytes], dtype=np.uint64) listener = ucp.create_listener( make_echo_server(lambda n: cuda.device_array((n, ), dtype=np.uint8))) client = await ucp.create_endpoint(ucp.get_address(), listener.port) await client.send(msg_size) await client.send(msg) resp = cuda.device_array_like(msg) await client.recv(resp) np.testing.assert_array_equal(np.array(resp), np.array(msg))
def initialize( create_cuda_context=True, enable_tcp_over_ucx=False, enable_infiniband=False, enable_nvlink=False, net_devices="", ): if create_cuda_context: try: numba.cuda.current_context() except Exception: logger.error("Unable to start CUDA Context", exc_info=True) if enable_tcp_over_ucx or enable_infiniband or enable_nvlink: try: import ucp except ImportError: logger.error( "UCX protocol requested but ucp module is not available", exc_info=True) else: options = {} if enable_tcp_over_ucx or enable_infiniband or enable_nvlink: tls = "tcp,sockcm,cuda_copy" tls_priority = "sockcm" if enable_infiniband: tls = "rc," + tls if enable_nvlink: tls = tls + ",cuda_ipc" options = {"TLS": tls, "SOCKADDR_TLS_PRIORITY": tls_priority} if net_devices is not None and net_devices != "": options["NET_DEVICES"] = net_devices ucp.reset() ucp.init(options=options) ucx_env = {} for k, v in ucp.get_config().items(): # Skip values that aren't actual environment variables (i.e., not strings) if isinstance(v, str): ucx_env["UCX_" + k] = v # Set also UCX environment variables: required by Dask client. It may be best ti # to have the client asking the scheduler for the proper variables. os.environ.update(ucx_env)
async def test_send_recv_obj_numpy(blocking_progress_mode): ucp.init(blocking_progress_mode=blocking_progress_mode) allocator = functools.partial(np.empty, dtype=np.uint8) async def echo_obj_server(ep): obj = await ep.recv_obj(allocator=allocator) await ep.send_obj(obj) listener = ucp.create_listener(echo_obj_server) client = await ucp.create_endpoint(ucp.get_address(), listener.port) msg = bytearray(b"hello") await client.send_obj(msg) got = await client.recv_obj(allocator=allocator) assert msg == got
async def test_send_recv_cupy(size, dtype, blocking_progress_mode): asyncio.get_event_loop().set_exception_handler(handle_exception) ucp.reset() ucp.init(blocking_progress_mode=blocking_progress_mode) cupy = pytest.importorskip("cupy") msg = cupy.arange(size, dtype=dtype) msg_size = np.array([msg.nbytes], dtype=np.uint64) listener = ucp.create_listener( make_echo_server(lambda n: cupy.empty((n, ), dtype=np.uint8))) client = await ucp.create_endpoint(ucp.get_address(), listener.port) await client.send(msg_size) await client.send(msg) resp = cupy.empty_like(msg) await client.recv(resp) np.testing.assert_array_equal(cupy.asnumpy(resp), cupy.asnumpy(msg))
async def test_send_recv_error(blocking_progress_mode): asyncio.get_event_loop().set_exception_handler(handle_exception) ucp.reset() ucp.init(blocking_progress_mode=blocking_progress_mode) async def say_hey_server(ep): await ep.send(bytearray(b"Hey")) listener = ucp.create_listener(say_hey_server) client = await ucp.create_endpoint(ucp.get_address(), listener.port) msg = bytearray(100) with pytest.raises( ucp.exceptions.UCXError, match=r"length mismatch: 3 \(got\) != 100 \(expected\)"): await client.recv(msg) await client.close() listener.close() del client assert listener.closed() is True del listener