def client(env, port, func, enable_rmm):
    # connect to server's listener
    # receive object for TRANSFER_ITERATIONS
    # repeat for EP_ITERATIONS

    import numba.cuda

    numba.cuda.current_context()

    async def read():
        await asyncio.sleep(1)
        ep = await get_ep("client", port)

        for i in range(TRANSFER_ITERATIONS):
            frames, msg = await recv(ep)
            print("size of the message: ", len(msg["data"]))

        print("Shutting Down Client...")
        await ep.close()

    if enable_rmm:
        set_rmm()

    for i in range(EP_ITERATIONS):
        print("ITER: ", i)
        get_event_loop().run_until_complete(read())

    print("FINISHED")
예제 #2
0
def _test_from_worker_address_client_fixedsize(queue):
    async def run():
        # Read local worker address
        address = ucp.get_worker_address()
        recv_tag = ucp.utils.hash64bits(os.urandom(16))
        send_tag = ucp.utils.hash64bits(os.urandom(16))
        packed_address = _pack_address_and_tag(address, recv_tag, send_tag)

        # Receive worker address from server via multiprocessing.Queue, create
        # endpoint to server
        remote_address = queue.get()
        ep = await ucp.create_endpoint_from_worker_address(remote_address)

        # Send local address to server on tag 0
        await ep.send(packed_address, tag=0, force_tag=True)

        # Receive message from server
        recv_msg = np.empty(10, dtype=np.int64)
        await ep.recv(recv_msg, tag=recv_tag, force_tag=True)

        np.testing.assert_array_equal(recv_msg, np.arange(10, dtype=np.int64))

        # Send message to server
        send_msg = np.arange(20, dtype=np.int64)
        await ep.send(send_msg, tag=send_tag, force_tag=True)

    get_event_loop().run_until_complete(run())
예제 #3
0
def _test_shutdown_unexpected_closed_peer_client(client_queue, server_queue,
                                                 endpoint_error_handling):
    async def run():
        server_port = client_queue.get()
        ep = await ucp.create_endpoint(
            ucp.get_address(),
            server_port,
            endpoint_error_handling=endpoint_error_handling,
        )
        msg = np.empty(100, dtype=np.int64)
        await ep.recv(msg)

    get_event_loop().run_until_complete(run())
예제 #4
0
def server(port, func, comm_api):
    # create listener receiver
    # write cudf object
    # confirm message is sent correctly
    from distributed.comm.utils import to_frames
    from distributed.protocol import to_serialize

    ucp.init()

    if comm_api == "am":
        register_am_allocators()

    async def f(listener_port):
        # coroutine shows up when the client asks
        # to connect
        async def write(ep):
            import cupy

            cupy.cuda.set_allocator(None)

            print("CREATING CUDA OBJECT IN SERVER...")
            cuda_obj_generator = cloudpickle.loads(func)
            cuda_obj = cuda_obj_generator()
            msg = {"data": to_serialize(cuda_obj)}
            frames = await to_frames(msg,
                                     serializers=("cuda", "dask", "pickle"))
            for i in range(ITERATIONS):
                # Send meta data
                if comm_api == "tag":
                    await send(ep, frames)
                else:
                    await am_send(ep, frames)

            print("CONFIRM RECEIPT")
            close_msg = b"shutdown listener"

            if comm_api == "tag":
                msg_size = np.empty(1, dtype=np.uint64)
                await ep.recv(msg_size)

                msg = np.empty(msg_size[0], dtype=np.uint8)
                await ep.recv(msg)
            else:
                msg = await ep.am_recv()

            recv_msg = msg.tobytes()
            assert recv_msg == close_msg
            print("Shutting Down Server...")
            await ep.close()
            lf.close()

        lf = ucp.create_listener(write, port=listener_port)
        try:
            while not lf.closed():
                await asyncio.sleep(0.1)
        except ucp.UCXCloseError:
            pass

    loop = get_event_loop()
    loop.run_until_complete(f(port))
예제 #5
0
def client(port, func, comm_api):
    # wait for server to come up
    # receive cudf object
    # deserialize
    # assert deserialized msg is cdf
    # send receipt
    from distributed.utils import nbytes

    ucp.init()

    if comm_api == "am":
        register_am_allocators()

    # must create context before importing
    # cudf/cupy/etc

    async def read():
        await asyncio.sleep(1)
        ep = await get_ep("client", port)
        msg = None
        import cupy

        cupy.cuda.set_allocator(None)
        for i in range(ITERATIONS):
            if comm_api == "tag":
                frames, msg = await recv(ep)
            else:
                frames, msg = await am_recv(ep)

        close_msg = b"shutdown listener"

        if comm_api == "tag":
            close_msg_size = np.array([len(close_msg)], dtype=np.uint64)

            await ep.send(close_msg_size)
            await ep.send(close_msg)
        else:
            await ep.am_send(close_msg)

        print("Shutting Down Client...")
        return msg["data"]

    rx_cuda_obj = get_event_loop().run_until_complete(read())
    rx_cuda_obj + rx_cuda_obj
    num_bytes = nbytes(rx_cuda_obj)
    print(f"TOTAL DATA RECEIVED: {num_bytes}")

    cuda_obj_generator = cloudpickle.loads(func)
    pure_cuda_obj = cuda_obj_generator()

    if isinstance(rx_cuda_obj, cupy.ndarray):
        cupy.testing.assert_allclose(rx_cuda_obj, pure_cuda_obj)
    else:
        from cudf.testing._utils import assert_eq

        assert_eq(rx_cuda_obj, pure_cuda_obj)
예제 #6
0
def _test_from_worker_address_server_fixedsize(num_nodes, queue):
    async def run():
        async def _handle_client(packed_remote_address):
            # Unpack the fixed-size address+tag buffer
            unpacked = _unpack_address_and_tag(packed_remote_address)
            remote_address = ucp.get_ucx_address_from_buffer(
                unpacked["address"])

            # Create endpoint to remote worker using the received address
            ep = await ucp.create_endpoint_from_worker_address(remote_address)

            # Send data to client's endpoint
            send_msg = np.arange(10, dtype=np.int64)
            await ep.send(send_msg, tag=unpacked["send_tag"], force_tag=True)

            # Receive data from client's endpoint
            recv_msg = np.empty(20, dtype=np.int64)
            await ep.recv(recv_msg, tag=unpacked["recv_tag"], force_tag=True)

            np.testing.assert_array_equal(recv_msg,
                                          np.arange(20, dtype=np.int64))

        # Send worker address to client processes via multiprocessing.Queue,
        # one entry for each client.
        address = ucp.get_worker_address()
        for i in range(num_nodes):
            queue.put(address)

        address_info = _get_address_info()

        server_tasks = []
        for i in range(num_nodes):
            # Receive fixed-size address+tag buffer on tag 0
            packed_remote_address = bytearray(address_info["frame_size"])
            await ucp.recv(packed_remote_address, tag=0)

            # Create an async task for client
            server_tasks.append(_handle_client(packed_remote_address))

        # Await handling each client request
        await asyncio.gather(*server_tasks)

    get_event_loop().run_until_complete(run())
def _test_from_worker_address_error_server(q1, q2, error_type):
    async def run():
        address = bytearray(ucp.get_worker_address())

        if error_type == "unreachable":
            # Shutdown worker, then send its address to client process via
            # multiprocessing.Queue
            ucp.reset()
            q1.put(address)
        else:
            # Send worker address to client process via # multiprocessing.Queue,
            # wait for client to connect, then shutdown worker.
            q1.put(address)

            ep_ready = q2.get()
            assert ep_ready == "ready"

            ucp.reset()

            q1.put("disconnected")

    get_event_loop().run_until_complete(run())
예제 #8
0
def _test_shutdown_unexpected_closed_peer_server(client_queue, server_queue,
                                                 endpoint_error_handling):
    global ep_is_alive
    ep_is_alive = None

    async def run():
        async def server_node(ep):
            try:
                global ep_is_alive

                await ep.send(np.arange(100, dtype=np.int64))
                # Waiting for signal to close the endpoint
                await mp_queue_get_nowait(server_queue)

                # At this point, the client should have died and the endpoint
                # is not alive anymore. `True` only when endpoint error
                # handling is enabled.
                ep_is_alive = ep._ep.is_alive()

                await ep.close()
            finally:
                listener.close()

        listener = ucp.create_listener(
            server_node, endpoint_error_handling=endpoint_error_handling)
        client_queue.put(listener.port)
        while not listener.closed():
            await asyncio.sleep(0.1)

    log_stream = StringIO()
    logging.basicConfig(stream=log_stream, level=logging.DEBUG)
    get_event_loop().run_until_complete(run())
    log = log_stream.getvalue()

    if endpoint_error_handling is True:
        assert ep_is_alive is False
    else:
        assert ep_is_alive
        assert log.find("""UCXError('<[Send shutdown]""") != -1
예제 #9
0
def _test_from_worker_address_client(queue):
    async def run():
        # Read local worker address
        address = ucp.get_worker_address()

        # Receive worker address from server via multiprocessing.Queue, create
        # endpoint to server
        remote_address = queue.get()
        ep = await ucp.create_endpoint_from_worker_address(remote_address)

        # Send local address to server on tag 0
        await ep.send(np.array(address.length, np.int64),
                      tag=0,
                      force_tag=True)
        await ep.send(address, tag=0, force_tag=True)

        # Receive message from server
        recv_msg = np.empty(10, dtype=np.int64)
        await ep.recv(recv_msg, tag=1, force_tag=True)

        np.testing.assert_array_equal(recv_msg, np.arange(10, dtype=np.int64))

    get_event_loop().run_until_complete(run())
예제 #10
0
def _test_from_worker_address_server(queue):
    async def run():
        # Send worker address to client process via multiprocessing.Queue
        address = ucp.get_worker_address()
        queue.put(address)

        # Receive address size
        address_size = np.empty(1, dtype=np.int64)
        await ucp.recv(address_size, tag=0)

        # Receive address buffer on tag 0 and create UCXAddress from it
        remote_address = bytearray(address_size[0])
        await ucp.recv(remote_address, tag=0)
        remote_address = ucp.get_ucx_address_from_buffer(remote_address)

        # Create endpoint to remote worker using the received address
        ep = await ucp.create_endpoint_from_worker_address(remote_address)

        # Send data to client's endpoint
        send_msg = np.arange(10, dtype=np.int64)
        await ep.send(send_msg, tag=1, force_tag=True)

    get_event_loop().run_until_complete(run())
예제 #11
0
def client(env, port, func, verbose):
    # wait for server to come up
    # receive cudf object
    # deserialize
    # assert deserialized msg is cdf
    # send receipt

    os.environ.update(env)
    before_rx, before_tx = total_nvlink_transfer()

    async def read():
        await asyncio.sleep(1)
        ep = await get_ep("client", port)

        for i in range(ITERATIONS):
            bytes_used = pynvml.nvmlDeviceGetMemoryInfo(
                pynvml.nvmlDeviceGetHandleByIndex(0)).used
            bytes_used
            # print("Bytes Used:", bytes_used, i)

            frames, msg = await recv(ep)

            # Send meta data
            await send(ep, frames)

        print("Shutting Down Client...")
        await ep.close()

    set_rmm()
    for i in range(ITERATIONS):
        print("ITER: ", i)
        t = time.time()
        get_event_loop().run_until_complete(read())
        if verbose:
            print("Time take for interation %d: %ss" % (i, time.time() - t))

    print("FINISHED")
예제 #12
0
def server(env, port, func, verbose):
    # create listener receiver
    # write cudf object
    # confirm message is sent correctly

    os.environ.update(env)

    async def f(listener_port):
        # coroutine shows up when the client asks
        # to connect
        set_rmm()

        async def write(ep):

            print("CREATING CUDA OBJECT IN SERVER...")
            cuda_obj_generator = cloudpickle.loads(func)
            cuda_obj = cuda_obj_generator()
            msg = {"data": to_serialize(cuda_obj)}
            frames = await to_frames(msg,
                                     serializers=("cuda", "dask", "pickle"))
            while True:
                for i in range(ITERATIONS):
                    print("ITER: ", i)
                    # Send meta data
                    await send(ep, frames)

                    frames, msg = await recv(ep)

                print("CONFIRM RECEIPT")
                await ep.close()
                break
            # lf.close()
            del msg
            del frames

        lf = ucp.create_listener(write, port=listener_port)
        try:
            while not lf.closed():
                await asyncio.sleep(0.1)
        except ucp.UCXCloseError:
            pass

    loop = get_event_loop()
    while True:
        loop.run_until_complete(f(port))
예제 #13
0
def server(env, port, func, enable_rmm, num_workers, proc_conn):
    # create frames to send
    # create listener
    # notify parent process of listener status
    # write object to each new connection for TRANSFER_ITERATIONS
    # close listener after num_workers*EP_ITERATIONS have disconnected

    os.environ.update(env)

    import numba.cuda

    numba.cuda.current_context()

    loop = get_event_loop()

    # Creates frames only once to prevent filling the entire GPU
    print("CREATING CUDA OBJECT IN SERVER...")
    cuda_obj_generator = cloudpickle.loads(func)
    cuda_obj = cuda_obj_generator()
    msg = {"data": to_serialize(cuda_obj)}
    frames = loop.run_until_complete(
        to_frames(msg, serializers=("cuda", "dask", "pickle")))

    async def f(listener_port, frames):
        # coroutine shows up when the client asks
        # to connect

        if enable_rmm:
            set_rmm()

        # Use a global so the `write` callback function can read frames
        global _frames
        global _connected
        global _disconnected
        global _lock
        _connected = 0
        _disconnected = 0
        _lock = threading.Lock()
        _frames = frames

        async def write(ep):
            global _connected
            global _disconnected

            _lock.acquire()
            _connected += 1
            _lock.release()

            for i in range(TRANSFER_ITERATIONS):
                print("ITER: ", i)
                # Send meta data
                await send(ep, _frames)

            print("CONFIRM RECEIPT")
            await ep.close()

            _lock.acquire()
            _disconnected += 1
            _lock.release()
            # break

        lf = ucp.create_listener(write, port=listener_port)
        proc_conn.send("initialized")
        proc_conn.close()

        try:
            while _disconnected < num_workers * EP_ITERATIONS:
                await asyncio.sleep(0.1)
            print("Closing listener")
            lf.close()
        except ucp.UCXCloseError:
            pass

    loop.run_until_complete(f(port, frames))
예제 #14
0
def server(queue, args):
    if args.server_cpu_affinity >= 0:
        os.sched_setaffinity(0, [args.server_cpu_affinity])

    if args.object_type == "numpy":
        import numpy as xp
    elif args.object_type == "cupy":
        import cupy as xp

        xp.cuda.runtime.setDevice(args.server_dev)
    else:
        import cupy as xp

        import rmm

        rmm.reinitialize(
            pool_allocator=True,
            managed_memory=False,
            initial_pool_size=args.rmm_init_pool_size,
            devices=[args.server_dev],
        )
        xp.cuda.runtime.setDevice(args.server_dev)
        xp.cuda.set_allocator(rmm.rmm_cupy_allocator)

    ucp.init()

    register_am_allocators(args)

    async def run():
        async def server_handler(ep):

            if not args.enable_am:
                msg_recv_list = []
                if not args.reuse_alloc:
                    for _ in range(args.n_iter + args.n_warmup_iter):
                        msg_recv_list.append(xp.zeros(args.n_bytes, dtype="u1"))
                else:
                    t = xp.zeros(args.n_bytes, dtype="u1")
                    for _ in range(args.n_iter + args.n_warmup_iter):
                        msg_recv_list.append(t)

                assert msg_recv_list[0].nbytes == args.n_bytes

            for i in range(args.n_iter + args.n_warmup_iter):
                if args.enable_am is True:
                    recv = await ep.am_recv()
                    await ep.am_send(recv)
                else:
                    await ep.recv(msg_recv_list[i])
                    await ep.send(msg_recv_list[i])
            await ep.close()
            lf.close()

        lf = ucp.create_listener(server_handler, port=args.port)
        queue.put(lf.port)

        while not lf.closed():
            await asyncio.sleep(0.5)

    loop = get_event_loop()
    loop.run_until_complete(run())
예제 #15
0
def client(queue, port, server_address, args):
    if args.client_cpu_affinity >= 0:
        os.sched_setaffinity(0, [args.client_cpu_affinity])

    import numpy as np

    if args.object_type == "numpy":
        import numpy as xp
    elif args.object_type == "cupy":
        import cupy as xp

        xp.cuda.runtime.setDevice(args.client_dev)
    else:
        import cupy as xp

        import rmm

        rmm.reinitialize(
            pool_allocator=True,
            managed_memory=False,
            initial_pool_size=args.rmm_init_pool_size,
            devices=[args.client_dev],
        )
        xp.cuda.runtime.setDevice(args.client_dev)
        xp.cuda.set_allocator(rmm.rmm_cupy_allocator)

    ucp.init()

    register_am_allocators(args)

    async def run():
        ep = await ucp.create_endpoint(server_address, port)

        if args.enable_am:
            msg = xp.arange(args.n_bytes, dtype="u1")
        else:
            msg_send_list = []
            msg_recv_list = []
            if not args.reuse_alloc:
                for i in range(args.n_iter + args.n_warmup_iter):
                    msg_send_list.append(xp.arange(args.n_bytes, dtype="u1"))
                    msg_recv_list.append(xp.zeros(args.n_bytes, dtype="u1"))
            else:
                t1 = xp.arange(args.n_bytes, dtype="u1")
                t2 = xp.zeros(args.n_bytes, dtype="u1")
                for i in range(args.n_iter + args.n_warmup_iter):
                    msg_send_list.append(t1)
                    msg_recv_list.append(t2)
            assert msg_send_list[0].nbytes == args.n_bytes
            assert msg_recv_list[0].nbytes == args.n_bytes

        if args.cuda_profile:
            xp.cuda.profiler.start()
        times = []
        for i in range(args.n_iter + args.n_warmup_iter):
            start = clock()
            if args.enable_am:
                await ep.am_send(msg)
                await ep.am_recv()
            else:
                await ep.send(msg_send_list[i])
                await ep.recv(msg_recv_list[i])
            stop = clock()
            if i >= args.n_warmup_iter:
                times.append(stop - start)
        if args.cuda_profile:
            xp.cuda.profiler.stop()
        queue.put(times)

    loop = get_event_loop()
    loop.run_until_complete(run())

    times = queue.get()
    assert len(times) == args.n_iter
    bw_avg = format_bytes(2 * args.n_iter * args.n_bytes / sum(times))
    bw_med = format_bytes(2 * args.n_bytes / np.median(times))
    lat_avg = int(sum(times) * 1e9 / (2 * args.n_iter))
    lat_med = int(np.median(times) * 1e9 / 2)

    print("Roundtrip benchmark")
    print_separator(separator="=")
    print_key_value(key="Iterations", value=f"{args.n_iter}")
    print_key_value(key="Bytes", value=f"{format_bytes(args.n_bytes)}")
    print_key_value(key="Object type", value=f"{args.object_type}")
    print_key_value(key="Reuse allocation", value=f"{args.reuse_alloc}")
    print_key_value(key="Transfer API", value=f"{'AM' if args.enable_am else 'TAG'}")
    print_key_value(key="UCX_TLS", value=f"{ucp.get_config()['TLS']}")
    print_key_value(key="UCX_NET_DEVICES", value=f"{ucp.get_config()['NET_DEVICES']}")
    print_separator(separator="=")
    if args.object_type == "numpy":
        print_key_value(key="Device(s)", value="CPU-only")
        s_aff = (
            args.server_cpu_affinity
            if args.server_cpu_affinity >= 0
            else "affinity not set"
        )
        c_aff = (
            args.client_cpu_affinity
            if args.client_cpu_affinity >= 0
            else "affinity not set"
        )
        print_key_value(key="Server CPU", value=f"{s_aff}")
        print_key_value(key="Client CPU", value=f"{c_aff}")
    else:
        print_key_value(key="Device(s)", value=f"{args.server_dev}, {args.client_dev}")
    print_separator(separator="=")
    print_key_value("Bandwidth (average)", value=f"{bw_avg}/s")
    print_key_value("Bandwidth (median)", value=f"{bw_med}/s")
    print_key_value("Latency (average)", value=f"{lat_avg} ns")
    print_key_value("Latency (median)", value=f"{lat_med} ns")
    if not args.no_detailed_report:
        print_separator(separator="=")
        print_key_value(key="Iterations", value="Bandwidth, Latency")
        print_separator(separator="-")
        for i, t in enumerate(times):
            ts = format_bytes(2 * args.n_bytes / t)
            lat = int(t * 1e9 / 2)
            print_key_value(key=i, value=f"{ts}/s, {lat}ns")
def _test_from_worker_address_error_client(q1, q2, error_type):
    async def run():
        # Receive worker address from server via multiprocessing.Queue
        remote_address = ucp.get_ucx_address_from_buffer(q1.get())

        if error_type == "unreachable":
            with pytest.raises(
                    ucp.exceptions.UCXError,
                    match="Destination is unreachable|Endpoint timeout",
            ):
                # Here, two cases may happen:
                # 1. With TCP creating endpoint will immediately raise
                #    "Destination is unreachable"
                # 2. With rc/ud creating endpoint will succeed, but raise
                #    "Endpoint timeout" after UCX_UD_TIMEOUT seconds have passed.
                #    We need to keep progressing UCP until timeout is raised.
                ep = await ucp.create_endpoint_from_worker_address(
                    remote_address)

                start = time.monotonic()
                while not ep._ep.raise_on_error():
                    ucp.progress()

                    # Prevent hanging
                    if time.monotonic() - start >= 1.0:
                        return
        else:
            # Create endpoint to remote worker, and:
            #
            # 1. For timeout_am_send/timeout_send:
            #    - inform remote worker that local endpoint is ready for remote
            #      shutdown;
            #    - wait for remote worker to shutdown and confirm;
            #    - attempt to send message.
            #
            # 2. For timeout_am_recv/timeout_recv:
            #    - schedule ep.recv;
            #    - inform remote worker that local endpoint is ready for remote
            #      shutdown;
            #    - wait for it to shutdown and confirm
            #    - wait for recv message.
            ep = await ucp.create_endpoint_from_worker_address(remote_address)

            if re.match("timeout.*send", error_type):
                q2.put("ready")

                remote_disconnected = q1.get()
                assert remote_disconnected == "disconnected"

                with pytest.raises(ucp.exceptions.UCXError,
                                   match="Endpoint timeout"):
                    if error_type == "timeout_am_send":
                        await asyncio.wait_for(ep.am_send(np.zeros(10)),
                                               timeout=1.0)
                    else:
                        await asyncio.wait_for(ep.send(np.zeros(10),
                                                       tag=0,
                                                       force_tag=True),
                                               timeout=1.0)
            else:
                with pytest.raises(ucp.exceptions.UCXCanceled):
                    if error_type == "timeout_am_recv":
                        task = asyncio.wait_for(ep.am_recv(), timeout=3.0)
                    else:
                        msg = np.empty(10)
                        task = asyncio.wait_for(ep.recv(msg,
                                                        tag=0,
                                                        force_tag=True),
                                                timeout=3.0)

                    q2.put("ready")

                    remote_disconnected = q1.get()
                    assert remote_disconnected == "disconnected"

                    await task

    get_event_loop().run_until_complete(run())