def _test_initialize_ucx_infiniband(): kwargs = {"enable_infiniband": True, "net_devices": "ib0"} initialize(**kwargs) with LocalCluster( protocol="ucx", dashboard_address=None, n_workers=1, threads_per_worker=1, processes=True, config={"ucx": get_ucx_config(**kwargs)}, ) as cluster: with Client(cluster) as client: res = da.from_array(numpy.arange(10000), chunks=(1000, )) res = res.sum().compute() assert res == 49995000 def check_ucx_options(): conf = ucp.get_config() assert "TLS" in conf assert "rc" in conf["TLS"] assert "tcp" in conf["TLS"] assert "sockcm" in conf["TLS"] assert "cuda_copy" in conf["TLS"] assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"] assert conf["NET_DEVICES"] == "ib0" return True assert client.run_on_scheduler(check_ucx_options) == True assert all(client.run(check_ucx_options).values())
def _test_initialize_ucx_all(): initialize() with LocalCluster( protocol="ucx", dashboard_address=None, n_workers=1, threads_per_worker=1, processes=True, config={"distributed.comm.ucx": get_ucx_config()}, ) as cluster: with Client(cluster) as client: res = da.from_array(numpy.arange(10000), chunks=(1000,)) res = res.sum().compute() assert res == 49995000 def check_ucx_options(): conf = ucp.get_config() assert "TLS" in conf assert conf["TLS"] == "all" assert all( [ p in conf["SOCKADDR_TLS_PRIORITY"] for p in ["rdmacm", "tcp", "sockcm"] ] ) return True assert client.run_on_scheduler(check_ucx_options) is True assert all(client.run(check_ucx_options).values())
def _test_initialize_ucx_nvlink(): initialize(enable_nvlink=True) with LocalCluster( protocol="ucx", dashboard_address=None, n_workers=1, threads_per_worker=1, processes=True, ) as cluster: with Client(cluster) as client: res = da.from_array(numpy.arange(10000), chunks=(1000, )) res = res.sum().compute() assert res == 49995000 def check_ucx_options(): conf = ucp.get_config() assert "TLS" in conf assert "cuda_ipc" in conf["TLS"] assert "tcp" in conf["TLS"] assert "sockcm" in conf["TLS"] assert "cuda_copy" in conf["TLS"] assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"] return True assert all(client.run(check_ucx_options).values())
async def test_dgx_ucx_infiniband_nvlink(params): ucp = pytest.importorskip("ucp") enable_tcp = params["enable_tcp"] enable_infiniband = params["enable_infiniband"] enable_nvlink = params["enable_nvlink"] initialize(create_cuda_context=True, enable_tcp_over_ucx=enable_tcp, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink) async with DGX( interface="enp1s0f0", protocol="ucx", enable_tcp_over_ucx=enable_tcp, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, asynchronous=True, ) as cluster: async with Client(cluster, asynchronous=True) as client: rs = da.random.RandomState(RandomState=cupy.random.RandomState) a = rs.normal(10, 1, (int(1e4), int(1e4)), chunks=(int(2.5e3), int(2.5e3))) x = a + a.T res = await client.compute(x)
async def run(): initialize( create_cuda_context=True, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, ) async with LocalCUDACluster( interface="enp1s0f0", protocol="ucx", enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, asynchronous=True, ) as cluster: async with Client(cluster, asynchronous=True) as client: rs = da.random.RandomState(RandomState=cupy.random.RandomState) a = rs.normal(10, 1, (int(4e3), int(4e3)), chunks=(int(1e3), int(1e3))) x = a + a.T for i in range(100): print("Running iteration:", i) start = time.time() await client.compute(x) print("Time for iteration", i, ":", time.time() - start)
def main( address, enable_nvlink, enable_infiniband, ): enable_rdmacm = False ucx_net_devices = None if enable_infiniband: # enable_rdmacm = True # RDMACM not working right now ucx_net_devices = "mlx5_0:1" # set up environment initialize( enable_tcp_over_ucx=True, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband, enable_rdmacm=enable_rdmacm, net_devices=ucx_net_devices, ) # initialize client client = Client(address) # user code here rs = da.random.RandomState(RandomState=cupy.random.RandomState) x = rs.random((10000, 10000), chunks=1000) x.sum().compute() # shutdown cluster client.shutdown()
def setup(dask_scheduler_file=None, rmm_pool_size=None): if dask_scheduler_file: cluster = None # Env var UCX_MAX_RNDV_RAILS=1 must be set too. initialize( enable_tcp_over_ucx=True, enable_nvlink=True, enable_infiniband=False, enable_rdmacm=False, #net_devices="mlx5_0:1", ) client = Client(scheduler_file=dask_scheduler_file) else: tempdir_object = tempfile.TemporaryDirectory() cluster = LocalCUDACluster(local_directory=tempdir_object.name, rmm_pool_size=rmm_pool_size) client = Client(cluster) # add the obj to the client so it doesn't get deleted until # the 'client' obj gets cleaned up client.tempdir_object = tempdir_object client.wait_for_workers(len(get_visible_devices())) Comms.initialize(p2p=True) return (client, cluster)
async def run(): initialize( create_cuda_context=True, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, ) async with LocalCUDACluster( interface="enp1s0f0", protocol="ucx", enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, asynchronous=True, ) as cluster: async with Client(cluster, asynchronous=True) as client: d = dask_cudf.from_cudf( cudf.DataFrame({"a": range(2 ** 16)}), npartitions=2 ) r = d.sum() for i in range(100): print("Running iteration:", i) start = time.time() await client.compute(r) print("Time for iteration", i, ":", time.time() - start)
def _test_initialize_ucx_nvlink(): kwargs = {"enable_nvlink": True} initialize(**kwargs) with LocalCluster( protocol="ucx", dashboard_address=None, n_workers=1, threads_per_worker=1, processes=True, config={"distributed.comm.ucx": get_ucx_config(**kwargs)}, ) as cluster: with Client(cluster) as client: res = da.from_array(numpy.arange(10000), chunks=(1000,)) res = res.sum().compute() assert res == 49995000 def check_ucx_options(): conf = ucp.get_config() assert "TLS" in conf assert "cuda_ipc" in conf["TLS"] assert "tcp" in conf["TLS"] assert "cuda_copy" in conf["TLS"] if _ucx_110: assert "tcp" in conf["SOCKADDR_TLS_PRIORITY"] else: assert "sockcm" in conf["TLS"] assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"] return True assert client.run_on_scheduler(check_ucx_options) is True assert all(client.run(check_ucx_options).values())
def _test_dataframe_shuffle(backend, protocol, n_workers): if backend == "cudf": cudf = pytest.importorskip("cudf") from cudf.testing._utils import assert_eq initialize(enable_tcp_over_ucx=True) else: from dask.dataframe.utils import assert_eq dask.config.update( dask.config.global_config, { "ucx": { "tcp": True, "cuda_copy": True, }, }, priority="new", ) with LocalCluster( protocol=protocol, dashboard_address=None, n_workers=n_workers, threads_per_worker=1, processes=True, ) as cluster: with Client(cluster) as client: all_workers = list(client.get_worker_logs().keys()) comms.default_comms() np.random.seed(42) df = pd.DataFrame({"key": np.random.random(100)}) if backend == "cudf": df = cudf.DataFrame.from_pandas(df) for input_nparts in range(1, 5): for output_nparts in range(1, 5): ddf = dd.from_pandas( df.copy(), npartitions=input_nparts).persist(workers=all_workers) ddf = explicit_comms_shuffle( ddf, ["key"], npartitions=output_nparts).persist() assert ddf.npartitions == output_nparts # Check that each partition of `ddf` hashes to the same value result = ddf.map_partitions(check_partitions, output_nparts).compute() assert all(result.to_list()) # Check the values of `ddf` (ignoring the row order) expected = df.sort_values("key") got = ddf.compute().sort_values("key") if backend == "cudf": assert_eq(got, expected) else: pd.testing.assert_frame_equal(got, expected)
def _test_dataframe_shuffle_merge(backend, protocol, n_workers): if backend == "cudf": cudf = pytest.importorskip("cudf") from cudf.testing._utils import assert_eq initialize(enable_tcp_over_ucx=True) else: from dask.dataframe.utils import assert_eq dask.config.update( dask.config.global_config, { "ucx": { "tcp": True, "cuda_copy": True, }, }, priority="new", ) with LocalCluster( protocol=protocol, dashboard_address=None, n_workers=n_workers, threads_per_worker=1, processes=True, ) as cluster: with Client(cluster): nrows = n_workers * 10 # Let's make some dataframes that we can join on the "key" column df1 = pd.DataFrame({ "key": np.arange(nrows), "payload1": np.arange(nrows) }) key = np.arange(nrows) np.random.shuffle(key) df2 = pd.DataFrame({ "key": key[nrows // 3:], "payload2": np.arange(nrows)[nrows // 3:] }) expected = df1.merge(df2, on="key").set_index("key") if backend == "cudf": df1 = cudf.DataFrame.from_pandas(df1) df2 = cudf.DataFrame.from_pandas(df2) ddf1 = dd.from_pandas(df1, npartitions=n_workers + 1) ddf2 = dd.from_pandas(df2, npartitions=n_workers - 1 if n_workers > 1 else 1) with dask.config.set(explicit_comms=True): got = ddf1.merge(ddf2, on="key").set_index("key").compute() if backend == "cudf": assert_eq(got, expected) else: pd.testing.assert_frame_equal(got, expected)
async def test_ucx_protocol_type_error(): pytest.importorskip("ucp") initialize(enable_tcp_over_ucx=True) with pytest.raises(TypeError): async with LocalCUDACluster( protocol="tcp", enable_tcp_over_ucx=True, asynchronous=True, data=dict ): pass
async def test_ucx_protocol(protocol): pytest.importorskip("ucp") initialize(enable_tcp_over_ucx=True) async with LocalCUDACluster( protocol=protocol, enable_tcp_over_ucx=True, asynchronous=True, data=dict ) as cluster: assert all( ws.address.startswith("ucx://") for ws in cluster.scheduler.workers.values() )
def ucx_cluster(): initialize.initialize(create_cuda_context=True, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband) cluster = LocalCUDACluster(protocol="ucx", enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband) yield cluster cluster.close()
def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm): cupy = pytest.importorskip("cupy") net_devices = _get_dgx_net_devices() openfabrics_devices = [d.split(",")[0] for d in net_devices] ucx_net_devices = "auto" if enable_infiniband else None cm_protocol = "rdmacm" if enable_rdmacm else "sockcm" initialize( enable_tcp_over_ucx=True, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, ) with LocalCUDACluster( interface="ib0", enable_tcp_over_ucx=True, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, ucx_net_devices=ucx_net_devices, ) as cluster: with Client(cluster) as client: res = da.from_array(cupy.arange(10000), chunks=(1000, ), asarray=False) res = res.sum().compute() assert res == 49995000 def check_ucx_options(): conf = ucp.get_config() assert "TLS" in conf assert "tcp" in conf["TLS"] assert "cuda_copy" in conf["TLS"] assert cm_protocol in conf["TLS"] assert cm_protocol in conf["SOCKADDR_TLS_PRIORITY"] if enable_nvlink: assert "cuda_ipc" in conf["TLS"] if enable_infiniband: assert "rc" in conf["TLS"] return True if enable_infiniband: assert all([ cluster.worker_spec[k]["options"]["env"]["UCX_NET_DEVICES"] == openfabrics_devices[k].split(",")[0] for k in cluster.worker_spec.keys() ]) assert all(client.run(check_ucx_options).values())
def test_initialize_ucx_tcp(): ucp = pytest.importorskip("ucp") initialize(enable_tcp_over_ucx=True) conf = ucp.get_config() env = os.environ assert "TLS" in conf assert "UCX_TLS" in env assert "tcp" in conf["TLS"] and "tcp" in env["UCX_TLS"] assert "sockcm" in conf["TLS"] and "sockcm" in env["UCX_TLS"] assert "cuda_copy" in conf["TLS"] and "cuda_copy" in env["UCX_TLS"] assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"] and "sockcm" in env["UCX_SOCKADDR_TLS_PRIORITY"]
def test_initialize_ucx_infiniband(): ucp = pytest.importorskip("ucp") initialize(enable_infiniband=True, net_devices="ib0") conf = ucp.get_config() env = os.environ assert "TLS" in conf assert "UCX_TLS" in env assert "rc" in conf["TLS"] and "rc" in env["UCX_TLS"] assert "tcp" in conf["TLS"] and "tcp" in env["UCX_TLS"] assert "sockcm" in conf["TLS"] and "sockcm" in env["UCX_TLS"] assert "cuda_copy" in conf["TLS"] and "cuda_copy" in env["UCX_TLS"] assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"] and "sockcm" in env["UCX_SOCKADDR_TLS_PRIORITY"] assert conf["NET_DEVICES"] == "ib0" and env["UCX_NET_DEVICES"] == "ib0"
def dask_client(): dask_scheduler_file = os.environ.get("SCHEDULER_FILE") cluster = None client = None tempdir_object = None if dask_scheduler_file: # Env var UCX_MAX_RNDV_RAILS=1 must be set too. initialize( enable_tcp_over_ucx=True, enable_nvlink=True, enable_infiniband=True, enable_rdmacm=True, # net_devices="mlx5_0:1", ) client = Client(scheduler_file=dask_scheduler_file) print("\ndask_client fixture: client created using " f"{dask_scheduler_file}") else: # The tempdir created by tempdir_object should be cleaned up once # tempdir_object goes out-of-scope and is deleted. tempdir_object = tempfile.TemporaryDirectory() cluster = LocalCUDACluster(local_directory=tempdir_object.name) client = Client(cluster) client.wait_for_workers(len(get_visible_devices())) print("\ndask_client fixture: client created using LocalCUDACluster") Comms.initialize(p2p=True) yield client Comms.destroy() # Shut down the connected scheduler and workers # therefore we will no longer rely on killing the dask cluster ID # for MNMG runs client.shutdown() if cluster: cluster.close() print("\ndask_client fixture: client.close() called")
def initialize_cluster(use_gpu=True, n_cpu=None, n_gpu=-1): enable_tcp_over_ucx = True enable_nvlink = True enable_infiniband = True logger.info('Starting dash cluster...') if use_gpu: initialize.initialize(create_cuda_context=True, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband) if n_gpu == -1: n_gpu = get_n_gpus() device_list = cuda_visible_devices(1, range(n_gpu)).split(',') CUDA_VISIBLE_DEVICES = [] for device in device_list: try: CUDA_VISIBLE_DEVICES.append(int(device)) except ValueError as vex: logger.warn(vex) logger.info('Using GPUs {} ...'.format(CUDA_VISIBLE_DEVICES)) cluster = LocalCUDACluster(protocol="ucx", dashboard_address=':8787', CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband) else: logger.info('Using {} CPUs ...'.format(n_cpu)) cluster = LocalCluster(dashboard_address=':8787', n_workers=n_cpu, threads_per_worker=4) client = Client(cluster) client.run(cupy.cuda.set_allocator) return client
def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm): cupy = pytest.importorskip("cupy") ucp = pytest.importorskip("ucp") net_devices = _get_dgx_net_devices() openfabrics_devices = [d.split(",")[0] for d in net_devices] ucx_net_devices = None if enable_infiniband is None and enable_nvlink is None and enable_rdmacm is None: if _ucx_110 is False: pytest.skip( "Specifying transports is required on UCX < 1.10", allow_module_level=True, ) enable_tcp_over_ucx = None cm_tls = ["all"] cm_tls_priority = ["rdmacm", "tcp", "sockcm"] else: if enable_infiniband and not _ucx_110: ucx_net_devices = "auto" enable_tcp_over_ucx = True if _ucx_110 is True: cm_tls = ["tcp"] if enable_rdmacm is True: cm_tls_priority = ["rdmacm"] else: cm_tls_priority = ["tcp"] else: cm_tls = ["tcp"] if enable_rdmacm is True: cm_tls.append("rdmacm") cm_tls_priority = ["rdmacm"] else: cm_tls.append("sockcm") cm_tls_priority = ["sockcm"] initialize( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, ) with LocalCUDACluster( interface="ib0", enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, ucx_net_devices=ucx_net_devices, rmm_pool_size="1 GiB", ) as cluster: with Client(cluster) as client: res = da.from_array(cupy.arange(10000), chunks=(1000,), asarray=False) res = res.sum().compute() assert res == 49995000 def check_ucx_options(): conf = ucp.get_config() assert "TLS" in conf assert all(t in conf["TLS"] for t in cm_tls) assert all(p in conf["SOCKADDR_TLS_PRIORITY"] for p in cm_tls_priority) if cm_tls != ["all"]: assert "tcp" in conf["TLS"] assert "cuda_copy" in conf["TLS"] if enable_nvlink: assert "cuda_ipc" in conf["TLS"] if enable_infiniband: assert "rc" in conf["TLS"] return True if ucx_net_devices == "auto": assert all( [ cluster.worker_spec[k]["options"]["env"]["UCX_NET_DEVICES"] == openfabrics_devices[k].split(",")[0] for k in cluster.worker_spec.keys() ] ) assert all(client.run(check_ucx_options).values())
def test_initialize_cuda_context(): initialize(create_cuda_context=True)
def _test_dask_cuda_worker_ucx_net_devices(enable_rdmacm): loop = IOLoop.current() ucp = pytest.importorskip("ucp") cm_protocol = "rdmacm" if enable_rdmacm else "sockcm" net_devices = _get_dgx_net_devices() openfabrics_devices = [d.split(",")[0] for d in net_devices] sched_addr = "127.0.0.1" # Enable proper variables for scheduler sched_env = os.environ.copy() sched_env["DASK_UCX__INFINIBAND"] = "True" sched_env["DASK_UCX__TCP"] = "True" sched_env["DASK_UCX__CUDA_COPY"] = "True" sched_env["DASK_UCX__NET_DEVICES"] = openfabrics_devices[0] if enable_rdmacm: sched_env["DASK_UCX__RDMACM"] = "True" sched_addr = get_ip_interface("ib0") sched_url = "ucx://" + sched_addr + ":9379" # Enable proper variables for workers worker_ucx_opts = [ "--enable-infiniband", "--net-devices", "auto", ] if enable_rdmacm: worker_ucx_opts.append("--enable-rdmacm") # Enable proper variables for client initialize( enable_tcp_over_ucx=True, enable_infiniband=True, enable_rdmacm=enable_rdmacm, net_devices=openfabrics_devices[0], ) with subprocess.Popen( [ "dask-scheduler", "--protocol", "ucx", "--host", sched_addr, "--port", "9379", "--no-dashboard", ], env=sched_env, ) as sched_proc: # Scheduler with UCX will take a few seconds to fully start sleep(5) with subprocess.Popen([ "dask-cuda-worker", sched_url, "--no-dashboard", ] + worker_ucx_opts) as worker_proc: with Client(sched_url, loop=loop) as client: def _timeout_callback(): # We must ensure processes are terminated to avoid hangs # if a timeout occurs worker_proc.kill() sched_proc.kill() assert wait_workers(client, timeout_callback=_timeout_callback) workers_tls = client.run(lambda: ucp.get_config()["TLS"]) workers_tls_priority = client.run( lambda: ucp.get_config()["SOCKADDR_TLS_PRIORITY"]) for tls, tls_priority in zip(workers_tls.values(), workers_tls_priority.values()): assert cm_protocol in tls assert cm_protocol in tls_priority worker_net_devices = client.run( lambda: ucp.get_config()["NET_DEVICES"]) cuda_visible_devices = client.run( lambda: os.environ["CUDA_VISIBLE_DEVICES"]) for i, v in enumerate( zip(worker_net_devices.values(), cuda_visible_devices.values())): net_dev = v[0] dev_idx = int(v[1].split(",")[0]) assert net_dev == openfabrics_devices[dev_idx] # A dask-worker with UCX protocol will not close until some work # is dispatched, therefore we kill the worker and scheduler to # ensure timely closing. worker_proc.kill() sched_proc.kill()
def main(args): # Set up workers on the local machine if args.protocol == "tcp": cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devs, ) else: enable_infiniband = args.enable_infiniband enable_nvlink = args.enable_nvlink enable_tcp_over_ucx = args.enable_tcp_over_ucx cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devs, ucx_net_devices="auto", enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, ) initialize( create_cuda_context=True, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, ) client = Client(cluster) def _worker_setup(initial_pool_size=None): import rmm rmm.reinitialize( pool_allocator=not args.no_rmm_pool, devices=0, initial_pool_size=initial_pool_size, ) cupy.cuda.set_allocator(rmm.rmm_cupy_allocator) client.run(_worker_setup) # Create an RMM pool on the scheduler due to occasional deserialization # of CUDA objects. May cause issues with InfiniBand otherwise. client.run_on_scheduler(_worker_setup, 1e9) took_list = [] for _ in range(args.runs - 1): took_list.append(run(client, args, write_profile=None)) took_list.append( run(client, args, write_profile=args.profile) ) # Only profiling the last run # Collect, aggregate, and print peer-to-peer bandwidths incoming_logs = client.run(lambda dask_worker: dask_worker.incoming_transfer_log) bandwidths = defaultdict(list) total_nbytes = defaultdict(list) for k, L in incoming_logs.items(): for d in L: if d["total"] >= args.ignore_size: bandwidths[k, d["who"]].append(d["bandwidth"]) total_nbytes[k, d["who"]].append(d["total"]) bandwidths = { (cluster.scheduler.workers[w1].name, cluster.scheduler.workers[w2].name): [ "%s/s" % format_bytes(x) for x in numpy.quantile(v, [0.25, 0.50, 0.75]) ] for (w1, w2), v in bandwidths.items() } total_nbytes = { ( cluster.scheduler.workers[w1].name, cluster.scheduler.workers[w2].name, ): format_bytes(sum(nb)) for (w1, w2), nb in total_nbytes.items() } if args.markdown: print("```") print("Merge benchmark") print("-------------------------------") print(f"backend | {args.backend}") print(f"rows-per-chunk | {args.chunk_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devs}") print(f"rmm-pool | {(not args.no_rmm_pool)}") print(f"frac-match | {args.frac_match}") if args.protocol == "ucx": print(f"tcp | {args.enable_tcp_over_ucx}") print(f"ib | {args.enable_infiniband}") print(f"nvlink | {args.enable_nvlink}") print(f"data-processed | {format_bytes(took_list[0][0])}") print("===============================") print("Wall-clock | Throughput") print("-------------------------------") for data_processed, took in took_list: throughput = int(data_processed / took) m = format_time(took) m += " " * (15 - len(m)) print(f"{m}| {format_bytes(throughput)}/s") print("===============================") if args.markdown: print("\n```") if args.backend == "dask": if args.markdown: print("<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```") print("(w1,w2) | 25% 50% 75% (total nbytes)") print("-------------------------------") for (d1, d2), bw in sorted(bandwidths.items()): print( "(%02d,%02d) | %s %s %s (%s)" % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]) ) if args.markdown: print("```\n</details>\n")