Exemplo n.º 1
0
def _test_jit_unspill(protocol):
    import cudf
    from cudf.tests.utils import assert_eq

    dask.config.update(
        dask.config.global_config,
        {
            "ucx": {
                "TLS": "tcp,sockcm,cuda_copy",
            },
        },
        priority="new",
    )

    with dask_cuda.LocalCUDACluster(
            protocol=protocol,
            dashboard_address=None,
            n_workers=1,
            threads_per_worker=1,
            processes=True,
            jit_unspill=True,
            device_memory_limit="1B",
    ) as cluster:
        with Client(cluster):
            np.random.seed(42)
            df = cudf.DataFrame.from_pandas(
                pd.DataFrame({"key": np.random.random(100)}))
            ddf = dd.from_pandas(df.copy(), npartitions=4)
            ddf = explicit_comms_shuffle(ddf, ["key"])

            # Check the values of `ddf` (ignoring the row order)
            expected = df.sort_values("key")
            got = ddf.compute().sort_values("key")
            assert_eq(got, expected)
Exemplo n.º 2
0
def _test_dataframe_shuffle(backend, protocol, n_workers):
    if backend == "cudf":
        cudf = pytest.importorskip("cudf")
        from cudf.testing._utils import assert_eq

        initialize(enable_tcp_over_ucx=True)
    else:
        from dask.dataframe.utils import assert_eq

        dask.config.update(
            dask.config.global_config,
            {
                "ucx": {
                    "tcp": True,
                    "cuda_copy": True,
                },
            },
            priority="new",
        )

    with LocalCluster(
            protocol=protocol,
            dashboard_address=None,
            n_workers=n_workers,
            threads_per_worker=1,
            processes=True,
    ) as cluster:
        with Client(cluster) as client:
            all_workers = list(client.get_worker_logs().keys())
            comms.default_comms()
            np.random.seed(42)
            df = pd.DataFrame({"key": np.random.random(100)})
            if backend == "cudf":
                df = cudf.DataFrame.from_pandas(df)

            for input_nparts in range(1, 5):
                for output_nparts in range(1, 5):
                    ddf = dd.from_pandas(
                        df.copy(),
                        npartitions=input_nparts).persist(workers=all_workers)
                    ddf = explicit_comms_shuffle(
                        ddf, ["key"], npartitions=output_nparts).persist()

                    assert ddf.npartitions == output_nparts

                    # Check that each partition of `ddf` hashes to the same value
                    result = ddf.map_partitions(check_partitions,
                                                output_nparts).compute()
                    assert all(result.to_list())

                    # Check the values of `ddf` (ignoring the row order)
                    expected = df.sort_values("key")
                    got = ddf.compute().sort_values("key")
                    if backend == "cudf":
                        assert_eq(got, expected)
                    else:
                        pd.testing.assert_frame_equal(got, expected)
Exemplo n.º 3
0
def _test_jit_unspill(protocol):
    import cudf

    with dask_cuda.LocalCUDACluster(
            protocol=protocol,
            dashboard_address=None,
            n_workers=1,
            threads_per_worker=1,
            jit_unspill=True,
            device_memory_limit="1B",
            enable_tcp_over_ucx=True if protocol == "ucx" else False,
    ) as cluster:
        with Client(cluster):
            np.random.seed(42)
            df = cudf.DataFrame.from_pandas(
                pd.DataFrame({"key": np.random.random(100)}))
            ddf = dd.from_pandas(df.copy(), npartitions=4)
            ddf = explicit_comms_shuffle(ddf, ["key"])

            # Check the values of `ddf` (ignoring the row order)
            expected = df.sort_values("key")
            got = ddf.compute().sort_values("key")
            assert_eq(got, expected)