Пример #1
0
def test_get_dummies_large():
    gdf = cudf.datasets.randomdata(
        nrows=200000,
        dtypes={
            "C": int,
            "first": "category",
            "b": float,
            "second": "category",
        },
    )
    df = gdf.to_pandas()
    ddf = dd.from_pandas(df, npartitions=25)
    dd.assert_eq(dd.get_dummies(ddf).compute(), pd.get_dummies(df))
    gddf = dask_cudf.from_cudf(gdf, npartitions=25)
    dd.assert_eq(
        dd.get_dummies(ddf).compute(),
        dd.get_dummies(gddf).compute(),
        check_dtype=False,
    )
Пример #2
0
def test_from_cudf():
    np.random.seed(0)

    df = pd.DataFrame(
        {
            "x": np.random.randint(0, 5, size=10000),
            "y": np.random.normal(size=10000),
        }
    )

    gdf = cudf.DataFrame.from_pandas(df)

    # Test simple around to/from dask
    ingested = dd.from_pandas(gdf, npartitions=2)
    dd.assert_eq(ingested, df)

    # Test conversion to dask.dataframe
    ddf = ingested.to_dask_dataframe()
    dd.assert_eq(ddf, df)
Пример #3
0
def test_groupby_multiindex_reset_index(npartitions):
    df = cudf.DataFrame({
        "a": [1, 1, 2, 3, 4],
        "b": [5, 2, 1, 2, 5],
        "c": [1, 2, 2, 3, 5]
    })
    ddf = dask_cudf.from_cudf(df, npartitions=npartitions)
    pddf = dd.from_pandas(df.to_pandas(), npartitions=npartitions)
    gr = ddf.groupby(["a", "c"]).agg({"b": ["count"]}).reset_index()
    pr = pddf.groupby(["a", "c"]).agg({"b": ["count"]}).reset_index()

    # CuDF uses "int32" for count. Pandas uses "int64"
    gr_out = gr.compute().sort_values(by=["a", "c"]).reset_index(drop=True)
    gr_out[("b", "count")] = gr_out[("b", "count")].astype("int64")

    dd.assert_eq(
        gr_out,
        pr.compute().sort_values(by=["a", "c"]).reset_index(drop=True),
    )
Пример #4
0
def test_indexed_join(how):
    p_left = pd.DataFrame({"x": np.arange(10)}, index=np.arange(10) * 2)
    p_right = pd.DataFrame({"y": 1}, index=np.arange(15))

    g_left = cudf.from_pandas(p_left)
    g_right = cudf.from_pandas(p_right)

    dg_left = dd.from_pandas(g_left, npartitions=4)
    dg_right = dd.from_pandas(g_right, npartitions=5)

    d = g_left.merge(g_right, left_index=True, right_index=True, how=how)
    dg = dg_left.merge(dg_right, left_index=True, right_index=True, how=how)

    # occassionally order is not correct (possibly do to hashing in the merge)
    d = d.sort_values("x")  # index is preserved
    dg = dg.sort_values(
        "x")  # index is reset -- sort_values will slow test down

    dd.assert_eq(d, dg, check_index=False)
Пример #5
0
def test_dataset_shuffle_on_keys(tmpdir, cpu, partition_on, keys, npartitions):

    # Initial timeseries dataset
    size = 60
    df1 = pd.DataFrame({
        "name": np.random.choice(["Dave", "Zelda"], size=size),
        "id": np.random.choice([0, 1], size=size),
        "x": np.random.uniform(low=0.0, high=10.0, size=size),
        "y": np.random.uniform(low=0.0, high=10.0, size=size),
    })
    ddf1 = dd.from_pandas(df1, npartitions=3)

    # Write the dataset to disk
    path = str(tmpdir)
    ddf1.to_parquet(str(tmpdir), partition_on=partition_on)

    # Construct NVT Dataset
    ds = nvt.Dataset(path, engine="parquet")

    # Shuffle the dataset by `keys`
    ds2 = ds.shuffle_by_keys(keys, npartitions=npartitions)

    # Inspect the result
    ddf2 = ds2.to_ddf()
    if npartitions:
        assert ddf2.npartitions == npartitions

    # A successful shuffle will return the same unique-value
    # count for both the full dask algorithm and a partition-wise sum
    n1 = sum([len(p[keys].drop_duplicates()) for p in ddf2.partitions])
    n2 = len(ddf2[keys].drop_duplicates())
    assert n1 == n2

    # Check that none of the rows was changed
    df1 = df1.sort_values(["id", "x", "y"]).reset_index(drop=True)
    df2 = ddf2.compute().sort_values(["id", "x", "y"]).reset_index(drop=True)
    if partition_on:
        # Dask will convert partitioned columns to Categorical
        df2["name"] = df2["name"].astype("object")
        df2["id"] = df2["id"].astype("int64")
    for col in df1:
        # Order of columns can change after round-trip partitioning
        assert_eq(df1[col], df2[col], check_index=False)
Пример #6
0
def test_on(how, on):
    left = cudf.DataFrame(
        {"id_1": [1, 2, 3, 4, 5], "id_2": [1.0, 2.0, 3.0, 4.0, 0.0]}
    )
    right = cudf.DataFrame(
        {"id_1": [2, 3, None, 2], "id_2": [2.0, 3.0, 4.0, 20]}
    )

    dleft = dd.from_pandas(left, npartitions=2)
    dright = dd.from_pandas(right, npartitions=3)

    expected = left.merge(right, how=how, on=on)
    result = dleft.merge(dright, how=how, on=on)

    dd.assert_eq(
        result.compute().to_pandas().sort_values(on),
        expected.to_pandas().sort_values(on),
        check_index=False,
    )
Пример #7
0
def test_groupby_agg(func):
    pdf = pd.DataFrame({
        "x": np.random.randint(0, 5, size=10000),
        "y": np.random.normal(size=10000),
    })

    gdf = cudf.DataFrame.from_pandas(pdf)

    ddf = dask_cudf.from_cudf(gdf, npartitions=5)

    a = func(gdf).to_pandas()
    b = func(ddf).compute().to_pandas()

    a.index.name = None
    a.name = None
    b.index.name = None
    b.name = None

    dd.assert_eq(a, b)
Пример #8
0
def test_groupby_reset_index_names():
    df = cudf.datasets.randomdata(nrows=10,
                                  dtypes={
                                      "a": str,
                                      "b": int,
                                      "c": int
                                  })
    pdf = df.to_pandas()

    gddf = dask_cudf.from_cudf(df, 2)
    pddf = dd.from_pandas(pdf, 2)

    g_res = gddf.groupby("a", sort=True).sum()
    p_res = pddf.groupby("a", sort=True).sum()

    got = g_res.reset_index().compute().sort_values(["a", "b", "c"])
    expect = p_res.reset_index().compute().sort_values(["a", "b", "c"])

    dd.assert_eq(got, expect)
Пример #9
0
def test_groupby_split_out(split_out, column):
    df = pd.DataFrame({
        "a":
        np.arange(8),
        "b": [1, 0, 0, 2, 1, 1, 2, 0],
        "c": [0, 1] * 4,
        "d": ["dog", "cat", "cat", "dog", "dog", "dog", "cat", "bird"],
    })
    df["e"] = df["d"].astype("category")
    gdf = cudf.from_pandas(df)

    ddf = dd.from_pandas(df, npartitions=3)
    gddf = dask_cudf.from_cudf(gdf, npartitions=3)

    ddf_result = (ddf.groupby(column).a.mean(
        split_out=split_out).compute().sort_values().dropna())
    gddf_result = (gddf.groupby(column).a.mean(
        split_out=split_out).compute().sort_values())

    dd.assert_eq(gddf_result, ddf_result, check_index=False)
Пример #10
0
def test_groupby_reset_index_string_name():
    df = cudf.DataFrame({"value": range(5), "key": ["a", "a", "b", "a", "c"]})
    pdf = df.to_pandas()

    gddf = dask_cudf.from_cudf(df, npartitions=1)
    pddf = dd.from_pandas(pdf, npartitions=1)

    g_res = (gddf.groupby(["key"]).agg({
        "value": "mean"
    }).reset_index(drop=False))
    p_res = (pddf.groupby(["key"]).agg({
        "value": "mean"
    }).reset_index(drop=False))

    got = g_res.compute().sort_values(["key", "value"]).reset_index(drop=True)
    expect = (p_res.compute().sort_values(["key",
                                           "value"]).reset_index(drop=True))

    dd.assert_eq(got, expect)
    assert len(g_res) == len(p_res)
Пример #11
0
def test_conditional_join_with_limit(c):
    df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
    ddf = dd.from_pandas(df, 5)

    c.create_table("many_partitions", ddf)

    df = df.assign(common=1)
    expected_df = df.merge(df, on="common",
                           suffixes=("", "0")).drop(columns="common")
    expected_df = expected_df[expected_df["a"] >= 2][:4]

    actual_df = c.sql("""
    SELECT * FROM
        many_partitions as df1, many_partitions as df2
    WHERE
        df1."a" >= 2
    LIMIT 4
    """)

    dd.assert_eq(actual_df, expected_df, check_index=False)
Пример #12
0
def test_groupby_nested_dict(func):
    pdf = pd.DataFrame(
        {
            "x": np.random.randint(0, 5, size=10000),
            "y": np.random.normal(size=10000),
        }
    )

    ddf = dd.from_pandas(pdf, npartitions=5)
    c_ddf = ddf.map_partitions(cudf.from_pandas)

    a = func(ddf).compute()
    b = func(c_ddf).compute().to_pandas()

    a.index.name = None
    a.name = None
    b.index.name = None
    b.name = None

    dd.assert_eq(a, b)
Пример #13
0
def test_single_dataframe_merge(daskify):
    right = cudf.DataFrame({"x": [1, 2, 1, 2], "y": [1, 2, 3, 4]})
    left = cudf.DataFrame({"x": np.arange(100) % 10, "z": np.arange(100)})

    dleft = dd.from_pandas(left, npartitions=10)

    if daskify:
        dright = dd.from_pandas(right, npartitions=1)
    else:
        dright = right

    expected = left.merge(right, how="inner")
    result = dd.merge(dleft, dright, how="inner")
    assert len(result.dask) < 25

    dd.assert_eq(
        result.compute().to_pandas().sort_values(["z", "y"]),
        expected.to_pandas().sort_values(["z", "y"]),
        check_index=False,
    )
Пример #14
0
def test_merge_left(
    left_nrows, right_nrows, left_nkeys, right_nkeys, how="left"
):
    chunksize = 3

    np.random.seed(0)

    # cuDF
    left = cudf.DataFrame(
        {
            "x": np.random.randint(0, left_nkeys, size=left_nrows),
            "y": np.random.randint(0, left_nkeys, size=left_nrows),
            "a": np.arange(left_nrows, dtype=np.float64),
        }
    )
    right = cudf.DataFrame(
        {
            "x": np.random.randint(0, right_nkeys, size=right_nrows),
            "y": np.random.randint(0, right_nkeys, size=right_nrows),
            "a": 1000 * np.arange(right_nrows, dtype=np.float64),
        }
    )

    expect = left.merge(right, on=("x", "y"), how=how)

    def normalize(df):
        return (
            df.to_pandas()
            .sort_values(["x", "y", "a_x", "a_y"])
            .reset_index(drop=True)
        )

    # dask_cudf
    left = dgd.from_cudf(left, chunksize=chunksize)
    right = dgd.from_cudf(right, chunksize=chunksize)

    result = left.merge(right, on=("x", "y"), how=how).compute(
        scheduler="single-threaded"
    )

    dd.assert_eq(normalize(expect), normalize(result))
Пример #15
0
def test_dask_dataset(datasets, engine, num_files, cpu):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    paths = paths[:num_files]
    if engine == "parquet":
        ddf0 = dask_cudf.read_parquet(paths)[mycols_pq]
        dataset = nvtabular.io.Dataset(paths, cpu=cpu)
        result = dataset.to_ddf(columns=mycols_pq)
    else:
        ddf0 = dask_cudf.read_csv(paths, header=None, names=allcols_csv)[mycols_csv]
        dataset = nvtabular.io.Dataset(paths, cpu=cpu, header=None, names=allcols_csv)
        result = dataset.to_ddf(columns=mycols_csv)

    # We do not preserve the index in NVTabular
    if engine == "parquet":
        assert_eq(ddf0, result, check_index=False)
    else:
        assert_eq(ddf0, result)

    # Check that the cpu kwarg is working correctly
    if cpu:
        assert isinstance(result.compute(), pd.DataFrame)

        # Should still work if we move to the GPU
        # (test behavior after repetitive conversion)
        dataset.to_gpu()
        dataset.to_cpu()
        dataset.to_cpu()
        dataset.to_gpu()
        result = dataset.to_ddf()
        assert isinstance(result.compute(), cudf.DataFrame)
    else:
        assert isinstance(result.compute(), cudf.DataFrame)

        # Should still work if we move to the CPU
        # (test behavior after repetitive conversion)
        dataset.to_cpu()
        dataset.to_gpu()
        dataset.to_gpu()
        dataset.to_cpu()
        result = dataset.to_ddf()
        assert isinstance(result.compute(), pd.DataFrame)
Пример #16
0
def test_roundtrip_from_dask_partitioned(tmpdir, parts, daskcudf, metadata):
    tmpdir = str(tmpdir)

    df = pd.DataFrame()
    df["year"] = [2018, 2019, 2019, 2019, 2020, 2021]
    df["month"] = [1, 2, 3, 3, 3, 2]
    df["day"] = [1, 1, 1, 2, 2, 1]
    df["data"] = [0, 0, 0, 0, 0, 0]
    df.index.name = "index"
    if daskcudf:
        ddf2 = dask_cudf.from_cudf(cudf.from_pandas(df), npartitions=2)
        ddf2.to_parquet(tmpdir,
                        write_metadata_file=metadata,
                        partition_on=parts)
    else:
        ddf2 = dd.from_pandas(df, npartitions=2)
        ddf2.to_parquet(
            tmpdir,
            engine="pyarrow",
            write_metadata_file=metadata,
            partition_on=parts,
        )
    df_read = dd.read_parquet(tmpdir, engine="pyarrow")
    gdf_read = dask_cudf.read_parquet(tmpdir)

    # TODO: Avoid column selection after `CudfEngine`
    # can be aligned with dask/dask#6534
    columns = list(df_read.columns)
    assert set(df_read.columns) == set(gdf_read.columns)
    dd.assert_eq(
        df_read.compute(scheduler=dask.get)[columns],
        gdf_read.compute(scheduler=dask.get)[columns],
    )

    assert gdf_read.index.name == "index"

    # Check that we don't have uuid4 file names
    for _, _, files in os.walk(tmpdir):
        for fn in files:
            if not fn.startswith("_"):
                assert "part" in fn
Пример #17
0
def test_append():
    np.random.seed(0)

    n = 1000
    df = pd.DataFrame({
        "x": np.random.randint(0, 5, size=n),
        "y": np.random.normal(size=n)
    })

    gdf = cudf.DataFrame.from_pandas(df)
    frags = _fragmented_gdf(gdf, nsplit=13)

    # Combine with .append
    head = frags[0]
    tail = frags[1:]

    appended = dd.from_pandas(head, npartitions=1)
    for each in tail:
        appended = appended.append(each)

    dd.assert_eq(df, appended)
Пример #18
0
def test_make_meta_backends(index):

    dtypes = ["int8", "int32", "int64", "float64"]
    df = cudf.DataFrame(
        {dt: np.arange(start=0, stop=3, dtype=dt) for dt in dtypes}
    )
    df["strings"] = ["cat", "dog", "fish"]
    df["cats"] = df["strings"].astype("category")
    df["time_s"] = np.array(
        ["2018-10-07", "2018-10-08", "2018-10-09"], dtype="datetime64[s]"
    )
    df["time_ms"] = df["time_s"].astype("datetime64[ms]")
    df["time_ns"] = df["time_s"].astype("datetime64[ns]")
    df = df.set_index(index)
    ddf = dgd.from_cudf(df, npartitions=1)

    # Check "empty" metadata types
    dd.assert_eq(ddf._meta.dtypes, df.dtypes)

    # Check "non-empty" metadata types
    dd.assert_eq(ddf._meta.dtypes, ddf._meta_nonempty.dtypes)
Пример #19
0
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    features = cat_names >> ops.JoinGroupby(
        cont_cols=cont_names,
        stats=["count", "sum", "std", "min"],
        out_path=str(tmpdir))

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    workflow = Workflow(features + cat_names + cont_names + label_name,
                        client=client)
    result = workflow.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check results.  Need to sort for direct comparison
    expect = df0.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    got = result.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    gb_e = expect.groupby("name-cat").aggregate({
        "name-cat": "count",
        "x": ["sum", "min", "std"]
    })
    gb_e.columns = ["count", "sum", "min", "std"]
    df_check = got.merge(gb_e,
                         left_on="name-cat",
                         right_index=True,
                         how="left")
    assert_eq(df_check["name-cat_count"],
              df_check["count"].astype("int64"),
              check_names=False)
    assert_eq(df_check["name-cat_x_sum"], df_check["sum"], check_names=False)
    assert_eq(df_check["name-cat_x_min"], df_check["min"], check_names=False)
    assert_eq(df_check["name-cat_x_std"], df_check["std"], check_names=False)
Пример #20
0
def test_hash_object_dispatch(index):
    obj = cudf.DataFrame({
        "x": ["a", "b", "c"],
        "y": [1, 2, 3],
        "z": [1, 1, 0]
    },
                         index=[2, 4, 6])

    # DataFrame
    result = dd.utils.hash_object_dispatch(obj, index=index)
    expected = dgd.backends.hash_object_cudf(obj, index=index)
    assert isinstance(result, cudf.Series)
    dd.assert_eq(result, expected)

    # Series
    result = dd.utils.hash_object_dispatch(obj["x"], index=index)
    expected = dgd.backends.hash_object_cudf(obj["x"], index=index)
    assert isinstance(result, cudf.Series)
    dd.assert_eq(result, expected)

    # DataFrame with MultiIndex
    obj_multi = obj.set_index(["x", "z"], drop=True)
    result = dd.utils.hash_object_dispatch(obj_multi, index=index)
    expected = dgd.backends.hash_object_cudf(obj_multi, index=index)
    assert isinstance(result, cudf.Series)
    dd.assert_eq(result, expected)
Пример #21
0
def test_dask_preproc_cpu(client, tmpdir, datasets, engine, shuffle, cpu):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    elif engine == "csv":
        df1 = cudf.read_csv(paths[0], header=0)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=0)[mycols_csv]
    else:
        df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv]
    df0 = cudf.concat([df1, df2], axis=0)

    if engine in ("parquet", "csv"):
        dataset = Dataset(paths, part_size="1MB", cpu=cpu)
    else:
        dataset = Dataset(paths, names=allcols_csv, part_size="1MB", cpu=cpu)

    # Simple transform (normalize)
    cat_names = ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]
    conts = cont_names >> ops.FillMissing() >> ops.Normalize()
    workflow = Workflow(conts + cat_names + label_name, client=client)
    transformed = workflow.fit_transform(dataset)

    # Write out dataset
    output_path = os.path.join(tmpdir, "processed")
    transformed.to_parquet(output_path=output_path,
                           shuffle=shuffle,
                           out_files_per_proc=4)

    # Check the final result
    df_disk = dd_read_parquet(output_path, engine="pyarrow").compute()
    assert_eq(
        df0.sort_values(["id", "x"])[["name-string", "label"]],
        df_disk.sort_values(["id", "x"])[["name-string", "label"]],
        check_index=False,
    )
Пример #22
0
def test_groupby_agg_empty_partition(tmpdir, split_out):

    # Write random and empty cudf DataFrames
    # to two distinct files.
    df = cudf.datasets.randomdata()
    df.to_parquet(str(tmpdir.join("f0.parquet")))
    cudf.DataFrame(
        columns=["id", "x", "y"],
        dtype={
            "id": "int64",
            "x": "float64",
            "y": "float64"
        },
    ).to_parquet(str(tmpdir.join("f1.parquet")))

    # Read back our two partitions as a single
    # dask_cudf DataFrame (one partition is now empty)
    ddf = dask_cudf.read_parquet(str(tmpdir))
    gb = ddf.groupby(["id"]).agg({"x": ["sum"]}, split_out=split_out)

    expect = df.groupby(["id"]).agg({"x": ["sum"]}).sort_index()
    dd.assert_eq(gb.compute().sort_index(), expect)
Пример #23
0
def test_read_csv_compression(tmp_path):
    df = pd.DataFrame(dict(x=np.arange(20), y=np.arange(20)))
    df.to_csv(tmp_path / "data.csv.gz", index=False, compression="gzip")

    with pytest.warns(UserWarning) as w:
        df2 = dask_cudf.read_csv(
            tmp_path / "*.csv.gz", chunksize="50 B", compression="gzip"
        )

    assert len(w) == 1
    msg = str(w[0].message)
    assert "gzip" in msg

    assert df2.npartitions == 1
    dd.assert_eq(df2, df, check_index=False)

    with warnings.catch_warnings(record=True) as record:
        df2 = dask_cudf.read_csv(
            tmp_path / "*.csv.gz", chunksize=None, compression="gzip"
        )

        assert not record
Пример #24
0
def test_multifile_parquet(tmpdir, dataset, df, engine, num_io_threads, nfiles, shuffle):

    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_names = ["label"]
    columns = cat_names + cont_names + label_names
    workflow = nvt.Workflow(nvt.ColumnGroup(columns))

    outdir = str(tmpdir.mkdir("out"))
    transformed = workflow.transform(nvt.Dataset(df))
    transformed.to_parquet(
        output_path=outdir, num_threads=num_io_threads, shuffle=shuffle, out_files_per_proc=nfiles
    )

    # Check that our output data is exactly the same
    out_paths = glob.glob(os.path.join(outdir, "*.parquet"))
    df_check = cudf.read_parquet(out_paths)
    assert_eq(
        df_check[columns].sort_values(["x", "y"]),
        df[columns].sort_values(["x", "y"]),
        check_index=False,
    )
Пример #25
0
def test_merge_1col_left(
    left_nrows, right_nrows, left_nkeys, right_nkeys, how="left"
):
    chunksize = 3

    np.random.seed(0)

    # cuDF
    left = cudf.DataFrame(
        {
            "x": np.random.randint(0, left_nkeys, size=left_nrows),
            "a": np.arange(left_nrows, dtype=np.float64),
        }
    )
    right = cudf.DataFrame(
        {
            "x": np.random.randint(0, right_nkeys, size=right_nrows),
            "a": 1000 * np.arange(right_nrows, dtype=np.float64),
        }
    )

    expect = left.merge(right, on=["x"], how=how)
    expect = (
        expect.to_pandas()
        .sort_values(["x", "a_x", "a_y"])
        .reset_index(drop=True)
    )

    # dask_cudf
    left = dgd.from_cudf(left, chunksize=chunksize)
    right = dgd.from_cudf(right, chunksize=chunksize)

    joined = left.merge(right, on=["x"], how=how)

    got = joined.compute().to_pandas()

    got = got.sort_values(["x", "a_x", "a_y"]).reset_index(drop=True)

    dd.assert_eq(expect, got)
Пример #26
0
def test_create_metadata_file_inconsistent_schema(tmpdir):

    # NOTE: This test demonstrates that the CudfEngine
    # can be used to generate a global `_metadata` file
    # even if there are inconsistent schemas in the dataset.

    # Write file 0
    df0 = pd.DataFrame({"a": [None] * 10, "b": range(10)})
    p0 = os.path.join(tmpdir, "part.0.parquet")
    df0.to_parquet(p0, engine="pyarrow")

    # Write file 1
    b = list(range(10))
    b[1] = None
    df1 = pd.DataFrame({"a": range(10), "b": b})
    p1 = os.path.join(tmpdir, "part.1.parquet")
    df1.to_parquet(p1, engine="pyarrow")

    # New pyarrow-dataset base can handle an inconsistent
    # schema (even without a _metadata file), but computing
    # and dtype validation may fail
    ddf1 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True)

    # Add global metadata file.
    # Dask-CuDF can do this without requiring schema
    # consistency.
    dask_cudf.io.parquet.create_metadata_file([p0, p1])

    # Check that we can still read the ddf
    # with the _metadata file present
    ddf2 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True)

    # Check that the result is the same with and
    # without the _metadata file.  Note that we must
    # call `compute` on `ddf1`, because the dtype of
    # the inconsistent column ("a") may be "object"
    # before computing, and "int" after
    dd.assert_eq(ddf1.compute(), ddf2)
    dd.assert_eq(ddf1.compute(), ddf2.compute())
Пример #27
0
def test_groupby_basic_aggs(agg):
    pdf = pd.DataFrame({
        "x": np.random.randint(0, 5, size=10000),
        "y": np.random.normal(size=10000),
    })

    gdf = cudf.DataFrame.from_pandas(pdf)

    ddf = dask_cudf.from_cudf(gdf, npartitions=5)

    a = getattr(gdf.groupby("x"), agg)().to_pandas()
    b = getattr(ddf.groupby("x"), agg)().compute().to_pandas()

    a.index.name = None
    a.name = None
    b.index.name = None
    b.name = None

    if agg == "count":
        a["y"] = a["y"].astype(np.int64)

    dd.assert_eq(a, b)
Пример #28
0
def test_row_groups_per_part(tmpdir, row_groups, index):
    nparts = 2
    df_size = 100
    row_group_size = 5
    file_row_groups = 10  # Known apriori
    npartitions_expected = math.ceil(file_row_groups / row_groups) * 2

    df = pd.DataFrame({
        "a":
        np.random.choice(["apple", "banana", "carrot"], size=df_size),
        "b":
        np.random.random(size=df_size),
        "c":
        np.random.randint(1, 5, size=df_size),
        "index":
        np.arange(0, df_size),
    })
    if index:
        df = df.set_index("index")

    ddf1 = dd.from_pandas(df, npartitions=nparts)
    ddf1.to_parquet(
        str(tmpdir),
        engine="pyarrow",
        row_group_size=row_group_size,
        write_metadata_file=True,
        write_index=index,
    )

    ddf2 = dask_cudf.read_parquet(
        str(tmpdir),
        row_groups_per_part=row_groups,
        index="index" if index else False,
    )

    dd.assert_eq(ddf1, ddf2, check_divisions=False, check_index=index)

    assert ddf2.npartitions == npartitions_expected
Пример #29
0
def test_reset_index_multiindex():
    df = cudf.DataFrame()
    df["id_1"] = ["a", "a", "b"]
    df["id_2"] = [0, 0, 1]
    df["val"] = [1, 2, 3]

    df_lookup = cudf.DataFrame()
    df_lookup["id_1"] = ["a", "b"]
    df_lookup["metadata"] = [0, 1]

    gddf = dask_cudf.from_cudf(df, npartitions=2)
    gddf_lookup = dask_cudf.from_cudf(df_lookup, npartitions=2)

    ddf = dd.from_pandas(df.to_pandas(), npartitions=2)
    ddf_lookup = dd.from_pandas(df_lookup.to_pandas(), npartitions=2)

    # Note: 'id_2' has wrong type (object) until after compute
    dd.assert_eq(
        gddf.groupby(by=["id_1", "id_2"]).val.sum().reset_index().merge(
            gddf_lookup, on="id_1").compute(),
        ddf.groupby(by=["id_1", "id_2"]).val.sum().reset_index().merge(
            ddf_lookup, on="id_1"),
    )
Пример #30
0
def test_create_metadata_file(tmpdir, partition_on):

    tmpdir = str(tmpdir)

    # Write ddf without a _metadata file
    df1 = cudf.DataFrame({"b": range(100), "a": ["A", "B", "C", "D"] * 25})
    df1.index.name = "myindex"
    ddf1 = dask_cudf.from_cudf(df1, npartitions=10)
    ddf1.to_parquet(
        tmpdir,
        write_metadata_file=False,
        partition_on=partition_on,
    )

    # Add global _metadata file
    if partition_on:
        fns = glob.glob(os.path.join(tmpdir, partition_on + "=*/*.parquet"))
    else:
        fns = glob.glob(os.path.join(tmpdir, "*.parquet"))
    dask_cudf.io.parquet.create_metadata_file(
        fns,
        split_every=3,  # Force tree reduction
    )

    # Check that we can now read the ddf
    # with the _metadata file present
    ddf2 = dask_cudf.read_parquet(
        tmpdir,
        gather_statistics=True,
        split_row_groups=False,
        index="myindex",
    )
    if partition_on:
        ddf1 = df1.sort_values("b")
        ddf2 = ddf2.compute().sort_values("b")
        ddf2.a = ddf2.a.astype("object")
    dd.assert_eq(ddf1, ddf2)