Exemplo n.º 1
0
def test_factorize_series_index():
    df = DataFrame()
    df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"]
    df["col2"] = [
        2992443.0,
        2992447.0,
        2992466.0,
        2992440.0,
        2992441.0,
        2992442.0,
        2992444.0,
        2992445.0,
        2992446.0,
        2992448.0,
    ]
    assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0])
    assert_eq(
        df.col1.factorize()[1].to_pandas().values,
        df.to_pandas().col1.factorize()[1].values,
    )

    df = df.set_index("col2")

    assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0])
    assert_eq(
        df.col1.factorize()[1].to_pandas().values,
        df.to_pandas().col1.factorize()[1].values,
    )
Exemplo n.º 2
0
def test_onehot_inverse_transform(client, drop):
    df = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2]})
    X = dask_cudf.from_cudf(df, npartitions=2)

    enc = OneHotEncoder(drop=drop)
    ohe = enc.fit_transform(X)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), df.to_pandas())
Exemplo n.º 3
0
def test_dataframe_nlargest_nsmallest(nelem, n, op, columns):
    np.random.seed(0)
    aa = np.random.random(nelem)
    bb = np.random.random(nelem)

    df = DataFrame({"a": aa, "b": bb})
    pdf = df.to_pandas()
    assert_eq(getattr(df, op)(n, columns), getattr(pdf, op)(n, columns))
Exemplo n.º 4
0
def test_onehot_inverse_transform_handle_unknown(client):
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
    X = dask_cudf.from_cudf(X, npartitions=2)
    Y_ohe = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]])
    Y_ohe = da.from_array(Y_ohe)

    enc = OneHotEncoder(handle_unknown='ignore')
    enc = enc.fit(X)
    df = enc.inverse_transform(Y_ohe)
    ref = DataFrame({'chars': [None, 'b'], 'int': [0, 2]})
    assert_frame_equal(df.compute().to_pandas(), ref.to_pandas())
Exemplo n.º 5
0
def test_onehot_drop_idx_first(client):
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    ddf = dask_cudf.from_cudf(X, npartitions=2)

    enc = OneHotEncoder(sparse=False, drop='first')
    sk_enc = SkOneHotEncoder(sparse=False, drop='first')
    ohe = enc.fit_transform(ddf)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe.compute(), ref)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
Exemplo n.º 6
0
def test_dataframe_sort_values_ignore_index(index, ignore_index):
    gdf = DataFrame(
        {"a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1]}
    )
    gdf = gdf.set_index(index)

    pdf = gdf.to_pandas()

    expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index)
    got = gdf.sort_values((gdf.columns), ignore_index=ignore_index)

    assert_eq(expect, got)
Exemplo n.º 7
0
def test_categorical_basic(data):
    cat = data.copy()
    pdsr = pd.Series(cat)
    sr = Series(cat)
    dsr = dgd.from_cudf(sr, npartitions=2)
    result = dsr.compute()
    np.testing.assert_array_equal(cat.codes, result.to_array())

    assert dsr.dtype.to_pandas() == pdsr.dtype
    # Test attributes
    assert pdsr.cat.ordered == dsr.cat.ordered

    assert tuple(pdsr.cat.categories) == tuple(dsr.cat.categories)

    np.testing.assert_array_equal(pdsr.cat.codes.data, result.to_array())
    np.testing.assert_array_equal(pdsr.cat.codes.dtype, dsr.cat.codes.dtype)

    string = str(result)
    expect_str = """
0 a
1 a
2 b
3 c
4 a
"""
    assert all(x == y for x, y in zip(string.split(), expect_str.split()))
    from cudf.tests.utils import assert_eq

    df = DataFrame()
    df["a"] = ["xyz", "abc", "def"] * 10

    pdf = df.to_pandas()
    cddf = dgd.from_cudf(df, 1)
    cddf["b"] = cddf["a"].astype("category")

    ddf = dd.from_pandas(pdf, 1)
    ddf["b"] = ddf["a"].astype("category")

    assert_eq(ddf._meta_nonempty["b"], cddf._meta_nonempty["b"])

    with pytest.raises(NotImplementedError):
        cddf["b"].cat.categories

    with pytest.raises(NotImplementedError):
        ddf["b"].cat.categories

    cddf = cddf.categorize()
    ddf = ddf.categorize()

    assert_eq(ddf["b"].cat.categories, cddf["b"].cat.categories)
    assert_eq(ddf["b"].cat.ordered, cddf["b"].cat.ordered)
Exemplo n.º 8
0
def test_dataframe_masked_slicing(nelem, slice_start, slice_end):
    gdf = DataFrame()
    gdf["a"] = list(range(nelem))
    gdf["b"] = list(range(nelem, 2 * nelem))
    gdf["a"] = gdf["a"].set_mask(utils.random_bitmask(nelem))
    gdf["b"] = gdf["b"].set_mask(utils.random_bitmask(nelem))

    def do_slice(x):
        return x[slice_start:slice_end]

    expect = do_slice(gdf.to_pandas())
    got = do_slice(gdf).to_pandas()

    assert_eq(expect, got, check_dtype=False)
Exemplo n.º 9
0
def test_str_slice():

    df = DataFrame({"a": ["abc,def,123", "xyz,hi,bye"]})

    ddf = dgd.from_cudf(df, 1)
    pdf = df.to_pandas()

    dd.assert_eq(
        pdf.a.str.split(",", expand=True, n=1),
        ddf.a.str.split(",", expand=True, n=1),
    )
    dd.assert_eq(
        pdf.a.str.split(",", expand=True, n=2),
        ddf.a.str.split(",", expand=True, n=2),
    )
Exemplo n.º 10
0
def test_onehot_drop_one_of_each(cluster):
    client = Client(cluster)
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    ddf = dask_cudf.from_cudf(X, npartitions=2)

    drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'})
    enc = OneHotEncoder(sparse=False, drop=drop)
    sk_enc = SkOneHotEncoder(sparse=False, drop=['b', 2, 'b'])
    ohe = enc.fit_transform(ddf)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe.compute(), ref)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
    client.close()
Exemplo n.º 11
0
def test_dataframe_take(ntake):
    np.random.seed(0)
    df = DataFrame()

    nelem = 123
    df["ii"] = np.random.randint(0, 20, nelem)
    df["ff"] = np.random.random(nelem)

    take_indices = np.random.randint(0, len(df), ntake)

    actual = df.take(take_indices)
    expected = df.to_pandas().take(take_indices)

    assert actual.ii.null_count == 0
    assert actual.ff.null_count == 0
    assert_eq(actual, expected)
Exemplo n.º 12
0
def test_categorical_categories():

    df = DataFrame(
        {"a": ["a", "b", "c", "d", "e", "e", "a", "d"], "b": range(8)}
    )
    df["a"] = df["a"].astype("category")
    pdf = df.to_pandas(nullable_pd_dtype=False)

    ddf = dgd.from_cudf(df, 2)
    dpdf = dd.from_pandas(pdf, 2)

    dd.assert_eq(
        ddf.a.cat.categories.to_series().to_pandas(nullable_pd_dtype=False),
        dpdf.a.cat.categories.to_series(),
        check_index=False,
    )
Exemplo n.º 13
0
def test_dataframe_take_with_multiIndex(ntake):
    np.random.seed(0)
    df = DataFrame(index=cudf.MultiIndex(
        levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
        codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
    ))

    nelem = 9
    df["ii"] = np.random.randint(0, 20, nelem)
    df["ff"] = np.random.random(nelem)

    take_indices = np.random.randint(0, len(df), ntake)

    actual = df.take(take_indices)
    expected = df.to_pandas().take(take_indices)

    assert_eq(actual, expected)
Exemplo n.º 14
0
def test_to_pandas():
    df = DataFrame()
    df["a"] = np.arange(5, dtype=np.int32)
    df["b"] = np.arange(10, 15, dtype=np.float64)
    df["c"] = np.array([True, False, None, True, True])

    pdf = df.to_pandas()

    assert tuple(df.columns) == tuple(pdf.columns)

    assert df["a"].dtype == pdf["a"].dtype
    assert df["b"].dtype == pdf["b"].dtype

    # Notice, the dtype differ when Pandas and cudf boolean series
    # contains None/NaN
    assert df["c"].dtype == np.bool_
    assert pdf["c"].dtype == np.object_

    assert len(df["a"]) == len(pdf["a"])
    assert len(df["b"]) == len(pdf["b"])
    assert len(df["c"]) == len(pdf["c"])