示例#1
0
def test_onehot_categories(client):
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
    X = dask_cudf.from_cudf(X, npartitions=2)
    cats = DataFrame({'chars': ['a', 'b', 'c'], 'int': [0, 1, 2]})
    enc = OneHotEncoder(categories=cats, sparse=False)
    ref = cp.array([[1., 0., 0., 1., 0., 0.], [0., 1., 0., 0., 0., 1.]])
    res = enc.fit_transform(X)
    cp.testing.assert_array_equal(res.compute(), ref)
示例#2
0
def test_onehot_inverse_transform(client, drop):
    df = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2]})
    X = dask_cudf.from_cudf(df, npartitions=2)

    enc = OneHotEncoder(drop=drop)
    ohe = enc.fit_transform(X)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), df.to_pandas())
示例#3
0
def test_onehot_vs_skonehot(client):
    X = DataFrame({'gender': ['Male', 'Female', 'Female'], 'int': [1, 3, 2]})
    skX = from_df_to_numpy(X)
    X = dask_cudf.from_cudf(X, npartitions=2)

    enc = OneHotEncoder(sparse=False)
    skohe = SkOneHotEncoder(sparse=False)

    ohe = enc.fit_transform(X)
    ref = skohe.fit_transform(skX)

    cp.testing.assert_array_equal(ohe.compute(), ref)
示例#4
0
def test_onehot_drop_idx_first(client):
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    ddf = dask_cudf.from_cudf(X, npartitions=2)

    enc = OneHotEncoder(sparse=False, drop='first')
    sk_enc = SkOneHotEncoder(sparse=False, drop='first')
    ohe = enc.fit_transform(ddf)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe.compute(), ref)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
示例#5
0
def test_onehot_drop_one_of_each(cluster):
    client = Client(cluster)
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    ddf = dask_cudf.from_cudf(X, npartitions=2)

    drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'})
    enc = OneHotEncoder(sparse=False, drop=drop)
    sk_enc = SkOneHotEncoder(sparse=False, drop=['b', 2, 'b'])
    ohe = enc.fit_transform(ddf)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe.compute(), ref)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
    client.close()
示例#6
0
def test_onehot_random_inputs(client, drop, as_array, sparse, n_samples):
    X, ary = generate_inputs_from_categories(n_samples=n_samples,
                                             as_array=as_array)
    if as_array:
        dX = da.from_array(X)
    else:
        dX = dask_cudf.from_cudf(X, npartitions=1)

    enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    ohe = enc.fit_transform(dX)
    ref = sk_enc.fit_transform(ary)
    if sparse:
        cp.testing.assert_array_equal(ohe.compute().toarray(), ref.toarray())
    else:
        cp.testing.assert_array_equal(ohe.compute(), ref)

    inv_ohe = enc.inverse_transform(ohe)
    assert_inverse_equal(inv_ohe.compute(), dX.compute())