예제 #1
0
def test_onehot_inverse_transform(client, drop):
    df = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2]})
    X = dask_cudf.from_cudf(df, npartitions=2)

    enc = OneHotEncoder(drop=drop)
    ohe = enc.fit_transform(X)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), df.to_pandas())
예제 #2
0
def test_onehot_inverse_transform_handle_unknown(client):
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
    X = dask_cudf.from_cudf(X, npartitions=2)
    Y_ohe = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]])
    Y_ohe = da.from_array(Y_ohe)

    enc = OneHotEncoder(handle_unknown='ignore')
    enc = enc.fit(X)
    df = enc.inverse_transform(Y_ohe)
    ref = DataFrame({'chars': [None, 'b'], 'int': [0, 2]})
    assert_frame_equal(df.compute().to_pandas(), ref.to_pandas())
예제 #3
0
def test_onehot_drop_idx_first(client):
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    ddf = dask_cudf.from_cudf(X, npartitions=2)

    enc = OneHotEncoder(sparse=False, drop='first')
    sk_enc = SkOneHotEncoder(sparse=False, drop='first')
    ohe = enc.fit_transform(ddf)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe.compute(), ref)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
예제 #4
0
def test_onehot_drop_one_of_each(cluster):
    client = Client(cluster)
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    ddf = dask_cudf.from_cudf(X, npartitions=2)

    drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'})
    enc = OneHotEncoder(sparse=False, drop=drop)
    sk_enc = SkOneHotEncoder(sparse=False, drop=['b', 2, 'b'])
    ohe = enc.fit_transform(ddf)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe.compute(), ref)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
    client.close()
예제 #5
0
def test_onehot_random_inputs(client, drop, as_array, sparse, n_samples):
    X, ary = generate_inputs_from_categories(n_samples=n_samples,
                                             as_array=as_array)
    if as_array:
        dX = da.from_array(X)
    else:
        dX = dask_cudf.from_cudf(X, npartitions=1)

    enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    ohe = enc.fit_transform(dX)
    ref = sk_enc.fit_transform(ary)
    if sparse:
        cp.testing.assert_array_equal(ohe.compute().toarray(), ref.toarray())
    else:
        cp.testing.assert_array_equal(ohe.compute(), ref)

    inv_ohe = enc.inverse_transform(ohe)
    assert_inverse_equal(inv_ohe.compute(), dX.compute())