def test_onehot_categories(client): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) X = dask_cudf.from_cudf(X, npartitions=2) cats = DataFrame({'chars': ['a', 'b', 'c'], 'int': [0, 1, 2]}) enc = OneHotEncoder(categories=cats, sparse=False) ref = cp.array([[1., 0., 0., 1., 0., 0.], [0., 1., 0., 0., 0., 1.]]) res = enc.fit_transform(X) cp.testing.assert_array_equal(res.compute(), ref)
def test_onehot_inverse_transform(client, drop): df = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2]}) X = dask_cudf.from_cudf(df, npartitions=2) enc = OneHotEncoder(drop=drop) ohe = enc.fit_transform(X) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), df.to_pandas())
def test_onehot_vs_skonehot(client): X = DataFrame({'gender': ['Male', 'Female', 'Female'], 'int': [1, 3, 2]}) skX = from_df_to_numpy(X) X = dask_cudf.from_cudf(X, npartitions=2) enc = OneHotEncoder(sparse=False) skohe = SkOneHotEncoder(sparse=False) ohe = enc.fit_transform(X) ref = skohe.fit_transform(skX) cp.testing.assert_array_equal(ohe.compute(), ref)
def test_onehot_drop_idx_first(client): X_ary = [['c', 2, 'a'], ['b', 2, 'b']] X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) ddf = dask_cudf.from_cudf(X, npartitions=2) enc = OneHotEncoder(sparse=False, drop='first') sk_enc = SkOneHotEncoder(sparse=False, drop='first') ohe = enc.fit_transform(ddf) ref = sk_enc.fit_transform(X_ary) cp.testing.assert_array_equal(ohe.compute(), ref) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
def test_onehot_drop_one_of_each(cluster): client = Client(cluster) X_ary = [['c', 2, 'a'], ['b', 2, 'b']] X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) ddf = dask_cudf.from_cudf(X, npartitions=2) drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'}) enc = OneHotEncoder(sparse=False, drop=drop) sk_enc = SkOneHotEncoder(sparse=False, drop=['b', 2, 'b']) ohe = enc.fit_transform(ddf) ref = sk_enc.fit_transform(X_ary) cp.testing.assert_array_equal(ohe.compute(), ref) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), X.to_pandas()) client.close()
def test_onehot_random_inputs(client, drop, as_array, sparse, n_samples): X, ary = generate_inputs_from_categories(n_samples=n_samples, as_array=as_array) if as_array: dX = da.from_array(X) else: dX = dask_cudf.from_cudf(X, npartitions=1) enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto') sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto') ohe = enc.fit_transform(dX) ref = sk_enc.fit_transform(ary) if sparse: cp.testing.assert_array_equal(ohe.compute().toarray(), ref.toarray()) else: cp.testing.assert_array_equal(ohe.compute(), ref) inv_ohe = enc.inverse_transform(ohe) assert_inverse_equal(inv_ohe.compute(), dX.compute())