예제 #1
0
 def test_fit(self):
     (train_X_pd, _), (_, _) = self.tgt2creditg["pandas"]
     cat_columns = categorical()(train_X_pd)
     prefix = Map(columns={c: it[c] for c in cat_columns})
     rasl_trainable = prefix >> RaslOneHotEncoder()
     sk_trainable = prefix >> SkOneHotEncoder()
     sk_trained = sk_trainable.fit(train_X_pd)
     for tgt, dataset in self.tgt2creditg.items():
         (train_X, train_y), (test_X, test_y) = dataset
         rasl_trained = rasl_trainable.fit(train_X)
         self._check_last_trained(sk_trained, rasl_trained, tgt)
예제 #2
0
def test_onehot_vs_skonehot(client):
    X = DataFrame({'gender': ['Male', 'Female', 'Female'], 'int': [1, 3, 2]})
    skX = from_df_to_numpy(X)
    X = dask_cudf.from_cudf(X, npartitions=2)

    enc = OneHotEncoder(sparse=False)
    skohe = SkOneHotEncoder(sparse=False)

    ohe = enc.fit_transform(X)
    ref = skohe.fit_transform(skX)

    cp.testing.assert_array_equal(ohe.compute(), ref)
예제 #3
0
def test_onehot_drop_idx_first(client):
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    ddf = dask_cudf.from_cudf(X, npartitions=2)

    enc = OneHotEncoder(sparse=False, drop='first')
    sk_enc = SkOneHotEncoder(sparse=False, drop='first')
    ohe = enc.fit_transform(ddf)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe.compute(), ref)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
예제 #4
0
def test_onehot_vs_skonehot(as_array):
    X = DataFrame({'gender': ['M', 'F', 'F'], 'int': [1, 3, 2]})
    skX = from_df_to_array(X)
    if as_array:
        X = _from_df_to_cupy(X)
        skX = cp.asnumpy(X)

    enc = OneHotEncoder(sparse=True)
    skohe = SkOneHotEncoder(sparse=True)

    ohe = enc.fit_transform(X)
    ref = skohe.fit_transform(skX)

    cp.testing.assert_array_equal(ohe.toarray(), ref.toarray())
예제 #5
0
def test_onehot_drop_idx_first(as_array):
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    if as_array:
        X = _from_df_to_cupy(X)
        X_ary = cp.asnumpy(X)

    enc = OneHotEncoder(sparse=False, drop='first', categories='auto')
    sk_enc = SkOneHotEncoder(sparse=False, drop='first', categories='auto')
    ohe = enc.fit_transform(X)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe, ref)
    inv = enc.inverse_transform(ohe)
    assert_inverse_equal(inv, X)
예제 #6
0
def test_onehot_random_inputs(drop, sparse, n_samples, as_array):
    X, ary = generate_inputs_from_categories(n_samples=n_samples,
                                             as_array=as_array)

    enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    ohe = enc.fit_transform(X)
    ref = sk_enc.fit_transform(ary)
    if sparse:
        cp.testing.assert_array_equal(ohe.toarray(), ref.toarray())
    else:
        cp.testing.assert_array_equal(ohe, ref)
    inv_ohe = enc.inverse_transform(ohe)
    assert_inverse_equal(inv_ohe, X)
예제 #7
0
def test_onehot_drop_one_of_each(cluster):
    client = Client(cluster)
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    ddf = dask_cudf.from_cudf(X, npartitions=2)

    drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'})
    enc = OneHotEncoder(sparse=False, drop=drop)
    sk_enc = SkOneHotEncoder(sparse=False, drop=['b', 2, 'b'])
    ohe = enc.fit_transform(ddf)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe.compute(), ref)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
    client.close()
예제 #8
0
def test_onehot_sparse_drop(as_array):
    X = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2], 'l': [5, 5, 6]})
    drop = {'g': 'F', 'i': 3, 'l': 6}

    ary = from_df_to_array(X)
    drop_ary = ['F', 3, 6]
    if as_array:
        X = _from_df_to_cupy(X)
        ary = cp.asnumpy(X)
        drop = drop_ary = _convert_drop(drop)

    enc = OneHotEncoder(sparse=True, drop=drop, categories='auto')
    sk_enc = SkOneHotEncoder(sparse=True, drop=drop_ary, categories='auto')
    ohe = enc.fit_transform(X)
    ref = sk_enc.fit_transform(ary)
    cp.testing.assert_array_equal(ohe.toarray(), ref.toarray())
예제 #9
0
def test_onehot_drop_one_of_each(as_array):
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'})
    X_ary = from_df_to_array(X)
    drop_ary = ['b', 2, 'b']
    if as_array:
        X = _from_df_to_cupy(X)
        X_ary = cp.asnumpy(X)
        drop = drop_ary = _convert_drop(drop)

    enc = OneHotEncoder(sparse=False, drop=drop, categories='auto')
    ohe = enc.fit_transform(X)
    print(ohe.dtype)
    ref = SkOneHotEncoder(sparse=False, drop=drop_ary,
                          categories='auto').fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe, ref)
    inv = enc.inverse_transform(ohe)
    assert_inverse_equal(inv, X)
예제 #10
0
def test_onehot_random_inputs(client, drop, as_array, sparse, n_samples):
    X, ary = generate_inputs_from_categories(n_samples=n_samples,
                                             as_array=as_array)
    if as_array:
        dX = da.from_array(X)
    else:
        dX = dask_cudf.from_cudf(X, npartitions=1)

    enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    ohe = enc.fit_transform(dX)
    ref = sk_enc.fit_transform(ary)
    if sparse:
        cp.testing.assert_array_equal(ohe.compute().toarray(), ref.toarray())
    else:
        cp.testing.assert_array_equal(ohe.compute(), ref)

    inv_ohe = enc.inverse_transform(ohe)
    assert_inverse_equal(inv_ohe.compute(), dX.compute())
예제 #11
0
 def test_predict(self):
     (train_X_pd, train_y_pd), (test_X_pd,
                                test_y_pd) = self.tgt2creditg["pandas"]
     cat_columns = categorical()(train_X_pd)
     prefix = Map(columns={c: it[c] for c in cat_columns})
     to_pd = FunctionTransformer(
         func=lambda X: X if isinstance(X, pd.DataFrame) else X.toPandas())
     lr = LogisticRegression()
     sk_trainable = prefix >> SkOneHotEncoder(sparse=False) >> lr
     sk_trained = sk_trainable.fit(train_X_pd, train_y_pd)
     sk_predicted = sk_trained.predict(test_X_pd)
     rasl_trainable = prefix >> RaslOneHotEncoder(
         sparse=False) >> to_pd >> lr
     for tgt, dataset in self.tgt2creditg.items():
         (train_X, train_y), (test_X, test_y) = dataset
         rasl_trained = rasl_trainable.fit(train_X, train_y)
         rasl_predicted = rasl_trained.predict(test_X)
         self.assertEqual(sk_predicted.shape, rasl_predicted.shape, tgt)
         self.assertEqual(sk_predicted.tolist(), rasl_predicted.tolist(),
                          tgt)
예제 #12
0
 def test_transform(self):
     (train_X_pd, train_y_pd), (test_X_pd,
                                test_y_pd) = self.tgt2creditg["pandas"]
     cat_columns = categorical()(train_X_pd)
     prefix = Map(columns={c: it[c] for c in cat_columns})
     rasl_trainable = prefix >> RaslOneHotEncoder(sparse=False)
     sk_trainable = prefix >> SkOneHotEncoder(sparse=False)
     sk_trained = sk_trainable.fit(train_X_pd)
     sk_transformed = sk_trained.transform(test_X_pd)
     for tgt, dataset in self.tgt2creditg.items():
         (train_X, train_y), (test_X, test_y) = dataset
         rasl_trained = rasl_trainable.fit(train_X)
         self._check_last_trained(sk_trained, rasl_trained, tgt)
         rasl_transformed = rasl_trained.transform(test_X)
         if tgt == "spark":
             rasl_transformed = rasl_transformed.toPandas()
         self.assertEqual(sk_transformed.shape, rasl_transformed.shape, tgt)
         for row_idx in range(sk_transformed.shape[0]):
             for col_idx in range(sk_transformed.shape[1]):
                 self.assertEqual(
                     sk_transformed[row_idx, col_idx],
                     rasl_transformed.iloc[row_idx, col_idx],
                     (row_idx, col_idx, tgt),
                 )