Exemplo n.º 1
0
    def test_error_handling(self):
        for encoder_name in encoders.__all__:
            with self.subTest(encoder_name=encoder_name):

                # we exclude some columns
                X = tu.create_dataset(n_rows=100)
                X = X.drop(['unique_str', 'none'], axis=1)
                X_t = tu.create_dataset(n_rows=50, extras=True)
                X_t = X_t.drop(['unique_str', 'none'], axis=1)

                # illegal state, we have to first train the encoder...
                enc = getattr(encoders, encoder_name)()
                with self.assertRaises(ValueError):
                    enc.transform(X)

                # wrong count of attributes
                enc = getattr(encoders, encoder_name)()
                enc.fit(X, y)
                with self.assertRaises(ValueError):
                    enc.transform(X_t.iloc[:, 0:3])

                # no cols
                enc = getattr(encoders, encoder_name)(cols=[])
                enc.fit(X, y)
                self.assertTrue(enc.transform(X_t).equals(X_t))
Exemplo n.º 2
0
    def test_one_hot(self):
        enc = encoders.OneHotEncoder(verbose=1, return_df=False)
        enc.fit(X)
        self.assertEqual(
            enc.transform(X_t).shape[1],
            enc.transform(X_t[X_t['extra'] != 'A']).shape[1],
            'We have to get the same count of columns')

        enc = encoders.OneHotEncoder(verbose=1,
                                     return_df=True,
                                     impute_missing=True)
        enc.fit(X)
        out = enc.transform(X_t)
        self.assertIn('extra_-1', out.columns.values)

        enc = encoders.OneHotEncoder(verbose=1,
                                     return_df=True,
                                     impute_missing=True,
                                     handle_unknown='ignore')
        enc.fit(X)
        out = enc.transform(X_t)
        self.assertEqual(
            len([x for x in out.columns.values
                 if str(x).startswith('extra_')]), 3)

        enc = encoders.OneHotEncoder(verbose=1,
                                     return_df=True,
                                     impute_missing=True,
                                     handle_unknown='error')
        enc.fit(X)
        with self.assertRaises(ValueError):
            out = enc.transform(X_t)

        enc = encoders.OneHotEncoder(verbose=1,
                                     return_df=True,
                                     handle_unknown='ignore',
                                     use_cat_names=True)
        enc.fit(X)
        out = enc.transform(X_t)
        self.assertIn('extra_A', out.columns.values)

        enc = encoders.OneHotEncoder(verbose=1,
                                     return_df=True,
                                     use_cat_names=True)
        enc.fit(X)
        out = enc.transform(X_t)
        self.assertIn('extra_-1', out.columns.values)

        # test inverse_transform
        X_i = tu.create_dataset(n_rows=100, has_none=False)
        X_i_t = tu.create_dataset(n_rows=50, has_none=False)
        X_i_t_extra = tu.create_dataset(n_rows=50, extras=True, has_none=False)
        cols = ['underscore', 'none', 'extra', 321]

        enc = encoders.OneHotEncoder(verbose=1, use_cat_names=True, cols=cols)
        enc.fit(X_i)
        obtained = enc.inverse_transform(enc.transform(X_i_t))
        obtained[321] = obtained[321].astype(
            'int64')  # numeric columns are incorrectly typed as object...
        tu.verify_inverse_transform(X_i_t, obtained)
    def test_one_hot(self):
        enc = encoders.OneHotEncoder(verbose=1, return_df=False)
        enc.fit(X)
        self.assertEqual(
            enc.transform(X_t).shape[1],
            enc.transform(X_t[X_t['extra'] != 'A']).shape[1],
            'We have to get the same count of columns')

        enc = encoders.OneHotEncoder(verbose=1,
                                     return_df=True,
                                     impute_missing=True)
        enc.fit(X)
        out = enc.transform(X_t)
        self.assertIn('extra_-1', out.columns.values)

        enc = encoders.OneHotEncoder(verbose=1,
                                     return_df=True,
                                     impute_missing=True,
                                     handle_unknown='ignore')
        enc.fit(X)
        out = enc.transform(X_t)
        self.assertEqual(
            len([x for x in out.columns.values
                 if str(x).startswith('extra_')]), 3)

        enc = encoders.OneHotEncoder(verbose=1,
                                     return_df=True,
                                     impute_missing=True,
                                     handle_unknown='error')
        # The exception is already raised in fit() because transform() is called there to get
        # feature_names right.
        with self.assertRaises(ValueError):
            enc.fit(X_t)

        enc = encoders.OneHotEncoder(verbose=1,
                                     return_df=True,
                                     handle_unknown='ignore',
                                     use_cat_names=True)
        enc.fit(X)
        out = enc.transform(X_t)
        self.assertIn('extra_A', out.columns.values)

        enc = encoders.OneHotEncoder(verbose=1,
                                     return_df=True,
                                     use_cat_names=True)
        enc.fit(X)
        out = enc.transform(X_t)
        self.assertIn('extra_-1', out.columns.values)

        # test inverse_transform
        X_i = tu.create_dataset(n_rows=100, has_none=False)
        X_i_t = tu.create_dataset(n_rows=50, has_none=False)
        X_i_t_extra = tu.create_dataset(n_rows=50, extras=True, has_none=False)
        cols = ['underscore', 'none', 'extra', 321, 'categorical']

        enc = encoders.OneHotEncoder(verbose=1, use_cat_names=True, cols=cols)
        enc.fit(X_i)
        obtained = enc.inverse_transform(enc.transform(X_i_t))
        tu.verify_inverse_transform(X_i_t, obtained)
Exemplo n.º 4
0
    def test_handle_unknown_error(self):
        # BaseN has problems with None -> ignore None
        X = tu.create_dataset(n_rows=100, has_none=False)
        X_t = tu.create_dataset(n_rows=50, extras=True, has_none=False)

        for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}):  # HashingEncoder supports new values by design -> excluded
            with self.subTest(encoder_name=encoder_name):

                # new value during scoring
                enc = getattr(encoders, encoder_name)(handle_unknown='error')
                enc.fit(X, y)
                with self.assertRaises(ValueError):
                    _ = enc.transform(X_t)
    def test_inverse_transform(self):
        # we do not allow None in these data (but "none" column without any None is ok)
        X = tu.create_dataset(n_rows=100, has_none=False)
        X_t = tu.create_dataset(n_rows=50, has_none=False)
        X_t_extra = tu.create_dataset(n_rows=50, extras=True, has_none=False)
        cols = ['underscore', 'none', 'extra', 321, 'categorical']

        for encoder_name in ['BaseNEncoder', 'BinaryEncoder', 'OneHotEncoder', 'OrdinalEncoder']:
            with self.subTest(encoder_name=encoder_name):

                # simple run
                enc = getattr(encoders, encoder_name)(verbose=1, cols=cols)
                enc.fit(X)
                tu.verify_inverse_transform(X_t, enc.inverse_transform(enc.transform(X_t)))
Exemplo n.º 6
0
import pandas as pd
from unittest2 import TestCase  # or `from unittest import ...` if on Python 3.4+
import category_encoders.tests.test_utils as tu
import numpy as np

import category_encoders as encoders

np_X = tu.create_array(n_rows=100)
np_X_t = tu.create_array(n_rows=50, extras=True)
np_y = np.random.randn(np_X.shape[0]) > 0.5
np_y_t = np.random.randn(np_X_t.shape[0]) > 0.5
X = tu.create_dataset(n_rows=100)
X_t = tu.create_dataset(n_rows=50, extras=True)
y = pd.DataFrame(np_y)
y_t = pd.DataFrame(np_y_t)


class TestLeaveOneOutEncoder(TestCase):
    def test_leave_one_out(self):
        enc = encoders.LeaveOneOutEncoder(verbose=1,
                                          randomized=True,
                                          sigma=0.1)
        enc.fit(X, y)
        tu.verify_numeric(enc.transform(X_t))
        tu.verify_numeric(enc.transform(X_t, y_t))

    def test_leave_one_out_values(self):
        df = pd.DataFrame({
            'color': ["a", "a", "a", "b", "b", "b"],
            'outcome': [1, 0, 0, 1, 0, 1]
        })