def test__transform_unknown_nan(self): """Test the ``_transform`` with unknown and nans. This is an edge case for ``_transform`` where unknowns should be zeros and nans should be the last entry in the column. Input: - Series with unknown and nans Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() pd.Series(['a']) ohet.dummies = ['a'] ohet.dummy_na = True ohet.num_dummies = 1 # Run out = ohet._transform(pd.Series(['b', 'b', np.nan])) # Assert expected = np.array([[0, 0], [0, 0], [0, 1]]) np.testing.assert_array_equal(out, expected)
def test__transform_zeros_categorical(self): """Test the ``_transform`` with unknown category. The values passed to ``_transform`` should be returned in a one-hot encoding representation using the categorical branch where it should be a column of zeros. Input: - Series with categorical and unknown values Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() pd.Series(['a']) ohet.dummies = ['a'] ohet.indexer = [0] ohet.num_dummies = 1 ohet.dummy_encoded = True # Run out = ohet._transform(pd.Series(['b', 'b', 'b'])) # Assert expected = np.array([[0], [0], [0]]) np.testing.assert_array_equal(out, expected)
def test__transform_nans_categorical(self): """Test the ``_transform`` method with nans. The values passed to ``_transform`` should be returned in a one-hot encoding representation using the categorical branch. Null values should be represented by the same encoding. Input: - Series with categorical values containing nans Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() data = pd.Series([np.nan, None, 'a', 'b']) ohet.dummies = ['a', 'b'] ohet.indexer = [0, 1] ohet.dummy_na = True ohet.num_dummies = 2 ohet.dummy_encoded = True # Run out = ohet._transform(data) # Assert expected = np.array([[0, 0, 1], [0, 0, 1], [1, 0, 0], [0, 1, 0]]) np.testing.assert_array_equal(out, expected)
def test__transform_single_categorical(self): """Test the ``_transform`` with one category. The values passed to ``_transform`` should be returned in a one-hot encoding representation using the categorical branch where it should be a single column. Input: - Series with a single category Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'a', 'a']) ohet.dummies = ['a'] ohet.indexer = [0] ohet.num_dummies = 1 ohet.dummy_encoded = True # Run out = ohet._transform(data) # Assert expected = np.array([[1], [1], [1]]) np.testing.assert_array_equal(out, expected)
def test__transform_no_nan(self): """Test the ``_transform`` method without nans. The values passed to ``_transform`` should be returned in a one-hot encoding representation. Input: - Series with values Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'b', 'c']) ohet.dummies = ['a', 'b', 'c'] ohet.num_dummies = 3 # Run out = ohet._transform(data) # Assert expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) np.testing.assert_array_equal(out, expected)