def test_robust_ordinal_encoder(): st_helper = SklearnTestHelper() roe = RobustOrdinalEncoder() data = np.array([[0, 1], [0, 4], [1, 2], [1, 10]], dtype=np.float32) roe.fit(data) dshape = (relay.Any(), len(data[0])) _test_model_impl(st_helper, roe, dshape, data)
def test_robust_ordinal_encoding_transform(): encoder = RobustOrdinalEncoder() encoder.fit(ordinal_data) test_data = np.concatenate([ordinal_data, np.array([["waffle", 1213, None]])], axis=0) encoded = encoder.transform(test_data) assert all(list((encoded[:-1] < 3).reshape((-1,)))) assert all(list(encoded[-1] == 3))
def test_robust_ordinal_encoding_transform(unknown_as_nan): encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan) encoder.fit(ordinal_data) test_data = np.concatenate([ordinal_data, np.array([["waffle", 1213, np.nan]])], axis=0) encoded = encoder.transform(test_data) assert all(list((encoded[:-1] < 3).reshape((-1,)))) if unknown_as_nan: assert all(list(np.isnan(encoded[-1]))) else: assert all(list(encoded[-1] == 3))
def test_robust_ordinal_encoding_transform_max_categories(): # Test where number of categories is much larger than max_categories data = np.array([[i for i in range(200)] + [i for i in range(150)] + [i for i in range(100)]]).T encoder = RobustOrdinalEncoder(max_categories=100) encoder.fit(data) assert len(encoder.categories_[0]) == 100 assert all(list(encoder.categories_[0] <= 100)) encoded = encoder.transform(data) cats, frequencies = np.unique(encoded, return_counts=True) assert len(cats) == encoder.max_categories + 1 assert sum(frequencies == 3) == 100 # Test where number of categories is equal to max categories encoder = RobustOrdinalEncoder(max_categories=2) encoder.fit(np.array([["x", "y"], ["y", "x"]])) assert len(encoder.categories_[0]) == 2 assert len(encoder.categories_[1]) == 2 encoded = encoder.transform([["x", "y"], ["z", "z"]]) assert np.all(encoded[1] == 2) assert np.all(encoded[0] == [0, 1])
def test_robust_ordinal_encoding_inverse_transform(unknown_as_nan): encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan) encoder.fit(ordinal_data) test_data = np.concatenate([ordinal_data, np.array([["waffle", 1213, None]])], axis=0) encoded = encoder.transform(test_data) reverse = encoder.inverse_transform(encoded) assert np.array_equal(ordinal_data, reverse[:-1]) assert all([x is None for x in reverse[-1]])
def test_robust_ordinal_encoding_inverse_transform_floatkeys(): encoder = RobustOrdinalEncoder() data = np.arange(9).astype(np.float32).reshape((3, 3)) encoder.fit(data) test_data = data + 3 encoded = encoder.transform(test_data) reverse = encoder.inverse_transform(encoded) assert reverse.dtype == object assert np.array_equal(data[1:], reverse[:-1]) assert all([x is None for x in reverse[-1]])
def test_robust_ordinal_encoding_categories(): encoder = RobustOrdinalEncoder() encoder.fit(ordinal_data) for i, cat in enumerate(encoder.categories_): assert set(cat) == set(ordinal_expected_categories_[i])
def test_robust_ordinal_encoding_inverse_transform(unknown_as_nan): encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan) encoder.fit(ordinal_data) test_data = np.concatenate([ordinal_data, np.array([["waffle", 1213, None]])], axis=0) encoded = encoder.transform(test_data) reverse = encoder.inverse_transform(encoded) assert np.array_equal(ordinal_data, reverse[:-1]) assert all([x is None for x in reverse[-1]]) # Test where some categories are below the threshold encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan, threshold=2) encoder.fit(ordinal_data) encoded = encoder.transform(test_data) reverse = encoder.inverse_transform(encoded) assert sum([i is None for i in reverse[:, 0]]) == 3 assert sum([i is None for i in reverse[:, 1]]) == 2 assert sum([i is None for i in reverse[:, 2]]) == 2 # Test where all categories are below the threshold encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan, threshold=10) encoder.fit(ordinal_data) encoded = encoder.transform(test_data) reverse = encoder.inverse_transform(encoded) assert sum(([i is None for i in reverse.flatten()])) == reverse.size
def test_robust_ordinal_encoding_transform_threshold(): # Test where some categories are below the threshold encoder = RobustOrdinalEncoder(threshold=2) encoder.fit(ordinal_data) encoded = encoder.transform(ordinal_data) assert all(list(encoded[:, 0] < 2)) assert all(list((encoded[:, 1:] < 3).reshape((-1,)))) # Test where some categories are below the threshold and new categories are introduced in transformation test_data = np.concatenate([ordinal_data, np.array([["waffle", 1213, np.nan]])], axis=0) encoded = encoder.transform(test_data) assert all(list(encoded[:, 0] < 2)) assert all(list((encoded[:, 1:] < 3).reshape((-1,)))) # Test where all categories are below the threshold encoder = RobustOrdinalEncoder(threshold=10) encoder.fit(ordinal_data) assert len(encoder.feature_idxs_no_categories_) == 3 encoded = encoder.transform(test_data) assert np.all(encoded == 0)
def test_robust_ordinal_encoding_categories(threshold, expected): encoder = RobustOrdinalEncoder(threshold=threshold) encoder.fit(ordinal_data) for i, cat in enumerate(encoder.categories_): assert set(cat) == set(expected[i])