Exemplo n.º 1
0
def test_robust_ordinal_encoder():
    st_helper = SklearnTestHelper()
    roe = RobustOrdinalEncoder()
    data = np.array([[0, 1], [0, 4], [1, 2], [1, 10]], dtype=np.float32)
    roe.fit(data)
    dshape = (relay.Any(), len(data[0]))
    _test_model_impl(st_helper, roe, dshape, data)
Exemplo n.º 2
0
def test_robust_ordinal_encoding_transform():
    encoder = RobustOrdinalEncoder()
    encoder.fit(ordinal_data)
    test_data = np.concatenate([ordinal_data, np.array([["waffle", 1213, None]])], axis=0)
    encoded = encoder.transform(test_data)
    assert all(list((encoded[:-1] < 3).reshape((-1,))))
    assert all(list(encoded[-1] == 3))
def test_robust_ordinal_encoding_inverse_transform(unknown_as_nan):
    encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan)
    encoder.fit(ordinal_data)
    test_data = np.concatenate([ordinal_data, np.array([["waffle", 1213, None]])], axis=0)
    encoded = encoder.transform(test_data)
    reverse = encoder.inverse_transform(encoded)
    assert np.array_equal(ordinal_data, reverse[:-1])
    assert all([x is None for x in reverse[-1]])
def test_robust_ordinal_encoding_inverse_transform_floatkeys():
    encoder = RobustOrdinalEncoder()
    data = np.arange(9).astype(np.float32).reshape((3, 3))
    encoder.fit(data)
    test_data = data + 3
    encoded = encoder.transform(test_data)
    reverse = encoder.inverse_transform(encoded)
    assert reverse.dtype == object
    assert np.array_equal(data[1:], reverse[:-1])
    assert all([x is None for x in reverse[-1]])
def test_robust_ordinal_encoding_transform(unknown_as_nan):
    encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan)
    encoder.fit(ordinal_data)
    test_data = np.concatenate([ordinal_data, np.array([["waffle", 1213, np.nan]])], axis=0)
    encoded = encoder.transform(test_data)
    assert all(list((encoded[:-1] < 3).reshape((-1,))))
    if unknown_as_nan:
        assert all(list(np.isnan(encoded[-1])))
    else:
        assert all(list(encoded[-1] == 3))
def test_robust_ordinal_encoding_transform_max_categories():
    # Test where number of categories is much larger than max_categories
    data = np.array([[i for i in range(200)] + [i for i in range(150)] + [i for i in range(100)]]).T
    encoder = RobustOrdinalEncoder(max_categories=100)
    encoder.fit(data)
    assert len(encoder.categories_[0]) == 100
    assert all(list(encoder.categories_[0] <= 100))
    encoded = encoder.transform(data)
    cats, frequencies = np.unique(encoded, return_counts=True)
    assert len(cats) == encoder.max_categories + 1
    assert sum(frequencies == 3) == 100

    # Test where number of categories is equal to max categories
    encoder = RobustOrdinalEncoder(max_categories=2)
    encoder.fit(np.array([["x", "y"], ["y", "x"]]))
    assert len(encoder.categories_[0]) == 2
    assert len(encoder.categories_[1]) == 2
    encoded = encoder.transform([["x", "y"], ["z", "z"]])
    assert np.all(encoded[1] == 2)
    assert np.all(encoded[0] == [0, 1])
def test_robust_ordinal_encoding_transform_threshold():
    # Test where some categories are below the threshold
    encoder = RobustOrdinalEncoder(threshold=2)
    encoder.fit(ordinal_data)
    encoded = encoder.transform(ordinal_data)
    assert all(list(encoded[:, 0] < 2))
    assert all(list((encoded[:, 1:] < 3).reshape((-1,))))

    # Test where some categories are below the threshold and new categories are introduced in transformation
    test_data = np.concatenate([ordinal_data, np.array([["waffle", 1213, np.nan]])], axis=0)
    encoded = encoder.transform(test_data)
    assert all(list(encoded[:, 0] < 2))
    assert all(list((encoded[:, 1:] < 3).reshape((-1,))))

    # Test where all categories are below the threshold
    encoder = RobustOrdinalEncoder(threshold=10)
    encoder.fit(ordinal_data)
    assert len(encoder.feature_idxs_no_categories_) == 3
    encoded = encoder.transform(test_data)
    assert np.all(encoded == 0)
def test_robust_ordinal_encoding_inverse_transform(unknown_as_nan):
    encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan)
    encoder.fit(ordinal_data)
    test_data = np.concatenate([ordinal_data, np.array([["waffle", 1213, None]])], axis=0)
    encoded = encoder.transform(test_data)
    reverse = encoder.inverse_transform(encoded)
    assert np.array_equal(ordinal_data, reverse[:-1])
    assert all([x is None for x in reverse[-1]])

    # Test where some categories are below the threshold
    encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan, threshold=2)
    encoder.fit(ordinal_data)
    encoded = encoder.transform(test_data)
    reverse = encoder.inverse_transform(encoded)
    assert sum([i is None for i in reverse[:, 0]]) == 3
    assert sum([i is None for i in reverse[:, 1]]) == 2
    assert sum([i is None for i in reverse[:, 2]]) == 2

    # Test where all categories are below the threshold
    encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan, threshold=10)
    encoder.fit(ordinal_data)
    encoded = encoder.transform(test_data)
    reverse = encoder.inverse_transform(encoded)
    assert sum(([i is None for i in reverse.flatten()])) == reverse.size
def test_robust_ordinal_encoding_categories():
    encoder = RobustOrdinalEncoder()
    encoder.fit(ordinal_data)
    for i, cat in enumerate(encoder.categories_):
        assert set(cat) == set(ordinal_expected_categories_[i])
def test_robust_ordinal_encoding_categories(threshold, expected):
    encoder = RobustOrdinalEncoder(threshold=threshold)
    encoder.fit(ordinal_data)
    for i, cat in enumerate(encoder.categories_):
        assert set(cat) == set(expected[i])