def test_transform():
    X = _get_clean_dataframe()
    sup_vec = SuperVectorizer()
    sup_vec.fit(X)
    s = [34, 5.5, 'private', 'manager', 'yes', '60K+']
    x = np.array(s).reshape(1, -1)
    x_trans = sup_vec.transform(x)
    assert (x_trans == [[1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 34,
                         5.5]]).all()
def test_auto_cast():
    """
    Tests that the SuperVectorizer automatic type detection works as expected.
    """
    vectorizer = SuperVectorizer()

    # Test datetime detection
    X = _get_datetimes_dataframe()

    expected_types_datetimes = {
        "pd_datetime": "datetime64[ns]",
        "np_datetime": "datetime64[ns]",
        "dmy-": "datetime64[ns]",
        "ymd/": "datetime64[ns]",
        "ymd/_hms:": "datetime64[ns]",
    }
    X_trans = vectorizer._auto_cast(X)
    for col in X_trans.columns:
        assert expected_types_datetimes[col] == X_trans[col].dtype

    # Test other types detection

    expected_types_clean_dataframe = {
        "int": "int64",
        "float": "float64",
        "str1": "object",
        "str2": "object",
        "cat1": "object",
        "cat2": "object"
    }

    X = _get_clean_dataframe()
    X_trans = vectorizer._auto_cast(X)
    for col in X_trans.columns:
        assert type_equality(expected_types_clean_dataframe[col],
                             X_trans[col].dtype)

    # Test that missing values don't prevent type detection
    expected_types_dirty_dataframe = {
        "int": "float64",  # int type doesn't support nans
        "float": "float64",
        "str1": "object",
        "str2": "object",
        "cat1": "object",
        "cat2": "object"
    }

    X = _get_dirty_dataframe()
    X_trans = vectorizer._auto_cast(X)
    for col in X_trans.columns:
        assert type_equality(expected_types_dirty_dataframe[col],
                             X_trans[col].dtype)
def test_fit():
    # Simply checks sklearn's `check_is_fitted` function raises an error if
    # the SuperVectorizer is instantiated but not fitted.
    # See GH#193
    sup_vec = SuperVectorizer()
    with pytest.raises(NotFittedError):
        if LooseVersion(sklearn.__version__) >= LooseVersion('0.22'):
            assert check_is_fitted(sup_vec)
        else:
            assert check_is_fitted(sup_vec, attributes=dir(sup_vec))
def test_with_arrays():
    """
    Check that the SuperVectorizer works if we input a list of lists or a numpy array.
    """
    expected_transformers = {
        'numeric': [0, 1],
        'low_card_cat': [2, 4],
        'high_card_cat': [3, 5],
    }
    vectorizer = SuperVectorizer(
        cardinality_threshold=4,
        # we must have n_samples = 5 >= n_components
        high_card_cat_transformer=GapEncoder(n_components=2),
        numerical_transformer=StandardScaler(),
    )

    X = _get_numpy_array()
    vectorizer.fit_transform(X)
    check_same_transformers(expected_transformers, vectorizer.transformers)

    X = _get_list_of_lists()
    vectorizer.fit_transform(X)
    check_same_transformers(expected_transformers, vectorizer.transformers)
def test_fit_transform_equiv():
    """
    We will test the equivalence between using `.fit_transform(X)`
    and `.fit(X).transform(X).`
    """
    X1 = _get_clean_dataframe()
    X2 = _get_dirty_dataframe()

    sup_vec1 = SuperVectorizer()
    sup_vec2 = SuperVectorizer()
    sup_vec3 = SuperVectorizer()
    sup_vec4 = SuperVectorizer()

    enc1_x1 = sup_vec1.fit_transform(X1)
    enc2_x1 = sup_vec2.fit(X1).transform(X1)

    enc1_x2 = sup_vec3.fit_transform(X2)
    enc2_x2 = sup_vec4.fit(X2).transform(X2)

    assert np.allclose(enc1_x1, enc2_x1, rtol=0, atol=0, equal_nan=True)

    assert np.allclose(enc1_x2, enc2_x2, rtol=0, atol=0, equal_nan=True)
def test_get_feature_names_out():
    X = _get_clean_dataframe()

    vectorizer_w_pass = SuperVectorizer(remainder='passthrough')
    vectorizer_w_pass.fit(X)

    if LooseVersion(sklearn.__version__) < LooseVersion('0.23'):
        with pytest.raises(NotImplementedError):
            # Prior to sklearn 0.23, ColumnTransformer.get_feature_names
            # with "passthrough" transformer(s) raises a NotImplementedError
            assert vectorizer_w_pass.get_feature_names()
            assert vectorizer_w_pass.get_feature_names_out()
    else:
        expected_feature_names_pass = [  # Order matters. If it doesn't, convert to set.
            'str1_private', 'str1_public', 'str2_chef', 'str2_lawyer',
            'str2_manager', 'str2_officer', 'str2_teacher', 'cat1_no',
            'cat1_yes', 'cat2_20K+', 'cat2_30K+', 'cat2_40K+', 'cat2_50K+',
            'cat2_60K+', 'int', 'float'
        ]
        assert vectorizer_w_pass.get_feature_names(
        ) == expected_feature_names_pass
        assert vectorizer_w_pass.get_feature_names_out(
        ) == expected_feature_names_pass

    vectorizer_w_drop = SuperVectorizer(remainder='drop')
    vectorizer_w_drop.fit(X)

    expected_feature_names_drop = [  # Order matters. If it doesn't, convert to set.
        'str1_private', 'str1_public', 'str2_chef', 'str2_lawyer',
        'str2_manager', 'str2_officer', 'str2_teacher', 'cat1_no', 'cat1_yes',
        'cat2_20K+', 'cat2_30K+', 'cat2_40K+', 'cat2_50K+', 'cat2_60K+'
    ]
    assert vectorizer_w_drop.get_feature_names() == expected_feature_names_drop
    assert vectorizer_w_drop.get_feature_names_out(
    ) == expected_feature_names_drop
def _test_possibilities(
    X,
    expected_transformers_df,
    expected_transformers_2,
    expected_transformers_np_no_cast,
    expected_transformers_series,
    expected_transformers_plain,
    expected_transformers_np_cast,
):
    """
    Do a bunch of tests with the SuperVectorizer.
    We take some expected transformers results as argument. They're usually
    lists or dictionaries.
    """
    # Test with low cardinality and a StandardScaler for the numeric columns
    vectorizer_base = SuperVectorizer(
        cardinality_threshold=4,
        # we must have n_samples = 5 >= n_components
        high_card_cat_transformer=GapEncoder(n_components=2),
        numerical_transformer=StandardScaler(),
    )
    # Warning: order-dependant
    vectorizer_base.fit_transform(X)
    check_same_transformers(expected_transformers_df,
                            vectorizer_base.transformers)

    # Test with higher cardinality threshold and no numeric transformer
    vectorizer_default = SuperVectorizer()  # Using default values
    vectorizer_default.fit_transform(X)
    check_same_transformers(expected_transformers_2,
                            vectorizer_default.transformers)

    # Test with a numpy array
    arr = X.to_numpy()
    # Instead of the columns names, we'll have the column indices.
    vectorizer_base.fit_transform(arr)
    check_same_transformers(expected_transformers_np_no_cast,
                            vectorizer_base.transformers)

    # Test with pandas series
    vectorizer_base.fit_transform(X['cat1'])
    check_same_transformers(expected_transformers_series,
                            vectorizer_base.transformers)

    # Test casting values
    vectorizer_cast = SuperVectorizer(
        cardinality_threshold=4,
        # we must have n_samples = 5 >= n_components
        high_card_cat_transformer=GapEncoder(n_components=2),
        numerical_transformer=StandardScaler(),
    )
    X_str = X.astype('object')
    # With pandas
    vectorizer_cast.fit_transform(X_str)
    check_same_transformers(expected_transformers_plain,
                            vectorizer_cast.transformers)
    # With numpy
    vectorizer_cast.fit_transform(X_str.to_numpy())
    check_same_transformers(expected_transformers_np_cast,
                            vectorizer_cast.transformers)
示例#8
0
# machine learning.

# %%
# Using the SuperVectorizer in a supervised-learning pipeline
# -----------------------------------------------------------
#
# Assembling the |SV| in a |Pipeline| with a powerful learner,
# such as gradient boosted trees, gives **a machine-learning method that
# can be readily applied to the dataframe**.
#
# The SuperVectorizer requires at least dirty_cat 0.2.0.
#

from dirty_cat import SuperVectorizer

pipeline = make_pipeline(SuperVectorizer(auto_cast=True),
                         HistGradientBoostingRegressor())

# %%
# Let's perform a cross-validation to see how well this model predicts

from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline, X, y, scoring='r2')

print(f'scores={scores}')
print(f'mean={np.mean(scores)}')
print(f'std={np.std(scores)}')

# %%
# The prediction performed here is pretty much as good as above
# but the code here is much simpler as it does not involve specifying
示例#9
0
#
# Assembling the |SV| in a pipeline with a powerful learner,
# such as gradient boosted trees, gives **a machine-learning method that
# can be readily applied to the dataframe**.
#
# It's the typical and recommended way of using it.

# For scikit-learn 0.24, we need to require the experimental feature
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.pipeline import Pipeline

from dirty_cat import SuperVectorizer

pipeline = Pipeline([('vectorizer', SuperVectorizer(auto_cast=True)),
                     ('clf', HistGradientBoostingRegressor(random_state=42))])

###############################################################################
# Let's perform a cross-validation to see how well this model predicts

from sklearn.model_selection import cross_val_score

scores = cross_val_score(pipeline, X, y, scoring='r2')

import numpy as np

print(f'{scores=}')
print(f'mean={np.mean(scores)}')
print(f'std={np.std(scores)}')
示例#10
0
def test_super_vectorizer():
    # Create a simple DataFrame
    X = _get_dataframe()
    # Test with low cardinality and a StandardScaler for the numeric columns
    vectorizer_base = SuperVectorizer(
        cardinality_threshold=3,
        # we must have n_samples = 5 >= n_components
        high_card_str_transformer=GapEncoder(n_components=2),
        high_card_cat_transformer=GapEncoder(n_components=2),
        numerical_transformer=StandardScaler(),
    )
    # Warning: order-dependant
    expected_transformers_df = {
        'numeric': ['int', 'float'],
        'low_card_str': ['str1'],
        'high_card_str': ['str2'],
        'low_card_cat': ['cat1'],
        'high_card_cat': ['cat2'],
    }
    vectorizer_base.fit_transform(X)
    check_same_transformers(expected_transformers_df,
                            vectorizer_base.transformers)

    # Test with higher cardinality threshold and no numeric transformer
    vectorizer_default = SuperVectorizer()  # Using default values
    expected_transformers_2 = {
        'low_card_str': ['str1', 'str2'],
        'low_card_cat': ['cat1', 'cat2'],
    }
    vectorizer_default.fit_transform(X)
    check_same_transformers(expected_transformers_2,
                            vectorizer_default.transformers)

    # Test with a numpy array
    arr = X.to_numpy()
    # Instead of the columns names, we'll have the column indices.
    expected_transformers_np = {
        'numeric': [0, 1],
        'low_card_str': [2, 4],
        'high_card_str': [3, 5],
    }
    vectorizer_base.fit_transform(arr)
    check_same_transformers(expected_transformers_np,
                            vectorizer_base.transformers)

    # Test with pandas series
    expected_transformers_series = {
        'low_card_cat': ['cat1'],
    }
    vectorizer_base.fit_transform(X['cat1'])
    check_same_transformers(expected_transformers_series,
                            vectorizer_base.transformers)

    # Test casting values
    vectorizer_cast = SuperVectorizer(
        cardinality_threshold=3,
        auto_cast=True,
        # we must have n_samples = 5 >= n_components
        high_card_str_transformer=GapEncoder(n_components=2),
        high_card_cat_transformer=GapEncoder(n_components=2),
        numerical_transformer=StandardScaler(),
    )
    X_str = X.astype('object')
    expected_transformers_plain = {
        'high_card_str': ['str2', 'cat2'],
        'low_card_str': ['str1', 'cat1'],
        'numeric': ['int', 'float']
    }
    # With pandas
    vectorizer_cast.fit_transform(X_str)
    check_same_transformers(expected_transformers_plain,
                            vectorizer_cast.transformers)
    # With numpy
    vectorizer_cast.fit_transform(X_str.to_numpy())
    check_same_transformers(expected_transformers_np,
                            vectorizer_cast.transformers)