예제 #1
0
def test_automatically_find_variables_and_return_as_numeric(df_normal_dist):
    # test case 1: automatically select variables, return_object=False
    transformer = EqualWidthDiscretiser(bins=10,
                                        variables=None,
                                        return_object=False)
    X = transformer.fit_transform(df_normal_dist)

    # fit parameters
    _, bins = pd.cut(x=df_normal_dist["var"],
                     bins=10,
                     retbins=True,
                     duplicates="drop")
    bins[0] = float("-inf")
    bins[len(bins) - 1] = float("inf")

    # transform output
    X_t = [x for x in range(0, 10)]
    val_counts = [18, 17, 16, 13, 11, 7, 7, 5, 5, 1]

    # init params
    assert transformer.bins == 10
    assert transformer.variables == ["var"]
    assert transformer.return_object is False
    # fit params
    assert transformer.input_shape_ == (100, 1)
    # transform params
    assert (transformer.binner_dict_["var"] == bins).all()
    assert len([x for x in X["var"].unique() if x not in X_t]) == 0
    # in equal width discretisation, intervals get different number of values
    assert len([x for x in X["var"].value_counts()
                if x not in val_counts]) == 0
예제 #2
0
def test_custom_models_template(scenario_with_custom_models_template):
    aml, pipeline, param_grid = scenario_with_custom_models_template
    final_pipes = aml._make_aml_combinations(pipeline, param_grid)
    check = [Pipeline(steps=[('disc1', EqualFrequencyDiscretiser()),
                             ('model1', LinearRegression())]),
             Pipeline(steps=[('disc1', EqualFrequencyDiscretiser()),
                             ('model2', RandomForestRegressor())]),
             Pipeline(steps=[('disc2', EqualWidthDiscretiser()),
                             ('model1', LinearRegression())]),
             Pipeline(steps=[('disc2', EqualWidthDiscretiser()),
                             ('model2', RandomForestRegressor())])]
    assert str(final_pipes) == str(check)
예제 #3
0
파일: conftest.py 프로젝트: jslomkowski/aml
def scenario_with_default_models_template():
    pipeline = Pipeline([('disc1', EqualFrequencyDiscretiser()),
                         ('disc2', EqualWidthDiscretiser()),
                         aml_basic_regressors[:2]])
    param_grid = {}
    aml = AMLGridSearchCV(pipeline, param_grid)
    return aml, pipeline, param_grid
예제 #4
0
파일: conftest.py 프로젝트: jslomkowski/aml
def scenario_with_grid_search_for_one_model():
    pipeline = Pipeline([('disc1', EqualFrequencyDiscretiser()),
                         ('disc2', EqualWidthDiscretiser()),
                         ('model1', LinearRegression()),
                         ('model2', RandomForestRegressor())])
    param_grid = {'disc1__q': [5, 15], 'model2__*': []}
    aml = AMLGridSearchCV(pipeline, param_grid)
    return aml, pipeline, param_grid
예제 #5
0
파일: conftest.py 프로젝트: jslomkowski/aml
def scenario_with_custom_models_template():
    regressors = [('model1', LinearRegression()),
                  ('model2', RandomForestRegressor())]
    pipeline = Pipeline([('disc1', EqualFrequencyDiscretiser()),
                         ('disc2', EqualWidthDiscretiser()), regressors])
    param_grid = {}
    aml = AMLGridSearchCV(pipeline, param_grid)
    return aml, pipeline, param_grid
예제 #6
0
파일: conftest.py 프로젝트: jslomkowski/aml
def scenario_without_params():
    pipeline = Pipeline([('disc1', EqualFrequencyDiscretiser()),
                         ('disc2', EqualWidthDiscretiser()),
                         ('model1', LinearRegression()),
                         ('model2', RandomForestRegressor())])
    param_grid = {}
    aml = AMLGridSearchCV(pipeline, param_grid)
    return aml, pipeline, param_grid
예제 #7
0
    def _make_discretiser(self):
        """
        Instantiate the EqualWidthDiscretiser or EqualFrequencyDiscretiser.
        """
        if self.strategy == "equal_width":
            discretiser = EqualWidthDiscretiser(
                bins=self.bins,
                variables=self.variables_numerical_,
                return_boundaries=True,
            )
        else:
            discretiser = EqualFrequencyDiscretiser(
                q=self.bins,
                variables=self.variables_numerical_,
                return_boundaries=True,
            )

        return discretiser
    def _make_numerical_pipeline(self):

        if self.strategy == "equal_width":
            discretizer = EqualWidthDiscretiser(
                bins=self.bins, variables=self.variables_numerical_, return_object=True
            )
        else:
            discretizer = EqualFrequencyDiscretiser(
                q=self.bins, variables=self.variables_numerical_, return_object=True
            )

        encoder = MeanEncoder(variables=self.variables_numerical_)

        _pipeline_numerical = Pipeline(
            [
                ("discretization", discretizer),
                ("encoder", encoder),
            ]
        )

        return _pipeline_numerical
예제 #9
0
def test_4th_step_in_scenario_without_params(scenario_without_params):
    aml, pipeline, param_grid = scenario_without_params
    final_pipes = aml._make_aml_combinations(pipeline, param_grid)
    check = [('disc2', EqualWidthDiscretiser()),
             ('model2', RandomForestRegressor())]
    assert str(final_pipes[3].steps) == str(check)
def test_non_fitted_error(df_vartypes):
    with pytest.raises(NotFittedError):
        transformer = EqualWidthDiscretiser()
        transformer.transform(df_vartypes)
예제 #11
0
def test_error_if_input_df_contains_na_in_transform(df_vartypes, df_na):
    # test case 4: when dataset contains na, transform method
    with pytest.raises(ValueError):
        transformer = EqualWidthDiscretiser()
        transformer.fit(df_vartypes)
        transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]])
예제 #12
0
def test_error_if_input_df_contains_na_in_fit(df_na):
    # test case 3: when dataset contains na, fit method
    with pytest.raises(ValueError):
        transformer = EqualWidthDiscretiser()
        transformer.fit(df_na)
예제 #13
0
def test_error_if_return_object_not_bool():
    with pytest.raises(ValueError):
        EqualWidthDiscretiser(return_object="other")
예제 #14
0
def test_error_when_bins_not_number():
    with pytest.raises(ValueError):
        EqualWidthDiscretiser(bins="other")
예제 #15
0
def test_automatically_find_variables_and_return_as_object(df_normal_dist):
    transformer = EqualWidthDiscretiser(bins=10,
                                        variables=None,
                                        return_object=True)
    X = transformer.fit_transform(df_normal_dist)
    assert X["var"].dtypes == "O"
예제 #16
0
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find features with high PSI values.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y : pandas series. Default = None
            y is not needed in this transformer. You can pass y or None.
        """
        # check input dataframe
        X = check_X(X)

        # If required exclude variables that are not in the input dataframe
        self._confirm_variables(X)

        # find numerical variables or check those entered are present in the dataframe
        self.variables_ = _find_or_check_numerical_variables(
            X, self.variables_)

        # Remove the split_col from the variables list. It might be added if the
        # variables are not defined at initialization.
        if self.split_col in self.variables_:
            self.variables_.remove(self.split_col)

        if self.missing_values == "raise":
            # check if dataset contains na or inf
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        # Split the dataframe into basis and test.
        basis_df, test_df = self._split_dataframe(X)

        # Check the shape of the returned dataframes for PSI calculations.
        # The number of observations must be at least equal to the
        # number of bins.
        if min(basis_df.shape[0], test_df.shape[0]) < self.bins:
            raise ValueError(
                "The number of rows in the basis and test datasets that will be used "
                f"in the PSI calculations must be at least larger than {self.bins}. "
                "After slitting the original dataset based on the given cut_off or"
                f"split_frac we have {basis_df.shape[0]} samples in the basis set, "
                f"and {test_df.shape[0]} samples in the test set. "
                "Please adjust the value of the cut_off or split_frac.")

        # Switch basis and test dataframes if required.
        if self.switch:
            test_df, basis_df = basis_df, test_df

        # set up the discretizer
        if self.strategy == "equal_width":
            bucketer = EqualWidthDiscretiser(bins=self.bins)
        else:
            bucketer = EqualFrequencyDiscretiser(q=self.bins)

        # Compute the PSI by looping over the features
        self.psi_values_ = {}
        self.features_to_drop_ = []

        for feature in self.variables_:
            # Discretize the features.

            basis_discrete = bucketer.fit_transform(basis_df[[feature
                                                              ]].dropna())
            test_discrete = bucketer.transform(test_df[[feature]].dropna())

            # Determine percentage of observations per bin
            basis_distrib, test_distrib = self._observation_frequency_per_bin(
                basis_discrete, test_discrete)

            # Calculate the PSI value
            self.psi_values_[feature] = np.sum(
                (test_distrib - basis_distrib) *
                np.log(test_distrib / basis_distrib))
            # Assess if feature should be dropped
            if self.psi_values_[feature] > self.threshold:
                self.features_to_drop_.append(feature)

        # save input features
        self._get_feature_names_in(X)

        return self
예제 #17
0
import numpy as np
import pytest
from sklearn.utils.estimator_checks import check_estimator

from feature_engine.discretisation import (
    ArbitraryDiscretiser,
    DecisionTreeDiscretiser,
    EqualFrequencyDiscretiser,
    EqualWidthDiscretiser,
)
from tests.estimator_checks.estimator_checks import check_feature_engine_estimator

_estimators = [
    DecisionTreeDiscretiser(regression=False),
    EqualFrequencyDiscretiser(),
    EqualWidthDiscretiser(),
    ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]}),
]


@pytest.mark.parametrize("estimator", _estimators)
def test_check_estimator_from_sklearn(estimator):
    return check_estimator(estimator)


@pytest.mark.parametrize("estimator", _estimators)
def test_check_estimator_from_feature_engine(estimator):
    if estimator.__class__.__name__ == "ArbitraryDiscretiser":
        estimator.set_params(binning_dict={"var_1": [-np.Inf, 0, np.Inf]})
    return check_feature_engine_estimator(estimator)
예제 #18
0
                              min_lr=0.01)
early_stop = EarlyStopping(monitor='val_loss',
                           mode='min',
                           min_delta=0,
                           verbose=1,
                           patience=20)

pump_pipeline = Pipeline(
    steps=[("feature_to_keeper",
            pp.FeatureKeeper(variables_to_keep=config.VARIABLES_TO_KEEP)),
           ("missing_imputer",
            pp.MissingImputer(numerical_variables=config.NUMERICAL_VARIABLES)),
           ("yeoJohnson",
            YeoJohnsonTransformer(variables=config.YEO_JHONSON_VARIABLES)),
           ("discretization",
            EqualWidthDiscretiser(bins=5, variables=config.NUMERICAL_VARIABLES)
            ),
           ("categorical_grouper",
            pp.CategoricalGrouping(config_dict=config.VARIABLES_TO_GROUP)),
           ("rareCategories_grouper",
            pp.RareCategoriesGrouping(threshold=config.VARIABLES_THRESHOLD)),
           ("one_hot_encoder",
            OneHotEncoder(variables=config.REAL_CATEGORICAL_VARIABLES,
                          drop_last=False)), ("scaler", MinMaxScaler()),
           ("model",
            KerasClassifier(build_fn=create_model,
                            epochs=1,
                            validation_split=0.2,
                            batch_size=256,
                            verbose=1,
                            callbacks=[early_stop, reduce_lr],