def test_arbitrary_discretiser(): boston_dataset = load_boston() data = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names) user_dict = {"LSTAT": [0, 10, 20, 30, np.Inf]} data_t1 = data.copy() data_t2 = data.copy() data_t1["LSTAT"] = pd.cut(data["LSTAT"], bins=[0, 10, 20, 30, np.Inf]) data_t2["LSTAT"] = pd.cut(data["LSTAT"], bins=[0, 10, 20, 30, np.Inf], labels=False) transformer = ArbitraryDiscretiser( binning_dict=user_dict, return_object=False, return_boundaries=False ) X = transformer.fit_transform(data) # init params assert transformer.return_object is False assert transformer.return_boundaries is False # fit params assert transformer.variables_ == ["LSTAT"] assert transformer.binner_dict_ == user_dict # transform params pd.testing.assert_frame_equal(X, data_t2) transformer = ArbitraryDiscretiser( binning_dict=user_dict, return_object=False, return_boundaries=True ) X = transformer.fit_transform(data) pd.testing.assert_frame_equal(X, data_t1)
def test_error_if_input_df_contains_na_in_transform(df_vartypes, df_na): # test case 1: when dataset contains na, transform method age_dict = {"Age": [0, 10, 20, 30, np.Inf]} with pytest.raises(ValueError): transformer = ArbitraryDiscretiser(binning_dict=age_dict) transformer.fit(df_vartypes) transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]])
def test_arbitrary_discretiser(): california_dataset = fetch_california_housing() data = pd.DataFrame(california_dataset.data, columns=california_dataset.feature_names) user_dict = {"HouseAge": [0, 20, 40, 60, np.Inf]} data_t1 = data.copy() data_t2 = data.copy() # HouseAge is the median house age in the block group. data_t1["HouseAge"] = pd.cut(data["HouseAge"], bins=[0, 20, 40, 60, np.Inf]) data_t1["HouseAge"] = data_t1["HouseAge"].astype(str) data_t2["HouseAge"] = pd.cut(data["HouseAge"], bins=[0, 20, 40, 60, np.Inf], labels=False) transformer = ArbitraryDiscretiser(binning_dict=user_dict, return_object=False, return_boundaries=False) X = transformer.fit_transform(data) # init params assert transformer.return_object is False assert transformer.return_boundaries is False # fit params assert transformer.variables_ == ["HouseAge"] assert transformer.binner_dict_ == user_dict # transform params pd.testing.assert_frame_equal(X, data_t2) transformer = ArbitraryDiscretiser(binning_dict=user_dict, return_object=False, return_boundaries=True) X = transformer.fit_transform(data) pd.testing.assert_frame_equal(X, data_t1)
import numpy as np import pytest from sklearn.utils.estimator_checks import check_estimator from feature_engine.discretisation import ( ArbitraryDiscretiser, DecisionTreeDiscretiser, EqualFrequencyDiscretiser, EqualWidthDiscretiser, ) from tests.estimator_checks.estimator_checks import check_feature_engine_estimator _estimators = [ DecisionTreeDiscretiser(regression=False), EqualFrequencyDiscretiser(), EqualWidthDiscretiser(), ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]}), ] @pytest.mark.parametrize("estimator", _estimators) def test_check_estimator_from_sklearn(estimator): return check_estimator(estimator) @pytest.mark.parametrize("estimator", _estimators) def test_check_estimator_from_feature_engine(estimator): if estimator.__class__.__name__ == "ArbitraryDiscretiser": estimator.set_params(binning_dict={"var_1": [-np.Inf, 0, np.Inf]}) return check_feature_engine_estimator(estimator)
def test_error_when_nan_introduced_during_transform(): # test error when NA are introduced during the discretisation. rng = default_rng() # create dataframe with 2 variables, 1 normal and 1 skewed random = skewnorm.rvs(a=-50, loc=4, size=100) random = random - min( random) # Shift so the minimum value is equal to zero. train = pd.concat( [ pd.Series(rng.standard_normal(100)), pd.Series(random), ], axis=1, ) train.columns = ["var_a", "var_b"] # create a dataframe with 2 variables normally distributed test = pd.concat( [ pd.Series(rng.standard_normal(100)), pd.Series(rng.standard_normal(100)), ], axis=1, ) test.columns = ["var_a", "var_b"] msg = ("During the discretisation, NaN values were introduced " "in the feature(s) var_b.") limits_dict = {"var_a": [-5, -2, 0, 2, 5], "var_b": [0, 2, 5]} # check for warning when errors equals 'ignore' with pytest.warns(UserWarning) as record: transformer = ArbitraryDiscretiser(binning_dict=limits_dict, errors="ignore") transformer.fit(train) transformer.transform(test) # check that only one warning was returned assert len(record) == 1 # check that message matches assert record[0].message.args[0] == msg # check for error when errors equals 'raise' with pytest.raises(ValueError) as record: transformer = ArbitraryDiscretiser(binning_dict=limits_dict, errors="raise") transformer.fit(train) transformer.transform(test) # check that error message matches assert str(record.value) == msg
def test_error_if_not_permitted_value_is_errors(): age_dict = {"Age": [0, 10, 20, 30, np.Inf]} with pytest.raises(ValueError): ArbitraryDiscretiser(binning_dict=age_dict, errors="medialuna")