def test_classification(df_normal_dist): transformer = DecisionTreeDiscretiser( cv=3, scoring="roc_auc", variables=None, param_grid={"max_depth": [1, 2, 3, 4]}, regression=False, random_state=0, ) np.random.seed(0) y = pd.Series(np.random.binomial(1, 0.7, 100)) X = transformer.fit_transform(df_normal_dist, y) X_t = [1.0, 0.71, 0.93, 0.0] # init params assert transformer.cv == 3 assert transformer.variables == ["var"] assert transformer.scoring == "roc_auc" assert transformer.regression is False # fit params assert transformer.input_shape_ == (100, 1) # transform params assert len([x for x in np.round(X["var"].unique(), 2) if x not in X_t]) == 0 assert transformer.scores_dict_ == {"var": 0.717391304347826}
def test_error_when_regression_is_false_and_target_is_continuous( df_discretise): np.random.seed(42) mu, sigma = 0, 3 y = np.random.normal(mu, sigma, len(df_discretise)) with pytest.raises(ValueError): transformer = DecisionTreeDiscretiser(regression=False) transformer.fit(df_discretise[["var_A", "var_B"]], y)
def test_regression(df_normal_dist): transformer = DecisionTreeDiscretiser( cv=3, scoring="neg_mean_squared_error", variables=None, param_grid={"max_depth": [1, 2, 3, 4]}, regression=True, random_state=0, ) np.random.seed(0) y = pd.Series(pd.Series(np.random.normal(0, 0.1, 100))) X = transformer.fit_transform(df_normal_dist, y) X_t = [ 0.19, 0.04, 0.11, 0.23, -0.09, -0.02, 0.01, 0.15, 0.07, -0.26, 0.09, -0.07, -0.16, -0.2, -0.04, -0.12, ] # init params assert transformer.cv == 3 assert transformer.variables is None assert transformer.scoring == "neg_mean_squared_error" assert transformer.regression is True # fit params assert transformer.variables_ == ["var"] assert transformer.n_features_in_ == 1 assert np.round(transformer.scores_dict_["var"], 3) == np.round(-4.4373314584616444e-05, 3) # transform params assert all(x for x in np.round(X["var"].unique(), 2) if x not in X_t)
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Fit a decision tree per variable. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the categorical variables. y : pandas series. The target variable. Required to train the decision tree and for ordered ordinal encoding. Raises ------ TypeError - If the input is not a Pandas DataFrame. - If any user provided variable is not categorical ValueError - If there are no categorical variables in the df or the df is empty - If the variable(s) contain null values Returns ------- self """ # check input dataframe X = self._check_fit_input_and_variables(X) # initialize categorical encoder cat_encoder = OrdinalEncoder(encoding_method=self.encoding_method, variables=self.variables) # initialize decision tree discretiser tree_discretiser = DecisionTreeDiscretiser( cv=self.cv, scoring=self.scoring, variables=self.variables, param_grid=self.param_grid, regression=self.regression, random_state=self.random_state, ) # pipeline for the encoder self.encoder_ = Pipeline([ ("categorical_encoder", cat_encoder), ("tree_discretiser", tree_discretiser), ]) self.encoder_.fit(X, y) self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learns the numbers that should be used to replace the categories in each variable. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the categorical variables. y : pandas series. The target variable. Required to train the decision tree and for ordered ordinal encoding. """ # check input dataframe X = self._check_fit_input_and_variables(X) # initialize categorical encoder cat_encoder = OrdinalEncoder(encoding_method=self.encoding_method, variables=self.variables) # initialize decision tree discretiser tree_discretiser = DecisionTreeDiscretiser( cv=self.cv, scoring=self.scoring, variables=self.variables, param_grid=self.param_grid, regression=self.regression, random_state=self.random_state, ) # pipeline for the encoder self.encoder_ = Pipeline([ ("categorical_encoder", cat_encoder), ("tree_discretiser", tree_discretiser), ]) self.encoder_.fit(X, y) self.input_shape_ = X.shape return self
import numpy as np import pytest from sklearn.utils.estimator_checks import check_estimator from feature_engine.discretisation import ( ArbitraryDiscretiser, DecisionTreeDiscretiser, EqualFrequencyDiscretiser, EqualWidthDiscretiser, ) from tests.estimator_checks.estimator_checks import check_feature_engine_estimator _estimators = [ DecisionTreeDiscretiser(regression=False), EqualFrequencyDiscretiser(), EqualWidthDiscretiser(), ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]}), ] @pytest.mark.parametrize("estimator", _estimators) def test_check_estimator_from_sklearn(estimator): return check_estimator(estimator) @pytest.mark.parametrize("estimator", _estimators) def test_check_estimator_from_feature_engine(estimator): if estimator.__class__.__name__ == "ArbitraryDiscretiser": estimator.set_params(binning_dict={"var_1": [-np.Inf, 0, np.Inf]}) return check_feature_engine_estimator(estimator)
def fit(self, X: pd.DataFrame, y: pd.Series): """ Fit a decision tree per variable. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the categorical variables. y : pandas series. The target variable. Required to train the decision tree and for ordered ordinal encoding. """ X, y = check_X_y(X, y) # confirm model type and target variables are compatible. if self.regression is True: if type_of_target(y) == "binary": raise ValueError( "Trying to fit a regression to a binary target is not " "allowed by this transformer. Check the target values " "or set regression to False.") else: check_classification_targets(y) self._fit(X) self._get_feature_names_in(X) if self.param_grid: param_grid = self.param_grid else: param_grid = {"max_depth": [1, 2, 3, 4]} # initialize categorical encoder cat_encoder = OrdinalEncoder( encoding_method=self.encoding_method, variables=self.variables_, ignore_format=self.ignore_format, errors="raise", ) # initialize decision tree discretiser tree_discretiser = DecisionTreeDiscretiser( cv=self.cv, scoring=self.scoring, variables=self.variables_, param_grid=param_grid, regression=self.regression, random_state=self.random_state, ) # pipeline for the encoder self.encoder_ = Pipeline([ ("categorical_encoder", cat_encoder), ("tree_discretiser", tree_discretiser), ]) self.encoder_.fit(X, y) return self
def test_error_if_y_not_passed(df_normal_dist): # test case 3: raises error if target is not passed with pytest.raises(TypeError): encoder = DecisionTreeDiscretiser() encoder.fit(df_normal_dist)
def test_error_when_regression_is_not_bool(): with pytest.raises(ValueError): DecisionTreeDiscretiser(regression="other")
def test_error_when_cv_is_string(): with pytest.raises(ValueError): DecisionTreeDiscretiser(cv="other")
print("Separando em base de treino e teste...") X_train, X_test, y_train, y_test = model_selection.train_test_split( df_full[features], df_full[target], random_state=42, test_size=0.1) print("ok.") # %% print("Ajustando modelo em nosso pipeline...") arbitrary_imputer = ArbitraryNumberImputer(arbitrary_number=-999, variables=features) disc = DecisionTreeDiscretiser(cv=3, scoring='roc_auc', variables=features, regression=False, random_state=42) pca = decomposition.PCA(n_components=120, random_state=42) best_pars = { 'subsample': 0.7, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.2 } clf_xgb = xgb.XGBClassifier(nthread=8, eval_metric='auc', random_state=42,
def test_error_when_regression_is_true_and_target_is_binary(df_discretise): with pytest.raises(ValueError): transformer = DecisionTreeDiscretiser(regression=True) transformer.fit(df_discretise[["var_A", "var_B"]], df_discretise["target"])
import numpy as np import pytest from sklearn.utils.estimator_checks import check_estimator from feature_engine.discretisation import ( ArbitraryDiscretiser, DecisionTreeDiscretiser, EqualFrequencyDiscretiser, EqualWidthDiscretiser, ) @pytest.mark.parametrize( "Estimator", [ DecisionTreeDiscretiser(), EqualFrequencyDiscretiser(), EqualWidthDiscretiser(), ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]}), ], ) def test_all_transformers(Estimator): return check_estimator(Estimator)