def test_check_X_y_raises_error_when_pandas_index_dont_match(): df = pd.DataFrame({ "0": [1, 2, 3, 4], "1": [5, 6, 7, 8] }, index=[22, 99, 101, 212]) s = pd.Series([1, 2, 3, 4], index=[22, 99, 101, 999]) with pytest.raises(ValueError): check_X_y(df, s)
def fit(self, X: pd.DataFrame, y: pd.Series): """ Learn the mean value of the target for each category of the variable. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to be encoded. y: pandas series The target. """ X, y = check_X_y(X, y) self._fit(X) self._get_feature_names_in(X) temp = pd.concat([X, y], axis=1) temp.columns = list(X.columns) + ["target"] self.encoder_dict_ = {} for var in self.variables_: self.encoder_dict_[var] = temp.groupby( var)["target"].mean().to_dict() self._check_encoding_dictionary() return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Learn the WoE. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the categorical variables. y: pandas series. Target, must be binary. """ X, y = check_X_y(X, y) # check that y is binary if y.nunique() != 2: raise ValueError( "This encoder is designed for binary classification. The target " "used has more than 2 unique values.") self._fit(X) self._get_feature_names_in(X) temp = pd.concat([X, y], axis=1) temp.columns = list(X.columns) + ["target"] # if target does not have values 0 and 1, we need to remap, to be able to # compute the averages. if any(x for x in y.unique() if x not in [0, 1]): temp["target"] = np.where(temp["target"] == y.unique()[0], 0, 1) self.encoder_dict_ = {} total_pos = temp["target"].sum() total_neg = len(temp) - total_pos temp["non_target"] = np.where(temp["target"] == 1, 0, 1) for var in self.variables_: pos = temp.groupby([var])["target"].sum() / total_pos neg = temp.groupby([var])["non_target"].sum() / total_neg t = pd.concat([pos, neg], axis=1) t["woe"] = np.log(t["target"] / t["non_target"]) if (not t.loc[t["target"] == 0, :].empty or not t.loc[t["non_target"] == 0, :].empty): raise ValueError( "The proportion of one of the classes for a category in " "variable {} is zero, and log of zero is not defined". format(var)) self.encoder_dict_[var] = t["woe"].to_dict() self._check_encoding_dictionary() return self
def test_check_X_y_returns_pandas_from_pandas_with_non_typical_index(): df = pd.DataFrame({ "0": [1, 2, 3, 4], "1": [5, 6, 7, 8] }, index=[22, 99, 101, 212]) s = pd.Series([1, 2, 3, 4], index=[22, 99, 101, 212]) x, y = check_X_y(df, s) assert_frame_equal(df, x) assert_series_equal(s, y)
def test_check_x_y_converts_numpy_to_pandas(): a2D = np.array([[1, 2], [3, 4], [3, 4], [3, 4]]) df_2D = pd.DataFrame(a2D, columns=["0", "1"]) a1D = np.array([1, 2, 3, 4]) s = pd.Series(a1D) x, y = check_X_y(df_2D, s) assert_frame_equal(df_2D, x) assert_series_equal(s, y)
def test_check_x_y_reassings_index_when_only_one_input_is_pandas(): # case 1: X is dataframe, y is something else df = pd.DataFrame({ "0": [1, 2, 3, 4], "1": [5, 6, 7, 8] }, index=[22, 99, 101, 212]) s = np.array([1, 2, 3, 4]) s_exp = pd.Series([1, 2, 3, 4], index=[22, 99, 101, 212]) x, y = check_X_y(df, s) assert_frame_equal(df, x) assert_series_equal(s_exp.astype(int), y.astype(int)) # case 2: X is not a df, y is a series df = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]).T s = pd.Series([1, 2, 3, 4], index=[22, 99, 101, 212]) df_exp = pd.DataFrame(df, columns=["0", "1"]) df_exp.index = s.index x, y = check_X_y(df, s) assert_frame_equal(df_exp, x) assert_series_equal(s, y)
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """Learn the numbers to be used to replace the categories in each variable. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to be encoded. y: pandas series, default=None The Target. Can be None if `encoding_method='arbitrary'`. Otherwise, y needs to be passed when fitting the transformer. """ if self.encoding_method == "ordered": X, y = check_X_y(X, y) else: X = check_X(X) self._fit(X) self._get_feature_names_in(X) if self.encoding_method == "ordered": temp = pd.concat([X, y], axis=1) temp.columns = list(X.columns) + ["target"] # find mappings self.encoder_dict_ = {} for var in self.variables_: if self.encoding_method == "ordered": t = (temp.groupby( [var])["target"].mean().sort_values(ascending=True).index) elif self.encoding_method == "arbitrary": t = X[var].unique() self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)} self._check_encoding_dictionary() return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Fit a decision tree per variable. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the categorical variables. y : pandas series. The target variable. Required to train the decision tree and for ordered ordinal encoding. """ X, y = check_X_y(X, y) # confirm model type and target variables are compatible. if self.regression is True: if type_of_target(y) == "binary": raise ValueError( "Trying to fit a regression to a binary target is not " "allowed by this transformer. Check the target values " "or set regression to False.") else: check_classification_targets(y) self._fit(X) self._get_feature_names_in(X) if self.param_grid: param_grid = self.param_grid else: param_grid = {"max_depth": [1, 2, 3, 4]} # initialize categorical encoder cat_encoder = OrdinalEncoder( encoding_method=self.encoding_method, variables=self.variables_, ignore_format=self.ignore_format, errors="raise", ) # initialize decision tree discretiser tree_discretiser = DecisionTreeDiscretiser( cv=self.cv, scoring=self.scoring, variables=self.variables_, param_grid=param_grid, regression=self.regression, random_state=self.random_state, ) # pipeline for the encoder self.encoder_ = Pipeline([ ("categorical_encoder", cat_encoder), ("tree_discretiser", tree_discretiser), ]) self.encoder_.fit(X, y) return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Learn the numbers that should be used to replace the categories in each variable. That is the ratio of probability. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the categorical variables. y: pandas series. Target, must be binary. """ X, y = check_X_y(X, y) # check that y is binary if y.nunique() != 2: raise ValueError( "This encoder is designed for binary classification. The target " "used has more than 2 unique values.") self._fit(X) self._get_feature_names_in(X) temp = pd.concat([X, y], axis=1) temp.columns = list(X.columns) + ["target"] # if target does not have values 0 and 1, we need to remap, to be able to # compute the averages. if any(x for x in y.unique() if x not in [0, 1]): temp["target"] = np.where(temp["target"] == y.unique()[0], 0, 1) self.encoder_dict_ = {} for var in self.variables_: t = temp.groupby(var)["target"].mean() t = pd.concat([t, 1 - t], axis=1) t.columns = ["p1", "p0"] if self.encoding_method == "log_ratio": if not t.loc[t["p0"] == 0, :].empty or not t.loc[t["p1"] == 0, :].empty: raise ValueError( "p(0) or p(1) for a category in variable {} is zero, log of " "zero is not defined".format(var)) else: self.encoder_dict_[var] = (np.log(t.p1 / t.p0)).to_dict() elif self.encoding_method == "ratio": if not t.loc[t["p0"] == 0, :].empty: raise ValueError( "p(0) for a category in variable {} is zero, division by 0 is " "not defined".format(var)) else: self.encoder_dict_[var] = (t.p1 / t.p0).to_dict() self._check_encoding_dictionary() return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Find the important features. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe. y: array-like of shape (n_samples) Target variable. Required to train the estimator. """ # check input dataframe X, y = check_X_y(X, y) # If required exclude variables that are not in the input dataframe self._confirm_variables(X) # find all variables or check those entered are present in the dataframe self.variables_ = _find_all_variables(X, self.variables_, exclude_datetime=True) if len(self.variables_) == 1 and self.threshold is None: raise ValueError( "When evaluating a single feature you need to manually set a value " "for the threshold. " f"The transformer is evaluating the performance of {self.variables_} " f"and the threshold was left to {self.threshold} when initializing " f"the transformer." ) # save input features self._get_feature_names_in(X) # set up the correct estimator if self.regression is True: est = TargetMeanRegressor( bins=self.bins, strategy=self.strategy, ) else: est = TargetMeanClassifier( bins=self.bins, strategy=self.strategy, ) self.feature_performance_ = {} for variable in self.variables_: # clone estimator estimator = clone(est) # set the estimator to evaluate the required variable estimator.set_params(variables=variable) model = cross_validate( estimator, X, y, cv=self.cv, scoring=self.scoring, ) self.feature_performance_[variable] = model["test_score"].mean() # select features if not self.threshold: threshold = pd.Series(self.feature_performance_).mean() else: threshold = self.threshold self.features_to_drop_ = [ f for f in self.variables_ if self.feature_performance_[f] < threshold ] return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Learn the mean target value per category or bin. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. y : pandas series of shape = [n_samples,] The target variable. """ # check if 'X' is a dataframe X, y = check_X_y(X, y) # find categorical and numerical variables ( self.variables_categorical_, self.variables_numerical_, ) = _find_categorical_and_numerical_variables(X, self.variables) # check for missing values _check_contains_na(X, self.variables_numerical_) _check_contains_na(X, self.variables_categorical_) # check inf _check_contains_inf(X, self.variables_numerical_) # Create pipelines if self.variables_categorical_ and self.variables_numerical_: self._pipeline = self._make_combined_pipeline() elif self.variables_categorical_: self._pipeline = self._make_categorical_pipeline() else: self._pipeline = self._make_numerical_pipeline() # Train pipeline self._pipeline.fit(X, y) # Assign attributes (useful to interpret features) # Use dict() to make a copy of the dictionary. Otherwise, like in pandas, # it is just another view of the same data, mind-blowing. if self.variables_categorical_ and self.variables_numerical_: self.binner_dict_ = dict( self._pipeline.named_steps["discretiser"].binner_dict_ ) self.encoder_dict_ = dict( self._pipeline.named_steps["encoder_num"].encoder_dict_ ) tmp_dict = dict(self._pipeline.named_steps["encoder_cat"].encoder_dict_) self.encoder_dict_.update(tmp_dict) elif self.variables_categorical_: self.binner_dict_ = {} self.encoder_dict_ = dict(self._pipeline.encoder_dict_) else: self.binner_dict_ = dict( self._pipeline.named_steps["discretiser"].binner_dict_ ) self.encoder_dict_ = dict( self._pipeline.named_steps["encoder"].encoder_dict_ ) # store input features self.n_features_in_ = X.shape[1] self.feature_names_in_ = list(X.columns) return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Find the important features. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe. y: array-like of shape (n_samples) Target variable. Required to train the estimator. """ X, y = check_X_y(X, y) # reset the index X = X.reset_index(drop=True) y = y.reset_index(drop=True) # If required exclude variables that are not in the input dataframe self._confirm_variables(X) # find numerical variables or check variables entered by user self.variables_ = _find_or_check_numerical_variables( X, self.variables_) # check that there are more than 1 variable to select from self._check_variable_number() # train model with all features and cross-validation model = cross_validate( self.estimator, X[self.variables_], y, cv=self.cv, return_estimator=True, scoring=self.scoring, ) # store initial model performance self.initial_model_performance_ = model["test_score"].mean() # extract the validation folds cv_ = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator)) validation_indices = [val_index for _, val_index in cv_.split(X, y)] # get performance metric scorer = get_scorer(self.scoring) # seed random_state = check_random_state(self.random_state) # dict to collect features and their performance_drift after shuffling self.performance_drifts_ = {} # shuffle features and save feature performance drift into a dict for feature in self.variables_: X_shuffled = X[self.variables_].copy() # shuffle individual feature X_shuffled[feature] = (X_shuffled[feature].sample( frac=1, random_state=random_state).reset_index(drop=True)) # determine the performance with the shuffled feature performance = np.mean([ scorer(m, X_shuffled.iloc[idx], y.iloc[idx]) for m, idx in zip(model["estimator"], validation_indices) ]) # determine drift in performance # Note, sklearn negates the log and error scores, so no need to manually # do the inversion # https://scikit-learn.org/stable/modules/model_evaluation.html # (https://scikit-learn.org/stable/modules/model_evaluation.html # #the-scoring-parameter-defining-model-evaluation-rules) performance_drift = self.initial_model_performance_ - performance # Save feature and performance drift self.performance_drifts_[feature] = performance_drift # select features if not self.threshold: threshold = pd.Series(self.performance_drifts_).mean() else: threshold = self.threshold self.features_to_drop_ = [ f for f in self.performance_drifts_.keys() if self.performance_drifts_[f] < threshold ] # save input features self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Select features. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe y: array-like of shape (n_samples) Target variable. Required to train the estimator. """ # check input dataframe X, y = check_X_y(X, y) # If required exclude variables that are not in the input dataframe self._confirm_variables(X) # find numerical variables or check variables entered by user self.variables_ = _find_or_check_numerical_variables( X, self.variables_) if len(self.variables_) == 1 and self.threshold is None: raise ValueError( "When evaluating a single feature you need to manually set a value " "for the threshold. " f"The transformer is evaluating the performance of {self.variables_} " f"and the threshold was left to {self.threshold} when initializing " f"the transformer.") self.feature_performance_ = {} # train a model for every feature and store the performance for feature in self.variables_: model = cross_validate( self.estimator, X[feature].to_frame(), y, cv=self.cv, return_estimator=False, scoring=self.scoring, ) self.feature_performance_[feature] = model["test_score"].mean() # select features if not self.threshold: threshold = pd.Series(self.feature_performance_).mean() else: threshold = self.threshold self.features_to_drop_ = [ f for f in self.feature_performance_.keys() if self.feature_performance_[f] < threshold ] # check we are not dropping all the columns in the df if len(self.features_to_drop_) == len(X.columns): warnings.warn( "All features will be dropped, try changing the threshold.") # save input features self._get_feature_names_in(X) return self
def test_check_x_y_returns_pandas_from_pandas(df_vartypes): s = pd.Series([0, 1, 2, 3]) x, y = check_X_y(df_vartypes, s) assert_frame_equal(df_vartypes, x) assert_series_equal(s, y)
def test_check_x_y_raises_error_when_inconsistent_length(df_vartypes): s = pd.Series([0, 1, 2, 3, 5]) with pytest.raises(ValueError): check_X_y(df_vartypes, s)
def fit(self, X: pd.DataFrame, y: pd.Series): """ Find initial model performance. Sort features by importance. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe y: array-like of shape (n_samples) Target variable. Required to train the estimator. """ # check input dataframe X, y = check_X_y(X, y) # If required exclude variables that are not in the input dataframe self._confirm_variables(X) # find numerical variables or check variables entered by user self.variables_ = _find_or_check_numerical_variables( X, self.variables_) # check that there are more than 1 variable to select from self._check_variable_number() # save input features self._get_feature_names_in(X) # train model with all features and cross-validation model = cross_validate( self.estimator, X[self.variables_], y, cv=self.cv, scoring=self.scoring, return_estimator=True, ) # store initial model performance self.initial_model_performance_ = model["test_score"].mean() # Initialize a dataframe that will contain the list of the feature/coeff # importance for each cross validation fold feature_importances_cv = pd.DataFrame() # Populate the feature_importances_cv dataframe with columns containing # the feature importance values for each model returned by the cross # validation. # There are as many columns as folds. for i in range(len(model["estimator"])): m = model["estimator"][i] feature_importances_cv[i] = get_feature_importances(m) # Add the variables as index to feature_importances_cv feature_importances_cv.index = self.variables_ # Aggregate the feature importance returned in each fold self.feature_importances_ = feature_importances_cv.mean(axis=1) return X, y