def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Applies transformation to the DataFrame. Args: X: Pandas DataFrame to apply the transformation Returns: Transformed DataFrame """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check if input data contains same number of columns as dataframe used to fit. _check_input_matches_training_df(X, self.input_shape_[1]) return X
def inverse_transform(self, X): """ Convert the data back to the original representation. Parameters ---------- X_transformed : pandas dataframe of shape = [n_samples, n_features]. The transformed dataframe. Returns ------- X : pandas dataframe of shape = [n_samples, n_features]. The un-transformed dataframe, that is, containing the original values of the categorical variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns # than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) # replace encoded categories by the original values for feature in self.encoder_dict_.keys(): inv_map = {v: k for k, v in self.encoder_dict_[feature].items()} X[feature] = X[feature].map(inv_map) return X
def fit(self, X: pd.DataFrame, y: pd.Series = None): """ Find the correlated features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training dataset. y : pandas series. Default = None y is not needed in this transformer. You can pass y or None. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find all numerical variables or check those entered are in the dataframe self.variables = _find_or_check_numerical_variables(X, self.variables) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables) # set to collect features that are correlated self.correlated_features_ = set() # create tuples of correlated feature groups self.correlated_feature_sets_ = [] # the correlation matrix self.correlated_matrix_ = X[self.variables].corr(method=self.method) # create set of examined features, helps to determine feature combinations # to evaluate below _examined_features = set() # for each feature in the dataset (columns of the correlation matrix) for feature in self.correlated_matrix_.columns: if feature not in _examined_features: # append so we can exclude when we create the combinations _examined_features.add(feature) # here we collect potentially correlated features # we need this for the correlated groups sets _temp_set = set([feature]) # features that have not been examined, are not currently examined and # were not found correlated _features_to_compare = [ f for f in self.correlated_matrix_.columns if f not in _examined_features ] # create combinations: for f2 in _features_to_compare: # if the correlation is higher than the threshold # we are interested in absolute correlation coefficient value if abs(self.correlated_matrix_.loc[ f2, feature]) > self.threshold: # add feature (f2) to our correlated set self.correlated_features_.add(f2) _temp_set.add(f2) _examined_features.add(f2) # if there are correlated features if len(_temp_set) > 1: self.correlated_feature_sets_.append(_temp_set) self.input_shape_ = X.shape return self
def fit(self, X, y=None): """ Finds the correlated features Args: X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: It is not needed in this transformer. Defaults to None. Alternatively takes Pandas Series.ss Returns: self """ # check input dataframe X = _is_dataframe(X) # find all numerical variables or check those entered are in the dataframe self.variables = _find_numerical_variables(X, self.variables) # set to collect features that are correlated self.correlated_features_ = set() # create tuples of correlated feature groups self.correlated_feature_sets_ = [] # the correlation matrix self.correlated_matrix_ = X[self.variables].corr(method=self.method) # create set of examined features, helps to determine feature combinations # to evaluate below _examined_features = set() # for each feature in the dataset (columns of the correlation matrix) for feature in self.correlated_matrix_.columns: if feature not in _examined_features: # append so we can exclude when we create the combinations _examined_features.add(feature) # here we collect potentially correlated features # we need this for the correlated groups sets _temp_set = set([feature]) # features that have not been examined, are not currently examined and # were not found correlated _features_to_compare = [ f for f in self.correlated_matrix_.columns if f not in _examined_features ] # create combinations: for f2 in _features_to_compare: # if the correlation is higher than the threshold # we are interested in absolute correlation coefficient value if abs(self.correlated_matrix_.loc[ f2, feature]) > self.threshold: # add feature (f2) to our correlated set self.correlated_features_.add(f2) _temp_set.add(f2) _examined_features.add(f2) # if there are correlated features if len(_temp_set) > 1: self.correlated_feature_sets_.append(_temp_set) self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Find the important features. Note that the selector trains various models at each round of selection, so it might take a while. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe y : array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find numerical variables or check variables entered by user self.variables = _find_or_check_numerical_variables(X, self.variables) # train model with all features and cross-validation model = cross_validate( self.estimator, X[self.variables], y, cv=self.cv, scoring=self.scoring, return_estimator=True, ) # store initial model performance self.initial_model_performance_ = model["test_score"].mean() # Initialize a dataframe that will contain the list of the feature/coeff # importance for each cross validation fold feature_importances_cv = pd.DataFrame() # Populate the feature_importances_cv dataframe with columns containing # the feature importance values for each model returned by the cross # validation. # There are as many columns as folds. for m in model["estimator"]: feature_importances_cv[m] = get_feature_importances(m) # Add the variables as index to feature_importances_cv feature_importances_cv.index = self.variables # Aggregate the feature importance returned in each fold self.feature_importances_ = feature_importances_cv.mean(axis=1) # Sort the feature importance values decreasingly self.feature_importances_.sort_values(ascending=False, inplace=True) # Extract most important feature from the ordered list of features first_most_important_feature = list(self.feature_importances_.index)[0] # Run baseline model using only the most important feature baseline_model = cross_validate( self.estimator, X[first_most_important_feature].to_frame(), y, cv=self.cv, scoring=self.scoring, return_estimator=True, ) # Save baseline model performance baseline_model_performance = baseline_model["test_score"].mean() # list to collect selected features # It is initialized with the most important feature _selected_features = [first_most_important_feature] # dict to collect features and their performance_drift # It is initialized with the performance drift of # the most important feature self.performance_drifts_ = {first_most_important_feature: 0} # loop over the ordered list of features by feature importance starting # from the second element in the list. for feature in list(self.feature_importances_.index)[1:]: # Add feature and train new model model_tmp = cross_validate( self.estimator, X[_selected_features + [feature]], y, cv=self.cv, scoring=self.scoring, return_estimator=True, ) # assign new model performance model_tmp_performance = model_tmp["test_score"].mean() # Calculate performance drift performance_drift = model_tmp_performance - baseline_model_performance # Save feature and performance drift self.performance_drifts_[feature] = performance_drift # If new performance model is if performance_drift > self.threshold: # add feature to the list of selected features _selected_features.append(feature) # Update new baseline model performance baseline_model_performance = model_tmp_performance self.features_to_drop_ = [ f for f in self.variables if f not in _selected_features ] self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: pd.Series = None): """ Find the correlated feature groups. Determine which feature should be selected from each group. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: pandas series. Default = None y is needed if selection_method == 'model_performance'. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find all numerical variables or check those entered are in the dataframe self.variables_ = _find_or_check_numerical_variables(X, self.variables) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) if self.selection_method == "model_performance" and y is None: raise ValueError("y is needed to fit the transformer") # FIND CORRELATED FEATURES # ======================== # create tuples of correlated feature groups self.correlated_feature_sets_ = [] # the correlation matrix _correlated_matrix = X[self.variables_].corr(method=self.method) # create set of examined features, helps to determine feature combinations # to evaluate below _examined_features = set() # for each feature in the dataset (columns of the correlation matrix) for feature in _correlated_matrix.columns: if feature not in _examined_features: # append so we can exclude when we create the combinations _examined_features.add(feature) # here we collect potentially correlated features # we need this for the correlated groups sets _temp_set = set([feature]) # features that have not been examined, are not currently examined and # were not found correlated _features_to_compare = [ f for f in _correlated_matrix.columns if f not in _examined_features ] # create combinations: for f2 in _features_to_compare: # if the correlation is higher than the threshold # we are interested in absolute correlation coefficient value if abs(_correlated_matrix.loc[f2, feature]) > self.threshold: # add feature (f2) to our correlated set _temp_set.add(f2) _examined_features.add(f2) # if there are correlated features if len(_temp_set) > 1: self.correlated_feature_sets_.append(_temp_set) # SELECT 1 FEATURE FROM EACH GROUP # ================================ # list to collect selected features # we start it with all features that were either not examined, i.e., categorical # variables, or not found correlated _selected_features = [ f for f in X.columns if f not in set().union(*self.correlated_feature_sets_) ] # select the feature with least missing values if self.selection_method == "missing_values": for feature_group in self.correlated_feature_sets_: f = X[feature_group].isnull().sum().sort_values( ascending=True).index[0] _selected_features.append(f) # select the feature with most unique values elif self.selection_method == "cardinality": for feature_group in self.correlated_feature_sets_: f = X[feature_group].nunique().sort_values( ascending=False).index[0] _selected_features.append(f) # select the feature with biggest variance elif self.selection_method == "variance": for feature_group in self.correlated_feature_sets_: f = X[feature_group].std().sort_values( ascending=False).index[0] _selected_features.append(f) # select best performing feature according to estimator else: for feature_group in self.correlated_feature_sets_: # feature_group = list(feature_group) temp_perf = [] # train a model for every feature for feature in feature_group: model = cross_validate( self.estimator, X[feature].to_frame(), y, cv=self.cv, return_estimator=False, scoring=self.scoring, ) temp_perf.append(model["test_score"].mean()) # select best performing feature from group f = list(feature_group)[temp_perf.index(max(temp_perf))] _selected_features.append(f) self.features_to_drop_ = [ f for f in self.variables_ if f not in _selected_features ] self.n_features_in_ = X.shape[1] return self
def fit(self, X, y): """ Learns the numbers that should be used to replace the categories in each variable. That is the WoE or ratio of probability. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the categorical variables. y : pandas series. Target, must be binary [0,1]. Attributes ---------- encoder_dict_: dictionary The dictionary containing the {category: WoE / ratio} pairs per variable. """ # check input dataframe X = _is_dataframe(X) # find categorical variables or check that those entered by the user # are of type object self.variables = _find_categorical_variables(X, self.variables) # check if dataset contains na _check_contains_na(X, self.variables) if y is None: raise ValueError( 'Please provide a target y for this encoding method') # check that y is binary if len([x for x in y.unique() if x not in [0, 1]]) > 0: raise ValueError( "This encoder is only designed for binary classification, values of y can be only 0 or 1" ) temp = pd.concat([X, y], axis=1) temp.columns = list(X.columns) + ['target'] self.encoder_dict_ = {} if self.encoding_method == 'woe': total_pos = temp['target'].sum() total_neg = len(temp) - total_pos temp['non_target'] = np.where(temp['target'] == 1, 0, 1) for var in self.variables: pos = temp.groupby([var])['target'].sum() / total_pos neg = temp.groupby([var])['non_target'].sum() / total_neg t = pd.concat([pos, neg], axis=1) t['woe'] = np.log(t['target'] / t['non_target']) if not t.loc[t['target'] == 0, :].empty or not t.loc[ t['non_target'] == 0, :].empty: raise ValueError( "The proportion of 1 of the classes for a category in variable {} is zero, and log of zero is " "not defined".format(var)) self.encoder_dict_[var] = t['woe'].to_dict() else: for var in self.variables: t = temp.groupby(var)['target'].mean() t = pd.concat([t, 1 - t], axis=1) t.columns = ['p1', 'p0'] if self.encoding_method == 'log_ratio': if not t.loc[t['p0'] == 0, :].empty or not t.loc[ t['p1'] == 0, :].empty: raise ValueError( "p(0) or p(1) for a category in variable {} is zero, log of zero is not defined" .format(var)) else: self.encoder_dict_[var] = (np.log(t.p1 / t.p0)).to_dict() elif self.encoding_method == 'ratio': if not t.loc[t['p0'] == 0, :].empty: raise ValueError( "p(0) for a category in variable {} is zero, division by 0 is not defined" .format(var)) else: self.encoder_dict_[var] = (t.p1 / t.p0).to_dict() self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_) self.input_shape_ = X.shape return self
def fit(self, X, y=None): """ Learns the frequent categories for each variable. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just selected variables y : None y is not required. You can pass y or None. Attributes ---------- encoder_dict_: dictionary The dictionary containing the frequent categories (that will be kept) for each variable. Categories not present in this list will be replaced by 'Rare' or by the user defined value. """ # check input dataframe X = _is_dataframe(X) # find categorical variables or check that those entered by the user # are of type object self.variables = _find_categorical_variables(X, self.variables) # check if dataset contains na _check_contains_na(X, self.variables) self.encoder_dict_ = {} for var in self.variables: if len(X[var].unique()) > self.n_categories: # if the variable has more than the indicated number of categories # the encoder will learn the most frequent categories t = pd.Series(X[var].value_counts() / np.float(len(X))) # non-rare labels: freq_idx = t[t >= self.tol].index if self.max_n_categories: self.encoder_dict_[var] = freq_idx[:self.max_n_categories] else: self.encoder_dict_[var] = freq_idx else: # if the total number of categories is smaller than the indicated # the encoder will consider all categories as frequent. warnings.warn("The number of unique categories for variable {} is less than that indicated in " "n_categories. Thus, all categories will be considered frequent".format(var)) self.encoder_dict_[var] = X[var].unique() self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_) self.input_shape_ = X.shape return self
def fit(self, X, y): """ Args ---- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe y: array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find numerical variables or check variables entered by user self.variables = _find_numerical_variables(X, self.variables) # train model with all features and cross-validation model = cross_validate( self.estimator, X[self.variables], y, cv=self.cv, scoring=self.scoring, return_estimator=True, ) # store initial model performance self.initial_model_performance_ = model["test_score"].mean() # Initialize a dataframe that will contain the list of the feature/coeff # importance for each cross validation fold feature_importances_cv = pd.DataFrame() # Populate the feature_importances_cv dataframe with columns containing # the feature importance values for each model returned by the cross # validation. # There are as many columns as folds. for m in model["estimator"]: feature_importances_cv[m] = get_feature_importances(m) # Add the variables as index to feature_importances_cv feature_importances_cv.index = self.variables # Aggregate the feature importance returned in each fold self.feature_importances_ = feature_importances_cv.mean(axis=1) # Sort the feature importance values self.feature_importances_.sort_values(ascending=True, inplace=True) # list to collect selected features self.selected_features_ = [] # temporary copy where we will remove features recursively X_tmp = X[self.variables].copy() # we need to update the performance as we remove features baseline_model_performance = self.initial_model_performance_ # dict to collect features and their performance_drift after shuffling self.performance_drifts_ = {} # evaluate every feature, starting from the least important # remember that feature_importances_ is ordered already for feature in list(self.feature_importances_.index): # remove feature and train new model model_tmp = cross_validate( self.estimator, X_tmp.drop(columns=feature), y, cv=self.cv, scoring=self.scoring, return_estimator=False, ) # assign new model performance model_tmp_performance = model_tmp["test_score"].mean() # Calculate performance drift performance_drift = baseline_model_performance - model_tmp_performance # Save feature and performance drift self.performance_drifts_[feature] = performance_drift if performance_drift > self.threshold: self.selected_features_.append(feature) else: # remove feature and adjust initial performance X_tmp = X_tmp.drop(columns=feature) baseline_model = cross_validate( self.estimator, X_tmp, y, cv=self.cv, return_estimator=False, scoring=self.scoring, ) # store initial model performance baseline_model_performance = baseline_model["test_score"].mean( ) self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Find the important features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe y : array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # reset the index X = X.reset_index(drop=True) y = y.reset_index(drop=True) # find numerical variables or check variables entered by user self.variables = _find_or_check_numerical_variables(X, self.variables) # train model with all features and cross-validation model = cross_validate( self.estimator, X, y, cv=self.cv, return_estimator=True, scoring=self.scoring, ) # store initial model performance self.initial_model_performance_ = model["test_score"].mean() # get performance metric scorer = get_scorer(self.scoring) # dict to collect features and their performance_drift after shuffling self.performance_drifts_ = {} # list to collect selected features self.selected_features_ = [] # shuffle features and save feature performance drift into a dict for feature in self.variables: X_shuffled = X.copy() # shuffle individual feature X_shuffled[feature] = ( X_shuffled[feature].sample(frac=1).reset_index(drop=True) ) # determine the performance with the shuffled feature performance = np.mean( [scorer(m, X_shuffled, y) for m in model["estimator"]] ) # determine drift in performance # Note, sklearn negates the log and error scores, so no need to manually # do the invertion # https://scikit-learn.org/stable/modules/model_evaluation.html # (https://scikit-learn.org/stable/modules/model_evaluation.html # #the-scoring-parameter-defining-model-evaluation-rules) performance_drift = self.initial_model_performance_ - performance # Save feature and performance drift self.performance_drifts_[feature] = performance_drift # select features for feature in self.performance_drifts_.keys(): if self.performance_drifts_[feature] > self.threshold: self.selected_features_.append(feature) self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the values that should be used to replace outliers. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. y : pandas Series, default=None y is not needed in this transformer. You can pass y or None. Raises ------ TypeError If the input is not a Pandas DataFrame Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables self.variables_ = _find_or_check_numerical_variables(X, self.variables) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) self.right_tail_caps_ = {} self.left_tail_caps_ = {} # estimate the end values if self.tail in ["right", "both"]: if self.capping_method == "gaussian": self.right_tail_caps_ = ( X[self.variables_].mean() + self.fold * X[self.variables_].std()).to_dict() elif self.capping_method == "iqr": IQR = X[self.variables_].quantile(0.75) - X[ self.variables_].quantile(0.25) self.right_tail_caps_ = (X[self.variables_].quantile(0.75) + (IQR * self.fold)).to_dict() elif self.capping_method == "quantiles": self.right_tail_caps_ = ( X[self.variables_].quantile(1 - self.fold).to_dict()) if self.tail in ["left", "both"]: if self.capping_method == "gaussian": self.left_tail_caps_ = ( X[self.variables_].mean() - self.fold * X[self.variables_].std()).to_dict() elif self.capping_method == "iqr": IQR = X[self.variables_].quantile(0.75) - X[ self.variables_].quantile(0.25) self.left_tail_caps_ = (X[self.variables_].quantile(0.25) - (IQR * self.fold)).to_dict() elif self.capping_method == "quantiles": self.left_tail_caps_ = X[self.variables_].quantile( self.fold).to_dict() self.n_features_in_ = X.shape[1] return self
def test_is_dataframe(df_vartypes): assert_frame_equal(_is_dataframe(df_vartypes), df_vartypes) with pytest.raises(TypeError): assert _is_dataframe([1, 2, 4])
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learns the values that should be used to replace outliers. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. y : None y is not needed in this transformer. You can pass y or None. Attributes ---------- right_tail_caps_: dictionary The dictionary containing the maximum values at which variables will be capped. left_tail_caps_ : dictionary The dictionary containing the minimum values at which variables will be capped. """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables self.variables = _find_numerical_variables(X, self.variables) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables) self.right_tail_caps_ = {} self.left_tail_caps_ = {} # estimate the end values if self.tail in ["right", "both"]: if self.capping_method == "gaussian": self.right_tail_caps_ = ( X[self.variables].mean() + self.fold * X[self.variables].std()).to_dict() elif self.capping_method == "iqr": IQR = X[self.variables].quantile(0.75) - X[ self.variables].quantile(0.25) self.right_tail_caps_ = (X[self.variables].quantile(0.75) + (IQR * self.fold)).to_dict() elif self.capping_method == "quantiles": self.right_tail_caps_ = ( X[self.variables].quantile(1 - self.fold).to_dict()) if self.tail in ["left", "both"]: if self.capping_method == "gaussian": self.left_tail_caps_ = ( X[self.variables].mean() - self.fold * X[self.variables].std()).to_dict() elif self.capping_method == "iqr": IQR = X[self.variables].quantile(0.75) - X[ self.variables].quantile(0.25) self.left_tail_caps_ = (X[self.variables].quantile(0.25) - (IQR * self.fold)).to_dict() elif self.capping_method == "quantiles": self.left_tail_caps_ = X[self.variables].quantile( self.fold).to_dict() self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: pd.Series = None): """ Find duplicated features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe. y : None y is not needed for this transformer. You can pass y or None. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find all variables or check those entered are in the dataframe self.variables = _find_all_variables(X, self.variables) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables) # create tuples of duplicated feature groups self.duplicated_feature_sets_ = [] # set to collect features that are duplicated self.features_to_drop_ = set() # type: ignore # create set of examined features _examined_features = set() for feature in self.variables: # append so we can remove when we create the combinations _examined_features.add(feature) if feature not in self.features_to_drop_: _temp_set = set([feature]) # features that have not been examined, are not currently examined and # were not found duplicates _features_to_compare = [ f for f in self.variables if f not in _examined_features.union(self.features_to_drop_) ] # create combinations: for f2 in _features_to_compare: if X[feature].equals(X[f2]): self.features_to_drop_.add(f2) _temp_set.add(f2) # if there are duplicated features if len(_temp_set) > 1: self.duplicated_feature_sets_.append(_temp_set) self.input_shape_ = X.shape return self
def fit(self, X, y=None): """ Find duplicated features. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe. y: None y is not needed for this transformer. You can pass y or None. Attributes ---------- duplicated_features_: set The duplicated features. duplicated_feature_sets_: list Groups of duplicated features. Or in other words, features that are duplicated with each other. Each list represents a group of duplicated features. """ # check input dataframe X = _is_dataframe(X) # find all variables or check those entered are in the dataframe self.variables = _find_all_variables(X, self.variables) # create tuples of duplicated feature groups self.duplicated_feature_sets_ = [] # set to collect features that are duplicated self.duplicated_features_ = set() # create set of examined features _examined_features = set() for feature in self.variables: # append so we can remove when we create the combinations _examined_features.add(feature) if feature not in self.duplicated_features_: _temp_set = set([feature]) # features that have not been examined, are not currently examined and # were not found duplicates _features_to_compare = [ f for f in self.variables if f not in _examined_features.union(self.duplicated_features_) ] # create combinations: for f2 in _features_to_compare: if X[feature].equals(X[f2]): self.duplicated_features_.add(f2) _temp_set.add(f2) # if there are duplicated features if len(_temp_set) > 1: self.duplicated_feature_sets_.append(_temp_set) self.input_shape_ = X.shape return self
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Apply the transformation to the dataframe. Only the selected variables will be modified. If transformer is the OneHotEncoder, the dummy features will be concatenated to the input dataset. Note that the original categorical variables will not be removed from the dataset after encoding. If this is the desired effect, please use Feature-engine's OneHotEncoder instead. Parameters ---------- X: Pandas DataFrame The data to transform Raises ------ TypeError If the input is not a Pandas DataFrame Returns ------- X: Pandas DataFrame The transformed dataset. """ # check that input is a dataframe X = _is_dataframe(X) # Check that input data contains same number of columns than # the dataframe used to fit the imputer. _check_input_matches_training_df(X, self.n_features_in_) if self.transformer_.__class__.__name__ == "OneHotEncoder": ohe_results_as_df = pd.DataFrame( data=self.transformer_.transform(X[self.variables_]), columns=self.transformer_.get_feature_names(self.variables_), ) X = pd.concat([X, ohe_results_as_df], axis=1) elif self.transformer_.__class__.__name__ in [ "SelectKBest", "SelectPercentile", "SelectFromModel", ]: # the variables selected by the transformer selected_variables = X.columns[self.transformer_.get_support( indices=True)] # the variables that were not examined, in case there are any remaining_variables = [ var for var in X.columns if var not in self.variables_ ] X = X[list(selected_variables) + list(remaining_variables)] else: X[self.variables_] = self.transformer_.transform( X[self.variables_]) return X
def fit(self, X: pd.DataFrame, y: pd.Series): """ Find the important features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe y : array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # check variables self.variables = _find_all_variables(X, self.variables) # check if df contains na _check_contains_na(X, self.variables) # limit df to variables to smooth code below X = X[self.variables].copy() # find categorical and numerical variables self.variables_categorical_ = list(X.select_dtypes(include="O").columns) self.variables_numerical_ = list( X.select_dtypes(include=["float", "integer"]).columns ) # obtain cross-validation indeces skf = StratifiedKFold( n_splits=self.cv, shuffle=True, random_state=self.random_state ) skf.get_n_splits(X, y) if self.variables_categorical_ and self.variables_numerical_: _pipeline = self._make_combined_pipeline() elif self.variables_categorical_: _pipeline = self._make_categorical_pipeline() else: _pipeline = self._make_numerical_pipeline() # obtain feature performance with cross-validation feature_importances_cv = [] for train_index, test_index in skf.split(X, y): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] _pipeline.fit(X_train, y_train) X_test = _pipeline.transform(X_test) if self.scoring == "roc_auc_score": tmp_split = { f: roc_auc_score(y_test, X_test[f]) for f in self.variables } else: tmp_split = {f: r2_score(y_test, X_test[f]) for f in self.variables} feature_importances_cv.append(pd.Series(tmp_split)) feature_importances_cv = pd.concat(feature_importances_cv, axis=1) self.feature_performance_ = feature_importances_cv.mean( # type: ignore axis=1 ).to_dict() self.features_to_drop_ = [ f for f in self.variables if self.feature_performance_[f] < self.threshold ] self.input_shape_ = X.shape return self
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Combine the variables with the mathematical operations. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. Returns ------- X: Pandas dataframe, shape = [n_samples, n_features + n_operations] The dataframe with the operations results added as columns. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # Check if input data contains same number of columns as dataframe used to fit. _check_input_matches_training_df(X, self.n_features_in_) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.reference_variables) _check_contains_na(X, self.variables_to_combine) _check_contains_inf(X, self.reference_variables) _check_contains_inf(X, self.variables_to_combine) # cannot divide by 0, as will result in error if "div" in self.operations: if X[self.reference_variables].isin([0]).any().any(): raise ValueError( "Some of the reference variables contain 0 as values. Check and " "remove those before using this transformer.") original_col_names = [var for var in X.columns] # Add new features and values into de data frame. if "sub" in self.operations: for reference in self.reference_variables: varname = [ str(var) + "_sub_" + str(reference) for var in self.variables_to_combine ] X[varname] = X[self.variables_to_combine].sub(X[reference], axis=0) if "div" in self.operations: for reference in self.reference_variables: varname = [ str(var) + "_div_" + str(reference) for var in self.variables_to_combine ] X[varname] = X[self.variables_to_combine].div(X[reference], axis=0) if "add" in self.operations: for reference in self.reference_variables: varname = [ str(var) + "_add_" + str(reference) for var in self.variables_to_combine ] X[varname] = X[self.variables_to_combine].add(X[reference], axis=0) if "mul" in self.operations: for reference in self.reference_variables: varname = [ str(var) + "_mul_" + str(reference) for var in self.variables_to_combine ] X[varname] = X[self.variables_to_combine].mul(X[reference], axis=0) # replace created variable names with user ones. if self.new_variables_names: X.columns = original_col_names + self.new_variables_names return X
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn parameters. Perform dataframe checks. Creates dictionary of operation to new feature name pairs. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y : pandas Series, or np.array. Defaults to None. It is not needed in this transformer. You can pass y or None. Raises ------ TypeError - If the input is not a Pandas DataFrame - If any user provided variables in variables_to_combine are not numerical ValueError If the variable(s) contain null values Returns ------- self """ # check input dataframe X = _is_dataframe(X) # check variables to combine are numerical self.variables_to_combine = _find_or_check_numerical_variables( X, self.variables_to_combine ) # check if dataset contains na _check_contains_na(X, self.variables_to_combine) if self.math_operations is None: self.math_operations_ = ["sum", "prod", "mean", "std", "max", "min"] else: self.math_operations_ = self.math_operations # dictionary of new_variable_name to operation pairs if self.new_variables_names: self.combination_dict_ = dict( zip(self.new_variables_names, self.math_operations_) ) else: if all(isinstance(var, str) for var in self.variables_to_combine): vars_ls = self.variables_to_combine else: vars_ls = [str(var) for var in self.variables_to_combine] self.combination_dict_ = { f"{operation}({'-'.join(vars_ls)})": operation # type: ignore for operation in self.math_operations_ } self.input_shape_ = X.shape return self
def transform(self, X): """ Replaces missing data with random values taken from the train set. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The dataframe to be transformed. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features] The dataframe without missing values in the transformed variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # Check that the dataframe contains the same number of columns than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) # random sampling with a general seed if self.seed == 'general': for feature in self.variables: if X[feature].isnull().sum() > 0: # determine number of data points to extract at random n_samples = X[feature].isnull().sum() # extract values random_sample = self.X_[feature].dropna().sample( n_samples, replace=True, random_state=self.random_state) # re-index: pandas needs this to add values in the correct observations random_sample.index = X[X[feature].isnull()].index # replace na X.loc[X[feature].isnull(), feature] = random_sample # random sampling observation per observation elif self.seed == 'observation': for feature in self.variables: if X[feature].isnull().sum() > 0: # loop over each observation with missing data for i in X[X[feature].isnull()].index: # find the seed using additional variables internal_seed = _define_seed(X, i, self.random_state, how=self.seeding_method) # extract 1 value at random random_sample = self.X_[feature].dropna().sample( 1, replace=True, random_state=internal_seed) random_sample = random_sample.values[0] # replace the missing data point X.loc[i, feature] = random_sample return X