def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This method does not learn any parameter. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: None y is not needed in this imputation. You can pass None or y. """ # check input dataframe X = check_X(X) # find or check for numerical variables # create the imputer dictionary if self.imputer_dict: self.variables_ = _find_or_check_numerical_variables( X, self.imputer_dict.keys() # type: ignore ) self.imputer_dict_ = self.imputer_dict else: self.variables_ = _find_or_check_numerical_variables( X, self.variables) self.imputer_dict_ = { var: self.arbitrary_number for var in self.variables_ } self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn any parameter. Performs dataframe checks. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: pandas Series, or np.array. Default=None. It is not needed in this transformer. You can pass y or None. Raises ------ TypeError - If the input is not a Pandas DataFrame - If any user provided variables are not numerical ValueError If any of the reference variables contain null values and the mathematical operation is 'div'. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # check variables to combine are numerical self.variables_to_combine = _find_or_check_numerical_variables( X, self.variables_to_combine) # check reference_variables are numerical self.reference_variables = _find_or_check_numerical_variables( X, self.reference_variables) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.reference_variables) _check_contains_na(X, self.variables_to_combine) _check_contains_inf(X, self.reference_variables) _check_contains_inf(X, self.variables_to_combine) # cannot divide by 0, as will result in error if "div" in self.operations: if X[self.reference_variables].isin([0]).any().any(): raise ValueError( "Some of the reference variables contain 0 as values. Check and " "remove those before using this transformer with div.") self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This method does not learn any parameter. Checks dataframe and finds numerical variables, or checks that the variables entered by user are numerical. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training dataset. y : None y is not needed in this imputation. You can pass None or y. Raises ------ TypeError - If the input is not a Pandas DataFrame - If any of the user provided variables are not numerical ValueError If there are no numerical variables in the df or the df is empty Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables if self.imputer_dict: self.variables = _find_or_check_numerical_variables( X, self.imputer_dict.keys() # type: ignore ) else: self.variables = _find_or_check_numerical_variables( X, self.variables) # create the imputer dictionary if self.imputer_dict: self.imputer_dict_ = self.imputer_dict else: self.imputer_dict_ = { var: self.arbitrary_number for var in self.variables } self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the mean or median values. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: pandas series or None, default=None y is not needed in this imputation. You can pass None or y. """ # check input dataframe X = check_X(X) # find or check for numerical variables self.variables_ = _find_or_check_numerical_variables(X, self.variables) # find imputation parameters: mean or median if self.imputation_method == "mean": self.imputer_dict_ = X[self.variables_].mean().to_dict() elif self.imputation_method == "median": self.imputer_dict_ = X[self.variables_].median().to_dict() self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Common set-up of creation transformers. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. y: pandas Series, or np.array. Defaults to None. It is not needed in this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # check variables are numerical self.variables: List[Union[str, int]] = _find_or_check_numerical_variables( X, self.variables) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.variables) _check_contains_inf(X, self.variables) # save input features self.feature_names_in_ = X.columns.tolist() # save train set shape self.n_features_in_ = X.shape[1] return X
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn parameters. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: pandas Series, default=None y is not needed in this transformer. You can pass None or y. """ # check input dataframe X = check_X(X) # We need the dataframes to have unique values in the index and no missing data. # Otherwise, when we merge the new features we will duplicate rows. self._check_index(X) # find or check for numerical variables self.variables_ = _find_or_check_numerical_variables(X, self.variables) # check if dataset contains na if self.missing_values == "raise": self._check_na_and_inf(X) self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn any parameter. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: pandas Series, or np.array. Default=None. It is not needed in this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # check variables to combine are numerical self.variables_to_combine = _find_or_check_numerical_variables( X, self.variables_to_combine) # check reference_variables are numerical self.reference_variables = _find_or_check_numerical_variables( X, self.reference_variables) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.reference_variables) _check_contains_na(X, self.variables_to_combine) _check_contains_inf(X, self.reference_variables) _check_contains_inf(X, self.variables_to_combine) # cannot divide by 0, as will result in error if "div" in self.operations: if X[self.reference_variables].isin([0]).any().any(): raise ValueError( "Some of the reference variables contain 0 as values. Check and " "remove those before using this transformer with div.") # save input features self.feature_names_in_ = X.columns.tolist() # save train set shape self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn parameters. Perform dataframe checks. Creates dictionary of operation to new feature name pairs. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: pandas Series, or np.array. Defaults to None. It is not needed in this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # check variables to combine are numerical self.variables_to_combine = _find_or_check_numerical_variables( X, self.variables_to_combine) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.variables_to_combine) _check_contains_inf(X, self.variables_to_combine) if self.math_operations is None: self.math_operations_ = [ "sum", "prod", "mean", "std", "max", "min" ] else: self.math_operations_ = self.math_operations # dictionary of new_variable_name to operation pairs if self.new_variables_names: self.combination_dict_ = dict( zip(self.new_variables_names, self.math_operations_)) else: if all(isinstance(var, str) for var in self.variables_to_combine): vars_ls = self.variables_to_combine else: vars_ls = [str(var) for var in self.variables_to_combine] self.combination_dict_ = { f"{operation}({'-'.join(vars_ls)})": operation # type: ignore for operation in self.math_operations_ } # save input features self.feature_names_in_ = X.columns.tolist() # save train set shape self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the values at the end of the variable distribution. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training dataset. y : pandas Series, default=None y is not needed in this imputation. You can pass None or y. Raises ------ TypeError - If the input is not a Pandas DataFrame - If any of the user provided variables are not numerical ValueError If there are no numerical variables in the df or the df is empty Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables self.variables = _find_or_check_numerical_variables(X, self.variables) # estimate imputation values if self.imputation_method == "max": self.imputer_dict_ = (X[self.variables].max() * self.fold).to_dict() elif self.imputation_method == "gaussian": if self.tail == "right": self.imputer_dict_ = ( X[self.variables].mean() + self.fold * X[self.variables].std() ).to_dict() elif self.tail == "left": self.imputer_dict_ = ( X[self.variables].mean() - self.fold * X[self.variables].std() ).to_dict() elif self.imputation_method == "iqr": IQR = X[self.variables].quantile(0.75) - X[self.variables].quantile(0.25) if self.tail == "right": self.imputer_dict_ = ( X[self.variables].quantile(0.75) + (IQR * self.fold) ).to_dict() elif self.tail == "left": self.imputer_dict_ = ( X[self.variables].quantile(0.25) - (IQR * self.fold) ).to_dict() self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[str] = None): """ Fits the Scikit-learn transformer to the selected variables. If you enter None in the variables parameter, all variables will be automatically transformed by the OneHotEncoder, OrdinalEncoder or SimpleImputer. For the rest of the transformers, only the numerical variables will be selected and transformed. If you enter a list in the variables attribute, the SklearnTransformerWrapper will check that those variables exist in the dataframe and are of type numeric for all transformers except the OneHotEncoder, OrdinalEncoder or SimpleImputer, which also accept categorical variables. Parameters ---------- X: Pandas DataFrame The dataset to fit the transformer y: pandas Series, default=None The target variable. Raises ------ TypeError If the input is not a Pandas DataFrame Returns ------- self """ # check input dataframe X = _is_dataframe(X) self.transformer_ = clone(self.transformer) if (self.transformer_.__class__.__name__ == "OneHotEncoder" and self.transformer_.sparse): raise AttributeError( "The SklearnTransformerWrapper can only wrap the OneHotEncoder if you " "set its sparse attribute to False") if self.transformer_.__class__.__name__ in [ "OneHotEncoder", "OrdinalEncoder", "SimpleImputer", ]: self.variables_ = _find_all_variables(X, self.variables) else: self.variables_ = _find_or_check_numerical_variables( X, self.variables) self.transformer_.fit(X[self.variables_], y) self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Select features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe y : array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find numerical variables or check variables entered by user self.variables = _find_or_check_numerical_variables(X, self.variables) self.feature_performance_ = {} # train a model for every feature and store the performance for feature in self.variables: model = cross_validate( self.estimator, X[feature].to_frame(), y, cv=self.cv, return_estimator=False, scoring=self.scoring, ) self.feature_performance_[feature] = model["test_score"].mean() # select features if not self.threshold: threshold = pd.Series(self.feature_performance_).mean() else: threshold = self.threshold self.features_to_drop_ = [ f for f in self.feature_performance_.keys() if self.feature_performance_[f] < threshold ] # check we are not dropping all the columns in the df if len(self.features_to_drop_) == len(X.columns): warnings.warn( "All features will be dropped, try changing the threshold.") self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn any parameter. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. y: pandas Series, default=None y is not needed in this transformer. You can pass y or None. Raises ------ TypeError If the input is not a Pandas DataFrame Returns ------- self """ X = _is_dataframe(X) # find variables to be capped if self.min_capping_dict is None and self.max_capping_dict: self.variables_ = [x for x in self.max_capping_dict.keys()] elif self.max_capping_dict is None and self.min_capping_dict: self.variables_ = [x for x in self.min_capping_dict.keys()] elif self.min_capping_dict and self.max_capping_dict: tmp = self.min_capping_dict.copy() tmp.update(self.max_capping_dict) self.variables_ = [x for x in tmp.keys()] if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) # find or check for numerical variables self.variables_ = _find_or_check_numerical_variables( X, self.variables_) if self.max_capping_dict is not None: self.right_tail_caps_ = self.max_capping_dict else: self.right_tail_caps_ = {} if self.min_capping_dict is not None: self.left_tail_caps_ = self.min_capping_dict else: self.left_tail_caps_ = {} self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Find the important features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe y : array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find numerical variables or check variables entered by user self.variables = _find_or_check_numerical_variables(X, self.variables) # list to collect selected features self.selected_features_ = [] self.feature_performance_ = {} # train a model for every feature for feature in self.variables: model = cross_validate( self.estimator, X[feature].to_frame(), y, cv=self.cv, return_estimator=False, scoring=self.scoring, ) if model["test_score"].mean() > self.threshold: self.selected_features_.append(feature) self.feature_performance_[feature] = model["test_score"].mean() # check we are not dropping all the columns in the df if len(self.selected_features_) == 0: raise ValueError( "No features were selected, try changing the threshold.") self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[str] = None): """ The `fit` method allows Scikit-learn transformers to learn the required parameters from the training data set. If transformer is OneHotEncoder, OrdinalEncoder or SimpleImputer, all variables indicated in the ```variables``` parameter will be transformed. When the variables parameter is None, the SklearnWrapper will automatically select and transform all features in the dataset, numerical or otherwise. For all other Scikit-learn transformers only numerical variables will be transformed. The SklearnWrapper will check that the variables indicated in the variables parameter are numerical, or alternatively, if variables is None, it will automatically select the numerical variables in the data set. Parameters ---------- X : Pandas DataFrame The dataset to fit the transformer y : pandas Series, default=None This parameter exists only for compatibility with sklearn.pipeline.Pipeline. Raises ------ TypeError If the input is not a Pandas DataFrame Returns ------- self """ # check input dataframe X = _is_dataframe(X) if isinstance(self.transformer, (OneHotEncoder, OrdinalEncoder, SimpleImputer)): self.variables = _find_all_variables(X, self.variables) else: self.variables = _find_or_check_numerical_variables( X, self.variables) self.transformer.fit(X[self.variables]) self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[str] = None): """ Fits the Scikit-learn transformer to the selected variables. Parameters ---------- X: Pandas DataFrame The dataset to fit the transformer. y: pandas Series, default=None The target variable. """ # check input dataframe X = check_X(X) self.transformer_ = clone(self.transformer) if self.transformer_.__class__.__name__ in [ "OneHotEncoder", "OrdinalEncoder", "SimpleImputer", "FunctionTransformer", ]: self.variables_ = _find_all_variables(X, self.variables) else: self.variables_ = _find_or_check_numerical_variables( X, self.variables) self.transformer_.fit(X[self.variables_], y) if self.transformer_.__class__.__name__ in _SELECTORS: # Find features to drop. selected = X[self.variables_].columns[ self.transformer_.get_support()] self.features_to_drop_ = [ f for f in self.variables_ if f not in selected ] # save input features self.feature_names_in_ = X.columns.tolist() self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn any parameter. Check dataframe and variables. Checks that the user entered variables are in the train set and cast as numerical. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. Can be the entire dataframe, not just the variables to be transformed. y: None y is not needed in this transformer. You can pass y or None. Raises ------ TypeError - If the input is not a Pandas DataFrame - If any of the user provided variables are not numerical ValueError - If there are no numerical variables in the df or the df is empty - If the variable(s) contain null values Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables variables = [x for x in self.binning_dict.keys()] self.variables_ = _find_or_check_numerical_variables(X, variables) # check if dataset contains na or inf _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) # for consistency wit the rest of the discretisers, we add this attribute self.binner_dict_ = self.binning_dict self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the values at the end of the variable distribution. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: pandas Series, default=None y is not needed in this imputation. You can pass None or y. """ # check input dataframe X = check_X(X) # find or check for numerical variables self.variables_ = _find_or_check_numerical_variables(X, self.variables) # estimate imputation values if self.imputation_method == "max": self.imputer_dict_ = (X[self.variables_].max() * self.fold).to_dict() elif self.imputation_method == "gaussian": if self.tail == "right": self.imputer_dict_ = ( X[self.variables_].mean() + self.fold * X[self.variables_].std()).to_dict() elif self.tail == "left": self.imputer_dict_ = ( X[self.variables_].mean() - self.fold * X[self.variables_].std()).to_dict() elif self.imputation_method == "iqr": IQR = X[self.variables_].quantile(0.75) - X[ self.variables_].quantile(0.25) if self.tail == "right": self.imputer_dict_ = (X[self.variables_].quantile(0.75) + (IQR * self.fold)).to_dict() elif self.tail == "left": self.imputer_dict_ = (X[self.variables_].quantile(0.25) - (IQR * self.fold)).to_dict() self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> pd.DataFrame: """ Checks that input is a dataframe, finds numerical variables, or alternatively checks that variables entered by the user are of type numerical. Parameters ---------- X : Pandas DataFrame y : Pandas Series, np.array. Default = None Parameter is necessary for compatibility with sklearn Pipeline. Raises ------ TypeError If the input is not a Pandas DataFrame or a numpy array If any of the user provided variables are not numerical ValueError If there are no numerical variables in the df or the df is empty If the variable(s) contain null values Returns ------- X : Pandas DataFrame The same dataframe entered as parameter """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables self.variables_: List[Union[str, int]] = _find_or_check_numerical_variables( X, self.variables) # check if dataset contains na or inf _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) return X
def test_find_or_check_numerical_variables(df_vartypes, df_numeric_columns): vars_num = ["Age", "Marks"] var_num = "Age" vars_mix = ["Age", "Marks", "Name"] vars_none = None assert _find_or_check_numerical_variables(df_vartypes, vars_num) == vars_num assert _find_or_check_numerical_variables(df_vartypes, var_num) == ["Age"] assert _find_or_check_numerical_variables(df_vartypes, vars_none) == vars_num with pytest.raises(TypeError): assert _find_or_check_numerical_variables(df_vartypes, vars_mix) with pytest.raises(ValueError): assert _find_or_check_numerical_variables(df_vartypes[["Name", "City"]], None) assert _find_or_check_numerical_variables(df_numeric_columns, [2, 3]) == [2, 3] assert _find_or_check_numerical_variables(df_numeric_columns, 2) == [2]
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn any parameter. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: pandas Series, or np.array. Default=None. It is not needed in this transformer. You can pass y or None. """ # Common checks and attributes X = super().fit(X, y) # check variables are numerical self.reference = _find_or_check_numerical_variables(X, self.reference) return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the mean or median values. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: pandas series or None, default=None y is not needed in this imputation. You can pass None or y. Raises ------ TypeError - If the input is not a Pandas DataFrame - If any of the user provided variables are not numerical ValueError If there are no numerical variables in the df or the df is empty Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables self.variables_ = _find_or_check_numerical_variables(X, self.variables) # find imputation parameters: mean or median if self.imputation_method == "mean": self.imputer_dict_ = X[self.variables_].mean().to_dict() elif self.imputation_method == "median": self.imputer_dict_ = X[self.variables_].median().to_dict() self.n_features_in_ = X.shape[1] return self
def _select_variables_from_dict(self, X: pd.DataFrame, user_dict_: Dict) -> pd.DataFrame: """ Checks that input is a dataframe, checks that variables in the dictionary entered by the user are of type numerical. Parameters ---------- X : Pandas DataFrame user_dict_ : Dictionary. Default = None Any dictionary allowed by the transformer and entered by user. Raises ------ TypeError If the input is not a Pandas DataFrame or a numpy array If any of the variables in the dictionary are not numerical ValueError If there are no numerical variables in the df or the df is empty If the variable(s) contain null values Returns ------- X : Pandas DataFrame The same dataframe entered as parameter """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables variables = [x for x in user_dict_.keys()] self.variables_ = _find_or_check_numerical_variables(X, variables) # check if dataset contains na or inf _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) return X
def fit(self, X: pd.DataFrame, y: pd.Series = None): """ Find features with high PSI values. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training dataset. y : pandas series. Default = None y is not needed in this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # If required exclude variables that are not in the input dataframe self._confirm_variables(X) # find numerical variables or check those entered are present in the dataframe self.variables_ = _find_or_check_numerical_variables( X, self.variables_) # Remove the split_col from the variables list. It might be added if the # variables are not defined at initialization. if self.split_col in self.variables_: self.variables_.remove(self.split_col) if self.missing_values == "raise": # check if dataset contains na or inf _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) # Split the dataframe into basis and test. basis_df, test_df = self._split_dataframe(X) # Check the shape of the returned dataframes for PSI calculations. # The number of observations must be at least equal to the # number of bins. if min(basis_df.shape[0], test_df.shape[0]) < self.bins: raise ValueError( "The number of rows in the basis and test datasets that will be used " f"in the PSI calculations must be at least larger than {self.bins}. " "After slitting the original dataset based on the given cut_off or" f"split_frac we have {basis_df.shape[0]} samples in the basis set, " f"and {test_df.shape[0]} samples in the test set. " "Please adjust the value of the cut_off or split_frac.") # Switch basis and test dataframes if required. if self.switch: test_df, basis_df = basis_df, test_df # set up the discretizer if self.strategy == "equal_width": bucketer = EqualWidthDiscretiser(bins=self.bins) else: bucketer = EqualFrequencyDiscretiser(q=self.bins) # Compute the PSI by looping over the features self.psi_values_ = {} self.features_to_drop_ = [] for feature in self.variables_: # Discretize the features. basis_discrete = bucketer.fit_transform(basis_df[[feature ]].dropna()) test_discrete = bucketer.transform(test_df[[feature]].dropna()) # Determine percentage of observations per bin basis_distrib, test_distrib = self._observation_frequency_per_bin( basis_discrete, test_discrete) # Calculate the PSI value self.psi_values_[feature] = np.sum( (test_distrib - basis_distrib) * np.log(test_distrib / basis_distrib)) # Assess if feature should be dropped if self.psi_values_[feature] > self.threshold: self.features_to_drop_.append(feature) # save input features self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Find the important features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe y : array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # reset the index X = X.reset_index(drop=True) y = y.reset_index(drop=True) # find numerical variables or check variables entered by user self.variables = _find_or_check_numerical_variables(X, self.variables) # train model with all features and cross-validation model = cross_validate( self.estimator, X, y, cv=self.cv, return_estimator=True, scoring=self.scoring, ) # store initial model performance self.initial_model_performance_ = model["test_score"].mean() # get performance metric scorer = get_scorer(self.scoring) # dict to collect features and their performance_drift after shuffling self.performance_drifts_ = {} # list to collect selected features self.selected_features_ = [] # shuffle features and save feature performance drift into a dict for feature in self.variables: X_shuffled = X.copy() # shuffle individual feature X_shuffled[feature] = ( X_shuffled[feature].sample(frac=1).reset_index(drop=True) ) # determine the performance with the shuffled feature performance = np.mean( [scorer(m, X_shuffled, y) for m in model["estimator"]] ) # determine drift in performance # Note, sklearn negates the log and error scores, so no need to manually # do the invertion # https://scikit-learn.org/stable/modules/model_evaluation.html # (https://scikit-learn.org/stable/modules/model_evaluation.html # #the-scoring-parameter-defining-model-evaluation-rules) performance_drift = self.initial_model_performance_ - performance # Save feature and performance drift self.performance_drifts_[feature] = performance_drift # select features for feature in self.performance_drifts_.keys(): if self.performance_drifts_[feature] > self.threshold: self.selected_features_.append(feature) self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: pd.Series = None): """ Find the correlated features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training dataset. y : pandas series. Default = None y is not needed in this transformer. You can pass y or None. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find all numerical variables or check those entered are in the dataframe self.variables = _find_or_check_numerical_variables(X, self.variables) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables) # set to collect features that are correlated self.correlated_features_ = set() # create tuples of correlated feature groups self.correlated_feature_sets_ = [] # the correlation matrix self.correlated_matrix_ = X[self.variables].corr(method=self.method) # create set of examined features, helps to determine feature combinations # to evaluate below _examined_features = set() # for each feature in the dataset (columns of the correlation matrix) for feature in self.correlated_matrix_.columns: if feature not in _examined_features: # append so we can exclude when we create the combinations _examined_features.add(feature) # here we collect potentially correlated features # we need this for the correlated groups sets _temp_set = set([feature]) # features that have not been examined, are not currently examined and # were not found correlated _features_to_compare = [ f for f in self.correlated_matrix_.columns if f not in _examined_features ] # create combinations: for f2 in _features_to_compare: # if the correlation is higher than the threshold # we are interested in absolute correlation coefficient value if abs(self.correlated_matrix_.loc[ f2, feature]) > self.threshold: # add feature (f2) to our correlated set self.correlated_features_.add(f2) _temp_set.add(f2) _examined_features.add(f2) # if there are correlated features if len(_temp_set) > 1: self.correlated_feature_sets_.append(_temp_set) self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn parameters. Perform dataframe checks. Creates dictionary of operation to new feature name pairs. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y : pandas Series, or np.array. Defaults to None. It is not needed in this transformer. You can pass y or None. Raises ------ TypeError - If the input is not a Pandas DataFrame - If any user provided variables in variables_to_combine are not numerical ValueError If the variable(s) contain null values Returns ------- self """ # check input dataframe X = _is_dataframe(X) # check variables to combine are numerical self.variables_to_combine = _find_or_check_numerical_variables( X, self.variables_to_combine ) # check if dataset contains na _check_contains_na(X, self.variables_to_combine) if self.math_operations is None: self.math_operations_ = ["sum", "prod", "mean", "std", "max", "min"] else: self.math_operations_ = self.math_operations # dictionary of new_variable_name to operation pairs if self.new_variables_names: self.combination_dict_ = dict( zip(self.new_variables_names, self.math_operations_) ) else: if all(isinstance(var, str) for var in self.variables_to_combine): vars_ls = self.variables_to_combine else: vars_ls = [str(var) for var in self.variables_to_combine] self.combination_dict_ = { f"{operation}({'-'.join(vars_ls)})": operation # type: ignore for operation in self.math_operations_ } self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Find the important features. Note that the selector trains various models at each round of selection, so it might take a while. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe y : array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find numerical variables or check variables entered by user self.variables = _find_or_check_numerical_variables(X, self.variables) # train model with all features and cross-validation model = cross_validate( self.estimator, X[self.variables], y, cv=self.cv, scoring=self.scoring, return_estimator=True, ) # store initial model performance self.initial_model_performance_ = model["test_score"].mean() # Initialize a dataframe that will contain the list of the feature/coeff # importance for each cross validation fold feature_importances_cv = pd.DataFrame() # Populate the feature_importances_cv dataframe with columns containing # the feature importance values for each model returned by the cross # validation. # There are as many columns as folds. for m in model["estimator"]: feature_importances_cv[m] = get_feature_importances(m) # Add the variables as index to feature_importances_cv feature_importances_cv.index = self.variables # Aggregate the feature importance returned in each fold self.feature_importances_ = feature_importances_cv.mean(axis=1) # Sort the feature importance values self.feature_importances_.sort_values(ascending=True, inplace=True) # to collect selected features _selected_features = [] # temporary copy where we will remove features recursively X_tmp = X[self.variables].copy() # we need to update the performance as we remove features baseline_model_performance = self.initial_model_performance_ # dict to collect features and their performance_drift after shuffling self.performance_drifts_ = {} # evaluate every feature, starting from the least important # remember that feature_importances_ is ordered already for feature in list(self.feature_importances_.index): # remove feature and train new model model_tmp = cross_validate( self.estimator, X_tmp.drop(columns=feature), y, cv=self.cv, scoring=self.scoring, return_estimator=False, ) # assign new model performance model_tmp_performance = model_tmp["test_score"].mean() # Calculate performance drift performance_drift = baseline_model_performance - model_tmp_performance # Save feature and performance drift self.performance_drifts_[feature] = performance_drift if performance_drift > self.threshold: _selected_features.append(feature) else: # remove feature and adjust initial performance X_tmp = X_tmp.drop(columns=feature) baseline_model = cross_validate( self.estimator, X_tmp, y, cv=self.cv, return_estimator=False, scoring=self.scoring, ) # store initial model performance baseline_model_performance = baseline_model["test_score"].mean( ) self.features_to_drop_ = [ f for f in self.variables if f not in _selected_features ] self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: pd.Series = None): """ Find the correlated feature groups. Determine which feature should be selected from each group. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: pandas series. Default = None y is needed if selection_method == 'model_performance'. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find all numerical variables or check those entered are in the dataframe self.variables_ = _find_or_check_numerical_variables(X, self.variables) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) if self.selection_method == "model_performance" and y is None: raise ValueError("y is needed to fit the transformer") # FIND CORRELATED FEATURES # ======================== # create tuples of correlated feature groups self.correlated_feature_sets_ = [] # the correlation matrix _correlated_matrix = X[self.variables_].corr(method=self.method) # create set of examined features, helps to determine feature combinations # to evaluate below _examined_features = set() # for each feature in the dataset (columns of the correlation matrix) for feature in _correlated_matrix.columns: if feature not in _examined_features: # append so we can exclude when we create the combinations _examined_features.add(feature) # here we collect potentially correlated features # we need this for the correlated groups sets _temp_set = set([feature]) # features that have not been examined, are not currently examined and # were not found correlated _features_to_compare = [ f for f in _correlated_matrix.columns if f not in _examined_features ] # create combinations: for f2 in _features_to_compare: # if the correlation is higher than the threshold # we are interested in absolute correlation coefficient value if abs(_correlated_matrix.loc[f2, feature]) > self.threshold: # add feature (f2) to our correlated set _temp_set.add(f2) _examined_features.add(f2) # if there are correlated features if len(_temp_set) > 1: self.correlated_feature_sets_.append(_temp_set) # SELECT 1 FEATURE FROM EACH GROUP # ================================ # list to collect selected features # we start it with all features that were either not examined, i.e., categorical # variables, or not found correlated _selected_features = [ f for f in X.columns if f not in set().union(*self.correlated_feature_sets_) ] # select the feature with least missing values if self.selection_method == "missing_values": for feature_group in self.correlated_feature_sets_: f = X[feature_group].isnull().sum().sort_values( ascending=True).index[0] _selected_features.append(f) # select the feature with most unique values elif self.selection_method == "cardinality": for feature_group in self.correlated_feature_sets_: f = X[feature_group].nunique().sort_values( ascending=False).index[0] _selected_features.append(f) # select the feature with biggest variance elif self.selection_method == "variance": for feature_group in self.correlated_feature_sets_: f = X[feature_group].std().sort_values( ascending=False).index[0] _selected_features.append(f) # select best performing feature according to estimator else: for feature_group in self.correlated_feature_sets_: # feature_group = list(feature_group) temp_perf = [] # train a model for every feature for feature in feature_group: model = cross_validate( self.estimator, X[feature].to_frame(), y, cv=self.cv, return_estimator=False, scoring=self.scoring, ) temp_perf.append(model["test_score"].mean()) # select best performing feature from group f = list(feature_group)[temp_perf.index(max(temp_perf))] _selected_features.append(f) self.features_to_drop_ = [ f for f in self.variables_ if f not in _selected_features ] self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the values that should be used to replace outliers. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. y : pandas Series, default=None y is not needed in this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # find or check for numerical variables self.variables_ = _find_or_check_numerical_variables(X, self.variables) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) self.right_tail_caps_ = {} self.left_tail_caps_ = {} # estimate the end values if self.tail in ["right", "both"]: if self.capping_method == "gaussian": self.right_tail_caps_ = ( X[self.variables_].mean() + self.fold * X[self.variables_].std()).to_dict() elif self.capping_method == "iqr": IQR = X[self.variables_].quantile(0.75) - X[ self.variables_].quantile(0.25) self.right_tail_caps_ = (X[self.variables_].quantile(0.75) + (IQR * self.fold)).to_dict() elif self.capping_method == "quantiles": self.right_tail_caps_ = ( X[self.variables_].quantile(1 - self.fold).to_dict()) if self.tail in ["left", "both"]: if self.capping_method == "gaussian": self.left_tail_caps_ = ( X[self.variables_].mean() - self.fold * X[self.variables_].std()).to_dict() elif self.capping_method == "iqr": IQR = X[self.variables_].quantile(0.75) - X[ self.variables_].quantile(0.25) self.left_tail_caps_ = (X[self.variables_].quantile(0.25) - (IQR * self.fold)).to_dict() elif self.capping_method == "quantiles": self.left_tail_caps_ = X[self.variables_].quantile( self.fold).to_dict() self.feature_names_in_ = X.columns.to_list() self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Find the important features. Note that the selector trains various models at each round of selection, so it might take a while. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe y : array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find numerical variables or check variables entered by user self.variables = _find_or_check_numerical_variables(X, self.variables) # train model with all features and cross-validation model = cross_validate( self.estimator, X[self.variables], y, cv=self.cv, scoring=self.scoring, return_estimator=True, ) # store initial model performance self.initial_model_performance_ = model["test_score"].mean() # Initialize a dataframe that will contain the list of the feature/coeff # importance for each cross validation fold feature_importances_cv = pd.DataFrame() # Populate the feature_importances_cv dataframe with columns containing # the feature importance values for each model returned by the cross # validation. # There are as many columns as folds. for m in model["estimator"]: feature_importances_cv[m] = get_feature_importances(m) # Add the variables as index to feature_importances_cv feature_importances_cv.index = self.variables # Aggregate the feature importance returned in each fold self.feature_importances_ = feature_importances_cv.mean(axis=1) # Sort the feature importance values decreasingly self.feature_importances_.sort_values(ascending=False, inplace=True) # Extract most important feature from the ordered list of features first_most_important_feature = list(self.feature_importances_.index)[0] # Run baseline model using only the most important feature baseline_model = cross_validate( self.estimator, X[first_most_important_feature].to_frame(), y, cv=self.cv, scoring=self.scoring, return_estimator=True, ) # Save baseline model performance baseline_model_performance = baseline_model["test_score"].mean() # list to collect selected features # It is initialized with the most important feature _selected_features = [first_most_important_feature] # dict to collect features and their performance_drift # It is initialized with the performance drift of # the most important feature self.performance_drifts_ = {first_most_important_feature: 0} # loop over the ordered list of features by feature importance starting # from the second element in the list. for feature in list(self.feature_importances_.index)[1:]: # Add feature and train new model model_tmp = cross_validate( self.estimator, X[_selected_features + [feature]], y, cv=self.cv, scoring=self.scoring, return_estimator=True, ) # assign new model performance model_tmp_performance = model_tmp["test_score"].mean() # Calculate performance drift performance_drift = model_tmp_performance - baseline_model_performance # Save feature and performance drift self.performance_drifts_[feature] = performance_drift # If new performance model is if performance_drift > self.threshold: # add feature to the list of selected features _selected_features.append(feature) # Update new baseline model performance baseline_model_performance = model_tmp_performance self.features_to_drop_ = [ f for f in self.variables if f not in _selected_features ] self.input_shape_ = X.shape return self