def test_find_numerical_variables(dataframe_vartypes): vars_num = ['Age', 'Marks'] vars_mix = ['Age', 'Marks', 'Name'] vars_none = None assert _find_numerical_variables(dataframe_vartypes, vars_num) == vars_num assert _find_numerical_variables(dataframe_vartypes, vars_none) == vars_num with pytest.raises(TypeError): assert _find_numerical_variables(dataframe_vartypes, vars_mix)
def test_find_numerical_variables(df_vartypes): vars_num = ["Age", "Marks"] vars_mix = ["Age", "Marks", "Name"] vars_none = None assert _find_numerical_variables(df_vartypes, vars_num) == vars_num assert _find_numerical_variables(df_vartypes, vars_none) == vars_num with pytest.raises(TypeError): assert _find_numerical_variables(df_vartypes, vars_mix) with pytest.raises(ValueError): assert _find_numerical_variables(df_vartypes[["Name", "City"]], None)
def fit(self, X, y=None): """ The `fit` method allows Scikit-learn transformers to learn the required parameters from the training data set. If transformer is OneHotEncoder, OrdinalEncoder or SimpleImputer, all variables indicated in the variables parameter will be transformed. When the variables parameter is None, the SklearnWrapper will automatically select and transform all features in the dataset, numerical or otherwise. For all other Scikit-learn transformers only numerical variables will be transformed. The SklearnWrapper will check that the variables indicated in the variables parameter are numerical, or alternatively, if variables is None, it will automatically select the numerical variables in the data set. """ # check input dataframe X = _is_dataframe(X) if isinstance(self.transformer, (OneHotEncoder, OrdinalEncoder, SimpleImputer)): self.variables = _find_all_variables(X, self.variables) else: self.variables = _find_numerical_variables(X, self.variables) self.transformer.fit(X[self.variables]) self.input_shape_ = X.shape return self
def fit(self, X, y=None): """ Checks that the variables are numerical. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. User can pass the entire dataframe, not just the variables to impute. y : None y is not needed in this imputation. You can pass None or y. """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables self.variables = _find_numerical_variables(X, self.variables) # create the imputer dictionary self.imputer_dict_ = { var: self.arbitrary_number for var in self.variables } self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> pd.DataFrame: """ Fits the transformation to the DataFrame. Args: X: Pandas DataFrame to fit the transformation y: This parameter exists only for compatibility with sklearn.pipeline.Pipeline. Defaults to None. Alternatively takes Pandas Series. Returns: DataFrame with fitted transformation """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables self.variables = _find_numerical_variables(X, self.variables) # check if dataset contains na _check_contains_na(X, self.variables) return X
def fit(self, X, y=None): """ Learns the values at the end of the variable distribution. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. The user can pass the entire dataframe, not just the variables that need imputation. y : None y is not needed in this imputation. You can pass None or y. Attributes ---------- imputer_dict_: dictionary The dictionary containing the values at the end of the distribution per variable. These values will be used by the imputer to replace missing data. """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables self.variables = _find_numerical_variables(X, self.variables) # estimate imputation values if self.distribution == 'max': self.imputer_dict_ = (X[self.variables].max() * self.fold).to_dict() elif self.distribution == 'gaussian': if self.tail == 'right': self.imputer_dict_ = ( X[self.variables].mean() + self.fold * X[self.variables].std()).to_dict() elif self.tail == 'left': self.imputer_dict_ = ( X[self.variables].mean() - self.fold * X[self.variables].std()).to_dict() elif self.distribution == 'skewed': IQR = X[self.variables].quantile(0.75) - X[ self.variables].quantile(0.25) if self.tail == 'right': self.imputer_dict_ = (X[self.variables].quantile(0.75) + (IQR * self.fold)).to_dict() elif self.tail == 'left': self.imputer_dict_ = (X[self.variables].quantile(0.25) - (IQR * self.fold)).to_dict() self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Checks that the variables are numerical. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. User can pass the entire dataframe, not just the variables to impute. y : None y is not needed in this imputation. You can pass None or y. Attributes ---------- imputer_dict_: dictionary The dictionary containing the values that will replace each variable. """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables if self.imputer_dict: self.variables = _find_numerical_variables( X, self.imputer_dict.keys()) else: self.variables = _find_numerical_variables(X, self.variables) # create the imputer dictionary if self.imputer_dict: self.imputer_dict_ = self.imputer_dict else: self.imputer_dict_ = { var: self.arbitrary_number for var in self.variables } self.input_shape_ = X.shape return self
def fit(self, X, y=None): # check input dataframe X = _is_dataframe(X) # find or check for numerical variables self.variables = _find_numerical_variables(X, self.variables) # check if dataset contains na _check_contains_na(X, self.variables) return X
def fit(self, X, y): """ Args ---- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe y: array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find numerical variables or check variables entered by user self.variables = _find_numerical_variables(X, self.variables) # list to collect selected features self.selected_features_ = [] self.feature_performance_ = {} # train a model for every feature for feature in self.variables: model = cross_validate( self.estimator, X[feature].to_frame(), y, cv=self.cv, return_estimator=False, scoring=self.scoring, ) if model["test_score"].mean() > self.threshold: self.selected_features_.append(feature) self.feature_performance_[feature] = model["test_score"].mean() self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. y : None y is not needed in this transformer. You can pass y or None. Attributes ---------- right_tail_caps_: dictionary The dictionary containing the maximum values at which variables will be capped. left_tail_caps_ : dictionary The dictionary containing the minimum values at which variables will be capped. """ X = _is_dataframe(X) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables) # find or check for numerical variables self.variables = _find_numerical_variables(X, self.variables) if self.max_capping_dict is not None: self.right_tail_caps_ = self.max_capping_dict else: self.right_tail_caps_ = {} if self.min_capping_dict is not None: self.left_tail_caps_ = self.min_capping_dict else: self.left_tail_caps_ = {} self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[str] = None): """ The `fit` method allows Scikit-learn transformers to learn the required parameters from the training data set. If transformer is OneHotEncoder, OrdinalEncoder or SimpleImputer, all variables indicated in the ```variables``` parameter will be transformed. When the variables parameter is None, the SklearnWrapper will automatically select and transform all features in the dataset, numerical or otherwise. For all other Scikit-learn transformers only numerical variables will be transformed. The SklearnWrapper will check that the variables indicated in the variables parameter are numerical, or alternatively, if variables is None, it will automatically select the numerical variables in the data set. Args: X: Pandas DataFrame to fit the transformer y: This parameter exists only for compatibility with sklearn.pipeline.Pipeline. Defaults to None. Returns: self """ # check input dataframe X = _is_dataframe(X) if isinstance(self.transformer, (OneHotEncoder, OrdinalEncoder, SimpleImputer)): self.variables = _find_all_variables(X, self.variables) else: self.variables = _find_numerical_variables(X, self.variables) self.transformer.fit(X[self.variables]) self.input_shape_ = X.shape return self
def fit(self, X, y=None): """ Learns the mean or median values. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. User can pass the entire dataframe, not just the variables that need imputation. y : pandas series or None, default=None y is not needed in this imputation. You can pass None or y. Attributes ---------- imputer_dict_: dictionary The dictionary containing the mean / median values per variable. These values will be used by the imputer to replace missing data. The imputer_dict_ is created when fitting the imputer. """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables self.variables = _find_numerical_variables(X, self.variables) # find imputation parameters: mean or median if self.imputation_method == 'mean': self.imputer_dict_ = X[self.variables].mean().to_dict() elif self.imputation_method == 'median': self.imputer_dict_ = X[self.variables].median().to_dict() self.input_shape_ = X.shape return self
def fit(self, X, y): """ Args ---- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe y: array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find numerical variables or check variables entered by user self.variables = _find_numerical_variables(X, self.variables) # train model with all features and cross-validation model = cross_validate( self.estimator, X[self.variables], y, cv=self.cv, scoring=self.scoring, return_estimator=True, ) # store initial model performance self.initial_model_performance_ = model["test_score"].mean() # Initialize a dataframe that will contain the list of the feature/coeff # importance for each cross validation fold feature_importances_cv = pd.DataFrame() # Populate the feature_importances_cv dataframe with columns containing # the feature importance values for each model returned by the cross # validation. # There are as many columns as folds. for m in model["estimator"]: feature_importances_cv[m] = get_feature_importances(m) # Add the variables as index to feature_importances_cv feature_importances_cv.index = self.variables # Aggregate the feature importance returned in each fold self.feature_importances_ = feature_importances_cv.mean(axis=1) # Sort the feature importance values self.feature_importances_.sort_values(ascending=True, inplace=True) # list to collect selected features self.selected_features_ = [] # temporary copy where we will remove features recursively X_tmp = X[self.variables].copy() # we need to update the performance as we remove features baseline_model_performance = self.initial_model_performance_ # dict to collect features and their performance_drift after shuffling self.performance_drifts_ = {} # evaluate every feature, starting from the least important # remember that feature_importances_ is ordered already for feature in list(self.feature_importances_.index): # remove feature and train new model model_tmp = cross_validate( self.estimator, X_tmp.drop(columns=feature), y, cv=self.cv, scoring=self.scoring, return_estimator=False, ) # assign new model performance model_tmp_performance = model_tmp["test_score"].mean() # Calculate performance drift performance_drift = baseline_model_performance - model_tmp_performance # Save feature and performance drift self.performance_drifts_[feature] = performance_drift if performance_drift > self.threshold: self.selected_features_.append(feature) else: # remove feature and adjust initial performance X_tmp = X_tmp.drop(columns=feature) baseline_model = cross_validate( self.estimator, X_tmp, y, cv=self.cv, return_estimator=False, scoring=self.scoring, ) # store initial model performance baseline_model_performance = baseline_model["test_score"].mean( ) self.input_shape_ = X.shape return self
def fit(self, X, y=None): """ Finds the correlated features Args: X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: It is not needed in this transformer. Defaults to None. Alternatively takes Pandas Series.ss Returns: self """ # check input dataframe X = _is_dataframe(X) # find all numerical variables or check those entered are in the dataframe self.variables = _find_numerical_variables(X, self.variables) # set to collect features that are correlated self.correlated_features_ = set() # create tuples of correlated feature groups self.correlated_feature_sets_ = [] # the correlation matrix self.correlated_matrix_ = X[self.variables].corr(method=self.method) # create set of examined features, helps to determine feature combinations # to evaluate below _examined_features = set() # for each feature in the dataset (columns of the correlation matrix) for feature in self.correlated_matrix_.columns: if feature not in _examined_features: # append so we can exclude when we create the combinations _examined_features.add(feature) # here we collect potentially correlated features # we need this for the correlated groups sets _temp_set = set([feature]) # features that have not been examined, are not currently examined and # were not found correlated _features_to_compare = [ f for f in self.correlated_matrix_.columns if f not in _examined_features ] # create combinations: for f2 in _features_to_compare: # if the correlation is higher than the threshold # we are interested in absolute correlation coefficient value if abs(self.correlated_matrix_.loc[ f2, feature]) > self.threshold: # add feature (f2) to our correlated set self.correlated_features_.add(f2) _temp_set.add(f2) _examined_features.add(f2) # if there are correlated features if len(_temp_set) > 1: self.correlated_feature_sets_.append(_temp_set) self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learns the values that should be used to replace outliers. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. y : None y is not needed in this transformer. You can pass y or None. Attributes ---------- right_tail_caps_: dictionary The dictionary containing the maximum values at which variables will be capped. left_tail_caps_ : dictionary The dictionary containing the minimum values at which variables will be capped. """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables self.variables = _find_numerical_variables(X, self.variables) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables) self.right_tail_caps_ = {} self.left_tail_caps_ = {} # estimate the end values if self.tail in ["right", "both"]: if self.capping_method == "gaussian": self.right_tail_caps_ = ( X[self.variables].mean() + self.fold * X[self.variables].std()).to_dict() elif self.capping_method == "iqr": IQR = X[self.variables].quantile(0.75) - X[ self.variables].quantile(0.25) self.right_tail_caps_ = (X[self.variables].quantile(0.75) + (IQR * self.fold)).to_dict() elif self.capping_method == "quantiles": self.right_tail_caps_ = ( X[self.variables].quantile(1 - self.fold).to_dict()) if self.tail in ["left", "both"]: if self.capping_method == "gaussian": self.left_tail_caps_ = ( X[self.variables].mean() - self.fold * X[self.variables].std()).to_dict() elif self.capping_method == "iqr": IQR = X[self.variables].quantile(0.75) - X[ self.variables].quantile(0.25) self.left_tail_caps_ = (X[self.variables].quantile(0.25) - (IQR * self.fold)).to_dict() elif self.capping_method == "quantiles": self.left_tail_caps_ = X[self.variables].quantile( self.fold).to_dict() self.input_shape_ = X.shape return self
def fit(self, X, y): """ Args ---- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe y: array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # reset the index X = X.reset_index(drop=True) y = y.reset_index(drop=True) # find numerical variables or check variables entered by user self.variables = _find_numerical_variables(X, self.variables) # train model with all features and cross-validation model = cross_validate( self.estimator, X, y, cv=self.cv, return_estimator=True, scoring=self.scoring, ) # store initial model performance self.initial_model_performance_ = model["test_score"].mean() # get performance metric scorer = get_scorer(self.scoring) # dict to collect features and their performance_drift after shuffling self.performance_drifts_ = {} # list to collect selected features self.selected_features_ = [] # shuffle features and save feature performance drift into a dict for feature in self.variables: X_shuffled = X.copy() # shuffle individual feature X_shuffled[feature] = (X_shuffled[feature].sample( frac=1).reset_index(drop=True)) # determine the performance with the shuffled feature performance = np.mean( [scorer(m, X_shuffled, y) for m in model["estimator"]]) # determine drift in performance # Note, sklearn negates the log and error scores, so no need to manually # do the invertion # https://scikit-learn.org/stable/modules/model_evaluation.html # (https://scikit-learn.org/stable/modules/model_evaluation.html # #the-scoring-parameter-defining-model-evaluation-rules) performance_drift = self.initial_model_performance_ - performance # Save feature and performance drift self.performance_drifts_[feature] = performance_drift # select features for feature in self.performance_drifts_.keys(): if self.performance_drifts_[feature] > self.threshold: self.selected_features_.append(feature) self.input_shape_ = X.shape return self