def test_find_all_variables(df_vartypes): all_vars = ["Name", "City", "Age", "Marks", "dob"] user_vars = ["Name", "City"] non_existing_vars = ["Grades"] assert _find_all_variables(df_vartypes) == all_vars assert _find_all_variables(df_vartypes, ["Name", "City"]) == user_vars with pytest.raises(TypeError): assert _find_all_variables(df_vartypes, non_existing_vars)
def test_find_all_variables(df_vartypes): all_vars = ["Name", "City", "Age", "Marks", "dob"] all_vars_no_dt = ["Name", "City", "Age", "Marks"] user_vars = ["Name", "City"] non_existing_vars = ["Grades"] assert _find_all_variables(df_vartypes) == all_vars assert _find_all_variables(df_vartypes, exclude_datetime=True) == all_vars_no_dt assert _find_all_variables(df_vartypes, ["Name", "City"]) == user_vars with pytest.raises(KeyError): assert _find_all_variables(df_vartypes, non_existing_vars)
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the variables for which the missing indicators will be created. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: pandas Series, default=None y is not needed in this imputation. You can pass None or y. """ # check input dataframe X = check_X(X) # find variables for which indicator should be added self.variables_ = _find_all_variables(X, self.variables) if self.missing_only is True: self.variables_ = [ var for var in self.variables_ if X[var].isnull().sum() > 0 ] self._get_feature_names_in(X) return self
def _check_or_select_variables(self, X: pd.DataFrame): """ Finds categorical variables, or alternatively checks that the variables entered by the user are of type object (categorical). Checks absence of NA. Parameters ---------- X: Pandas DataFrame Raises ------ TypeError If any user provided variable is not categorical ValueError If there are no categorical variables in the df or the df is empty If the variable(s) contain null values """ if not self.ignore_format: # find categorical variables or check variables entered by user are object self.variables_: List[Union[ str, int]] = _find_or_check_categorical_variables( X, self.variables) else: # select all variables or check variables entered by the user self.variables_ = _find_all_variables(X, self.variables)
def fit(self, X, y=None): """ The `fit` method allows Scikit-learn transformers to learn the required parameters from the training data set. If transformer is OneHotEncoder, OrdinalEncoder or SimpleImputer, all variables indicated in the variables parameter will be transformed. When the variables parameter is None, the SklearnWrapper will automatically select and transform all features in the dataset, numerical or otherwise. For all other Scikit-learn transformers only numerical variables will be transformed. The SklearnWrapper will check that the variables indicated in the variables parameter are numerical, or alternatively, if variables is None, it will automatically select the numerical variables in the data set. """ # check input dataframe X = _is_dataframe(X) if isinstance(self.transformer, (OneHotEncoder, OrdinalEncoder, SimpleImputer)): self.variables = _find_all_variables(X, self.variables) else: self.variables = _find_numerical_variables(X, self.variables) self.transformer.fit(X[self.variables]) self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Find the variables for which missing data should be evaluated to decide if a row should be dropped. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training data set. y: pandas Series, default=None y is not needed in this imputation. You can pass None or y. """ # check input dataframe X = check_X(X) # find variables for which indicator should be added self.variables_ = _find_all_variables(X, self.variables) # If user passes a threshold, then missing_only is ignored: if self.threshold is None and self.missing_only is True: self.variables_ = [ var for var in self.variables_ if X[var].isnull().sum() > 0 ] self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: pd.Series = None): """ Find constant and quasi-constant features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe. y : None y is not needed for this transformer. You can pass y or None. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find all variables or check those entered are present in the dataframe self.variables = _find_all_variables(X, self.variables) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables) if self.missing_values == "include": X[self.variables] = X[self.variables].fillna("missing_values") # find constant features if self.tol == 1: self.features_to_drop_ = [ feature for feature in self.variables if X[feature].nunique() == 1 ] # find constant and quasi-constant features else: self.features_to_drop_ = [] for feature in self.variables: # find most frequent value / category in the variable predominant = ((X[feature].value_counts() / np.float(len(X))).sort_values( ascending=False).values[0]) if predominant >= self.tol: self.features_to_drop_.append(feature) # check we are not dropping all the columns in the df if len(self.features_to_drop_) == len(X.columns): raise ValueError( "The resulting dataframe will have no columns after dropping all " "constant or quasi-constant features. Try changing the tol value." ) self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the most frequent category if the imputation method is set to frequent. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: pandas Series, default=None y is not needed in this imputation. You can pass None or y. Raises ------ TypeError - If the input is not a Pandas DataFrame. - If user enters non-categorical variables (unless ignore_format is True) ValueError If there are no categorical variables in the df or the df is empty Returns ------- self """ # check input dataframe X = _is_dataframe(X) # check or select the the right variables if not self.ignore_format: # find categorical variables or check variables entered by user are # categorical self.variables_: List[ Union[str, int] ] = _find_or_check_categorical_variables(X, self.variables) else: # select all variables or check variables entered by the user self.variables_ = _find_all_variables(X, self.variables) if self.imputation_method == "missing": self.imputer_dict_ = {var: self.fill_value for var in self.variables_} elif self.imputation_method == "frequent": self.imputer_dict_ = {} for var in self.variables_: mode_vals = X[var].mode() # careful: some variables contain multiple modes if len(mode_vals) == 1: self.imputer_dict_[var] = mode_vals[0] else: raise ValueError( "Variable {} contains multiple frequent categories.".format(var) ) self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: Optional[str] = None): """ Fits the Scikit-learn transformer to the selected variables. If you enter None in the variables parameter, all variables will be automatically transformed by the OneHotEncoder, OrdinalEncoder or SimpleImputer. For the rest of the transformers, only the numerical variables will be selected and transformed. If you enter a list in the variables attribute, the SklearnTransformerWrapper will check that those variables exist in the dataframe and are of type numeric for all transformers except the OneHotEncoder, OrdinalEncoder or SimpleImputer, which also accept categorical variables. Parameters ---------- X: Pandas DataFrame The dataset to fit the transformer y: pandas Series, default=None The target variable. Raises ------ TypeError If the input is not a Pandas DataFrame Returns ------- self """ # check input dataframe X = _is_dataframe(X) self.transformer_ = clone(self.transformer) if (self.transformer_.__class__.__name__ == "OneHotEncoder" and self.transformer_.sparse): raise AttributeError( "The SklearnTransformerWrapper can only wrap the OneHotEncoder if you " "set its sparse attribute to False") if self.transformer_.__class__.__name__ in [ "OneHotEncoder", "OrdinalEncoder", "SimpleImputer", ]: self.variables_ = _find_all_variables(X, self.variables) else: self.variables_ = _find_or_check_numerical_variables( X, self.variables) self.transformer_.fit(X[self.variables_], y) self.n_features_in_ = X.shape[1] return self
def fit(self, X, y=None): """ Find constant and quasi-constant features. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe. y: None y is not needed for this transformer. You can pass y or None. Attributes ---------- constant_features_: list The list of constant and quasi-constant features. """ # check input dataframe X = _is_dataframe(X) # find all variables or check those entered are present in the dataframe self.variables = _find_all_variables(X, self.variables) # find constant and quasi-constant self.constant_features_ = [] for feature in self.variables: predominant = ( (X[feature].value_counts() / np.float(len(X))) .sort_values(ascending=False) .values[0] ) if predominant >= self.tol: self.constant_features_.append(feature) # if total constant features is equal to total features raise an error if len(self.constant_features_) == len(X.columns): raise ValueError( "The resulting dataframe will have no columns after dropping all " "constant features." ) self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[str] = None): """ The `fit` method allows Scikit-learn transformers to learn the required parameters from the training data set. If transformer is OneHotEncoder, OrdinalEncoder or SimpleImputer, all variables indicated in the ```variables``` parameter will be transformed. When the variables parameter is None, the SklearnWrapper will automatically select and transform all features in the dataset, numerical or otherwise. For all other Scikit-learn transformers only numerical variables will be transformed. The SklearnWrapper will check that the variables indicated in the variables parameter are numerical, or alternatively, if variables is None, it will automatically select the numerical variables in the data set. Parameters ---------- X : Pandas DataFrame The dataset to fit the transformer y : pandas Series, default=None This parameter exists only for compatibility with sklearn.pipeline.Pipeline. Raises ------ TypeError If the input is not a Pandas DataFrame Returns ------- self """ # check input dataframe X = _is_dataframe(X) if isinstance(self.transformer, (OneHotEncoder, OrdinalEncoder, SimpleImputer)): self.variables = _find_all_variables(X, self.variables) else: self.variables = _find_or_check_numerical_variables( X, self.variables) self.transformer.fit(X[self.variables]) self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: Optional[str] = None): """ Fits the Scikit-learn transformer to the selected variables. Parameters ---------- X: Pandas DataFrame The dataset to fit the transformer. y: pandas Series, default=None The target variable. """ # check input dataframe X = check_X(X) self.transformer_ = clone(self.transformer) if self.transformer_.__class__.__name__ in [ "OneHotEncoder", "OrdinalEncoder", "SimpleImputer", "FunctionTransformer", ]: self.variables_ = _find_all_variables(X, self.variables) else: self.variables_ = _find_or_check_numerical_variables( X, self.variables) self.transformer_.fit(X[self.variables_], y) if self.transformer_.__class__.__name__ in _SELECTORS: # Find features to drop. selected = X[self.variables_].columns[ self.transformer_.get_support()] self.features_to_drop_ = [ f for f in self.variables_ if f not in selected ] # save input features self.feature_names_in_ = X.columns.tolist() self.n_features_in_ = X.shape[1] return self
def _check_fit_input_and_variables(self, X: pd.DataFrame) -> pd.DataFrame: """ Checks that input is a dataframe, finds categorical variables, or alternatively checks that the variables entered by the user are of type object (categorical). Checks absence of NA. Parameters ---------- X: Pandas DataFrame Raises ------ TypeError If the input is not a Pandas DataFrame. If any user provided variable is not categorical ValueError If there are no categorical variables in the df or the df is empty If the variable(s) contain null values Returns ------- X: Pandas DataFrame The same dataframe entered as parameter variables : list list of categorical variables """ # check input dataframe X = _is_dataframe(X) if not self.ignore_format: # find categorical variables or check variables entered by user are object self.variables_: List[ Union[str, int] ] = _find_or_check_categorical_variables(X, self.variables) else: # select all variables or check variables entered by the user self.variables_ = _find_all_variables(X, self.variables) # check if dataset contains na _check_contains_na(X, self.variables_) return X
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Makes a copy of the train set. Only stores a copy of the variables to impute. This copy is then used to randomly extract the values to fill the missing data during transform. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: None y is not needed in this imputation. You can pass None or y. """ # check input dataframe X = check_X(X) # find variables to impute self.variables_ = _find_all_variables(X, self.variables) # take a copy of the selected variables self.X_ = X[self.variables_].copy() # check the variables assigned to the random state if self.seed == "observation": self.random_state = _check_input_parameter_variables( self.random_state) if isinstance(self.random_state, (int, str)): self.random_state = [self.random_state] if self.random_state and any( var for var in self.random_state if var not in X.columns): raise ValueError( "There are variables assigned as random state which are not part " "of the training dataframe.") self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Find the important features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe y : array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # check variables self.variables = _find_all_variables(X, self.variables) # check if df contains na _check_contains_na(X, self.variables) # limit df to variables to smooth code below X = X[self.variables].copy() # find categorical and numerical variables self.variables_categorical_ = list(X.select_dtypes(include="O").columns) self.variables_numerical_ = list( X.select_dtypes(include=["float", "integer"]).columns ) # obtain cross-validation indeces skf = StratifiedKFold( n_splits=self.cv, shuffle=True, random_state=self.random_state ) skf.get_n_splits(X, y) if self.variables_categorical_ and self.variables_numerical_: _pipeline = self._make_combined_pipeline() elif self.variables_categorical_: _pipeline = self._make_categorical_pipeline() else: _pipeline = self._make_numerical_pipeline() # obtain feature performance with cross-validation feature_importances_cv = [] for train_index, test_index in skf.split(X, y): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] _pipeline.fit(X_train, y_train) X_test = _pipeline.transform(X_test) if self.scoring == "roc_auc_score": tmp_split = { f: roc_auc_score(y_test, X_test[f]) for f in self.variables } else: tmp_split = {f: r2_score(y_test, X_test[f]) for f in self.variables} feature_importances_cv.append(pd.Series(tmp_split)) feature_importances_cv = pd.concat(feature_importances_cv, axis=1) self.feature_performance_ = feature_importances_cv.mean( # type: ignore axis=1 ).to_dict() self.features_to_drop_ = [ f for f in self.variables if self.feature_performance_[f] < self.threshold ] self.input_shape_ = X.shape return self
def fit(self, X, y=None): """ Find duplicated features. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe. y: None y is not needed for this transformer. You can pass y or None. Attributes ---------- duplicated_features_: set The duplicated features. duplicated_feature_sets_: list Groups of duplicated features. Or in other words, features that are duplicated with each other. Each list represents a group of duplicated features. """ # check input dataframe X = _is_dataframe(X) # find all variables or check those entered are in the dataframe self.variables = _find_all_variables(X, self.variables) # create tuples of duplicated feature groups self.duplicated_feature_sets_ = [] # set to collect features that are duplicated self.duplicated_features_ = set() # create set of examined features _examined_features = set() for feature in self.variables: # append so we can remove when we create the combinations _examined_features.add(feature) if feature not in self.duplicated_features_: _temp_set = set([feature]) # features that have not been examined, are not currently examined and # were not found duplicates _features_to_compare = [ f for f in self.variables if f not in _examined_features.union(self.duplicated_features_) ] # create combinations: for f2 in _features_to_compare: if X[feature].equals(X[f2]): self.duplicated_features_.add(f2) _temp_set.add(f2) # if there are duplicated features if len(_temp_set) > 1: self.duplicated_feature_sets_.append(_temp_set) self.input_shape_ = X.shape return self
def fit(self, X: pd.DataFrame, y: pd.Series): """ Find the important features. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe. y: array-like of shape (n_samples) Target variable. Required to train the estimator. """ # check input dataframe X, y = check_X_y(X, y) # If required exclude variables that are not in the input dataframe self._confirm_variables(X) # find all variables or check those entered are present in the dataframe self.variables_ = _find_all_variables(X, self.variables_, exclude_datetime=True) if len(self.variables_) == 1 and self.threshold is None: raise ValueError( "When evaluating a single feature you need to manually set a value " "for the threshold. " f"The transformer is evaluating the performance of {self.variables_} " f"and the threshold was left to {self.threshold} when initializing " f"the transformer." ) # save input features self._get_feature_names_in(X) # set up the correct estimator if self.regression is True: est = TargetMeanRegressor( bins=self.bins, strategy=self.strategy, ) else: est = TargetMeanClassifier( bins=self.bins, strategy=self.strategy, ) self.feature_performance_ = {} for variable in self.variables_: # clone estimator estimator = clone(est) # set the estimator to evaluate the required variable estimator.set_params(variables=variable) model = cross_validate( estimator, X, y, cv=self.cv, scoring=self.scoring, ) self.feature_performance_[variable] = model["test_score"].mean() # select features if not self.threshold: threshold = pd.Series(self.feature_performance_).mean() else: threshold = self.threshold self.features_to_drop_ = [ f for f in self.variables_ if self.feature_performance_[f] < threshold ] return self
def fit(self, X: pd.DataFrame, y: pd.Series = None): """ Find duplicated features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe. y : None y is not needed for this transformer. You can pass y or None. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find all variables or check those entered are in the dataframe self.variables = _find_all_variables(X, self.variables) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables) # create tuples of duplicated feature groups self.duplicated_feature_sets_ = [] # set to collect features that are duplicated self.features_to_drop_ = set() # type: ignore # create set of examined features _examined_features = set() for feature in self.variables: # append so we can remove when we create the combinations _examined_features.add(feature) if feature not in self.features_to_drop_: _temp_set = set([feature]) # features that have not been examined, are not currently examined and # were not found duplicates _features_to_compare = [ f for f in self.variables if f not in _examined_features.union(self.features_to_drop_) ] # create combinations: for f2 in _features_to_compare: if X[feature].equals(X[f2]): self.features_to_drop_.add(f2) _temp_set.add(f2) # if there are duplicated features if len(_temp_set) > 1: self.duplicated_feature_sets_.append(_temp_set) self.input_shape_ = X.shape return self