def test_error_if_input_df_contains_na_in_transform(df_vartypes, df_na): # test case 4: when dataset contains na, transform method with pytest.raises(ValueError): transformer = EqualWidthDiscretiser() transformer.fit(df_vartypes) transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]])
def test_non_fitted_error(df_vartypes): with pytest.raises(NotFittedError): transformer = EqualWidthDiscretiser() transformer.transform(df_vartypes)
def fit(self, X: pd.DataFrame, y: pd.Series = None): """ Find features with high PSI values. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training dataset. y : pandas series. Default = None y is not needed in this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # If required exclude variables that are not in the input dataframe self._confirm_variables(X) # find numerical variables or check those entered are present in the dataframe self.variables_ = _find_or_check_numerical_variables( X, self.variables_) # Remove the split_col from the variables list. It might be added if the # variables are not defined at initialization. if self.split_col in self.variables_: self.variables_.remove(self.split_col) if self.missing_values == "raise": # check if dataset contains na or inf _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) # Split the dataframe into basis and test. basis_df, test_df = self._split_dataframe(X) # Check the shape of the returned dataframes for PSI calculations. # The number of observations must be at least equal to the # number of bins. if min(basis_df.shape[0], test_df.shape[0]) < self.bins: raise ValueError( "The number of rows in the basis and test datasets that will be used " f"in the PSI calculations must be at least larger than {self.bins}. " "After slitting the original dataset based on the given cut_off or" f"split_frac we have {basis_df.shape[0]} samples in the basis set, " f"and {test_df.shape[0]} samples in the test set. " "Please adjust the value of the cut_off or split_frac.") # Switch basis and test dataframes if required. if self.switch: test_df, basis_df = basis_df, test_df # set up the discretizer if self.strategy == "equal_width": bucketer = EqualWidthDiscretiser(bins=self.bins) else: bucketer = EqualFrequencyDiscretiser(q=self.bins) # Compute the PSI by looping over the features self.psi_values_ = {} self.features_to_drop_ = [] for feature in self.variables_: # Discretize the features. basis_discrete = bucketer.fit_transform(basis_df[[feature ]].dropna()) test_discrete = bucketer.transform(test_df[[feature]].dropna()) # Determine percentage of observations per bin basis_distrib, test_distrib = self._observation_frequency_per_bin( basis_discrete, test_discrete) # Calculate the PSI value self.psi_values_[feature] = np.sum( (test_distrib - basis_distrib) * np.log(test_distrib / basis_distrib)) # Assess if feature should be dropped if self.psi_values_[feature] > self.threshold: self.features_to_drop_.append(feature) # save input features self._get_feature_names_in(X) return self