def test_check_X_converts_numpy_to_pandas(): a1D = np.array([1, 2, 3, 4]) a2D = np.array([[1, 2], [3, 4]]) a3D = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) df_2D = pd.DataFrame(a2D, columns=["0", "1"]) assert_frame_equal(df_2D, check_X(a2D)) with pytest.raises(ValueError): check_X(a3D) with pytest.raises(ValueError): check_X(a1D)
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Common set-up of creation transformers. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. y: pandas Series, or np.array. Defaults to None. It is not needed in this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # check variables are numerical self.variables: List[Union[str, int]] = _find_or_check_numerical_variables( X, self.variables) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.variables) _check_contains_inf(X, self.variables) # save input features self.feature_names_in_ = X.columns.tolist() # save train set shape self.n_features_in_ = X.shape[1] return X
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Return dataframe with selected features. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features]. The input dataframe. Returns ------- X_new: pandas dataframe of shape = [n_samples, n_selected_features] Pandas dataframe with the selected features. """ # check if fit is performed prior to transform check_is_fitted(self) # check if input is a dataframe X = check_X(X) # check if number of columns in test dataset matches to train dataset _check_X_matches_training_df(X, self.n_features_in_) # reorder df to match train set X = X[self.feature_names_in_] # return the dataframe with the selected features return X.drop(columns=self.features_to_drop_)
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the encodings or levels to use for representing categorical variables. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. Can be the entire dataframe, not just the variables to be transformed. y: pandas Series, default = None y is not needed in this encoder. You can pass y or None. """ X = check_X(X) self._check_or_select_variables(X) if self.missing_values == "raise": _check_contains_na(X, self.variables_) self._get_feature_names_in(X) self.category_dict_ = dict() for var in self.variables_: self.category_dict_[var] = pd.Categorical(X[var]).categories return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This method does not learn any parameter. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: None y is not needed in this imputation. You can pass None or y. """ # check input dataframe X = check_X(X) # find or check for numerical variables # create the imputer dictionary if self.imputer_dict: self.variables_ = _find_or_check_numerical_variables( X, self.imputer_dict.keys() # type: ignore ) self.imputer_dict_ = self.imputer_dict else: self.variables_ = _find_or_check_numerical_variables( X, self.variables) self.imputer_dict_ = { var: self.arbitrary_number for var in self.variables_ } self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn parameters. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: pandas Series, default=None y is not needed in this transformer. You can pass None or y. """ # check input dataframe X = check_X(X) # We need the dataframes to have unique values in the index and no missing data. # Otherwise, when we merge the new features we will duplicate rows. self._check_index(X) # find or check for numerical variables self.variables_ = _find_or_check_numerical_variables(X, self.variables) # check if dataset contains na if self.missing_values == "raise": self._check_na_and_inf(X) self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the variables for which the missing indicators will be created. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: pandas Series, default=None y is not needed in this imputation. You can pass None or y. """ # check input dataframe X = check_X(X) # find variables for which indicator should be added self.variables_ = _find_all_variables(X, self.variables) if self.missing_only is True: self.variables_ = [ var for var in self.variables_ if X[var].isnull().sum() > 0 ] self._get_feature_names_in(X) return self
def _transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Common checks before transforming data: - Check transformer was fit - Check that the input is a dataframe - Check that input has same size than the train set used in fit() - Re-orders dataframe features if necessary Parameters ---------- X: Pandas DataFrame Returns ------- X: Pandas DataFrame The same dataframe entered by the user. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check that input df contains same number of columns as df used to fit _check_X_matches_training_df(X, self.n_features_in_) # reorder df to match train set X = X[self.feature_names_in_] return X
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the counts or frequencies which will be used to replace the categories. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. Can be the entire dataframe, not just the variables to be transformed. y: pandas Series, default = None y is not needed in this encoder. You can pass y or None. """ X = check_X(X) self._fit(X) self._get_feature_names_in(X) self.encoder_dict_ = {} dct_init = defaultdict(lambda: 0) if self.errors == "encode" else {} # learn encoding maps for var in self.variables_: if self.encoding_method == "count": self.encoder_dict_[var] = X[var].value_counts().to_dict(dct_init) elif self.encoding_method == "frequency": self.encoder_dict_[var] = ( X[var].value_counts(normalize=True).to_dict(dct_init) ) self._check_encoding_dictionary() return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Find the variables for which missing data should be evaluated to decide if a row should be dropped. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training data set. y: pandas Series, default=None y is not needed in this imputation. You can pass None or y. """ # check input dataframe X = check_X(X) # find variables for which indicator should be added self.variables_ = _find_all_variables(X, self.variables) # If user passes a threshold, then missing_only is ignored: if self.threshold is None and self.missing_only is True: self.variables_ = [ var for var in self.variables_ if X[var].isnull().sum() > 0 ] self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the mean or median values. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: pandas series or None, default=None y is not needed in this imputation. You can pass None or y. """ # check input dataframe X = check_X(X) # find or check for numerical variables self.variables_ = _find_or_check_numerical_variables(X, self.variables) # find imputation parameters: mean or median if self.imputation_method == "mean": self.imputer_dict_ = X[self.variables_].mean().to_dict() elif self.imputation_method == "median": self.imputer_dict_ = X[self.variables_].median().to_dict() self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: pd.Series = None): """ This transformer does not learn any parameter. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe y : pandas Series, default = None y is not needed for this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # X[self.features_to_drops] calls to pandas to check if columns are # present in the df. X[self.features_to_drop] self.features_to_drop_ = self.features_to_drop # check user is not removing all columns in the dataframe if len(self.features_to_drop) == len(X.columns): raise ValueError( "The resulting dataframe will have no columns after dropping all " "existing variables" ) # save input features self._get_feature_names_in(X) return self
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Common input and transformer checks. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. Returns ------- X_new: Pandas dataframe The dataframe with the original variables plus the new variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check if input data contains same number of columns as dataframe used to fit. _check_X_matches_training_df(X, self.n_features_in_) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.variables) _check_contains_inf(X, self.variables) # reorder variables to match train set X = X[self.feature_names_in_] return X
def fit(self, X: pd.DataFrame, y: pd.Series = None): """ Find constant and quasi-constant features. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe. y: None y is not needed for this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # If required exclude variables that are not in the input dataframe self._confirm_variables(X) # find all variables or check those entered are present in the dataframe self.variables_ = _find_all_variables(X, self.variables_) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables_) if self.missing_values == "include": X[self.variables_] = X[self.variables_].fillna("missing_values") # find constant features if self.tol == 1: self.features_to_drop_ = [ feature for feature in self.variables_ if X[feature].nunique() == 1 ] # find constant and quasi-constant features else: self.features_to_drop_ = [] for feature in self.variables_: # find most frequent value / category in the variable predominant = ((X[feature].value_counts() / float(len(X))).sort_values( ascending=False).values[0]) if predominant >= self.tol: self.features_to_drop_.append(feature) # check we are not dropping all the columns in the df if len(self.features_to_drop_) == len(X.columns): raise ValueError( "The resulting dataframe will have no columns after dropping all " "constant or quasi-constant features. Try changing the tol value." ) # save input features self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn parameters. Perform dataframe checks. Creates dictionary of operation to new feature name pairs. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: pandas Series, or np.array. Defaults to None. It is not needed in this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # check variables to combine are numerical self.variables_to_combine = _find_or_check_numerical_variables( X, self.variables_to_combine) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.variables_to_combine) _check_contains_inf(X, self.variables_to_combine) if self.math_operations is None: self.math_operations_ = [ "sum", "prod", "mean", "std", "max", "min" ] else: self.math_operations_ = self.math_operations # dictionary of new_variable_name to operation pairs if self.new_variables_names: self.combination_dict_ = dict( zip(self.new_variables_names, self.math_operations_)) else: if all(isinstance(var, str) for var in self.variables_to_combine): vars_ls = self.variables_to_combine else: vars_ls = [str(var) for var in self.variables_to_combine] self.combination_dict_ = { f"{operation}({'-'.join(vars_ls)})": operation # type: ignore for operation in self.math_operations_ } # save input features self.feature_names_in_ = X.columns.tolist() # save train set shape self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn any parameter. Finds datetime variables or checks that the variables selected by the user can be converted to datetime. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: pandas Series, default=None It is not needed in this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # special case index if self.variables == "index": if not ( is_datetime(X.index) or ( not is_numeric(X.index) and _is_categorical_and_is_datetime(X.index) ) ): raise TypeError("The dataframe index is not datetime.") if self.missing_values == "raise": self._check_index_contains_na(X.index) self.variables_ = None else: # find or check for datetime variables self.variables_ = _find_or_check_datetime_variables(X, self.variables) # check if datetime variables contains na if self.missing_values == "raise": _check_contains_na(X, self.variables_) if self.features_to_extract is None: self.features_to_extract_ = FEATURES_DEFAULT elif isinstance(self.features_to_extract, str): self.features_to_extract_ = FEATURES_SUPPORTED else: self.features_to_extract_ = self.features_to_extract # save input features self.feature_names_in_ = X.columns.tolist() # save train set shape self.n_features_in_ = X.shape[1] return self
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Cap the variable values. Optionally, add outlier indicators. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to be transformed. Returns ------- X_new: pandas dataframe of shape = [n_samples, n_features + n_ind] The dataframe with the capped variables and indicators. The number of output variables depends on the values for 'tail' and 'add_indicators': if passing 'add_indicators=False', will be equal to 'n_features', otherwise, will have an additional indicator column per processed feature for each tail. """ if not self.add_indicators: X_out = super().transform(X) else: X_orig = check_X(X) X_out = super().transform(X_orig) X_orig = X_orig[self.variables_] X_out_filtered = X_out[self.variables_] if self.tail in ["left", "both"]: X_left = X_out_filtered > X_orig X_left.columns = [str(cl) + "_left" for cl in self.variables_] if self.tail in ["right", "both"]: X_right = X_out_filtered < X_orig X_right.columns = [ str(cl) + "_right" for cl in self.variables_ ] if self.tail == "left": X_out = pd.concat([X_out, X_left.astype(np.float64)], axis=1) elif self.tail == "right": X_out = pd.concat([X_out, X_right.astype(np.float64)], axis=1) else: X_both = pd.concat([X_left, X_right], axis=1).astype(np.float64) X_both = X_both[[ cl1 for cl2 in zip(X_left.columns.values, X_right.columns.values) for cl1 in cl2 ]] X_out = pd.concat([X_out, X_both], axis=1) return X_out
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the frequent categories for each variable. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just selected variables y: None y is not required. You can pass y or None. """ X = check_X(X) self._fit(X) self._get_feature_names_in(X) self.encoder_dict_ = {} for var in self.variables_: if len(X[var].unique()) > self.n_categories: # if the variable has more than the indicated number of categories # the encoder will learn the most frequent categories t = pd.Series(X[var].value_counts() / float(len(X))) # non-rare labels: freq_idx = t[t >= self.tol].index if self.max_n_categories: self.encoder_dict_[var] = freq_idx[:self.max_n_categories] else: self.encoder_dict_[var] = freq_idx else: # if the total number of categories is smaller than the indicated # the encoder will consider all categories as frequent. warnings.warn( "The number of unique categories for variable {} is less than that " "indicated in n_categories. Thus, all categories will be " "considered frequent".format(var)) self.encoder_dict_[var] = X[var].unique() self._check_encoding_dictionary() return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn any parameter. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: pandas Series, or np.array. Default=None. It is not needed in this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # check variables to combine are numerical self.variables_to_combine = _find_or_check_numerical_variables( X, self.variables_to_combine) # check reference_variables are numerical self.reference_variables = _find_or_check_numerical_variables( X, self.reference_variables) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.reference_variables) _check_contains_na(X, self.variables_to_combine) _check_contains_inf(X, self.reference_variables) _check_contains_inf(X, self.variables_to_combine) # cannot divide by 0, as will result in error if "div" in self.operations: if X[self.reference_variables].isin([0]).any().any(): raise ValueError( "Some of the reference variables contain 0 as values. Check and " "remove those before using this transformer with div.") # save input features self.feature_names_in_ = X.columns.tolist() # save train set shape self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the values at the end of the variable distribution. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: pandas Series, default=None y is not needed in this imputation. You can pass None or y. """ # check input dataframe X = check_X(X) # find or check for numerical variables self.variables_ = _find_or_check_numerical_variables(X, self.variables) # estimate imputation values if self.imputation_method == "max": self.imputer_dict_ = (X[self.variables_].max() * self.fold).to_dict() elif self.imputation_method == "gaussian": if self.tail == "right": self.imputer_dict_ = ( X[self.variables_].mean() + self.fold * X[self.variables_].std()).to_dict() elif self.tail == "left": self.imputer_dict_ = ( X[self.variables_].mean() - self.fold * X[self.variables_].std()).to_dict() elif self.imputation_method == "iqr": IQR = X[self.variables_].quantile(0.75) - X[ self.variables_].quantile(0.25) if self.tail == "right": self.imputer_dict_ = (X[self.variables_].quantile(0.75) + (IQR * self.fold)).to_dict() elif self.tail == "left": self.imputer_dict_ = (X[self.variables_].quantile(0.25) - (IQR * self.fold)).to_dict() self._get_feature_names_in(X) return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn any parameter. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. y: pandas Series, default=None y is not needed in this transformer. You can pass y or None. """ X = check_X(X) # find variables to be capped if self.min_capping_dict is None and self.max_capping_dict: self.variables_ = [x for x in self.max_capping_dict.keys()] elif self.max_capping_dict is None and self.min_capping_dict: self.variables_ = [x for x in self.min_capping_dict.keys()] elif self.min_capping_dict and self.max_capping_dict: tmp = self.min_capping_dict.copy() tmp.update(self.max_capping_dict) self.variables_ = [x for x in tmp.keys()] if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) # find or check for numerical variables self.variables_ = _find_or_check_numerical_variables(X, self.variables_) if self.max_capping_dict is not None: self.right_tail_caps_ = self.max_capping_dict else: self.right_tail_caps_ = {} if self.min_capping_dict is not None: self.left_tail_caps_ = self.min_capping_dict else: self.left_tail_caps_ = {} self.feature_names_in_ = X.columns.to_list() self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: Optional[str] = None): """ Fits the Scikit-learn transformer to the selected variables. Parameters ---------- X: Pandas DataFrame The dataset to fit the transformer. y: pandas Series, default=None The target variable. """ # check input dataframe X = check_X(X) self.transformer_ = clone(self.transformer) if self.transformer_.__class__.__name__ in [ "OneHotEncoder", "OrdinalEncoder", "SimpleImputer", "FunctionTransformer", ]: self.variables_ = _find_all_variables(X, self.variables) else: self.variables_ = _find_or_check_numerical_variables( X, self.variables) self.transformer_.fit(X[self.variables_], y) if self.transformer_.__class__.__name__ in _SELECTORS: # Find features to drop. selected = X[self.variables_].columns[ self.transformer_.get_support()] self.features_to_drop_ = [ f for f in self.variables_ if f not in selected ] # save input features self.feature_names_in_ = X.columns.tolist() self.n_features_in_ = X.shape[1] return self
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """Learn the numbers to be used to replace the categories in each variable. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to be encoded. y: pandas series, default=None The Target. Can be None if `encoding_method='arbitrary'`. Otherwise, y needs to be passed when fitting the transformer. """ if self.encoding_method == "ordered": X, y = check_X_y(X, y) else: X = check_X(X) self._fit(X) self._get_feature_names_in(X) if self.encoding_method == "ordered": temp = pd.concat([X, y], axis=1) temp.columns = list(X.columns) + ["target"] # find mappings self.encoder_dict_ = {} for var in self.variables_: if self.encoding_method == "ordered": t = (temp.groupby( [var])["target"].mean().sort_values(ascending=True).index) elif self.encoding_method == "arbitrary": t = X[var].unique() self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)} self._check_encoding_dictionary() return self
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Drops variables that were not seen in the train set and adds variables that were in the train set but not in the data to transform. In other words, it returns a dataframe with matching columns. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. Returns ------- X_new: Pandas dataframe, shape = [n_samples, n_features] The dataframe with variables that match those observed in the train set. """ check_is_fitted(self) X = check_X(X) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.feature_names_in_) _columns_to_drop = list(set(X.columns) - set(self.feature_names_in_)) _columns_to_add = list(set(self.feature_names_in_) - set(X.columns)) if self.verbose: if len(_columns_to_add) > 0: print( "The following variables are added to the DataFrame: " f"{_columns_to_add}" ) if len(_columns_to_drop) > 0: print( "The following variables are dropped from the DataFrame: " f"{_columns_to_drop}" ) X = X.drop(_columns_to_drop, axis=1) X = X.reindex(columns=self.feature_names_in_, fill_value=self.fill_value) return X
def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame: """Convert the transformed variables back to the original values. Only implemented for the following Scikit-learn transformers: PowerTransformer, QuantileTransformer, OrdinalEncoder, MaxAbsScaler, MinMaxScaler, StandardScaler, RobustScaler. If you would like this method implemented for additional transformers, please check if they have the inverse_transform method in Scikit-learn and then raise an issue in our repo. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features]. The transformed dataframe. Returns ------- X_tr: pandas dataframe of shape = [n_samples, n_features]. The dataframe with the original values. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = check_X(X) if self.transformer_.__class__.__name__ not in _INVERSE_TRANSFORM: raise NotImplementedError( "The method `inverse_transform` is not implemented for this " "transformer. Supported transformers are {}.".format( ", ".join(_INVERSE_TRANSFORM))) # For safety, we check that the transformer has the method implemented. if hasattr(self.transformer_, "inverse_transform") and callable( self.transformer_.inverse_transform): X[self.variables_] = self.transformer_.inverse_transform( X[self.variables_]) else: raise NotImplementedError( "This Scikit-learn transformer does not have the method " "`inverse_transform` implemented.") return X
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Makes a copy of the train set. Only stores a copy of the variables to impute. This copy is then used to randomly extract the values to fill the missing data during transform. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: None y is not needed in this imputation. You can pass None or y. """ # check input dataframe X = check_X(X) # find variables to impute self.variables_ = _find_all_variables(X, self.variables) # take a copy of the selected variables self.X_ = X[self.variables_].copy() # check the variables assigned to the random state if self.seed == "observation": self.random_state = _check_input_parameter_variables( self.random_state) if isinstance(self.random_state, (int, str)): self.random_state = [self.random_state] if self.random_state and any( var for var in self.random_state if var not in X.columns): raise ValueError( "There are variables assigned as random state which are not part " "of the training dataframe.") self._get_feature_names_in(X) return self
def _check_transform_input_and_state(self, X: pd.DataFrame) -> pd.DataFrame: """Checks that the input is a dataframe and of the same size than the one used in the fit method. Checks absence of NA. Parameters ---------- X: Pandas DataFrame Raises ------ TypeError If the input is not a Pandas DataFrame ValueError If the dataframe is not of same size as that used in fit() Returns ------- X: Pandas DataFrame The same dataframe entered by the user. """ # check if class was fitted check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check that the dataframe contains the same number of columns # than the dataframe used to fit the imputer. _check_X_matches_training_df(X, self.n_features_in_) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) # reorder to match training set X = X[self.feature_names_in_] return X
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Common checks performed before the feature transformation. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. Returns ------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. """ # check method fit has been called check_is_fitted(self) # check if 'X' is a dataframe X = check_X(X) # check if input data contains the same number of columns as the fitted # dataframe. _check_X_matches_training_df(X, self.n_features_in_) # Dataframes must have unique values in the index and no missing data. # Otherwise, when we merge the created features we will duplicate rows. self._check_index(X) # check if dataset contains na if self.missing_values == "raise": self._check_na_and_inf(X) # reorder variables to match train set X = X[self.feature_names_in_] if self.sort_index is True: X.sort_index(inplace=True) return X
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Combine the variables with the mathematical operations. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. Returns ------- X_new: Pandas dataframe, shape = [n_samples, n_features + n_operations] The dataframe with the original variables plus the new variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check if input data contains same number of columns as dataframe used to fit. _check_X_matches_training_df(X, self.n_features_in_) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.variables_to_combine) _check_contains_inf(X, self.variables_to_combine) # combine mathematically for new_variable_name, operation in self.combination_dict_.items(): X[new_variable_name] = X[self.variables_to_combine].agg(operation, axis=1) if self.drop_original: X.drop(columns=self.variables_to_combine, inplace=True) return X
def _transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Replace original values by the average of the target mean value per bin or category in each one of the variables. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input samples. Return ------- X_new: pandas dataframe of shape = [n_samples, n_features] The transformed data with the discrete variables. """ # check method fit has been called check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check input data contains same number of columns as df used to fit _check_X_matches_training_df(X, self.n_features_in_) # check for missing values _check_contains_na(X, self.variables_numerical_) _check_contains_na(X, self.variables_categorical_) # check inf _check_contains_inf(X, self.variables_numerical_) # reorder dataframe to match train set X = X[self.feature_names_in_] # transform dataframe X_tr = self._pipeline.transform(X) return X_tr