def _transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Common checks before transforming data: - Check transformer was fit - Check that the input is a dataframe - Check that input has same size than the train set used in fit() - Re-orders dataframe features if necessary Parameters ---------- X: Pandas DataFrame Returns ------- X: Pandas DataFrame The same dataframe entered by the user. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check that input df contains same number of columns as df used to fit _check_X_matches_training_df(X, self.n_features_in_) # reorder df to match train set X = X[self.feature_names_in_] return X
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Common input and transformer checks. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. Returns ------- X_new: Pandas dataframe The dataframe with the original variables plus the new variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check if input data contains same number of columns as dataframe used to fit. _check_X_matches_training_df(X, self.n_features_in_) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.variables) _check_contains_inf(X, self.variables) # reorder variables to match train set X = X[self.feature_names_in_] return X
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Return dataframe with selected features. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features]. The input dataframe. Returns ------- X_new: pandas dataframe of shape = [n_samples, n_selected_features] Pandas dataframe with the selected features. """ # check if fit is performed prior to transform check_is_fitted(self) # check if input is a dataframe X = check_X(X) # check if number of columns in test dataset matches to train dataset _check_X_matches_training_df(X, self.n_features_in_) # reorder df to match train set X = X[self.feature_names_in_] # return the dataframe with the selected features return X.drop(columns=self.features_to_drop_)
def _check_transform_input_and_state(self, X: pd.DataFrame) -> pd.DataFrame: """Checks that the input is a dataframe and of the same size than the one used in the fit method. Checks absence of NA. Parameters ---------- X: Pandas DataFrame Raises ------ TypeError If the input is not a Pandas DataFrame ValueError If the dataframe is not of same size as that used in fit() Returns ------- X: Pandas DataFrame The same dataframe entered by the user. """ # check if class was fitted check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check that the dataframe contains the same number of columns # than the dataframe used to fit the imputer. _check_X_matches_training_df(X, self.n_features_in_) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) # reorder to match training set X = X[self.feature_names_in_] return X
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Common checks performed before the feature transformation. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. Returns ------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. """ # check method fit has been called check_is_fitted(self) # check if 'X' is a dataframe X = check_X(X) # check if input data contains the same number of columns as the fitted # dataframe. _check_X_matches_training_df(X, self.n_features_in_) # Dataframes must have unique values in the index and no missing data. # Otherwise, when we merge the created features we will duplicate rows. self._check_index(X) # check if dataset contains na if self.missing_values == "raise": self._check_na_and_inf(X) # reorder variables to match train set X = X[self.feature_names_in_] if self.sort_index is True: X.sort_index(inplace=True) return X
def _transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Replace original values by the average of the target mean value per bin or category in each one of the variables. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input samples. Return ------- X_new: pandas dataframe of shape = [n_samples, n_features] The transformed data with the discrete variables. """ # check method fit has been called check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check input data contains same number of columns as df used to fit _check_X_matches_training_df(X, self.n_features_in_) # check for missing values _check_contains_na(X, self.variables_numerical_) _check_contains_na(X, self.variables_categorical_) # check inf _check_contains_inf(X, self.variables_numerical_) # reorder dataframe to match train set X = X[self.feature_names_in_] # transform dataframe X_tr = self._pipeline.transform(X) return X_tr
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Combine the variables with the mathematical operations. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. Returns ------- X_new: Pandas dataframe, shape = [n_samples, n_features + n_operations] The dataframe with the original variables plus the new variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check if input data contains same number of columns as dataframe used to fit. _check_X_matches_training_df(X, self.n_features_in_) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.variables_to_combine) _check_contains_inf(X, self.variables_to_combine) # combine mathematically for new_variable_name, operation in self.combination_dict_.items(): X[new_variable_name] = X[self.variables_to_combine].agg(operation, axis=1) if self.drop_original: X.drop(columns=self.variables_to_combine, inplace=True) return X
def _check_transform_input_and_state(self, X: pd.DataFrame) -> pd.DataFrame: """ Checks that the input is a dataframe and of the same size than the one used in the fit method. Checks absence of NA. Parameters ---------- X: Pandas DataFrame Raises ------ TypeError If the input is not a Pandas DataFrame ValueError - If the variable(s) contain null values. - If the df has different number of features than the df used in fit() Returns ------- X: Pandas DataFrame The same dataframe entered by the user. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check input data contains same number of columns as df used to fit _check_X_matches_training_df(X, self.n_features_in_) # reorder df to match train set X = X[self.feature_names_in_] return X
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Apply the transformation to the dataframe. Only the selected variables will be modified. If the Scikit-learn transformer is the OneHotEncoder or the PolynomialFeatures, the new features will be concatenated to the input dataset. If the Scikit-learn transformer is for feature selection, the non-selected features will be dropped from the dataframe. For all other transformers, the original variables will be replaced by the transformed ones. Parameters ---------- X: Pandas DataFrame The data to transform. Returns ------- X_new: Pandas DataFrame The transformed dataset. """ check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check that input data contains same number of columns than # the dataframe used to fit the imputer. _check_X_matches_training_df(X, self.n_features_in_) # reorder df to match train set X = X[self.feature_names_in_] # Transformers that add features: creators if self.transformer_.__class__.__name__ in [ "OneHotEncoder", "PolynomialFeatures", ]: new_features_df = pd.DataFrame( data=self.transformer_.transform(X[self.variables_]), columns=self.transformer_.get_feature_names_out( self.variables_), index=X.index, ) X = pd.concat([X, new_features_df], axis=1) # Feature selection: transformers that remove features elif self.transformer_.__class__.__name__ in _SELECTORS: # return the dataframe with the selected features X.drop(columns=self.features_to_drop_, inplace=True) # Transformers that modify existing features else: X[self.variables_] = self.transformer_.transform( X[self.variables_]) return X
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Extract the date and time features and add them to the dataframe. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. Returns ------- X_new: Pandas dataframe, shape = [n_samples, n_features x n_df_features] The dataframe with the original variables plus the new variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check if input data contains same number of columns as dataframe used to fit. _check_X_matches_training_df(X, self.n_features_in_) # reorder variables to match train set X = X[self.feature_names_in_] # special case index if self.variables_ is None: # check if dataset contains na if self.missing_values == "raise": self._check_index_contains_na(X.index) # convert index to a datetime series idx_datetime = pd.Series( pd.to_datetime( X.index, dayfirst=self.dayfirst, yearfirst=self.yearfirst, utc=self.utc, ), index=X.index, ) # create new features for feat in self.features_to_extract_: X[FEATURES_SUFFIXES[feat][1:]] = FEATURES_FUNCTIONS[feat](idx_datetime) else: # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.variables_) # convert datetime variables datetime_df = pd.concat( [ pd.to_datetime( X[variable], dayfirst=self.dayfirst, yearfirst=self.yearfirst, utc=self.utc, ) for variable in self.variables_ ], axis=1, ) non_dt_columns = datetime_df.columns[ ~datetime_df.apply(is_datetime) ].tolist() if non_dt_columns: raise ValueError( "ValueError: variable(s) " + (len(non_dt_columns) * "{} ").format(*non_dt_columns) + "could not be converted to datetime. Try setting utc=True" ) # create new features for var in self.variables_: for feat in self.features_to_extract_: X[str(var) + FEATURES_SUFFIXES[feat]] = FEATURES_FUNCTIONS[feat]( datetime_df[var] ) if self.drop_original: X.drop(self.variables_, axis=1, inplace=True) return X
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Combine the variables with the mathematical operations. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. Returns ------- X_new: Pandas dataframe, shape = [n_samples, n_features + n_operations] The dataframe with the new variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check if input data contains same number of columns as dataframe used to fit. _check_X_matches_training_df(X, self.n_features_in_) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.reference_variables) _check_contains_na(X, self.variables_to_combine) _check_contains_inf(X, self.reference_variables) _check_contains_inf(X, self.variables_to_combine) # cannot divide by 0, as will result in error if "div" in self.operations: if X[self.reference_variables].isin([0]).any().any(): raise ValueError( "Some of the reference variables contain 0 as values. Check and " "remove those before using this transformer.") # Add new features and values into de data frame. if "sub" in self.operations: for reference in self.reference_variables: varname = [ str(var) + "_sub_" + str(reference) for var in self.variables_to_combine ] X[varname] = X[self.variables_to_combine].sub(X[reference], axis=0) if "div" in self.operations: for reference in self.reference_variables: varname = [ str(var) + "_div_" + str(reference) for var in self.variables_to_combine ] X[varname] = X[self.variables_to_combine].div(X[reference], axis=0) if "add" in self.operations: for reference in self.reference_variables: varname = [ str(var) + "_add_" + str(reference) for var in self.variables_to_combine ] X[varname] = X[self.variables_to_combine].add(X[reference], axis=0) if "mul" in self.operations: for reference in self.reference_variables: varname = [ str(var) + "_mul_" + str(reference) for var in self.variables_to_combine ] X[varname] = X[self.variables_to_combine].mul(X[reference], axis=0) # replace created variable names with user ones. if self.new_variables_names: X.columns = self.feature_names_in_ + self.new_variables_names if self.drop_original: X.drop( columns=set(self.variables_to_combine + self.reference_variables), inplace=True, ) return X
def test_check_X_matches_training_df(df_vartypes): with pytest.raises(ValueError): assert _check_X_matches_training_df(df_vartypes, 4)