示例#1
0
    def _transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Common checks before transforming data:

        - Check transformer was fit
        - Check that the input is a dataframe
        - Check that input has same size than the train set used in fit()
        - Re-orders dataframe features if necessary

        Parameters
        ----------
        X: Pandas DataFrame

        Returns
        -------
        X: Pandas DataFrame
            The same dataframe entered by the user.
        """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check that input df contains same number of columns as df used to fit
        _check_X_matches_training_df(X, self.n_features_in_)

        # reorder df to match train set
        X = X[self.feature_names_in_]

        return X
示例#2
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Common input and transformer checks.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X_new: Pandas dataframe
            The dataframe with the original variables plus the new variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check if input data contains same number of columns as dataframe used to fit.
        _check_X_matches_training_df(X, self.n_features_in_)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.variables)
            _check_contains_inf(X, self.variables)

        # reorder variables to match train set
        X = X[self.feature_names_in_]

        return X
示例#3
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Return dataframe with selected features.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features].
            The input dataframe.

        Returns
        -------
        X_new: pandas dataframe of shape = [n_samples, n_selected_features]
            Pandas dataframe with the selected features.
        """

        # check if fit is performed prior to transform
        check_is_fitted(self)

        # check if input is a dataframe
        X = check_X(X)

        # check if number of columns in test dataset matches to train dataset
        _check_X_matches_training_df(X, self.n_features_in_)

        # reorder df to match train set
        X = X[self.feature_names_in_]

        # return the dataframe with the selected features
        return X.drop(columns=self.features_to_drop_)
示例#4
0
    def _check_transform_input_and_state(self,
                                         X: pd.DataFrame) -> pd.DataFrame:
        """Checks that the input is a dataframe and of the same size than the one used
        in the fit method. Checks absence of NA.

        Parameters
        ----------
        X: Pandas DataFrame

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame
        ValueError
            If the dataframe is not of same size as that used in fit()

        Returns
        -------
        X: Pandas DataFrame
            The same dataframe entered by the user.
        """
        # check if class was fitted
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check that the dataframe contains the same number of columns
        # than the dataframe used to fit the imputer.
        _check_X_matches_training_df(X, self.n_features_in_)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        # reorder to match training set
        X = X[self.feature_names_in_]

        return X
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Common checks performed before the feature transformation.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.
        """
        # check method fit has been called
        check_is_fitted(self)

        # check if 'X' is a dataframe
        X = check_X(X)

        # check if input data contains the same number of columns as the fitted
        # dataframe.
        _check_X_matches_training_df(X, self.n_features_in_)

        # Dataframes must have unique values in the index and no missing data.
        # Otherwise, when we merge the created features we will duplicate rows.
        self._check_index(X)

        # check if dataset contains na
        if self.missing_values == "raise":
            self._check_na_and_inf(X)

        # reorder variables to match train set
        X = X[self.feature_names_in_]

        if self.sort_index is True:
            X.sort_index(inplace=True)

        return X
示例#6
0
    def _transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
         Replace original values by the average of the target mean value per bin or
         category in each one of the variables.

         Parameters
         ----------
         X : pandas dataframe of shape = [n_samples, n_features]
             The input samples.

         Return
         -------
        X_new: pandas dataframe of shape = [n_samples, n_features]
            The transformed data with the discrete variables.
        """
        # check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check input data contains same number of columns as df used to fit
        _check_X_matches_training_df(X, self.n_features_in_)

        # check for missing values
        _check_contains_na(X, self.variables_numerical_)
        _check_contains_na(X, self.variables_categorical_)

        # check inf
        _check_contains_inf(X, self.variables_numerical_)

        # reorder dataframe to match train set
        X = X[self.feature_names_in_]

        # transform dataframe
        X_tr = self._pipeline.transform(X)

        return X_tr
示例#7
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Combine the variables with the mathematical operations.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X_new: Pandas dataframe, shape = [n_samples, n_features + n_operations]
            The dataframe with the original variables plus the new variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check if input data contains same number of columns as dataframe used to fit.
        _check_X_matches_training_df(X, self.n_features_in_)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.variables_to_combine)
            _check_contains_inf(X, self.variables_to_combine)

        # combine mathematically
        for new_variable_name, operation in self.combination_dict_.items():
            X[new_variable_name] = X[self.variables_to_combine].agg(operation,
                                                                    axis=1)

        if self.drop_original:
            X.drop(columns=self.variables_to_combine, inplace=True)

        return X
示例#8
0
    def _check_transform_input_and_state(self,
                                         X: pd.DataFrame) -> pd.DataFrame:
        """
        Checks that the input is a dataframe and of the same size than the one used
        in the fit method. Checks absence of NA.

        Parameters
        ----------
        X: Pandas DataFrame

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame
        ValueError
            - If the variable(s) contain null values.
            - If the df has different number of features than the df used in fit()

        Returns
        -------
        X: Pandas DataFrame
            The same dataframe entered by the user.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check input data contains same number of columns as df used to fit
        _check_X_matches_training_df(X, self.n_features_in_)

        # reorder df to match train set
        X = X[self.feature_names_in_]

        return X
示例#9
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Apply the transformation to the dataframe. Only the selected variables will be
        modified.

        If the Scikit-learn transformer is the OneHotEncoder or the  PolynomialFeatures,
        the new features will be concatenated to the input dataset.

        If the Scikit-learn transformer is for feature selection, the non-selected
        features will be dropped from the dataframe.

        For all other transformers, the original variables will be replaced by the
        transformed ones.

        Parameters
        ----------
        X: Pandas DataFrame
            The data to transform.

        Returns
        -------
        X_new: Pandas DataFrame
            The transformed dataset.
        """
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check that input data contains same number of columns than
        # the dataframe used to fit the imputer.

        _check_X_matches_training_df(X, self.n_features_in_)

        # reorder df to match train set
        X = X[self.feature_names_in_]

        # Transformers that add features: creators
        if self.transformer_.__class__.__name__ in [
                "OneHotEncoder",
                "PolynomialFeatures",
        ]:
            new_features_df = pd.DataFrame(
                data=self.transformer_.transform(X[self.variables_]),
                columns=self.transformer_.get_feature_names_out(
                    self.variables_),
                index=X.index,
            )
            X = pd.concat([X, new_features_df], axis=1)

        # Feature selection: transformers that remove features
        elif self.transformer_.__class__.__name__ in _SELECTORS:

            # return the dataframe with the selected features
            X.drop(columns=self.features_to_drop_, inplace=True)

        # Transformers that modify existing features
        else:
            X[self.variables_] = self.transformer_.transform(
                X[self.variables_])

        return X
示例#10
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Extract the date and time features and add them to the dataframe.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X_new: Pandas dataframe, shape = [n_samples, n_features x n_df_features]
            The dataframe with the original variables plus the new variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check if input data contains same number of columns as dataframe used to fit.
        _check_X_matches_training_df(X, self.n_features_in_)

        # reorder variables to match train set
        X = X[self.feature_names_in_]

        # special case index
        if self.variables_ is None:
            # check if dataset contains na
            if self.missing_values == "raise":
                self._check_index_contains_na(X.index)

            # convert index to a datetime series
            idx_datetime = pd.Series(
                pd.to_datetime(
                    X.index,
                    dayfirst=self.dayfirst,
                    yearfirst=self.yearfirst,
                    utc=self.utc,
                ),
                index=X.index,
            )

            # create new features
            for feat in self.features_to_extract_:
                X[FEATURES_SUFFIXES[feat][1:]] = FEATURES_FUNCTIONS[feat](idx_datetime)

        else:
            # check if dataset contains na
            if self.missing_values == "raise":
                _check_contains_na(X, self.variables_)

            # convert datetime variables
            datetime_df = pd.concat(
                [
                    pd.to_datetime(
                        X[variable],
                        dayfirst=self.dayfirst,
                        yearfirst=self.yearfirst,
                        utc=self.utc,
                    )
                    for variable in self.variables_
                ],
                axis=1,
            )

            non_dt_columns = datetime_df.columns[
                ~datetime_df.apply(is_datetime)
            ].tolist()
            if non_dt_columns:
                raise ValueError(
                    "ValueError: variable(s) "
                    + (len(non_dt_columns) * "{} ").format(*non_dt_columns)
                    + "could not be converted to datetime. Try setting utc=True"
                )

            # create new features
            for var in self.variables_:
                for feat in self.features_to_extract_:
                    X[str(var) + FEATURES_SUFFIXES[feat]] = FEATURES_FUNCTIONS[feat](
                        datetime_df[var]
                    )
            if self.drop_original:
                X.drop(self.variables_, axis=1, inplace=True)

        return X
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Combine the variables with the mathematical operations.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X_new: Pandas dataframe, shape = [n_samples, n_features + n_operations]
            The dataframe with the new variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check if input data contains same number of columns as dataframe used to fit.
        _check_X_matches_training_df(X, self.n_features_in_)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.reference_variables)
            _check_contains_na(X, self.variables_to_combine)

            _check_contains_inf(X, self.reference_variables)
            _check_contains_inf(X, self.variables_to_combine)

        # cannot divide by 0, as will result in error
        if "div" in self.operations:
            if X[self.reference_variables].isin([0]).any().any():
                raise ValueError(
                    "Some of the reference variables contain 0 as values. Check and "
                    "remove those before using this transformer.")

        # Add new features and values into de data frame.
        if "sub" in self.operations:
            for reference in self.reference_variables:
                varname = [
                    str(var) + "_sub_" + str(reference)
                    for var in self.variables_to_combine
                ]
                X[varname] = X[self.variables_to_combine].sub(X[reference],
                                                              axis=0)
        if "div" in self.operations:
            for reference in self.reference_variables:
                varname = [
                    str(var) + "_div_" + str(reference)
                    for var in self.variables_to_combine
                ]
                X[varname] = X[self.variables_to_combine].div(X[reference],
                                                              axis=0)
        if "add" in self.operations:
            for reference in self.reference_variables:
                varname = [
                    str(var) + "_add_" + str(reference)
                    for var in self.variables_to_combine
                ]
                X[varname] = X[self.variables_to_combine].add(X[reference],
                                                              axis=0)
        if "mul" in self.operations:
            for reference in self.reference_variables:
                varname = [
                    str(var) + "_mul_" + str(reference)
                    for var in self.variables_to_combine
                ]
                X[varname] = X[self.variables_to_combine].mul(X[reference],
                                                              axis=0)

        # replace created variable names with user ones.
        if self.new_variables_names:
            X.columns = self.feature_names_in_ + self.new_variables_names

        if self.drop_original:
            X.drop(
                columns=set(self.variables_to_combine +
                            self.reference_variables),
                inplace=True,
            )

        return X
def test_check_X_matches_training_df(df_vartypes):
    with pytest.raises(ValueError):
        assert _check_X_matches_training_df(df_vartypes, 4)