示例#1
0
    def transform(self, X):
        """
        Apply the transformation to the dataframe. Only the selected features will be modified. 

        If transformer is OneHotEncoder, dummy features are concatenated to the source dataset.
        Note that the original categorical variables will not be removed from the dataset
        after encoding. If this is the desired effect, please use Feature-engine's 
        OneHotCategoricalEncoder instead.
        """

        # check that input is a dataframe
        X = _is_dataframe(X)

        # Check that input data contains same number of columns than
        # the dataframe used to fit the imputer.

        _check_input_matches_training_df(X, self.input_shape_[1])

        if isinstance(self.transformer, OneHotEncoder):
            ohe_results_as_df = pd.DataFrame(
                data=self.transformer.transform(X[self.variables]),
                columns=self.transformer.get_feature_names(self.variables)
            )
            X = pd.concat([X, ohe_results_as_df], axis=1)
        else:
            X[self.variables] = self.transformer.transform(X[self.variables])

        return X
示例#2
0
    def transform(self, X):
        """
        Groups rare labels under separate group 'Rare' or any other name provided
        by the user.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The input samples.
        
        Returns
        -------
        
        X_transformed : pandas dataframe of shape = [n_samples, n_features]
            The dataframe where rare categories have been grouped.
        """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        for feature in self.variables:
            X[feature] = np.where(X[feature].isin(self.encoder_dict_[feature]),
                                  X[feature], self.replace_with)

        return X
示例#3
0
    def transform(self, X):
        """
        Returns the predictions of the decision tree based of the variable's original value.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The input samples.
        
        Returns
        -------
        
        X_transformed : pandas dataframe of shape = [n_samples, n_features].
                        Dataframe with variables encoded with decision tree predictions.
        """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        X = self.encoder_.transform(X)

        return X
示例#4
0
    def transform(self, X):
        """
        Replaces missing data with the learned parameters.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The input samples.

        Returns
        -------

        X_transformed : pandas dataframe of shape = [n_samples, n_features]
            The dataframe without missing values in the selected variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # Check that input data contains same number of columns than
        # the dataframe used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        # replaces missing data with the learned parameters
        for variable in self.imputer_dict_:
            X[variable].fillna(self.imputer_dict_[variable], inplace=True)

        return X
示例#5
0
    def transform(self, X):
        """
        Drops the variable or list of variables indicated by the user from the original dataframe
        and returns a new dataframe with the remaining subset of variables.

        Parameters
        ----------
        X: pandas dataframe
            The input dataframe from which features will be dropped

        Returns
        -------
        X_transformed: pandas dataframe of shape = [n_samples, n_features - len(features_to_drop)]
            The transformed dataframe with the remaining subset of variables.

        """
        # check if fit is called prior
        check_is_fitted(self)

        # check input dataframe
        X = _is_dataframe(X)

        # check for input consistency
        _check_input_matches_training_df(X, self.input_shape_[1])

        X = X.drop(columns=self.features_to_drop)

        return X
示例#6
0
    def transform(self, X):
        """
        Adds the binary missing indicators.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The dataframe to be transformed.

        Returns
        -------

        X_transformed : pandas dataframe of shape = [n_samples, n_features]
            The dataframe containing the additional binary variables.
            Binary variables are named with the original variable name plus
            '_na'.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # Check that the dataframe contains the same number of columns than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        X = X.copy()
        for feature in self.variables_:
            X[feature + '_na'] = np.where(X[feature].isnull(), 1, 0)

        return X
示例#7
0
    def transform(self, X):
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns
        # than the dataframe used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        return X
示例#8
0
    def transform(self, X):
        """ Replaces categories with the learned parameters.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features].
            The input samples.

        Returns
        -------

        X_transformed : pandas dataframe of shape = [n_samples, n_features].
            The dataframe containing categories replaced by numbers.
       """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns
        # than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        # replace categories by the learned parameters
        for feature in self.encoder_dict_.keys():
            X[feature] = X[feature].map(self.encoder_dict_[feature])

        # check if NaN values were introduced by the encoding
        if X[self.encoder_dict_.keys()].isnull().sum().sum() > 0:
            warnings.warn(
                "NaN values were introduced in the returned dataframe by the encoder."
                "This means that some of the categories in the input dataframe were "
                "not present in the training set used when the fit method was called. "
                "Thus, mappings for those categories does not exist. Try using the "
                "RareLabelCategoricalEncoder to remove infrequent categories before "
                "calling this encoder."
            )

        return X
示例#9
0
    def transform(self, X):
        """
        Removes observations with outliers from the dataframe.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The data to be transformed.

        Returns
        -------

        X_transformed : pandas dataframe of shape = [n_samples, n_features]
            The dataframe without outlier observations.
        """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        if self.missing_values == 'raise':
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        for feature in self.right_tail_caps_.keys():
            outliers = np.where(X[feature] > self.right_tail_caps_[feature],
                                True, False)
            X = X.loc[~outliers]

        for feature in self.left_tail_caps_.keys():
            outliers = np.where(X[feature] < self.left_tail_caps_[feature],
                                True, False)
            X = X.loc[~outliers]

        return X
示例#10
0
    def transform(self, X):
        """
        Caps the variable values, that is, censors outliers.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The data to be transformed.

        Returns
        -------

        X_transformed : pandas dataframe of shape = [n_samples, n_features]
            The dataframe with the capped variables.
        """

        # check if class was fitted
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        if self.missing_values == 'raise':
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns
        # than the dataframe used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        # replace outliers
        for feature in self.right_tail_caps_.keys():
            X[feature] = np.where(X[feature] > self.right_tail_caps_[feature],
                                  self.right_tail_caps_[feature], X[feature])

        for feature in self.left_tail_caps_.keys():
            X[feature] = np.where(X[feature] < self.left_tail_caps_[feature],
                                  self.left_tail_caps_[feature], X[feature])

        return X
示例#11
0
    def transform(self, X):
        """
        Creates the dummy / binary variables.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The data to transform.
        
        Returns
        -------
        
        X_transformed : pandas dataframe.
            The shape of the dataframe will be different from the original as it includes the dummy variables.
        """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        for feature in self.variables:
            for category in self.encoder_dict_[feature]:
                X[str(feature) + '_' + str(category)] = np.where(
                    X[feature] == category, 1, 0)

        # drop the original non-encoded variables.
        X.drop(labels=self.variables, axis=1, inplace=True)

        return X
示例#12
0
    def inverse_transform(self, X):
        """ Convert the data back to the original representation.

        Parameters
        ----------

        X_transformed : pandas dataframe of shape = [n_samples, n_features].
            The transformed dataframe.

        Returns
        -------

        X : pandas dataframe of shape = [n_samples, n_features].
            The un-transformed dataframe, that is, containing the original values
            of the categorical variables.
       """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns
        # than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        # replace encoded categories by the original values
        for feature in self.encoder_dict_.keys():
            inv_map = {v: k for k, v in self.encoder_dict_[feature].items()}
            X[feature] = X[feature].map(inv_map)

        return X
示例#13
0
def test_check_input_matches_training_df(dataframe_vartypes):
    with pytest.raises(ValueError):
        assert _check_input_matches_training_df(dataframe_vartypes, 4)
示例#14
0
    def transform(self, X):
        """
        Replaces missing data with random values taken from the train set.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The dataframe to be transformed.

        Returns
        -------

        X_transformed : pandas dataframe of shape = [n_samples, n_features]
            The dataframe without missing values in the transformed variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # Check that the dataframe contains the same number of columns than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        # random sampling with a general seed
        if self.seed == 'general':
            for feature in self.variables:
                if X[feature].isnull().sum() > 0:
                    # determine number of data points to extract at random
                    n_samples = X[feature].isnull().sum()

                    # extract values
                    random_sample = self.X_[feature].dropna().sample(
                        n_samples,
                        replace=True,
                        random_state=self.random_state)
                    # re-index: pandas needs this to add values in the correct observations
                    random_sample.index = X[X[feature].isnull()].index

                    # replace na
                    X.loc[X[feature].isnull(), feature] = random_sample

        # random sampling observation per observation
        elif self.seed == 'observation':
            for feature in self.variables:
                if X[feature].isnull().sum() > 0:

                    # loop over each observation with missing data
                    for i in X[X[feature].isnull()].index:
                        # find the seed using additional variables
                        internal_seed = _define_seed(X,
                                                     i,
                                                     self.random_state,
                                                     how=self.seeding_method)

                        # extract 1 value at random
                        random_sample = self.X_[feature].dropna().sample(
                            1, replace=True, random_state=internal_seed)
                        random_sample = random_sample.values[0]

                        # replace the missing data point
                        X.loc[i, feature] = random_sample
        return X