Пример #1
0
    def transform(self, X):
        """
        Replaces missing data with the learned parameters.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The input samples.

        Returns
        -------

        X_transformed : pandas dataframe of shape = [n_samples, n_features]
            The dataframe without missing values in the selected variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # Check that input data contains same number of columns than
        # the dataframe used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        # replaces missing data with the learned parameters
        for variable in self.imputer_dict_:
            X[variable].fillna(self.imputer_dict_[variable], inplace=True)

        return X
Пример #2
0
    def transform(self, X):
        """
        Apply the transformation to the dataframe. Only the selected features will be modified. 

        If transformer is OneHotEncoder, dummy features are concatenated to the source dataset.
        Note that the original categorical variables will not be removed from the dataset
        after encoding. If this is the desired effect, please use Feature-engine's 
        OneHotCategoricalEncoder instead.
        """

        # check that input is a dataframe
        X = _is_dataframe(X)

        # Check that input data contains same number of columns than
        # the dataframe used to fit the imputer.

        _check_input_matches_training_df(X, self.input_shape_[1])

        if isinstance(self.transformer, OneHotEncoder):
            ohe_results_as_df = pd.DataFrame(
                data=self.transformer.transform(X[self.variables]),
                columns=self.transformer.get_feature_names(self.variables)
            )
            X = pd.concat([X, ohe_results_as_df], axis=1)
        else:
            X[self.variables] = self.transformer.transform(X[self.variables])

        return X
Пример #3
0
    def transform(self, X):
        """
        Drops the variable or list of variables indicated by the user from the original dataframe
        and returns a new dataframe with the remaining subset of variables.

        Parameters
        ----------
        X: pandas dataframe
            The input dataframe from which features will be dropped

        Returns
        -------
        X_transformed: pandas dataframe of shape = [n_samples, n_features - len(features_to_drop)]
            The transformed dataframe with the remaining subset of variables.

        """
        # check if fit is called prior
        check_is_fitted(self)

        # check input dataframe
        X = _is_dataframe(X)

        # check for input consistency
        _check_input_matches_training_df(X, self.input_shape_[1])

        X = X.drop(columns=self.features_to_drop)

        return X
Пример #4
0
    def transform(self, X):
        """
        Groups rare labels under separate group 'Rare' or any other name provided
        by the user.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The input samples.
        
        Returns
        -------
        
        X_transformed : pandas dataframe of shape = [n_samples, n_features]
            The dataframe where rare categories have been grouped.
        """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        for feature in self.variables:
            X[feature] = np.where(X[feature].isin(self.encoder_dict_[feature]),
                                  X[feature], self.replace_with)

        return X
Пример #5
0
    def fit(self, X, y=None):
        """
        Verifies that the input X is a pandas dataframe

        Parameters
        ----------

        X: pandas dataframe of shape = [n_samples, n_features]
            The input dataframe

        y: None
            y is not needed for this transformer. You can pass y or None.

        """
        # check input dataframe
        X = _is_dataframe(X)

        # check for non existent columns
        non_existent = [x for x in self.features_to_drop if x not in X.columns]
        if non_existent:
            raise KeyError(
                f"Columns '{', '.join(non_existent)}' not present in the input dataframe, "
                f"please check the columns and enter a new list of features to drop"
            )

        # check that user does not drop all columns returning empty dataframe
        if len(self.features_to_drop) == len(X.columns):
            raise ValueError(
                "The resulting dataframe will have no columns after dropping all existing variables"
            )

        # add input shape
        self.input_shape_ = X.shape

        return self
Пример #6
0
    def transform(self, X):
        """
        Adds the binary missing indicators.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The dataframe to be transformed.

        Returns
        -------

        X_transformed : pandas dataframe of shape = [n_samples, n_features]
            The dataframe containing the additional binary variables.
            Binary variables are named with the original variable name plus
            '_na'.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # Check that the dataframe contains the same number of columns than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        X = X.copy()
        for feature in self.variables_:
            X[feature + '_na'] = np.where(X[feature].isnull(), 1, 0)

        return X
Пример #7
0
    def transform(self, X):
        """
        Returns the predictions of the decision tree based of the variable's original value.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The input samples.
        
        Returns
        -------
        
        X_transformed : pandas dataframe of shape = [n_samples, n_features].
                        Dataframe with variables encoded with decision tree predictions.
        """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        X = self.encoder_.transform(X)

        return X
Пример #8
0
    def fit(self, X, y=None):
        """
        The `fit` method allows Scikit-learn transformers to learn the required parameters
        from the training data set.

        If transformer is OneHotEncoder, OrdinalEncoder or SimpleImputer, all variables indicated
        in the variables parameter will be transformed. When the variables parameter is None, the
        SklearnWrapper will automatically select and transform all features in the dataset,
        numerical or otherwise.

        For all other Scikit-learn transformers only numerical variables will be transformed.
        The SklearnWrapper will check that the variables indicated in the variables parameter
        are numerical, or alternatively, if variables is None, it will automatically select
        the numerical variables in the data set.
        """

        # check input dataframe
        X = _is_dataframe(X)

        if isinstance(self.transformer, (OneHotEncoder, OrdinalEncoder, SimpleImputer)):
            self.variables = _find_all_variables(X, self.variables)
        else:
            self.variables = _find_numerical_variables(X, self.variables)

        self.transformer.fit(X[self.variables])

        self.input_shape_ = X.shape

        return self
Пример #9
0
    def fit(self, X, y=None):
        """
        Learns the values at the end of the variable distribution.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            The user can pass the entire dataframe, not just the variables that need imputation.

        y : None
            y is not needed in this imputation. You can pass None or y.

        Attributes
        ----------

        imputer_dict_: dictionary
            The dictionary containing the values at the end of the distribution
            per variable. These values will be used by the imputer to replace missing
            data.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        # estimate imputation values
        if self.distribution == 'max':
            self.imputer_dict_ = (X[self.variables].max() *
                                  self.fold).to_dict()

        elif self.distribution == 'gaussian':
            if self.tail == 'right':
                self.imputer_dict_ = (
                    X[self.variables].mean() +
                    self.fold * X[self.variables].std()).to_dict()
            elif self.tail == 'left':
                self.imputer_dict_ = (
                    X[self.variables].mean() -
                    self.fold * X[self.variables].std()).to_dict()

        elif self.distribution == 'skewed':
            IQR = X[self.variables].quantile(0.75) - X[
                self.variables].quantile(0.25)
            if self.tail == 'right':
                self.imputer_dict_ = (X[self.variables].quantile(0.75) +
                                      (IQR * self.fold)).to_dict()
            elif self.tail == 'left':
                self.imputer_dict_ = (X[self.variables].quantile(0.25) -
                                      (IQR * self.fold)).to_dict()

        self.input_shape_ = X.shape

        return self
Пример #10
0
    def fit(self, X, y=None):
        """ Learns the numbers to be used to replace the categories in each
        variable.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the variables to be
            encoded.

        y : pandas series, default=None
            The Target. Can be None if encoding_method = 'arbitrary'.
            Otherwise, y needs to be passed when fitting the transformer.
       
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # join target to predictor variables
        if self.encoding_method == 'ordered':
            if y is None:
                raise ValueError(
                    'Please provide a target y for this encoding method')

            temp = pd.concat([X, y], axis=1)
            temp.columns = list(X.columns) + ['target']

        # find mappings
        self.encoder_dict_ = {}

        for var in self.variables:

            if self.encoding_method == 'ordered':
                t = temp.groupby(
                    [var])['target'].mean().sort_values(ascending=True).index

            elif self.encoding_method == 'arbitrary':
                t = X[var].unique()

            self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)}

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self
Пример #11
0
    def fit(self, X, y=None):
        """
        Learns the unique categories per variable. If top_categories is indicated,
        it will learn the most popular categories. Alternatively, it learns all
        unique categories per variable.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just seleted variables.

        y : pandas series, default=None
            Target. It is not needed in this encoded. You can pass y or
            None.

        Attributes
        ----------

        encoder_dict_: dictionary
            The dictionary containing the categories for which dummy variables
            will be created.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        self.encoder_dict_ = {}

        for var in self.variables:
            if not self.top_categories:
                if self.drop_last:
                    category_ls = [x for x in X[var].unique()]
                    self.encoder_dict_[var] = category_ls[:-1]
                else:
                    self.encoder_dict_[var] = X[var].unique()

            else:
                self.encoder_dict_[var] = [
                    x for x in X[var].value_counts().sort_values(
                        ascending=False).head(self.top_categories).index
                ]

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self
Пример #12
0
    def fit(self, X, y=None):
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        return X
Пример #13
0
    def fit(self, X, y=None):
        """
        Learns the most frequent category if the imputation method is set to frequent.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the selected variables.

        y : None
            y is not needed in this imputation. You can pass None or y.

        Attributes
        ----------

        imputer_dict_: dictionary
            The dictionary mapping each variable to the most frequent category, or to
            the value 'Missing' depending on the imputation_method. The most frequent
            category is calculated when fitting the transformer.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for categorical variables
        self.variables = _find_categorical_variables(X, self.variables)

        if self.imputation_method == 'missing':
            self.imputer_dict_ = {
                var: self.fill_value
                for var in self.variables
            }

        elif self.imputation_method == 'frequent':
            self.imputer_dict_ = {}

            for var in self.variables:
                mode_vals = X[var].mode()

                # careful: some variables contain multiple modes
                if len(mode_vals) == 1:
                    self.imputer_dict_[var] = mode_vals[0]
                else:
                    raise ValueError(
                        'The variable {} contains multiple frequent categories.'
                        .format(var))

        self.input_shape_ = X.shape

        return self
Пример #14
0
    def fit(self, X, y=None):
        """
        Learns the counts or frequencies which will be used to replace the categories.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            The user can pass the entire dataframe.

        y : None
            y is not needed in this encoder. You can pass y or None.

        Attributes
        ----------

        encoder_dict_: dictionary
            Dictionary containing the {category: count / frequency} pairs for
            each variable.
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        self.encoder_dict_ = {}

        # learn encoding maps
        for var in self.variables:
            if self.encoding_method == 'count':
                self.encoder_dict_[var] = X[var].value_counts().to_dict()

            elif self.encoding_method == 'frequency':
                n_obs = np.float(len(X))
                self.encoder_dict_[var] = (X[var].value_counts() /
                                           n_obs).to_dict()

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self
Пример #15
0
    def fit(self, X, y):
        """
        Learns the mean value of the target for each category of the variable.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the variables to be encoded.

        y : pandas series
            The target.

        Attributes
        ----------

        encoder_dict_: dictionary
            The dictionary containing the {category: target mean} pairs used
            to replace categories in every variable.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        if y is None:
            raise ValueError(
                'Please provide a target y for this encoding method')

        temp = pd.concat([X, y], axis=1)
        temp.columns = list(X.columns) + ['target']

        self.encoder_dict_ = {}

        for var in self.variables:
            self.encoder_dict_[var] = temp.groupby(
                var)['target'].mean().to_dict()

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self
Пример #16
0
    def transform(self, X):
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns
        # than the dataframe used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        return X
Пример #17
0
    def fit(self, X, y=None):
        """
        Learns the numbers that should be used to replace the categories in each
        variable.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the categorical variables.

        y : pandas series.
            The target variable. Required to train the decision tree and for
            ordered ordinal encoding.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # initialize categorical encoder
        cat_encoder = OrdinalCategoricalEncoder(
            encoding_method=self.encoding_method, variables=self.variables)

        # initialize decision tree discretiser
        tree_discretiser = DecisionTreeDiscretiser(
            cv=self.cv,
            scoring=self.scoring,
            variables=self.variables,
            param_grid=self.param_grid,
            regression=self.regression,
            random_state=self.random_state)

        # pipeline for the encoder
        self.encoder_ = Pipeline([('categorical_encoder', cat_encoder),
                                  ('tree_discretiser', tree_discretiser)])

        self.encoder_.fit(X, y)

        self.input_shape_ = X.shape

        return self
Пример #18
0
    def fit(self, X, y=None):
        """
        Makes a copy of the variables to impute in the training dataframe from
        which it will randomly extract the values to fill the missing data
        during transform.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just he variables to impute.

        y : None
            y is not needed in this imputation. You can pass None or y.

        Attributes
        ----------

        X_ : dataframe.
            Copy of the training dataframe from which to extract the random samples.
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find variables to impute
        if not self.variables:
            self.variables = [var for var in X.columns]
        else:
            self.variables = self.variables

        # take a copy of the selected variables
        self.X_ = X[self.variables].copy()

        # check the variables assigned to the random state
        if self.seed == 'observation':
            self.random_state = _define_variables(self.random_state)
            if len([var
                    for var in self.random_state if var not in X.columns]) > 0:
                raise ValueError(
                    "There are variables assigned as random state which are not part of the training "
                    "dataframe.")
        self.input_shape_ = X.shape

        return self
Пример #19
0
    def transform(self, X):
        """ Replaces categories with the learned parameters.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features].
            The input samples.

        Returns
        -------

        X_transformed : pandas dataframe of shape = [n_samples, n_features].
            The dataframe containing categories replaced by numbers.
       """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns
        # than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        # replace categories by the learned parameters
        for feature in self.encoder_dict_.keys():
            X[feature] = X[feature].map(self.encoder_dict_[feature])

        # check if NaN values were introduced by the encoding
        if X[self.encoder_dict_.keys()].isnull().sum().sum() > 0:
            warnings.warn(
                "NaN values were introduced in the returned dataframe by the encoder."
                "This means that some of the categories in the input dataframe were "
                "not present in the training set used when the fit method was called. "
                "Thus, mappings for those categories does not exist. Try using the "
                "RareLabelCategoricalEncoder to remove infrequent categories before "
                "calling this encoder."
            )

        return X
Пример #20
0
    def fit(self, X, y=None):
        """
        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y : None
            y is not needed in this transformer. You can pass y or None.

        Attributes
        ----------

        right_tail_caps_: dictionary
            The dictionary containing the maximum values at which variables
            will be capped.

        left_tail_caps_ : dictionary
            The dictionary containing the minimum values at which variables
            will be capped.
        """
        X = _is_dataframe(X)

        if self.missing_values == 'raise':
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        if self.max_capping_dict is not None:
            self.right_tail_caps_ = self.max_capping_dict
        else:
            self.right_tail_caps_ = {}

        if self.min_capping_dict is not None:
            self.left_tail_caps_ = self.min_capping_dict
        else:
            self.left_tail_caps_ = {}

        self.input_shape_ = X.shape

        return self
Пример #21
0
    def fit(self, X, y=None):
        """
        Learns the variables for which the missing indicators will be created.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y : None
            y is not needed in this imputation. You can pass None or y.

        Attributes
        ----------

        variables_: list
            the lit of variables for which the missing indicator will be created.
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find variables for which indicator should be added
        if self.how == 'missing_only':
            if not self.variables:
                self.variables_ = [
                    var for var in X.columns if X[var].isnull().sum() > 0
                ]
            else:
                self.variables_ = [
                    var for var in self.variables if X[var].isnull().sum() > 0
                ]

        elif self.how == 'all':
            if not self.variables:
                self.variables_ = [var for var in X.columns]
            else:
                self.variables_ = self.variables

        self.input_shape_ = X.shape

        return self
Пример #22
0
    def fit(self, X, y=None):
        """
        Checks that the variables are numerical.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            User can pass the entire dataframe, not just the variables to impute.

        y : None
            y is not needed in this imputation. You can pass None or y.


        Attributes
        ----------

        imputer_dict_: dictionary
            The dictionary containing the values that will be used to replace each variable.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        if self.imputer_dict:
            self.variables = _find_numerical_variables(
                X, self.imputer_dict.keys())
        else:
            self.variables = _find_numerical_variables(X, self.variables)

        # create the imputer dictionary
        if self.imputer_dict:
            self.imputer_dict_ = self.imputer_dict
        else:
            self.imputer_dict_ = {
                var: self.arbitrary_number
                for var in self.variables
            }

        self.input_shape_ = X.shape

        return self
Пример #23
0
    def transform(self, X):
        """
        Removes observations with outliers from the dataframe.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The data to be transformed.

        Returns
        -------

        X_transformed : pandas dataframe of shape = [n_samples, n_features]
            The dataframe without outlier observations.
        """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        if self.missing_values == 'raise':
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        for feature in self.right_tail_caps_.keys():
            outliers = np.where(X[feature] > self.right_tail_caps_[feature],
                                True, False)
            X = X.loc[~outliers]

        for feature in self.left_tail_caps_.keys():
            outliers = np.where(X[feature] < self.left_tail_caps_[feature],
                                True, False)
            X = X.loc[~outliers]

        return X
Пример #24
0
    def transform(self, X):
        """
        Caps the variable values, that is, censors outliers.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The data to be transformed.

        Returns
        -------

        X_transformed : pandas dataframe of shape = [n_samples, n_features]
            The dataframe with the capped variables.
        """

        # check if class was fitted
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        if self.missing_values == 'raise':
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns
        # than the dataframe used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        # replace outliers
        for feature in self.right_tail_caps_.keys():
            X[feature] = np.where(X[feature] > self.right_tail_caps_[feature],
                                  self.right_tail_caps_[feature], X[feature])

        for feature in self.left_tail_caps_.keys():
            X[feature] = np.where(X[feature] < self.left_tail_caps_[feature],
                                  self.left_tail_caps_[feature], X[feature])

        return X
Пример #25
0
    def transform(self, X):
        """
        Creates the dummy / binary variables.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The data to transform.
        
        Returns
        -------
        
        X_transformed : pandas dataframe.
            The shape of the dataframe will be different from the original as it includes the dummy variables.
        """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        for feature in self.variables:
            for category in self.encoder_dict_[feature]:
                X[str(feature) + '_' + str(category)] = np.where(
                    X[feature] == category, 1, 0)

        # drop the original non-encoded variables.
        X.drop(labels=self.variables, axis=1, inplace=True)

        return X
Пример #26
0
    def fit(self, X, y=None):
        """
        Learns the mean or median values.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            User can pass the entire dataframe, not just the variables that need imputation.

        y : pandas series or None, default=None
            y is not needed in this imputation. You can pass None or y.

        Attributes
        ----------

        imputer_dict_ : dictionary
            The dictionary containing the mean / median values per variable. These
            values will be used by the imputer to replace missing data.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        # find imputation parameters: mean or median
        if self.imputation_method == 'mean':
            self.imputer_dict_ = X[self.variables].mean().to_dict()

        elif self.imputation_method == 'median':
            self.imputer_dict_ = X[self.variables].median().to_dict()

        self.input_shape_ = X.shape

        return self
Пример #27
0
    def inverse_transform(self, X):
        """ Convert the data back to the original representation.

        Parameters
        ----------

        X_transformed : pandas dataframe of shape = [n_samples, n_features].
            The transformed dataframe.

        Returns
        -------

        X : pandas dataframe of shape = [n_samples, n_features].
            The un-transformed dataframe, that is, containing the original values
            of the categorical variables.
       """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns
        # than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        # replace encoded categories by the original values
        for feature in self.encoder_dict_.keys():
            inv_map = {v: k for k, v in self.encoder_dict_[feature].items()}
            X[feature] = X[feature].map(inv_map)

        return X
Пример #28
0
def test_is_dataframe(dataframe_vartypes):
    assert_frame_equal(_is_dataframe(dataframe_vartypes), dataframe_vartypes)
    with pytest.raises(TypeError):
        assert _is_dataframe([1, 2, 4])
Пример #29
0
    def fit(self, X, y):
        """
        Learns the numbers that should be used to replace the categories in each
        variable. That is the WoE or ratio of probability.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the categorical variables.

        y : pandas series.
            Target, must be binary [0,1].

        Attributes
        ----------

        encoder_dict_: dictionary
            The dictionary containing the {category: WoE / ratio} pairs per variable.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        if y is None:
            raise ValueError(
                'Please provide a target y for this encoding method')

        # check that y is binary
        if len([x for x in y.unique() if x not in [0, 1]]) > 0:
            raise ValueError(
                "This encoder is only designed for binary classification, values of y can be only 0 or 1"
            )

        temp = pd.concat([X, y], axis=1)
        temp.columns = list(X.columns) + ['target']

        self.encoder_dict_ = {}

        if self.encoding_method == 'woe':
            total_pos = temp['target'].sum()
            total_neg = len(temp) - total_pos
            temp['non_target'] = np.where(temp['target'] == 1, 0, 1)

            for var in self.variables:
                pos = temp.groupby([var])['target'].sum() / total_pos
                neg = temp.groupby([var])['non_target'].sum() / total_neg

                t = pd.concat([pos, neg], axis=1)
                t['woe'] = np.log(t['target'] / t['non_target'])

                if not t.loc[t['target'] == 0, :].empty or not t.loc[
                        t['non_target'] == 0, :].empty:
                    raise ValueError(
                        "The proportion of 1 of the classes for a category in variable {} is zero, and log of zero is "
                        "not defined".format(var))

                self.encoder_dict_[var] = t['woe'].to_dict()

        else:
            for var in self.variables:
                t = temp.groupby(var)['target'].mean()
                t = pd.concat([t, 1 - t], axis=1)
                t.columns = ['p1', 'p0']

                if self.encoding_method == 'log_ratio':
                    if not t.loc[t['p0'] == 0, :].empty or not t.loc[
                            t['p1'] == 0, :].empty:
                        raise ValueError(
                            "p(0) or p(1) for a category in variable {} is zero, log of zero is not defined"
                            .format(var))
                    else:
                        self.encoder_dict_[var] = (np.log(t.p1 /
                                                          t.p0)).to_dict()

                elif self.encoding_method == 'ratio':
                    if not t.loc[t['p0'] == 0, :].empty:
                        raise ValueError(
                            "p(0) for a category in variable {} is zero, division by 0 is not defined"
                            .format(var))
                    else:
                        self.encoder_dict_[var] = (t.p1 / t.p0).to_dict()

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self
Пример #30
0
    def fit(self, X, y=None):
        """
        Learns the frequent categories for each variable.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just selected variables

        y : None
            y is not required. You can pass y or None.

        Attributes
        ----------

        encoder_dict_: dictionary
            The dictionary containing the frequent categories (that will be kept)
            for each variable. Categories not present in this list will be replaced
            by 'Rare' or by the user defined value.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        self.encoder_dict_ = {}

        for var in self.variables:
            if len(X[var].unique()) > self.n_categories:

                # if the variable has more than the indicated number of categories
                # the encoder will learn the most frequent categories
                t = pd.Series(X[var].value_counts() / np.float(len(X)))

                # non-rare labels:
                freq_idx = t[t >= self.tol].index

                if self.max_n_categories:
                    self.encoder_dict_[var] = freq_idx[:self.max_n_categories]
                else:
                    self.encoder_dict_[var] = freq_idx

            else:
                # if the total number of categories is smaller than the indicated
                # the encoder will consider all categories as frequent.
                warnings.warn(
                    "The number of unique categories for variable {} is less than that indicated in "
                    "n_categories. Thus, all categories will be considered frequent"
                    .format(var))
                self.encoder_dict_[var] = X[var].unique()

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self