def test_check_X_y_raises_error_when_pandas_index_dont_match():
    df = pd.DataFrame({
        "0": [1, 2, 3, 4],
        "1": [5, 6, 7, 8]
    },
                      index=[22, 99, 101, 212])
    s = pd.Series([1, 2, 3, 4], index=[22, 99, 101, 999])
    with pytest.raises(ValueError):
        check_X_y(df, s)
Exemplo n.º 2
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Learn the mean value of the target for each category of the variable.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to be encoded.

        y: pandas series
            The target.
        """

        X, y = check_X_y(X, y)
        self._fit(X)
        self._get_feature_names_in(X)

        temp = pd.concat([X, y], axis=1)
        temp.columns = list(X.columns) + ["target"]

        self.encoder_dict_ = {}

        for var in self.variables_:
            self.encoder_dict_[var] = temp.groupby(
                var)["target"].mean().to_dict()

        self._check_encoding_dictionary()

        return self
Exemplo n.º 3
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Learn the WoE.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the categorical variables.

        y: pandas series.
            Target, must be binary.
        """

        X, y = check_X_y(X, y)

        # check that y is binary
        if y.nunique() != 2:
            raise ValueError(
                "This encoder is designed for binary classification. The target "
                "used has more than 2 unique values.")

        self._fit(X)
        self._get_feature_names_in(X)

        temp = pd.concat([X, y], axis=1)
        temp.columns = list(X.columns) + ["target"]

        # if target does not have values 0 and 1, we need to remap, to be able to
        # compute the averages.
        if any(x for x in y.unique() if x not in [0, 1]):
            temp["target"] = np.where(temp["target"] == y.unique()[0], 0, 1)

        self.encoder_dict_ = {}

        total_pos = temp["target"].sum()
        total_neg = len(temp) - total_pos
        temp["non_target"] = np.where(temp["target"] == 1, 0, 1)

        for var in self.variables_:
            pos = temp.groupby([var])["target"].sum() / total_pos
            neg = temp.groupby([var])["non_target"].sum() / total_neg

            t = pd.concat([pos, neg], axis=1)
            t["woe"] = np.log(t["target"] / t["non_target"])

            if (not t.loc[t["target"] == 0, :].empty
                    or not t.loc[t["non_target"] == 0, :].empty):
                raise ValueError(
                    "The proportion of one of the classes for a category in "
                    "variable {} is zero, and log of zero is not defined".
                    format(var))

            self.encoder_dict_[var] = t["woe"].to_dict()

        self._check_encoding_dictionary()

        return self
def test_check_X_y_returns_pandas_from_pandas_with_non_typical_index():
    df = pd.DataFrame({
        "0": [1, 2, 3, 4],
        "1": [5, 6, 7, 8]
    },
                      index=[22, 99, 101, 212])
    s = pd.Series([1, 2, 3, 4], index=[22, 99, 101, 212])
    x, y = check_X_y(df, s)
    assert_frame_equal(df, x)
    assert_series_equal(s, y)
def test_check_x_y_converts_numpy_to_pandas():
    a2D = np.array([[1, 2], [3, 4], [3, 4], [3, 4]])
    df_2D = pd.DataFrame(a2D, columns=["0", "1"])

    a1D = np.array([1, 2, 3, 4])
    s = pd.Series(a1D)

    x, y = check_X_y(df_2D, s)
    assert_frame_equal(df_2D, x)
    assert_series_equal(s, y)
def test_check_x_y_reassings_index_when_only_one_input_is_pandas():
    # case 1: X is dataframe, y is something else
    df = pd.DataFrame({
        "0": [1, 2, 3, 4],
        "1": [5, 6, 7, 8]
    },
                      index=[22, 99, 101, 212])
    s = np.array([1, 2, 3, 4])
    s_exp = pd.Series([1, 2, 3, 4], index=[22, 99, 101, 212])
    x, y = check_X_y(df, s)
    assert_frame_equal(df, x)
    assert_series_equal(s_exp.astype(int), y.astype(int))

    # case 2: X is not a df, y is a series
    df = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]).T
    s = pd.Series([1, 2, 3, 4], index=[22, 99, 101, 212])
    df_exp = pd.DataFrame(df, columns=["0", "1"])
    df_exp.index = s.index

    x, y = check_X_y(df, s)
    assert_frame_equal(df_exp, x)
    assert_series_equal(s, y)
Exemplo n.º 7
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """Learn the numbers to be used to replace the categories in each
        variable.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to be encoded.

        y: pandas series, default=None
            The Target. Can be None if `encoding_method='arbitrary'`.
            Otherwise, y needs to be passed when fitting the transformer.
        """

        if self.encoding_method == "ordered":
            X, y = check_X_y(X, y)
        else:
            X = check_X(X)

        self._fit(X)
        self._get_feature_names_in(X)

        if self.encoding_method == "ordered":
            temp = pd.concat([X, y], axis=1)
            temp.columns = list(X.columns) + ["target"]

        # find mappings
        self.encoder_dict_ = {}

        for var in self.variables_:

            if self.encoding_method == "ordered":
                t = (temp.groupby(
                    [var])["target"].mean().sort_values(ascending=True).index)

            elif self.encoding_method == "arbitrary":
                t = X[var].unique()

            self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)}

        self._check_encoding_dictionary()

        return self
Exemplo n.º 8
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Fit a decision tree per variable.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            categorical variables.

        y : pandas series.
            The target variable. Required to train the decision tree and for
            ordered ordinal encoding.
        """
        X, y = check_X_y(X, y)

        # confirm model type and target variables are compatible.
        if self.regression is True:
            if type_of_target(y) == "binary":
                raise ValueError(
                    "Trying to fit a regression to a binary target is not "
                    "allowed by this transformer. Check the target values "
                    "or set regression to False.")

        else:
            check_classification_targets(y)

        self._fit(X)
        self._get_feature_names_in(X)

        if self.param_grid:
            param_grid = self.param_grid
        else:
            param_grid = {"max_depth": [1, 2, 3, 4]}

        # initialize categorical encoder
        cat_encoder = OrdinalEncoder(
            encoding_method=self.encoding_method,
            variables=self.variables_,
            ignore_format=self.ignore_format,
            errors="raise",
        )

        # initialize decision tree discretiser
        tree_discretiser = DecisionTreeDiscretiser(
            cv=self.cv,
            scoring=self.scoring,
            variables=self.variables_,
            param_grid=param_grid,
            regression=self.regression,
            random_state=self.random_state,
        )

        # pipeline for the encoder
        self.encoder_ = Pipeline([
            ("categorical_encoder", cat_encoder),
            ("tree_discretiser", tree_discretiser),
        ])

        self.encoder_.fit(X, y)

        return self
Exemplo n.º 9
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Learn the numbers that should be used to replace the categories in each
        variable. That is the ratio of probability.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            categorical variables.

        y: pandas series.
            Target, must be binary.
        """

        X, y = check_X_y(X, y)

        # check that y is binary
        if y.nunique() != 2:
            raise ValueError(
                "This encoder is designed for binary classification. The target "
                "used has more than 2 unique values.")

        self._fit(X)
        self._get_feature_names_in(X)

        temp = pd.concat([X, y], axis=1)
        temp.columns = list(X.columns) + ["target"]

        # if target does not have values 0 and 1, we need to remap, to be able to
        # compute the averages.
        if any(x for x in y.unique() if x not in [0, 1]):
            temp["target"] = np.where(temp["target"] == y.unique()[0], 0, 1)

        self.encoder_dict_ = {}

        for var in self.variables_:

            t = temp.groupby(var)["target"].mean()
            t = pd.concat([t, 1 - t], axis=1)
            t.columns = ["p1", "p0"]

            if self.encoding_method == "log_ratio":
                if not t.loc[t["p0"] == 0, :].empty or not t.loc[t["p1"] ==
                                                                 0, :].empty:
                    raise ValueError(
                        "p(0) or p(1) for a category in variable {} is zero, log of "
                        "zero is not defined".format(var))
                else:
                    self.encoder_dict_[var] = (np.log(t.p1 / t.p0)).to_dict()

            elif self.encoding_method == "ratio":
                if not t.loc[t["p0"] == 0, :].empty:
                    raise ValueError(
                        "p(0) for a category in variable {} is zero, division by 0 is "
                        "not defined".format(var))

                else:
                    self.encoder_dict_[var] = (t.p1 / t.p0).to_dict()

        self._check_encoding_dictionary()

        return self
Exemplo n.º 10
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find the important features.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
           The input dataframe.

        y: array-like of shape (n_samples)
           Target variable. Required to train the estimator.
        """
        # check input dataframe
        X, y = check_X_y(X, y)

        # If required exclude variables that are not in the input dataframe
        self._confirm_variables(X)

        # find all variables or check those entered are present in the dataframe
        self.variables_ = _find_all_variables(X, self.variables_, exclude_datetime=True)

        if len(self.variables_) == 1 and self.threshold is None:
            raise ValueError(
                "When evaluating a single feature you need to manually set a value "
                "for the threshold. "
                f"The transformer is evaluating the performance of {self.variables_} "
                f"and the threshold was left to {self.threshold} when initializing "
                f"the transformer."
            )

        # save input features
        self._get_feature_names_in(X)

        # set up the correct estimator
        if self.regression is True:
            est = TargetMeanRegressor(
                bins=self.bins,
                strategy=self.strategy,
            )
        else:
            est = TargetMeanClassifier(
                bins=self.bins,
                strategy=self.strategy,
            )

        self.feature_performance_ = {}

        for variable in self.variables_:
            # clone estimator
            estimator = clone(est)

            # set the estimator to evaluate the required variable
            estimator.set_params(variables=variable)

            model = cross_validate(
                estimator,
                X,
                y,
                cv=self.cv,
                scoring=self.scoring,
            )

            self.feature_performance_[variable] = model["test_score"].mean()

        # select features
        if not self.threshold:
            threshold = pd.Series(self.feature_performance_).mean()
        else:
            threshold = self.threshold

        self.features_to_drop_ = [
            f for f in self.variables_ if self.feature_performance_[f] < threshold
        ]

        return self
Exemplo n.º 11
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Learn the mean target value per category or bin.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y : pandas series of shape = [n_samples,]
            The target variable.
        """
        # check if 'X' is a dataframe
        X, y = check_X_y(X, y)

        # find categorical and numerical variables
        (
            self.variables_categorical_,
            self.variables_numerical_,
        ) = _find_categorical_and_numerical_variables(X, self.variables)

        # check for missing values
        _check_contains_na(X, self.variables_numerical_)
        _check_contains_na(X, self.variables_categorical_)

        # check inf
        _check_contains_inf(X, self.variables_numerical_)

        # Create pipelines
        if self.variables_categorical_ and self.variables_numerical_:
            self._pipeline = self._make_combined_pipeline()

        elif self.variables_categorical_:
            self._pipeline = self._make_categorical_pipeline()

        else:
            self._pipeline = self._make_numerical_pipeline()

        # Train pipeline
        self._pipeline.fit(X, y)

        # Assign attributes (useful to interpret features)
        # Use dict() to make a copy of the dictionary. Otherwise, like in pandas,
        # it is just another view of the same data, mind-blowing.
        if self.variables_categorical_ and self.variables_numerical_:
            self.binner_dict_ = dict(
                self._pipeline.named_steps["discretiser"].binner_dict_
            )
            self.encoder_dict_ = dict(
                self._pipeline.named_steps["encoder_num"].encoder_dict_
            )
            tmp_dict = dict(self._pipeline.named_steps["encoder_cat"].encoder_dict_)
            self.encoder_dict_.update(tmp_dict)

        elif self.variables_categorical_:
            self.binner_dict_ = {}
            self.encoder_dict_ = dict(self._pipeline.encoder_dict_)

        else:
            self.binner_dict_ = dict(
                self._pipeline.named_steps["discretiser"].binner_dict_
            )
            self.encoder_dict_ = dict(
                self._pipeline.named_steps["encoder"].encoder_dict_
            )

        # store input features
        self.n_features_in_ = X.shape[1]
        self.feature_names_in_ = list(X.columns)

        return self
Exemplo n.º 12
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find the important features.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
           The input dataframe.
        y: array-like of shape (n_samples)
           Target variable. Required to train the estimator.
        """

        X, y = check_X_y(X, y)

        # reset the index
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)

        # If required exclude variables that are not in the input dataframe
        self._confirm_variables(X)

        # find numerical variables or check variables entered by user
        self.variables_ = _find_or_check_numerical_variables(
            X, self.variables_)

        # check that there are more than 1 variable to select from
        self._check_variable_number()

        # train model with all features and cross-validation
        model = cross_validate(
            self.estimator,
            X[self.variables_],
            y,
            cv=self.cv,
            return_estimator=True,
            scoring=self.scoring,
        )

        # store initial model performance
        self.initial_model_performance_ = model["test_score"].mean()

        # extract the validation folds
        cv_ = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator))
        validation_indices = [val_index for _, val_index in cv_.split(X, y)]

        # get performance metric
        scorer = get_scorer(self.scoring)

        # seed
        random_state = check_random_state(self.random_state)

        # dict to collect features and their performance_drift after shuffling
        self.performance_drifts_ = {}

        # shuffle features and save feature performance drift into a dict
        for feature in self.variables_:

            X_shuffled = X[self.variables_].copy()

            # shuffle individual feature
            X_shuffled[feature] = (X_shuffled[feature].sample(
                frac=1, random_state=random_state).reset_index(drop=True))

            # determine the performance with the shuffled feature
            performance = np.mean([
                scorer(m, X_shuffled.iloc[idx], y.iloc[idx])
                for m, idx in zip(model["estimator"], validation_indices)
            ])

            # determine drift in performance
            # Note, sklearn negates the log and error scores, so no need to manually
            # do the inversion
            # https://scikit-learn.org/stable/modules/model_evaluation.html
            # (https://scikit-learn.org/stable/modules/model_evaluation.html
            # #the-scoring-parameter-defining-model-evaluation-rules)
            performance_drift = self.initial_model_performance_ - performance

            # Save feature and performance drift
            self.performance_drifts_[feature] = performance_drift

        # select features
        if not self.threshold:
            threshold = pd.Series(self.performance_drifts_).mean()
        else:
            threshold = self.threshold

        self.features_to_drop_ = [
            f for f in self.performance_drifts_.keys()
            if self.performance_drifts_[f] < threshold
        ]

        # save input features
        self._get_feature_names_in(X)

        return self
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Select features.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
           The input dataframe

        y: array-like of shape (n_samples)
           Target variable. Required to train the estimator.
        """

        # check input dataframe
        X, y = check_X_y(X, y)

        # If required exclude variables that are not in the input dataframe
        self._confirm_variables(X)

        # find numerical variables or check variables entered by user
        self.variables_ = _find_or_check_numerical_variables(
            X, self.variables_)

        if len(self.variables_) == 1 and self.threshold is None:
            raise ValueError(
                "When evaluating a single feature you need to manually set a value "
                "for the threshold. "
                f"The transformer is evaluating the performance of {self.variables_} "
                f"and the threshold was left to {self.threshold} when initializing "
                f"the transformer.")

        self.feature_performance_ = {}

        # train a model for every feature and store the performance
        for feature in self.variables_:
            model = cross_validate(
                self.estimator,
                X[feature].to_frame(),
                y,
                cv=self.cv,
                return_estimator=False,
                scoring=self.scoring,
            )

            self.feature_performance_[feature] = model["test_score"].mean()

        # select features
        if not self.threshold:
            threshold = pd.Series(self.feature_performance_).mean()
        else:
            threshold = self.threshold

        self.features_to_drop_ = [
            f for f in self.feature_performance_.keys()
            if self.feature_performance_[f] < threshold
        ]

        # check we are not dropping all the columns in the df
        if len(self.features_to_drop_) == len(X.columns):
            warnings.warn(
                "All features will be dropped, try changing the threshold.")

        # save input features
        self._get_feature_names_in(X)

        return self
def test_check_x_y_returns_pandas_from_pandas(df_vartypes):
    s = pd.Series([0, 1, 2, 3])
    x, y = check_X_y(df_vartypes, s)
    assert_frame_equal(df_vartypes, x)
    assert_series_equal(s, y)
def test_check_x_y_raises_error_when_inconsistent_length(df_vartypes):
    s = pd.Series([0, 1, 2, 3, 5])
    with pytest.raises(ValueError):
        check_X_y(df_vartypes, s)
Exemplo n.º 16
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find initial model performance. Sort features by importance.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
           The input dataframe

        y: array-like of shape (n_samples)
           Target variable. Required to train the estimator.
        """

        # check input dataframe
        X, y = check_X_y(X, y)

        # If required exclude variables that are not in the input dataframe
        self._confirm_variables(X)

        # find numerical variables or check variables entered by user
        self.variables_ = _find_or_check_numerical_variables(
            X, self.variables_)

        # check that there are more than 1 variable to select from
        self._check_variable_number()

        # save input features
        self._get_feature_names_in(X)

        # train model with all features and cross-validation
        model = cross_validate(
            self.estimator,
            X[self.variables_],
            y,
            cv=self.cv,
            scoring=self.scoring,
            return_estimator=True,
        )

        # store initial model performance
        self.initial_model_performance_ = model["test_score"].mean()

        # Initialize a dataframe that will contain the list of the feature/coeff
        # importance for each cross validation fold
        feature_importances_cv = pd.DataFrame()

        # Populate the feature_importances_cv dataframe with columns containing
        # the feature importance values for each model returned by the cross
        # validation.
        # There are as many columns as folds.
        for i in range(len(model["estimator"])):
            m = model["estimator"][i]
            feature_importances_cv[i] = get_feature_importances(m)

        # Add the variables as index to feature_importances_cv
        feature_importances_cv.index = self.variables_

        # Aggregate the feature importance returned in each fold
        self.feature_importances_ = feature_importances_cv.mean(axis=1)

        return X, y