Exemplo n.º 1
0
def test_find_numerical_variables(dataframe_vartypes):
    vars_num = ['Age', 'Marks']
    vars_mix = ['Age', 'Marks', 'Name']
    vars_none = None
    assert _find_numerical_variables(dataframe_vartypes, vars_num) == vars_num
    assert _find_numerical_variables(dataframe_vartypes, vars_none) == vars_num
    with pytest.raises(TypeError):
        assert _find_numerical_variables(dataframe_vartypes, vars_mix)
def test_find_numerical_variables(df_vartypes):
    vars_num = ["Age", "Marks"]
    vars_mix = ["Age", "Marks", "Name"]
    vars_none = None

    assert _find_numerical_variables(df_vartypes, vars_num) == vars_num
    assert _find_numerical_variables(df_vartypes, vars_none) == vars_num

    with pytest.raises(TypeError):
        assert _find_numerical_variables(df_vartypes, vars_mix)

    with pytest.raises(ValueError):
        assert _find_numerical_variables(df_vartypes[["Name", "City"]], None)
Exemplo n.º 3
0
    def fit(self, X, y=None):
        """
        The `fit` method allows Scikit-learn transformers to learn the required parameters
        from the training data set.

        If transformer is OneHotEncoder, OrdinalEncoder or SimpleImputer, all variables indicated
        in the variables parameter will be transformed. When the variables parameter is None, the
        SklearnWrapper will automatically select and transform all features in the dataset,
        numerical or otherwise.

        For all other Scikit-learn transformers only numerical variables will be transformed.
        The SklearnWrapper will check that the variables indicated in the variables parameter
        are numerical, or alternatively, if variables is None, it will automatically select
        the numerical variables in the data set.
        """

        # check input dataframe
        X = _is_dataframe(X)

        if isinstance(self.transformer, (OneHotEncoder, OrdinalEncoder, SimpleImputer)):
            self.variables = _find_all_variables(X, self.variables)
        else:
            self.variables = _find_numerical_variables(X, self.variables)

        self.transformer.fit(X[self.variables])

        self.input_shape_ = X.shape

        return self
    def fit(self, X, y=None):
        """
        Checks that the variables are numerical.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            User can pass the entire dataframe, not just the variables to impute.

        y : None
            y is not needed in this imputation. You can pass None or y.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        # create the imputer dictionary
        self.imputer_dict_ = {
            var: self.arbitrary_number
            for var in self.variables
        }

        self.input_shape_ = X.shape

        return self
Exemplo n.º 5
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> pd.DataFrame:
        """
        Fits the transformation to the DataFrame.

        Args:
            X: Pandas DataFrame to fit the transformation

            y: This parameter exists only for compatibility with
            sklearn.pipeline.Pipeline.
            Defaults to None. Alternatively takes Pandas Series.

        Returns:
            DataFrame with fitted transformation

        """

        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        return X
    def fit(self, X, y=None):
        """
        Learns the values at the end of the variable distribution.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            The user can pass the entire dataframe, not just the variables that need imputation.

        y : None
            y is not needed in this imputation. You can pass None or y.

        Attributes
        ----------

        imputer_dict_: dictionary
            The dictionary containing the values at the end of the distribution
            per variable. These values will be used by the imputer to replace missing
            data.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        # estimate imputation values
        if self.distribution == 'max':
            self.imputer_dict_ = (X[self.variables].max() *
                                  self.fold).to_dict()

        elif self.distribution == 'gaussian':
            if self.tail == 'right':
                self.imputer_dict_ = (
                    X[self.variables].mean() +
                    self.fold * X[self.variables].std()).to_dict()
            elif self.tail == 'left':
                self.imputer_dict_ = (
                    X[self.variables].mean() -
                    self.fold * X[self.variables].std()).to_dict()

        elif self.distribution == 'skewed':
            IQR = X[self.variables].quantile(0.75) - X[
                self.variables].quantile(0.25)
            if self.tail == 'right':
                self.imputer_dict_ = (X[self.variables].quantile(0.75) +
                                      (IQR * self.fold)).to_dict()
            elif self.tail == 'left':
                self.imputer_dict_ = (X[self.variables].quantile(0.25) -
                                      (IQR * self.fold)).to_dict()

        self.input_shape_ = X.shape

        return self
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Checks that the variables are numerical.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            User can pass the entire dataframe, not just the variables to impute.

        y : None
            y is not needed in this imputation. You can pass None or y.


        Attributes
        ----------

        imputer_dict_: dictionary
            The dictionary containing the values that will replace each variable.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        if self.imputer_dict:
            self.variables = _find_numerical_variables(
                X, self.imputer_dict.keys())
        else:
            self.variables = _find_numerical_variables(X, self.variables)

        # create the imputer dictionary
        if self.imputer_dict:
            self.imputer_dict_ = self.imputer_dict
        else:
            self.imputer_dict_ = {
                var: self.arbitrary_number
                for var in self.variables
            }

        self.input_shape_ = X.shape

        return self
Exemplo n.º 8
0
    def fit(self, X, y=None):
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        return X
    def fit(self, X, y):
        """

        Args
        ----

        X: pandas dataframe of shape = [n_samples, n_features]
           The input dataframe

        y: array-like of shape (n_samples)
           Target variable. Required to train the estimator.


        Returns
        -------

        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find numerical variables or check variables entered by user
        self.variables = _find_numerical_variables(X, self.variables)

        # list to collect selected features
        self.selected_features_ = []

        self.feature_performance_ = {}

        # train a model for every feature
        for feature in self.variables:
            model = cross_validate(
                self.estimator,
                X[feature].to_frame(),
                y,
                cv=self.cv,
                return_estimator=False,
                scoring=self.scoring,
            )

            if model["test_score"].mean() > self.threshold:
                self.selected_features_.append(feature)

            self.feature_performance_[feature] = model["test_score"].mean()

        self.input_shape_ = X.shape

        return self
Exemplo n.º 10
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y : None
            y is not needed in this transformer. You can pass y or None.

        Attributes
        ----------

        right_tail_caps_: dictionary
            The dictionary containing the maximum values at which variables
            will be capped.

        left_tail_caps_ : dictionary
            The dictionary containing the minimum values at which variables
            will be capped.
        """
        X = _is_dataframe(X)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        if self.max_capping_dict is not None:
            self.right_tail_caps_ = self.max_capping_dict
        else:
            self.right_tail_caps_ = {}

        if self.min_capping_dict is not None:
            self.left_tail_caps_ = self.min_capping_dict
        else:
            self.left_tail_caps_ = {}

        self.input_shape_ = X.shape

        return self
Exemplo n.º 11
0
    def fit(self, X: pd.DataFrame, y: Optional[str] = None):
        """
        The `fit` method allows Scikit-learn transformers to learn
        the required parameters from the training data set.

        If transformer is OneHotEncoder, OrdinalEncoder or SimpleImputer,
        all variables indicated in the ```variables``` parameter will be transformed.
        When the variables parameter is None, the SklearnWrapper will automatically
        select and transform all features in the dataset, numerical or otherwise.

        For all other Scikit-learn transformers only numerical variables
        will be transformed. The SklearnWrapper will check that the variables
        indicated in the variables parameter are numerical, or alternatively,
        if variables is None, it will automatically select
        the numerical variables in the data set.

        Args:
            X: Pandas DataFrame to fit the transformer
            y: This parameter exists only for compatibility
            with sklearn.pipeline.Pipeline.
            Defaults to None.

        Returns:
            self
        """

        # check input dataframe
        X = _is_dataframe(X)

        if isinstance(self.transformer,
                      (OneHotEncoder, OrdinalEncoder, SimpleImputer)):
            self.variables = _find_all_variables(X, self.variables)

        else:
            self.variables = _find_numerical_variables(X, self.variables)

        self.transformer.fit(X[self.variables])

        self.input_shape_ = X.shape

        return self
    def fit(self, X, y=None):
        """
        Learns the mean or median values.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            User can pass the entire dataframe, not just the variables that need imputation.

        y : pandas series or None, default=None
            y is not needed in this imputation. You can pass None or y.

        Attributes
        ----------

        imputer_dict_: dictionary
            The dictionary containing the mean / median values per variable. These
            values will be used by the imputer to replace missing data.
            The imputer_dict_ is created when fitting the imputer.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        # find imputation parameters: mean or median
        if self.imputation_method == 'mean':
            self.imputer_dict_ = X[self.variables].mean().to_dict()

        elif self.imputation_method == 'median':
            self.imputer_dict_ = X[self.variables].median().to_dict()

        self.input_shape_ = X.shape

        return self
    def fit(self, X, y):
        """

        Args
        ----

        X: pandas dataframe of shape = [n_samples, n_features]
           The input dataframe

        y: array-like of shape (n_samples)
           Target variable. Required to train the estimator.


        Returns
        -------

        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find numerical variables or check variables entered by user
        self.variables = _find_numerical_variables(X, self.variables)

        # train model with all features and cross-validation
        model = cross_validate(
            self.estimator,
            X[self.variables],
            y,
            cv=self.cv,
            scoring=self.scoring,
            return_estimator=True,
        )

        # store initial model performance
        self.initial_model_performance_ = model["test_score"].mean()

        # Initialize a dataframe that will contain the list of the feature/coeff
        # importance for each cross validation fold
        feature_importances_cv = pd.DataFrame()

        # Populate the feature_importances_cv dataframe with columns containing
        # the feature importance values for each model returned by the cross
        # validation.
        # There are as many columns as folds.
        for m in model["estimator"]:

            feature_importances_cv[m] = get_feature_importances(m)

        # Add the variables as index to feature_importances_cv
        feature_importances_cv.index = self.variables

        # Aggregate the feature importance returned in each fold
        self.feature_importances_ = feature_importances_cv.mean(axis=1)

        # Sort the feature importance values
        self.feature_importances_.sort_values(ascending=True, inplace=True)

        # list to collect selected features
        self.selected_features_ = []

        # temporary copy where we will remove features recursively
        X_tmp = X[self.variables].copy()

        # we need to update the performance as we remove features
        baseline_model_performance = self.initial_model_performance_

        # dict to collect features and their performance_drift after shuffling
        self.performance_drifts_ = {}

        # evaluate every feature, starting from the least important
        # remember that feature_importances_ is ordered already
        for feature in list(self.feature_importances_.index):

            # remove feature and train new model
            model_tmp = cross_validate(
                self.estimator,
                X_tmp.drop(columns=feature),
                y,
                cv=self.cv,
                scoring=self.scoring,
                return_estimator=False,
            )

            # assign new model performance
            model_tmp_performance = model_tmp["test_score"].mean()

            # Calculate performance drift
            performance_drift = baseline_model_performance - model_tmp_performance

            # Save feature and performance drift
            self.performance_drifts_[feature] = performance_drift

            if performance_drift > self.threshold:

                self.selected_features_.append(feature)

            else:
                # remove feature and adjust initial performance
                X_tmp = X_tmp.drop(columns=feature)

                baseline_model = cross_validate(
                    self.estimator,
                    X_tmp,
                    y,
                    cv=self.cv,
                    return_estimator=False,
                    scoring=self.scoring,
                )

                # store initial model performance
                baseline_model_performance = baseline_model["test_score"].mean(
                )

        self.input_shape_ = X.shape

        return self
    def fit(self, X, y=None):
        """
        Finds the correlated features

        Args:
            X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the variables to transform.

            y: It is not needed in this transformer. Defaults to None.
            Alternatively takes Pandas Series.ss

        Returns:
            self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find all numerical variables or check those entered are in the dataframe
        self.variables = _find_numerical_variables(X, self.variables)

        # set to collect features that are correlated
        self.correlated_features_ = set()

        # create tuples of correlated feature groups
        self.correlated_feature_sets_ = []

        # the correlation matrix
        self.correlated_matrix_ = X[self.variables].corr(method=self.method)

        # create set of examined features, helps to determine feature combinations
        # to evaluate below
        _examined_features = set()

        # for each feature in the dataset (columns of the correlation matrix)
        for feature in self.correlated_matrix_.columns:

            if feature not in _examined_features:

                # append so we can exclude when we create the combinations
                _examined_features.add(feature)

                # here we collect potentially correlated features
                # we need this for the correlated groups sets
                _temp_set = set([feature])

                # features that have not been examined, are not currently examined and
                # were not found correlated
                _features_to_compare = [
                    f for f in self.correlated_matrix_.columns
                    if f not in _examined_features
                ]

                # create combinations:
                for f2 in _features_to_compare:

                    # if the correlation is higher than the threshold
                    # we are interested in absolute correlation coefficient value
                    if abs(self.correlated_matrix_.loc[
                            f2, feature]) > self.threshold:

                        # add feature (f2) to our correlated set
                        self.correlated_features_.add(f2)
                        _temp_set.add(f2)
                        _examined_features.add(f2)

                # if there are correlated features
                if len(_temp_set) > 1:
                    self.correlated_feature_sets_.append(_temp_set)

        self.input_shape_ = X.shape

        return self
Exemplo n.º 15
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learns the values that should be used to replace outliers.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y : None
            y is not needed in this transformer. You can pass y or None.

        Attributes
        ----------

        right_tail_caps_: dictionary
            The dictionary containing the maximum values at which variables
            will be capped.

        left_tail_caps_ : dictionary
            The dictionary containing the minimum values at which variables
            will be capped.
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        self.right_tail_caps_ = {}
        self.left_tail_caps_ = {}

        # estimate the end values
        if self.tail in ["right", "both"]:
            if self.capping_method == "gaussian":
                self.right_tail_caps_ = (
                    X[self.variables].mean() +
                    self.fold * X[self.variables].std()).to_dict()

            elif self.capping_method == "iqr":
                IQR = X[self.variables].quantile(0.75) - X[
                    self.variables].quantile(0.25)
                self.right_tail_caps_ = (X[self.variables].quantile(0.75) +
                                         (IQR * self.fold)).to_dict()

            elif self.capping_method == "quantiles":
                self.right_tail_caps_ = (
                    X[self.variables].quantile(1 - self.fold).to_dict())

        if self.tail in ["left", "both"]:
            if self.capping_method == "gaussian":
                self.left_tail_caps_ = (
                    X[self.variables].mean() -
                    self.fold * X[self.variables].std()).to_dict()

            elif self.capping_method == "iqr":
                IQR = X[self.variables].quantile(0.75) - X[
                    self.variables].quantile(0.25)
                self.left_tail_caps_ = (X[self.variables].quantile(0.25) -
                                        (IQR * self.fold)).to_dict()

            elif self.capping_method == "quantiles":
                self.left_tail_caps_ = X[self.variables].quantile(
                    self.fold).to_dict()

        self.input_shape_ = X.shape

        return self
Exemplo n.º 16
0
    def fit(self, X, y):
        """

        Args
        ----

        X: pandas dataframe of shape = [n_samples, n_features]
           The input dataframe

        y: array-like of shape (n_samples)
           Target variable. Required to train the estimator.


        Returns
        -------

        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # reset the index
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)

        # find numerical variables or check variables entered by user
        self.variables = _find_numerical_variables(X, self.variables)

        # train model with all features and cross-validation
        model = cross_validate(
            self.estimator,
            X,
            y,
            cv=self.cv,
            return_estimator=True,
            scoring=self.scoring,
        )

        # store initial model performance
        self.initial_model_performance_ = model["test_score"].mean()

        # get performance metric
        scorer = get_scorer(self.scoring)

        # dict to collect features and their performance_drift after shuffling
        self.performance_drifts_ = {}

        # list to collect selected features
        self.selected_features_ = []

        # shuffle features and save feature performance drift into a dict
        for feature in self.variables:

            X_shuffled = X.copy()

            # shuffle individual feature
            X_shuffled[feature] = (X_shuffled[feature].sample(
                frac=1).reset_index(drop=True))

            # determine the performance with the shuffled feature
            performance = np.mean(
                [scorer(m, X_shuffled, y) for m in model["estimator"]])

            # determine drift in performance
            # Note, sklearn negates the log and error scores, so no need to manually
            # do the invertion
            # https://scikit-learn.org/stable/modules/model_evaluation.html
            # (https://scikit-learn.org/stable/modules/model_evaluation.html
            # #the-scoring-parameter-defining-model-evaluation-rules)
            performance_drift = self.initial_model_performance_ - performance

            # Save feature and performance drift
            self.performance_drifts_[feature] = performance_drift

        # select features
        for feature in self.performance_drifts_.keys():

            if self.performance_drifts_[feature] > self.threshold:

                self.selected_features_.append(feature)

        self.input_shape_ = X.shape

        return self