示例#1
0
    def _fit_model(self, model_type, regressor, X, y):
        """Private method to fit a model using sklearn or statsmodels."""

        # encoding for predictor variable
        # we enforce that predictors were imputed in imputation phase.
        X = _one_hot_encode(X)
        self.new_X_columns = X.columns.tolist()

        # encoding for response variable
        if model_type == "logistic":
            ycat = y.astype("category").cat
            y = ycat.codes
            self._response_categories = ycat.categories

        # statsmodels fit case, which requires different logic than sklearn
        if self.model_lib == "statsmodels":
            X = add_constant(X)
            if self.model_kwgs:
                model = regressor(y, X, **self.model_kwgs)
            else:
                model = regressor(y, X)
            model = model.fit()

        # sklearn fit case, which requires different logic than statsmodels
        if self.model_lib == "sklearn":
            if self.model_kwgs:
                model = regressor(**self.model_kwgs)
            else:
                model = regressor()
            # sklearn doesn't need encoding for response
            model.fit(X, y)

        # return the model after fitting it to a given dataset
        return model
示例#2
0
    def _predict_strategy_validator(self, instance, X):
        """Private method to validate before prediction."""

        # first check that model is fitted, then check columns are the same
        check_is_fitted(instance, "statistics_")
        X_cols = X.columns.tolist()
        fit_cols = set(instance.fit_X_columns)
        diff_fit = set(fit_cols).difference(X_cols)
        if diff_fit:
            err = "Same columns that were fit must appear in predict."
            raise ValueError(err)

        # encoding for predictor variable
        # we enforce that predictors were imputed in imputation phase.
        if X.isnull().sum().any():
            me = "Data passed to make predictions can't contain missingness."
            raise ValueError(me)
        X = _one_hot_encode(X)
        return X
示例#3
0
    def fit(self, X, y=None, imp_ixs=None):
        """Fit specified imputation methods to each column within a DataFrame.

        The fit method calculates the `statistics` necessary to later
        transform a dataset (i.e. perform actual imputations). Inductive
        methods calculate statistic on the fit data, then impute new missing
        data with that value. Most currently supported methods are inductive.

        It's important to note that we have to fit X regardless of whether any
        data is missing. Transform step may have missing data if new data is
        used, so fit each column that appears in the given strategies.

        Args:
            X (pd.DataFrame): pandas DataFrame on which imputer is fit.
            y (pd.Series, pd.DataFrame Optional): response. Default is None.
                Determined interally in fit method. Arg is present to remain
                compatible with sklearn Pipelines.
            imp_ixs (dict): Dictionary of lists of indices that indicate which
                data elements to impute per column or None to identify from
                missing elements per column

        Returns:
            self: instance of the SingleImputer class.

        Raises:
            ValueError: error in specification of strategies. Raised through
                `check_strategy_fit`. See its docstrings for more info.
            ValueError: error in specification of predictors. Raised through
                `check_predictors_fit`. See its docstrings for more info.
        """

        # first, prep columns we plan to use and make sure they are valid
        self._fit_strategy_validator(X)
        self.statistics_ = {}

        # perform fit on each column, depending on that column's strategy
        # note that right now, operations are COLUMN-by-COLUMN, iteratively
        if self.seed is not None:
            np.random.seed(self.seed)
        for column, method in self._strats.items():
            imp = self.strategies[method]
            imp_params = self._fit_init_params(column, method, self.imp_kwgs)

            # try to create an instance of the imputer, given the args
            try:
                if imp_params is None:
                    imputer = imp()
                else:
                    imputer = imp(**imp_params)
            except TypeError as te:
                name = imp.__name__
                err = f"Invalid arguments passed to {name} __init__ method."
                raise ValueError(err) from te

            # identify the column for imputation
            ys = X[column]

            # the fit depends on what type of strategy we use.
            # first, fit univariate methods, which are straightforward.
            if method in self.univariate_strategies:
                imputer.fit(ys, None)

            # now, fit on predictive methods, which are more complex.
            if method in self.predictive_strategies:
                preds = self._preds[column]
                if preds == "all":
                    xs = X.drop(column, axis=1)
                else:
                    xs = X[preds]

                if imp_ixs is not None:
                    ys[imp_ixs[column]] = np.nan

                # fit the data on observed values only.
                x_, y_ = _get_observed(xs, ys)

                # before imputing, need to encode categoricals
                x_ = _one_hot_encode(x_)

                imputer.fit(x_, y_)

            # finally, store imputer for each column as statistics
            self.statistics_[column] = imputer
        return self
示例#4
0
    def transform(self, X, imp_ixs=None):
        """Impute each column within a DataFrame using fit imputation methods.

        The transform step performs the actual imputations. Given a dataset
        previously fit, `transform` imputes each column with it's respective
        imputed values from fit (in the case of inductive) or performs new fit
        and transform in one sweep (in the case of transductive).

        Args:
            X (pd.DataFrame): DataFrame to impute (same as fit or new data).
            imp_ixs (dict): Dictionary of lists of indices that indicate which
                data elements to impute per column or None to identify from
                missing elements per column

        Returns:
            X (pd.DataFrame): imputed in place or copy of original.

        Raises:
            ValueError: same columns must appear in fit and transform.
                Raised through _transform_strategy_validator.
        """

        # copy the dataset if necessary, then prep predictors
        if self.copy:
            X = X.copy()
        self._transform_strategy_validator(X)

        # transformation logic
        self.imputed_ = {}
        if self.seed is not None:
            np.random.seed(self.seed)
        for column, imputer in self.statistics_.items():
            if imp_ixs is None:
                imp_ix = X[column][X[column].isnull()].index
            else:
                imp_ix = pd.Index(imp_ixs[column])
            self.imputed_[column] = imp_ix.tolist()

            # continue if there are no imputations to make
            if imp_ix.empty:
                continue

            # implement transform logic for univariate
            if imputer.strategy in self.univariate_strategies:
                x_ = X[column]

            # implement transform logic for predictive
            if imputer.strategy in self.predictive_strategies:
                preds = self._preds[column]
                if preds == "all":
                    x_ = X.drop(column, axis=1)
                else:
                    x_ = X[preds]

                # isolate missingness
                if isinstance(x_, pd.Series):
                    x_ = x_.to_frame()
                    x_ = x_.loc[imp_ix]
                else:
                    x_ = x_.loc[imp_ix, :]

                # default univariate impute for missing covariates
                mis_cov = pd.isnull(x_).sum()
                mis_cov = mis_cov[mis_cov > 0]
                if any(mis_cov):
                    x_m = mis_cov.index
                    for col in x_m:
                        d = DefaultUnivarImputer()
                        if mis_cov[col] == x_.shape[0]:
                            d_imps = 0
                        else:
                            d_imps = d.fit_impute(x_[col], None)
                        x_null = x_[col][x_[col].isnull()].index
                        x_.loc[x_null, col] = d_imps

                # handling encoding again for prediction of imputations
                x_ = _one_hot_encode(x_)

            # perform imputation given the specified imputer and value for x_
            X.loc[imp_ix, column] = imputer.impute(x_)
        return X
示例#5
0
    def transform(self, X):
        """Impute each column within a DataFrame using fit imputation methods.

        The transform step performs the actual imputations. Given a dataset
        previously fit, `transform` imputes each column with it's respective
        imputed values from fit (in the case of inductive) or performs new fit
        and transform in one sweep (in the case of transductive).

        Args:
            X (pd.DataFrame): DataFrame to impute (same as fit or new data).

        Returns:
            X (pd.DataFrame): imputed in place or copy of original.

        Raises:
            ValueError: same columns must appear in fit and transform.
                Raised through _transform_strategy_validator.
        """

        # copy the dataset if necessary, then prep predictors
        if self.copy:
            X = X.copy()
        self._transform_strategy_validator(X)
        if self.verbose:
            trans = "PERFORMING IMPUTATIONS ON DATA BASED ON FIT..."
            print(f"{trans}\n{'-'*len(trans)}")

        # transformation logic
        self.imputed_ = {}
        if self.seed is not None:
            np.random.seed(self.seed)
        for column, imputer in self.statistics_.items():
            imp_ix = X[column][X[column].isnull()].index
            self.imputed_[column] = imp_ix.tolist()

            # print to console for transformation if self.verbose
            if self.verbose:
                strat = imputer.statistics_["strategy"]
                print(f"Transforming {column} with strategy '{strat}'")
                if not imp_ix.empty:
                    print(f"Numer of imputations to perform: {imp_ix.size}")
                else:
                    print(f"No imputations, moving to next column...")

            # continue if there are no imputations to make
            if imp_ix.empty:
                continue

            # implement transform logic for univariate
            if imputer.strategy in self.univariate_strategies:
                x_ = X[column]

            # implement transform logic for predictive
            if imputer.strategy in self.predictive_strategies:
                preds = self._preds[column]
                if preds == "all":
                    x_ = X.drop(column, axis=1)
                else:
                    x_ = X[preds]

                # isolate missingness
                if isinstance(x_, pd.Series):
                    x_ = x_.to_frame()
                    x_ = x_.loc[imp_ix]
                else:
                    x_ = x_.loc[imp_ix, :]

                # default univariate impute for missing covariates
                mis_cov = pd.isnull(x_).sum()
                mis_cov = mis_cov[mis_cov > 0]
                if any(mis_cov):
                    x_m = mis_cov.index
                    if self.verbose:
                        print(f"Missing Covariates:\n{mis_cov}\n")
                        print("Using single imputer for missing covariates...")
                    for col in x_m:
                        d = DefaultUnivarImputer()
                        d_imps = d.fit_impute(x_[col], None)
                        x_null = x_[col][x_[col].isnull()].index
                        x_.loc[x_null, col] = d_imps

                # handling encoding again for prediction of imputations
                x_ = _one_hot_encode(x_)

            # perform imputation given the specified imputer and value for x_
            X.loc[imp_ix, column] = imputer.impute(x_)
        return X