def test_bayesian_logistic_imputer(): """Test bayesian works for binary column of PredictiveImputer.""" imp_b = MiceImputer(strategy={"y": "bayesian binary logistic"}, imp_kwgs={"y": { "fill_value": "random" }}) imp_b.fit_transform(dfs.df_bayes_log)
def test_stochastic_predictive_imputer(): """Test stochastic works for numerical columns of PredictiveImputer.""" # generate linear, then stochastic imp_p = MiceImputer(strategy={"A": "least squares"}) imp_s = MiceImputer(strategy={"A": "stochastic"}) # make sure both work _ = imp_p.fit_transform(dfs.df_num) _ = imp_s.fit_transform(dfs.df_num) assert imp_p.imputed_["A"] == imp_s.imputed_["A"]
def test_bayesian_reg_imputer(): """Test bayesian works for numerical column of PredictiveImputer.""" # test designed first - test kwargs and params imp_b = MiceImputer( strategy={"y": "bayesian least squares"}, imp_kwgs={"y": { "fill_value": "random", "am": 11, "cores": 2 }}) imp_b.fit_transform(dfs.df_bayes_reg) # test on numerical in general imp_n = MiceImputer(strategy="bayesian least squares") imp_n.fit_transform(dfs.df_num)
def test_pmm_lrd_imputer(): """Test pmm and lrd work for numerical column of PredictiveImputer.""" # test pmm first - test kwargs and params imp_pmm = MiceImputer( strategy={"y": "pmm"}, imp_kwgs={"y": { "fill_value": "random", "copy_x": False }}) imp_pmm.fit_transform(dfs.df_bayes_reg) # test lrd second - test kwargs and params imp_lrd = MiceImputer( strategy={"y": "lrd"}, imp_kwgs={"y": { "fill_value": "random", "copy_x": False }}) imp_lrd.fit_transform(dfs.df_bayes_reg)
class MiBaseRegressor: """Building blocks to create an Autoimpute regressor. Every Autoimpute regressor inherits from the MiBaseRegressor. The class provides the functionality necessary for Autoimpute regressors to wrap sklearn or statsmodels libraries and apply them to multiply imputed datasets. It also creates the MiceImputer used to impute data if the user does not specify a custom MiceImputer during instantiation. Attributes: model_libs (tuple): libraries supported by Autoimpute regressors. """ model_libs = ("sklearn", "statsmodels") def __init__(self, mi, model_lib, mi_kwgs, model_kwgs): """Create an instance of the MiBaseRegressor class. The MiBaseRegressor class is not a stand-alone class and should not be used other than as a parent class to an Autoimpute regressor. An Autoimpute regressor wraps either sklearn or statsmodels regressors to apply them on multiply imputed datasets. The MiBaseRegressor contains the logic Autoimpute regressors share. In addition, it creates an instance of the MiceImputer to impute missing data. Args: mi (MiceImputer): An instance of a MiceImputer. If `mi` passed explicitly, this `mi` will be used for MultipleImptuer. Can use `mi_kwgs` instead, although `mi` is cleaner/preferred. model_lib (str): library the regressor will use to implement regression. Options are sklearn and statsmodels. Default is statsmodels. mi_kwgs (dict): keyword args to instantiate MiceImputer. If valid MiceImputer passed to `mi`, model_kwgs ignored. If `mi_kwgs` is None and `mi` is None, MiBaseRegressor creates default instance of MiceImputer. model_kwgs (dict): keyword args to instantiate regressor. Arg is passed along to either sklearn or statsmodels regressor. If `model_kwgs` is None, default instance of regressor created. Returns: self. Instance of MiBaseRegressor class. """ # Order Important. `mi_kwgs` validation first b/c it's used in `mi` # also note - encoder is not argument, b/c one-hot only right now self.mi_kwgs = mi_kwgs self.mi = mi self.model_kwgs = model_kwgs self.model_lib = model_lib @property def mi_kwgs(self): """Property getter to return the value of mi_kwgs.""" return self._mi_kwgs @mi_kwgs.setter def mi_kwgs(self, kwgs): """Validate the mi_kwgs and set default properties. The MiBaseRegressor validates mi_kwgs argument. mi_kwgs contain optional keyword arguments to create a MiceImputer. The argument is optional, and its default is None. Args: kwgs (dict, None): None or dictionary of keywords. Raises: ValueError: mi_kwgs not correctly specified as argument. """ if not isinstance(kwgs, (type(None), dict)): err = "mi_kwgs must be None or dict of args for MiceImputer." raise ValueError(err) self._mi_kwgs = kwgs @property def mi(self): """Property getter to return the value of mi.""" return self._mi @mi.setter def mi(self, m): """Validate mi and set default properties. The MiBaseRegressor validates the mi argument. mi must be a valid instance of a MiceImputer. It can also be None. If None, the MiBaseRegressor will create a MiceImputer on its own, either by default or with any key values passed to the mi_kwgs args dict. Args: m (MiceImputer, None): Instance of a MiceImputer. Raises: ValueError: mi is not an instance of a MiceImputer. """ # check if m is None or a MiceImputer if not isinstance(m, (type(None), MiceImputer)): err = f"{m} must be None or a valid instance of MiceImputer." raise ValueError(err) # handle each case if None or MiceImputer if m is not None: self._mi = m else: # handle whether or not mi_kwgs should be passed if self.mi_kwgs: self._mi = MiceImputer(**self.mi_kwgs) else: self.mi = MiceImputer() @property def model_kwgs(self): """Property getter to return the value of model_kwargs.""" return self._model_kwgs @model_kwgs.setter def model_kwgs(self, kwgs): """Validate the model_kwgs and set default properties. The MiBaseRegressor validates the model_kwgs argument. model_kwgs contain optional keyword arguments pased to a regression model. The argument is optional, and its default is None. Args: kwgs (dict, None): None or dictionary of keywords. Raises: ValueError: model_kwgs not correctly specified as argument. """ if not isinstance(kwgs, (type(None), dict)): err = "model_kwgs must be dict of args used to instantiate model." raise ValueError(err) self._model_kwgs = kwgs @property def model_lib(self): """Property getter to return the value of model_lib.""" return self._model_lib @model_lib.setter def model_lib(self, lib): """Validate model_lib and set default properties. The MiBaseRegressor validates the model_lib argument. model_lib should be in the MiBaseRegressor.model_libs tuple, which contains the libs to use for regression of multiply imputed datasets. The library chosen is important. Only statsmodels (the default) provides proper parameter pooling using Rubin's rules. sklearn provides mean estimate pooling only. sklearn variance parameter pooling and diagnostics in dev, TBD. Args: lib (iter): library to use Raises: ValueError: lib not a valid library to use. """ if lib not in self.model_libs: err = f"{lib} not valid `model_lib`. Must be {self.model_libs}." raise ValueError(err) self._model_lib = lib def _fit_strategy_validator(self, X, y): """Private method to validate data before fitting model.""" # y must be a series or dataframe if not isinstance(y, (pd.Series, pd.DataFrame)): err = "y must be a Series or DataFrame" raise ValueError(err) # y must have a name if series. if isinstance(y, pd.Series): self._yn = y.name if self._yn is None: err = "series y must have a name" raise ValueError(err) # y must have one column if dataframe. if isinstance(y, pd.DataFrame): yc = y.shape[1] if yc != 1: err = "y should only have one column" raise ValueError(err) y = y.iloc[:, 0] self._yn = y.name # y and X must have the same number of rows if X.shape[0] != y.shape[0]: err = "y and X must have the same number of records" raise ValueError(err) # if no errors thus far, add y to X for imputation X[self._yn] = y # return the multiply imputed datasets return self.mi.fit_transform(X) def _fit_model(self, model_type, regressor, X, y): """Private method to fit a model using sklearn or statsmodels.""" # encoding for predictor variable # we enforce that predictors were imputed in imputation phase. X = _one_hot_encode(X) self.new_X_columns = X.columns.tolist() # encoding for response variable if model_type == "logistic": ycat = y.astype("category").cat y = ycat.codes self._response_categories = ycat.categories # statsmodels fit case, which requires different logic than sklearn if self.model_lib == "statsmodels": X = add_constant(X) if self.model_kwgs: model = regressor(y, X, **self.model_kwgs) else: model = regressor(y, X) model = model.fit() # sklearn fit case, which requires different logic than statsmodels if self.model_lib == "sklearn": if self.model_kwgs: model = regressor(**self.model_kwgs) else: model = regressor() # sklearn doesn't need encoding for response model.fit(X, y) # return the model after fitting it to a given dataset return model def _apply_models_to_mi_data(self, model_dict, X, y): """Private method to apply analysis model to multiply imputed data.""" # find regressor based on model lib, then get mutliply imputed data model_type = model_dict["type"] regressor = model_dict[self.model_lib] mi_data = self._fit_strategy_validator(X, y) models = {} # then preform analysis models. Sequential only right now. for dataset in mi_data: ind, X = dataset y = X.pop(self._yn) model = self._fit_model(model_type, regressor, X, y) models[ind] = model # returns a dictionary: k=imp #; v=analysis model applied to imp # return models def _predict_strategy_validator(self, instance, X): """Private method to validate before prediction.""" # first check that model is fitted, then check columns are the same check_is_fitted(instance, "statistics_") X_cols = X.columns.tolist() fit_cols = set(instance.fit_X_columns) diff_fit = set(fit_cols).difference(X_cols) if diff_fit: err = "Same columns that were fit must appear in predict." raise ValueError(err) # encoding for predictor variable # we enforce that predictors were imputed in imputation phase. if X.isnull().sum().any(): me = "Data passed to make predictions can't contain missingness." raise ValueError(me) X = _one_hot_encode(X) return X def _var_ratios(self, imps, num, denom): """Private method for the variance ratios.""" return (num+(num/imps))/denom def _degrees_freedom(self, imps, lambda_, v_com): """Private method to calculate degrees of freedom for estimates.""" # note we nudge lambda if zero b/c need lambda for other stats # see source code barnard.rubin.R from MICE for more lambda_ = np.maximum(1e-04, lambda_) v_old = (imps-1)/lambda_**2 v_obs = ((v_com+1)/(v_com+3))*v_com*(1-lambda_) v = (v_old*v_obs)/(v_old+v_obs) return v def _get_stats_from_models(self, models): """Private method to generate statistics given on model lib chosen.""" # initial setup - get items from models and get number of models items = models.items() m = self.mi.n # pooling phase: sklearn - coefficients only, no variance if self.model_lib == "sklearn": # find basic parameters, but can't return much more than coeff # sklearn does not implement inference out of the box # will have to write methods to do so from stratch, so TBD self.mi_alphas_ = [j.intercept_ for i, j in items] self.mi_params_ = [j.coef_ for i, j in items] alpha = sum(self.mi_alphas_) / m params = sum(self.mi_params_) / m coefs = pd.Series(np.insert(params, 0, alpha)) coefs.index = ["const"] + self.new_X_columns statistics = OrderedDict( coefs=coefs ) # pooling phase: statsmodels - coefficients and variance possible if self.model_lib == "statsmodels": # data and model parameters self.mi_params_ = [j.params for i, j in items] self.mi_std_errors_ = [j.bse for i, j in items] coefs = sum(self.mi_params_)/ m k = coefs.index.size n = list(items)[0][1].nobs df_com = n-k # variance metrics (See VB Ch 2.3) vw = sum(map(lambda x: x**2, self.mi_std_errors_))/m vb = sum(map(lambda p: (p-coefs)**2, self.mi_params_))/max(1, m-1) vt = vw + vb + (vb / m) stdt = np.sqrt(vt) # variance ratios (See VB Ch 2.3) # efficiency as specified in stats manual lambda_ = self._var_ratios(m, vb, vt) r_ = self._var_ratios(m, vb, vw) v_ = self._degrees_freedom(m, lambda_, df_com) fmi_ = ((v_+1)/(v_+3))*lambda_ + 2/(v_+3) eff_ = (1+(np.maximum(1e-04, fmi_)/m))**-1 # create statistics with pooled metrics from above statistics = OrderedDict( coefs=coefs, std=stdt, vw=vw, vb=vb, vt=vt, dfcom=df_com, dfadj=v_, lambda_=lambda_, riv=r_, fmi=fmi_, eff=eff_ ) # finally, return dictionary with stats from fit used in transform return statistics
def test_partial_dependence_imputer(): """Test to ensure that edge case for partial dependence whandled""" imp = MiceImputer(strategy='stochastic') imp.fit_transform(dfs.df_partial_dependence)
def test_normal_unit_variance_imputer(): """Test normal unit variance imputer for numerical column""" imp_pmm = MiceImputer(strategy={"y": "normal unit variance"}, ) imp_pmm.fit_transform(dfs.df_bayes_reg)
def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name miss_rate = args.miss_rate gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations } # Load data and introduce missingness ori_data_x, miss_data_x, data_m = data_loader(data_name, miss_rate) # Impute missing data imputed_data_x = gain(miss_data_x, gain_parameters) # Report the RMSE performance rmse = rmse_loss(ori_data_x, imputed_data_x, data_m) print() mi_data = miss_data_x.astype(float) no, dim = imputed_data_x.shape miss_data = np.reshape(mi_data, (no, dim)) np.savetxt("data/missing_data.csv", mi_data, delimiter=',', fmt='%1.2f') print('Shape of miss data: ', miss_data.shape) print('Save results in missing_data.csv') print() print('=== GAIN RMSE ===') print('RMSE Performance: ' + str(np.round(rmse, 6))) #print('Kích thước của file đầu ra: ', imputed_data_x.shape) np.savetxt("data/imputed_data.csv", imputed_data_x, delimiter=',', fmt='%d') print('Save results in Imputed_data.csv') # MissForest print() print('=== MissForest RMSE ===') data = miss_data_x imp_mean = MissForest(max_iter=5) miss_f = imp_mean.fit_transform(data) #miss_f = pd.DataFrame(imputed_train_df) rmse_MF = rmse_loss(ori_data_x, miss_f, data_m) print('RMSE Performance: ' + str(np.round(rmse_MF, 6))) np.savetxt("data/imputed_data_MF.csv", miss_f, delimiter=',', fmt='%d') print('Save results in Imputed_data_MF.csv') # MICE From Auto Impute print() print('=== MICE of Auto Impute RMSE ===') data_mice = pd.DataFrame(miss_data_x) mi = MiceImputer(k=1, imp_kwgs=None, n=1, predictors='all', return_list=True, seed=None, strategy='default predictive', visit='default') mice_out = mi.fit_transform(data_mice) c = [list(x) for x in mice_out] c1 = c[0] c2 = c1[1] c3 = np.asarray(c2) mice_x = c3 #print('here :', mice_x, miss_f, miss_f.shape) rmse_MICE = rmse_loss(ori_data_x, mice_x, data_m) print('=== MICE of Auto Impute RMSE ===') print('RMSE Performance: ' + str(np.round(rmse_MICE, 6))) np.savetxt("data/imputed_data_MICE.csv", mice_x, delimiter=',', fmt='%d') print('Save results in Imputed_data_MICE.csv') return imputed_data_x, rmse