def test_bayesian_logistic_imputer(): """Test bayesian works for binary column of PredictiveImputer.""" imp_b = SingleImputer(strategy={"y": "bayesian binary logistic"}, imp_kwgs={"y": { "fill_value": "random" }}) imp_b.fit_transform(dfs.df_bayes_log)
def test_categorical_univar_imputers(): """Test categorical methods when not using the _default.""" for cat_strat in dfs.cat_strategies: imp = SingleImputer(strategy={"cats": cat_strat}) imp.fit_transform(dfs.df_ts_mixed) for imputer in imp.statistics_.values(): strat = imputer.statistics_["strategy"] assert strat == cat_strat
def test_bayesian_reg_imputer(): """Test bayesian works for numerical column of PredictiveImputer.""" # test designed first imp_b = SingleImputer(strategy={"y":"bayesian least squares"}) imp_b.fit_transform(dfs.df_bayes_reg) # test on numerical in general imp_n = SingleImputer(strategy="bayesian least squares") imp_n.fit_transform(dfs.df_num)
def test_numerical_univar_imputers(): """Test numerical methods when not using the _default.""" for num_strat in dfs.num_strategies: imp = SingleImputer(strategy=num_strat) imp.fit_transform(dfs.df_num) for imputer in imp.statistics_.values(): strat = imputer.statistics_["strategy"] assert strat == num_strat
def test_stochastic_predictive_imputer(): """Test stochastic works for numerical columns of PredictiveImputer.""" # generate linear, then stochastic imp_p = SingleImputer(strategy={"A": "least squares"}) imp_s = SingleImputer(strategy={"A": "stochastic"}) # make sure both work _ = imp_p.fit_transform(dfs.df_num) _ = imp_s.fit_transform(dfs.df_num) assert imp_p.imputed_["A"] == imp_s.imputed_["A"]
def test_bayesian_reg_imputer(): """Test bayesian works for numerical column of PredictiveImputer.""" # test designed first - test kwargs and params imp_b = SingleImputer( strategy={"y": "bayesian least squares"}, imp_kwgs={"y": { "fill_value": "random", "am": 11, "cores": 2 }}) imp_b.fit_transform(dfs.df_bayes_reg) # test on numerical in general imp_n = SingleImputer(strategy="bayesian least squares") imp_n.fit_transform(dfs.df_num)
def test_pmm_lrd_imputer(): """Test pmm and lrd work for numerical column of PredictiveImputer.""" # test pmm first - test kwargs and params imp_pmm = SingleImputer( strategy={"y": "pmm"}, imp_kwgs={"y": { "fill_value": "random", "copy_x": False }}) imp_pmm.fit_transform(dfs.df_bayes_reg) # test lrd second - test kwargs and params imp_lrd = SingleImputer( strategy={"y": "lrd"}, imp_kwgs={"y": { "fill_value": "random", "copy_x": False }}) imp_lrd.fit_transform(dfs.df_bayes_reg)
def test_default_single_imputer(): """Test the _default method and results for SingleImputer().""" imp = SingleImputer() # test df_num first # ----------------- # all strategies should default to pmm imp.fit_transform(dfs.df_num) for imputer in imp.statistics_.values(): strat = imputer.statistics_["strategy"] assert strat == "pmm" # test df_ts_mixed next # --------------- # datetime col should default to none # numerical col should default to mean # categorical col should default to mean imp.fit_transform(dfs.df_mix) gender_imputer = imp.statistics_["gender"] salary_imputer = imp.statistics_["salary"] assert salary_imputer.statistics_["strategy"] == "pmm" assert gender_imputer.statistics_["strategy"] == "multinomial logistic"
def model(self, method, col, predictors): """ imputes the given column with any of the regression techniques. """ if predictors: params = predictors else: params = self.df.select_dtypes( include=self.numerics).columns.to_list() if col in params: params.remove(col) print(params) imputer = SingleImputer(strategy={col: method}, predictors={col: params}) self.df = imputer.fit_transform(self.df)
np.array -- imputed dataset. """ # check if fitted then impute with mean check_is_fitted(self, "statistics_") _not_num_series(self.strategy, X) omu = self.statistics_["param"] # mean of observed data idx = X.isnull() # missing data nO = sum(~idx) # number of observed m = sum(idx) # number to impute muhatk = stats.norm(omu, np.sqrt(1 / nO)) # imputation cross-terms *NOT* uncorrelated Ymi = stats.multivariate_normal( np.ones(m) * muhatk.rvs(), np.ones((m, m)) / nO + np.eye(m)).rvs() out = X.copy() out[idx] = Ymi return out def fit_impute(self, X, y=None): """Convenience method to perform fit and imputation in one go.""" return self.fit(X, y).impute(X) if __name__ == '__main__': from autoimpute.imputations import SingleImputer si = SingleImputer('normal unit variance') Yo = stats.norm(0, 1).rvs(100) df = pd.DataFrame(columns=['Yo'], index=range(200), dtype=float) df.loc[range(100), 'Yo'] = Yo si.fit_transform(df)
def test_wrong_categorical_type(): """Test supported strategies but improper column type for strategy.""" cat_for_num = SingleImputer(strategy="categorical") with pytest.raises(TypeError): cat_for_num.fit_transform(dfs.df_num)
def test_wrong_numerical_type(): """Test supported strategies but improper column type for strategy.""" num_for_cat = SingleImputer(strategy={"cats": "mean"}) with pytest.raises(TypeError): num_for_cat.fit_transform(dfs.df_ts_mixed)
def test_bad_strategy(): """Test that strategies not supported throw a ValueError.""" with pytest.raises(ValueError): imp = SingleImputer(strategy="not_a_strategy") imp.fit_transform(dfs.df_num)
def test_single_missing_column(): """Test that the imputer removes columns that are fully missing.""" with pytest.raises(ValueError): imp = SingleImputer() imp.fit_transform(dfs.df_col_miss)
""" Imputation (1st Round): Univariate Imputation - Imputation method: Quadratic spline interpolation - Impute selected k-features """ strategy = "interpolate" fill_strategy = "cubic" dict_strategy = dict() dict_imp_kwgs = dict() for i in idx_selected: dict_strategy.update({df.columns[i]: strategy}) dict_imp_kwgs.update({df.columns[i]: {'fill_strategy': fill_strategy}}) imp_x = SingleImputer(strategy=dict_strategy, imp_kwgs=dict_imp_kwgs) df_imputed = imp_x.fit_transform(df) plt.plot(df_imputed[df.columns[idx_selected[0]]], label='Imputed') plt.plot(df[df.columns[idx_selected[0]]], label='Actual') train_ratio = 0.8 split_idx = int(len(df) * train_ratio) x_train = df[:split_idx].values[1:] y_train = df[:split_idx].values[0] x_test = df[split_idx:].values[1:] y_test = df[split_idx:].values[0]
def plot_imp_scatter(d, x, y, strategy, color=None, title="Jointplot after Imputation", h=8.27, imp_kwgs=None, a=0.5, marginals=None, obs_color="navy", imp_color="red", **plot_kwgs): """Plot the joint scatter and density plot after single imputation. Use this method to visualize a scatterplot between two features, x and y, where y is imputed and x is a predictor used to impute y. This method performs single imputation and is useful to determine how an imputation method looks under the hood. Args: d (pd.DataFrame): DataFrame with data to impute and plot. x (str): column to plot on x axis. y (str): column to plot on y axis and set color for imputation. strategy (str): imputation method for SingleImputer. color (str, Optional): which variable to color with imputations. Deafult is none, which means y is colored. Other option is to color "x". Color should be the same as "x" or "y". title (str, Optional): title of plot. "Defualt is Jointplot after Imputation". h (float, Optional): height of the jointplot. Default is 8.27 imp_kwgs (dict, Optional): imp kwgs for SingleImputer procedure. Default is None. a (float, Optional): alpha for plot color. Default is 0.5 marginals (dict, Optional): dictionary of marginal plot args. Default is None, configured in code below. obs_color (str, Optional): color of observed. Default is navy. imp_color (str, Optional): color of imputations. Default is red. **plot_kwgs: keyword arguments used by sns.set. Raises: ValueError: x and y must be names of columns in data """ # plot setup and arg validation _default_plot_args(**plot_kwgs) _validate_kwgs(marginals) _validate_kwgs(imp_kwgs) if marginals is None: marginals = dict(rug=True, kde=True) # validate x and y selection if not x in d.columns or not y in d.columns: err = "x and y must be names of columns in data" raise ValueError(err) # create imputer with strategy and optional imp kwgs if imp_kwgs is None: imp = SingleImputer(strategy=strategy) else: imp = SingleImputer(strategy=strategy, imp_kwgs=imp_kwgs) # handling the color configuration if color is None: color = y else: if color == y: color = y elif color == x: color = x else: err = "color must be the same as `y` or `x`" raise ValueError(err) # configure and apply the imputer impute = imp.fit_transform(d) impute["colors"] = obs_color impute.loc[imp.imputed_[color], "colors"] = imp_color joints_color = impute["colors"] # create the joint plot joint_kws = dict(facecolor=joints_color, edgecolor=joints_color) g = sns.jointplot(x=x, y=y, data=impute, alpha=a, height=h, joint_kws=joint_kws, marginal_kws=marginals) # final plot config and title plt.subplots_adjust(top=0.925) g.fig.suptitle(title)
imp = MultipleImputer(n=3) res = imp.fit_transform(data) print(res) res.shape data.shape from autoimpute.imputations import SingleImputer single = SingleImputer( strategy={ 'status': "categorical", 'release_year': "median", 'runtime': 'norm', 'release_month': 'random' }) data_imputed_once = single.fit_transform(data) data_imputed_once.isna().sum() data_imputed_once.release_month.unique() data_imputed_once['release_month'] triple = MultipleImputer(n=3) sns.scatterplot(x='release_year', y='runtime', data=data_imputed_once) sns.catplot(x='release_year', y='rating', kind='box', data=data_imputed_once) data = data_imputed_once #Let's start modeling for god's sake #get dummies x = pd.get_dummies(data, columns=['status'], drop_first=True)
def test_normal_unit_variance_imputer(): """Test normal unit variance imputer for numerical column""" imp_pmm = SingleImputer(strategy={"y": "normal unit variance"}, ) imp_pmm.fit_transform(dfs.df_bayes_reg)
def test_partial_dependence_imputer(): """Test to ensure that edge case for partial dependence whandled""" imp = SingleImputer(strategy='stochastic') imp.fit_transform(dfs.df_partial_dependence)