Exemplo n.º 1
0
def test_bayesian_logistic_imputer():
    """Test bayesian works for binary column of PredictiveImputer."""
    imp_b = SingleImputer(strategy={"y": "bayesian binary logistic"},
                          imp_kwgs={"y": {
                              "fill_value": "random"
                          }})
    imp_b.fit_transform(dfs.df_bayes_log)
Exemplo n.º 2
0
def test_categorical_univar_imputers():
    """Test categorical methods when not using the _default."""
    for cat_strat in dfs.cat_strategies:
        imp = SingleImputer(strategy={"cats": cat_strat})
        imp.fit_transform(dfs.df_ts_mixed)
        for imputer in imp.statistics_.values():
            strat = imputer.statistics_["strategy"]
            assert strat == cat_strat
Exemplo n.º 3
0
def test_bayesian_reg_imputer():
    """Test bayesian works for numerical column of PredictiveImputer."""
    # test designed first
    imp_b = SingleImputer(strategy={"y":"bayesian least squares"})
    imp_b.fit_transform(dfs.df_bayes_reg)
    # test on numerical in general
    imp_n = SingleImputer(strategy="bayesian least squares")
    imp_n.fit_transform(dfs.df_num)
Exemplo n.º 4
0
def test_numerical_univar_imputers():
    """Test numerical methods when not using the _default."""
    for num_strat in dfs.num_strategies:
        imp = SingleImputer(strategy=num_strat)
        imp.fit_transform(dfs.df_num)
        for imputer in imp.statistics_.values():
            strat = imputer.statistics_["strategy"]
            assert strat == num_strat
Exemplo n.º 5
0
def test_stochastic_predictive_imputer():
    """Test stochastic works for numerical columns of PredictiveImputer."""
    # generate linear, then stochastic
    imp_p = SingleImputer(strategy={"A": "least squares"})
    imp_s = SingleImputer(strategy={"A": "stochastic"})
    # make sure both work
    _ = imp_p.fit_transform(dfs.df_num)
    _ = imp_s.fit_transform(dfs.df_num)
    assert imp_p.imputed_["A"] == imp_s.imputed_["A"]
Exemplo n.º 6
0
def test_bayesian_reg_imputer():
    """Test bayesian works for numerical column of PredictiveImputer."""
    # test designed first - test kwargs and params
    imp_b = SingleImputer(
        strategy={"y": "bayesian least squares"},
        imp_kwgs={"y": {
            "fill_value": "random",
            "am": 11,
            "cores": 2
        }})
    imp_b.fit_transform(dfs.df_bayes_reg)
    # test on numerical in general
    imp_n = SingleImputer(strategy="bayesian least squares")
    imp_n.fit_transform(dfs.df_num)
Exemplo n.º 7
0
def test_pmm_lrd_imputer():
    """Test pmm and lrd work for numerical column of PredictiveImputer."""
    # test pmm first - test kwargs and params
    imp_pmm = SingleImputer(
        strategy={"y": "pmm"},
        imp_kwgs={"y": {
            "fill_value": "random",
            "copy_x": False
        }})
    imp_pmm.fit_transform(dfs.df_bayes_reg)

    # test lrd second - test kwargs and params
    imp_lrd = SingleImputer(
        strategy={"y": "lrd"},
        imp_kwgs={"y": {
            "fill_value": "random",
            "copy_x": False
        }})
    imp_lrd.fit_transform(dfs.df_bayes_reg)
Exemplo n.º 8
0
def test_default_single_imputer():
    """Test the _default method and results for SingleImputer()."""
    imp = SingleImputer()
    # test df_num first
    # -----------------
    # all strategies should default to pmm
    imp.fit_transform(dfs.df_num)
    for imputer in imp.statistics_.values():
        strat = imputer.statistics_["strategy"]
        assert strat == "pmm"

    # test df_ts_mixed next
    # ---------------
    # datetime col should default to none
    # numerical col should default to mean
    # categorical col should default to mean
    imp.fit_transform(dfs.df_mix)
    gender_imputer = imp.statistics_["gender"]
    salary_imputer = imp.statistics_["salary"]
    assert salary_imputer.statistics_["strategy"] == "pmm"
    assert gender_imputer.statistics_["strategy"] == "multinomial logistic"
Exemplo n.º 9
0
 def model(self, method, col, predictors):
     """
     imputes the given column with any of the regression techniques.
     """
     if predictors:
         params = predictors
     else:
         params = self.df.select_dtypes(
             include=self.numerics).columns.to_list()
         if col in params: params.remove(col)
         print(params)
     imputer = SingleImputer(strategy={col: method},
                             predictors={col: params})
     self.df = imputer.fit_transform(self.df)
Exemplo n.º 10
0
            np.array -- imputed dataset.
        """
        # check if fitted then impute with mean
        check_is_fitted(self, "statistics_")
        _not_num_series(self.strategy, X)
        omu = self.statistics_["param"]  # mean of observed data
        idx = X.isnull()  # missing data
        nO = sum(~idx)  # number of observed
        m = sum(idx)  # number to impute
        muhatk = stats.norm(omu, np.sqrt(1 / nO))
        # imputation cross-terms *NOT* uncorrelated
        Ymi = stats.multivariate_normal(
            np.ones(m) * muhatk.rvs(),
            np.ones((m, m)) / nO + np.eye(m)).rvs()
        out = X.copy()
        out[idx] = Ymi
        return out

    def fit_impute(self, X, y=None):
        """Convenience method to perform fit and imputation in one go."""
        return self.fit(X, y).impute(X)


if __name__ == '__main__':
    from autoimpute.imputations import SingleImputer
    si = SingleImputer('normal unit variance')
    Yo = stats.norm(0, 1).rvs(100)
    df = pd.DataFrame(columns=['Yo'], index=range(200), dtype=float)
    df.loc[range(100), 'Yo'] = Yo
    si.fit_transform(df)
Exemplo n.º 11
0
def test_wrong_categorical_type():
    """Test supported strategies but improper column type for strategy."""
    cat_for_num = SingleImputer(strategy="categorical")
    with pytest.raises(TypeError):
        cat_for_num.fit_transform(dfs.df_num)
Exemplo n.º 12
0
def test_wrong_numerical_type():
    """Test supported strategies but improper column type for strategy."""
    num_for_cat = SingleImputer(strategy={"cats": "mean"})
    with pytest.raises(TypeError):
        num_for_cat.fit_transform(dfs.df_ts_mixed)
Exemplo n.º 13
0
def test_bad_strategy():
    """Test that strategies not supported throw a ValueError."""
    with pytest.raises(ValueError):
        imp = SingleImputer(strategy="not_a_strategy")
        imp.fit_transform(dfs.df_num)
Exemplo n.º 14
0
def test_single_missing_column():
    """Test that the imputer removes columns that are fully missing."""
    with pytest.raises(ValueError):
        imp = SingleImputer()
        imp.fit_transform(dfs.df_col_miss)
Exemplo n.º 15
0
"""
Imputation (1st Round): Univariate Imputation
    - Imputation method: Quadratic spline interpolation
    - Impute selected k-features
"""

strategy = "interpolate"
fill_strategy = "cubic"

dict_strategy = dict()
dict_imp_kwgs = dict()

for i in idx_selected:
    dict_strategy.update({df.columns[i]: strategy})
    dict_imp_kwgs.update({df.columns[i]: {'fill_strategy': fill_strategy}})

imp_x = SingleImputer(strategy=dict_strategy, imp_kwgs=dict_imp_kwgs)

df_imputed = imp_x.fit_transform(df)

plt.plot(df_imputed[df.columns[idx_selected[0]]], label='Imputed')
plt.plot(df[df.columns[idx_selected[0]]], label='Actual')

train_ratio = 0.8
split_idx = int(len(df) * train_ratio)

x_train = df[:split_idx].values[1:]
y_train = df[:split_idx].values[0]
x_test = df[split_idx:].values[1:]
y_test = df[split_idx:].values[0]
Exemplo n.º 16
0
def plot_imp_scatter(d,
                     x,
                     y,
                     strategy,
                     color=None,
                     title="Jointplot after Imputation",
                     h=8.27,
                     imp_kwgs=None,
                     a=0.5,
                     marginals=None,
                     obs_color="navy",
                     imp_color="red",
                     **plot_kwgs):
    """Plot the joint scatter and density plot after single imputation.

    Use this method to visualize a scatterplot between two features, x and y,
    where y is imputed and x is a predictor used to impute y. This method
    performs single imputation and is useful to determine how an imputation
    method looks under the hood.

    Args:
        d (pd.DataFrame): DataFrame with data to impute and plot.
        x (str): column to plot on x axis.
        y (str): column to plot on y axis and set color for imputation.
        strategy (str): imputation method for SingleImputer.
        color (str, Optional): which variable to color with imputations.
            Deafult is none, which means y is colored. Other option is to
            color "x". Color should be the same as "x" or "y".
        title (str, Optional): title of plot.
            "Defualt is Jointplot after Imputation".
        h (float, Optional): height of the jointplot. Default is 8.27
        imp_kwgs (dict, Optional): imp kwgs for SingleImputer procedure.
            Default is None.
        a (float, Optional): alpha for plot color. Default is 0.5
        marginals (dict, Optional): dictionary of marginal plot args.
            Default is None, configured in code below.
        obs_color (str, Optional): color of observed. Default is navy.
        imp_color (str, Optional): color of imputations. Default is red.
        **plot_kwgs: keyword arguments used by sns.set.

    Raises:
        ValueError: x and y must be names of columns in data
    """

    # plot setup and arg validation
    _default_plot_args(**plot_kwgs)
    _validate_kwgs(marginals)
    _validate_kwgs(imp_kwgs)
    if marginals is None:
        marginals = dict(rug=True, kde=True)

    # validate x and y selection
    if not x in d.columns or not y in d.columns:
        err = "x and y must be names of columns in data"
        raise ValueError(err)

    # create imputer with strategy and optional imp kwgs
    if imp_kwgs is None:
        imp = SingleImputer(strategy=strategy)
    else:
        imp = SingleImputer(strategy=strategy, imp_kwgs=imp_kwgs)

    # handling the color configuration
    if color is None:
        color = y
    else:
        if color == y:
            color = y
        elif color == x:
            color = x
        else:
            err = "color must be the same as `y` or `x`"
            raise ValueError(err)

    # configure and apply the imputer
    impute = imp.fit_transform(d)
    impute["colors"] = obs_color
    impute.loc[imp.imputed_[color], "colors"] = imp_color
    joints_color = impute["colors"]

    # create the joint plot
    joint_kws = dict(facecolor=joints_color, edgecolor=joints_color)
    g = sns.jointplot(x=x,
                      y=y,
                      data=impute,
                      alpha=a,
                      height=h,
                      joint_kws=joint_kws,
                      marginal_kws=marginals)

    # final plot config and title
    plt.subplots_adjust(top=0.925)
    g.fig.suptitle(title)
Exemplo n.º 17
0
imp = MultipleImputer(n=3)

res = imp.fit_transform(data)
print(res)
res.shape
data.shape

from autoimpute.imputations import SingleImputer
single = SingleImputer(
    strategy={
        'status': "categorical",
        'release_year': "median",
        'runtime': 'norm',
        'release_month': 'random'
    })
data_imputed_once = single.fit_transform(data)
data_imputed_once.isna().sum()
data_imputed_once.release_month.unique()
data_imputed_once['release_month']

triple = MultipleImputer(n=3)

sns.scatterplot(x='release_year', y='runtime', data=data_imputed_once)
sns.catplot(x='release_year', y='rating', kind='box', data=data_imputed_once)

data = data_imputed_once

#Let's start modeling for god's sake

#get dummies
x = pd.get_dummies(data, columns=['status'], drop_first=True)
Exemplo n.º 18
0
def test_normal_unit_variance_imputer():
    """Test normal unit variance imputer for numerical column"""
    imp_pmm = SingleImputer(strategy={"y": "normal unit variance"}, )
    imp_pmm.fit_transform(dfs.df_bayes_reg)
Exemplo n.º 19
0
def test_partial_dependence_imputer():
    """Test to ensure that edge case for partial dependence whandled"""
    imp = SingleImputer(strategy='stochastic')
    imp.fit_transform(dfs.df_partial_dependence)