Exemplo n.º 1
0
def test_wrong_categorical_type():
    """Test supported strategies but improper column type for strategy."""
    cat_for_num = SingleImputer(strategy="categorical")
    with pytest.raises(TypeError):
        cat_for_num.fit_transform(dfs.df_num)
Exemplo n.º 2
0
def test_bad_strategy():
    """Test that strategies not supported throw a ValueError."""
    with pytest.raises(ValueError):
        imp = SingleImputer(strategy="not_a_strategy")
        imp.fit_transform(dfs.df_num)
Exemplo n.º 3
0
def test_wrong_numerical_type():
    """Test supported strategies but improper column type for strategy."""
    num_for_cat = SingleImputer(strategy={"cats": "mean"})
    with pytest.raises(TypeError):
        num_for_cat.fit_transform(dfs.df_ts_mixed)
Exemplo n.º 4
0
def test_partial_dependence_imputer():
    """Test to ensure that edge case for partial dependence whandled"""
    imp = SingleImputer(strategy='stochastic')
    imp.fit_transform(dfs.df_partial_dependence)
Exemplo n.º 5
0
def test_single_missing_column():
    """Test that the imputer removes columns that are fully missing."""
    with pytest.raises(ValueError):
        imp = SingleImputer()
        imp.fit_transform(dfs.df_col_miss)
Exemplo n.º 6
0
def test_normal_unit_variance_imputer():
    """Test normal unit variance imputer for numerical column"""
    imp_pmm = SingleImputer(strategy={"y": "normal unit variance"}, )
    imp_pmm.fit_transform(dfs.df_bayes_reg)
Exemplo n.º 7
0
"""
Imputation (1st Round): Univariate Imputation
    - Imputation method: Quadratic spline interpolation
    - Impute selected k-features
"""

strategy = "interpolate"
fill_strategy = "cubic"

dict_strategy = dict()
dict_imp_kwgs = dict()

for i in idx_selected:
    dict_strategy.update({df.columns[i]: strategy})
    dict_imp_kwgs.update({df.columns[i]: {'fill_strategy': fill_strategy}})

imp_x = SingleImputer(strategy=dict_strategy, imp_kwgs=dict_imp_kwgs)

df_imputed = imp_x.fit_transform(df)

plt.plot(df_imputed[df.columns[idx_selected[0]]], label='Imputed')
plt.plot(df[df.columns[idx_selected[0]]], label='Actual')

train_ratio = 0.8
split_idx = int(len(df) * train_ratio)

x_train = df[:split_idx].values[1:]
y_train = df[:split_idx].values[0]
x_test = df[split_idx:].values[1:]
y_test = df[split_idx:].values[0]
Exemplo n.º 8
0
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

import os
for ids in train['id']:
    print(ids)
    for file in os.listdir(text):
        if file.endswith(".gz"):
            print(file)
            data = pd.read_csv(text + file, compression='gzip')
            print(data.index[data['id'] == ids].tolist())
        break

from autoimpute.imputations import SingleImputer, MultipleImputer
si = SingleImputer() # imputation methods, passing through the data once
mi = MultipleImputer() # imputation methods, passing through the data multiple times

# train_cols = list(train)
X = train.drop(['id', 'target'], axis=1)
X = MICE().fit_transform(X)

X_test1 = test.drop(['id'], axis = 1)
X_test1 = MICE().fit_transform(X_test)

def scorer(true,pred):
    error = math.sqrt(mean_squared_error(pred,true))
    return math.exp(-1*error)
score = make_scorer(scorer, greater_is_better=True)

X = train.drop(['id', 'target'], axis=1)
Exemplo n.º 9
0
def plot_imp_scatter(d,
                     x,
                     y,
                     strategy,
                     color=None,
                     title="Jointplot after Imputation",
                     h=8.27,
                     imp_kwgs=None,
                     a=0.5,
                     marginals=None,
                     obs_color="navy",
                     imp_color="red",
                     **plot_kwgs):
    """Plot the joint scatter and density plot after single imputation.

    Use this method to visualize a scatterplot between two features, x and y,
    where y is imputed and x is a predictor used to impute y. This method
    performs single imputation and is useful to determine how an imputation
    method looks under the hood.

    Args:
        d (pd.DataFrame): DataFrame with data to impute and plot.
        x (str): column to plot on x axis.
        y (str): column to plot on y axis and set color for imputation.
        strategy (str): imputation method for SingleImputer.
        color (str, Optional): which variable to color with imputations.
            Deafult is none, which means y is colored. Other option is to
            color "x". Color should be the same as "x" or "y".
        title (str, Optional): title of plot.
            "Defualt is Jointplot after Imputation".
        h (float, Optional): height of the jointplot. Default is 8.27
        imp_kwgs (dict, Optional): imp kwgs for SingleImputer procedure.
            Default is None.
        a (float, Optional): alpha for plot color. Default is 0.5
        marginals (dict, Optional): dictionary of marginal plot args.
            Default is None, configured in code below.
        obs_color (str, Optional): color of observed. Default is navy.
        imp_color (str, Optional): color of imputations. Default is red.
        **plot_kwgs: keyword arguments used by sns.set.

    Raises:
        ValueError: x and y must be names of columns in data
    """

    # plot setup and arg validation
    _default_plot_args(**plot_kwgs)
    _validate_kwgs(marginals)
    _validate_kwgs(imp_kwgs)
    if marginals is None:
        marginals = dict(rug=True, kde=True)

    # validate x and y selection
    if not x in d.columns or not y in d.columns:
        err = "x and y must be names of columns in data"
        raise ValueError(err)

    # create imputer with strategy and optional imp kwgs
    if imp_kwgs is None:
        imp = SingleImputer(strategy=strategy)
    else:
        imp = SingleImputer(strategy=strategy, imp_kwgs=imp_kwgs)

    # handling the color configuration
    if color is None:
        color = y
    else:
        if color == y:
            color = y
        elif color == x:
            color = x
        else:
            err = "color must be the same as `y` or `x`"
            raise ValueError(err)

    # configure and apply the imputer
    impute = imp.fit_transform(d)
    impute["colors"] = obs_color
    impute.loc[imp.imputed_[color], "colors"] = imp_color
    joints_color = impute["colors"]

    # create the joint plot
    joint_kws = dict(facecolor=joints_color, edgecolor=joints_color)
    g = sns.jointplot(x=x,
                      y=y,
                      data=impute,
                      alpha=a,
                      height=h,
                      joint_kws=joint_kws,
                      marginal_kws=marginals)

    # final plot config and title
    plt.subplots_adjust(top=0.925)
    g.fig.suptitle(title)
Exemplo n.º 10
0
            np.array -- imputed dataset.
        """
        # check if fitted then impute with mean
        check_is_fitted(self, "statistics_")
        _not_num_series(self.strategy, X)
        omu = self.statistics_["param"]  # mean of observed data
        idx = X.isnull()  # missing data
        nO = sum(~idx)  # number of observed
        m = sum(idx)  # number to impute
        muhatk = stats.norm(omu, np.sqrt(1 / nO))
        # imputation cross-terms *NOT* uncorrelated
        Ymi = stats.multivariate_normal(
            np.ones(m) * muhatk.rvs(),
            np.ones((m, m)) / nO + np.eye(m)).rvs()
        out = X.copy()
        out[idx] = Ymi
        return out

    def fit_impute(self, X, y=None):
        """Convenience method to perform fit and imputation in one go."""
        return self.fit(X, y).impute(X)


if __name__ == '__main__':
    from autoimpute.imputations import SingleImputer
    si = SingleImputer('normal unit variance')
    Yo = stats.norm(0, 1).rvs(100)
    df = pd.DataFrame(columns=['Yo'], index=range(200), dtype=float)
    df.loc[range(100), 'Yo'] = Yo
    si.fit_transform(df)