def Imputer(method):
     if method == 'MICE':
         imputer = fancyimpute.MICE(
             n_nearest_columns=n_nearest_columns,
             min_value=0.0,
             verbose=False,
         )
         def impute(data):
             return imputer.complete(data)
     elif method in ('fancymean', 'zero', 'fancymedian', 'min', 'random'):
         imputer = fancyimpute.SimpleFill(
             min_value=0.0,
             fill_method=method,
         )
         def impute(data):
             return imputer.complete(data)
     elif method in ('mean', 'median', 'most_frequent'):
         import sklearn.preprocessing
         imputer = sklearn.preprocessing.Imputer(
             strategy=method,
         )
         def impute(data):
             return imputer.fit_transform(data)
     elif method == 'drop':
         def impute(data):
             raise NotImplementedError
             
     return impute
Exemplo n.º 2
0
def mean_fill(perc, c):
    """mean fill that provies the log probability"""
    clf = BayesianRidge()
    df, y = glm_testing.create_missing(perc=perc, c=c)
    drug_vals, drug_true = glm_testing.test_drug(c=c)
    design = fancyimpute.SimpleFill(fill_method='mean').fit_transform(df)
    clf.fit(design, y)
    drug_preds, std = clf.predict(drug_vals, return_std=True)
    return -scipy.stats.norm(drug_preds, std * 1).logpdf(drug_true).sum()
Exemplo n.º 3
0
 def impute_value(self, df, method="MICE"):
     """
     Impute using MICE
     """
     if method == "MICE":
         return fi.MICE(verbose=False).complete(df)
     elif method == "KNN":
         return fi.KNN(k=4, verbose=False).complete(df)
     else:
         return fi.SimpleFill().complete(df)
def impute_targetdata(targetdata):
    """
    there are very few missing values, so just use a fast but dumb imputation here
    """
    if numpy.sum(numpy.isnan(targetdata.values)) > 0:
        targetdata_imp = fancyimpute.SimpleFill().complete(targetdata.values)
        targetdata = pandas.DataFrame(targetdata_imp,
                                      index=targetdata.index,
                                      columns=targetdata.columns)
    return targetdata
Exemplo n.º 5
0
def dec_mean(perc, c):
    """
    Decision tree classifier with mean imputation
    """
    clf = DecisionTreeClassifier()
    df, y = glm_testing.create_missing(perc=perc, c=c)
    y = y.apply(lambda x: 1 if x > 1 else 0)
    drug_vals, drug_true = glm_testing.test_drug(c=c)
    drug_true = drug_true.apply(lambda x: 1 if x > 1 else 0)
    design = fancyimpute.SimpleFill(fill_method='mean').fit_transform(df)
    clf.fit(design, y)
    return (np.log(clf.predict_proba(drug_vals)[:, 1][drug_true == 1].prod() *
                   clf.predict_proba(drug_vals)[:, 0][drug_true == 0].prod()))
Exemplo n.º 6
0
def log_mean(perc, c, extras=0):
    """
    Logistic regression with mean imputation
    """
    clf = LogisticRegression(solver="liblinear")
    df, y = glm_testing.create_missing(perc=perc, c=c, extras=extras)
    y = y.apply(lambda x: 1 if x > 1 else 0)
    drug_vals, drug_true = glm_testing.test_drug(c=c, extras=extras)
    drug_true = drug_true.apply(lambda x: 1 if x > 1 else 0)
    design = fancyimpute.SimpleFill(fill_method='mean').fit_transform(df)
    clf.fit(design, y)
    return (np.log(clf.predict_proba(drug_vals)[:, 1][drug_true == 1].prod() *
                   clf.predict_proba(drug_vals)[:, 0][drug_true == 0].prod()))
Exemplo n.º 7
0
def mean_fill_conf(perc, c):
    """mean fill that provides the conf intervals"""
    clf = BayesianRidge()
    df, y = glm_testing.create_missing(perc=perc, c=c)
    drug_vals, drug_true = glm_testing.test_drug(c=c)
    design = fancyimpute.SimpleFill(fill_method='mean').fit_transform(df)
    clf.fit(design, y)
    drug_preds, std = clf.predict(drug_vals, return_std=True)
    return sum(
        scipy.stats.norm(
            drug_preds,
            std *
            1).pdf(drug_true) < 0.05)  # 95 percent confidence
Exemplo n.º 8
0
def impute(df, method, verbose=False):
    """
    Impute missing data using specified imputation method.
    
    Parameters
    ----------
    df: pd.DataFrame
        Stat DataFrame with source columns and player/team  multi-index.
    method: str/bool
        Imputation method for missing data.
            - False: Do not impute missing data.
            - None: Do not impute missing data.
            - 'BiScaler'
            - 'IterativeImpute'
            - 'IterativeSVD'
            - 'KNN': Impute with nearest neighbors.
            - 'Mean': Impute missing with average of other sources.
            - 'NuclearNorm'
            - 'SoftImpute'
    verbose: bool, default=False
        If True, print debugging information.
        
    Returns
    -------
    df: pd.DataFrame
        Imputed DataFrame with no NaNs.
    """
    warnings.filterwarnings('ignore', category=RuntimeWarning)

    # Subset DataFrame to only include only projection columns.
    ignored_cols = ['Player', 'Team', 'Pos', 'Week', 'STATS']
    impute_cols = [col for col in list(df) if col not in ignored_cols]
    X = df[impute_cols].copy().T

    # Impute DataFrame.
    v = verbose
    if method in [None, False]:
        imputed_vals = X.values
    elif np.sum(np.sum(X.isnull())) == 0:
        # No missing values.
        imputed_vals = X.values
    elif method == 'BiScaler':
        imputed_vals = fi.BiScaler(verbose=v).fit_transform(X)
    elif method == 'IterativeImpute':
        imputed_vals = fi.IterativeImputer(verbose=v).fit_transform(X)
    elif method == 'IterativeSVD':
        imputed_vals = fi.IterativeSVD(verbose=v).fit_transform(X)
    elif method == 'KNN':
        imputed_vals = fi.KNN(k=3, verbose=v).fit_transform(X)
    elif method == 'MatrixFactorization':
        imputed_vals = fi.MatrixFactorization(verbose=v).fit_transform(X)
    elif method == 'Mean':
        imputed_vals = fi.SimpleFill('mean').fit_transform(X)
    elif method == 'Median':
        imputed_vals = fi.SimpleFill('median').fit_transform(X)
    elif method == 'NuclearNorm':
        imputed_vals = fi.NuclearNormMinimization(verbose=v).fit_transform(X)
    elif method == 'SoftImpute':
        imputed_vals = fi.SoftImpute(verbose=v).fit_transform(X)

    # Recombine ignored columns with imputed data.
    imputed_df = pd.DataFrame(imputed_vals.T, columns=X.index)
    for col in impute_cols:
        if len(imputed_df[col]) != len(df[col]):
            print(f'df: {len(df[col])}\nimp: {len(imputed_df[col])}')
        df[col] = imputed_df[col].values

    return df
Exemplo n.º 9
0
 def __init__(self, imputer=None):
     if imputer is None:
         self.imputer = fancyimpute.SimpleFill()
     else:
         self.imputer = imputer(verbose=False)
Exemplo n.º 10
0
    label = Y[mask]

    mae = np.abs(pred - label).sum() / (1e-5 + np.sum(mask))
    mre = np.abs(pred - label).sum() / (1e-5 + np.sum(np.abs(label)))

    return {'mae': mae, 'mre': mre}


# Algo1: Mean imputation

X_mean = []

print(len(X))

for x, y in zip(X, Y):
    X_mean.append(fancyimpute.SimpleFill().fit_transform(x))

X_c = np.concatenate(X, axis=0).reshape(-1, 48, 35)
Y_c = np.concatenate(Y, axis=0).reshape(-1, 48, 35)
Z_c = np.array(Z)
X_mean = np.concatenate(X_mean, axis=0).reshape(-1, 48, 35)

print('Mean imputation:')
print(get_loss(X_c, X_mean, Y_c))

# save mean inputation results
print(X_c.shape, Y_c.shape, Z_c.shape)
# raw_input()
np.save('./result/mean_data.npy', X_mean)
np.save('./result/mean_label.npy', Z_c)
Exemplo n.º 11
0
    pred = X_pred[mask]
    label = Y[mask]

    mae = np.abs(pred - label).sum() / (1e-5 + np.sum(mask))
    mre = np.abs(pred - label).sum() / (1e-5 + np.sum(np.abs(label)))

    return {'mae': mae, 'mre': mre}

# Algo1: Mean imputation

X_mean = []

print(len(X))

for x, y in zip(X, Y):
    X_mean.append(fancyimpute.SimpleFill().complete(x))

X_c = np.concatenate(X, axis=0).reshape(-1, 48, 35)
Y_c = np.concatenate(Y, axis=0).reshape(-1, 48, 35)
Z_c = np.array(Z)
X_mean = np.concatenate(X_mean, axis=0).reshape(-1, 48, 35)

print('Mean imputation:')
print(get_loss(X_c, X_mean, Y_c))

# save mean inputation results
print(X_c.shape, Y_c.shape, Z_c.shape)
raw_input()
np.save('./result/mean_data.npy', X_mean)
np.save('./result/mean_label.npy', Z_c)