def Imputer(method): if method == 'MICE': imputer = fancyimpute.MICE( n_nearest_columns=n_nearest_columns, min_value=0.0, verbose=False, ) def impute(data): return imputer.complete(data) elif method in ('fancymean', 'zero', 'fancymedian', 'min', 'random'): imputer = fancyimpute.SimpleFill( min_value=0.0, fill_method=method, ) def impute(data): return imputer.complete(data) elif method in ('mean', 'median', 'most_frequent'): import sklearn.preprocessing imputer = sklearn.preprocessing.Imputer( strategy=method, ) def impute(data): return imputer.fit_transform(data) elif method == 'drop': def impute(data): raise NotImplementedError return impute
def mean_fill(perc, c): """mean fill that provies the log probability""" clf = BayesianRidge() df, y = glm_testing.create_missing(perc=perc, c=c) drug_vals, drug_true = glm_testing.test_drug(c=c) design = fancyimpute.SimpleFill(fill_method='mean').fit_transform(df) clf.fit(design, y) drug_preds, std = clf.predict(drug_vals, return_std=True) return -scipy.stats.norm(drug_preds, std * 1).logpdf(drug_true).sum()
def impute_value(self, df, method="MICE"): """ Impute using MICE """ if method == "MICE": return fi.MICE(verbose=False).complete(df) elif method == "KNN": return fi.KNN(k=4, verbose=False).complete(df) else: return fi.SimpleFill().complete(df)
def impute_targetdata(targetdata): """ there are very few missing values, so just use a fast but dumb imputation here """ if numpy.sum(numpy.isnan(targetdata.values)) > 0: targetdata_imp = fancyimpute.SimpleFill().complete(targetdata.values) targetdata = pandas.DataFrame(targetdata_imp, index=targetdata.index, columns=targetdata.columns) return targetdata
def dec_mean(perc, c): """ Decision tree classifier with mean imputation """ clf = DecisionTreeClassifier() df, y = glm_testing.create_missing(perc=perc, c=c) y = y.apply(lambda x: 1 if x > 1 else 0) drug_vals, drug_true = glm_testing.test_drug(c=c) drug_true = drug_true.apply(lambda x: 1 if x > 1 else 0) design = fancyimpute.SimpleFill(fill_method='mean').fit_transform(df) clf.fit(design, y) return (np.log(clf.predict_proba(drug_vals)[:, 1][drug_true == 1].prod() * clf.predict_proba(drug_vals)[:, 0][drug_true == 0].prod()))
def log_mean(perc, c, extras=0): """ Logistic regression with mean imputation """ clf = LogisticRegression(solver="liblinear") df, y = glm_testing.create_missing(perc=perc, c=c, extras=extras) y = y.apply(lambda x: 1 if x > 1 else 0) drug_vals, drug_true = glm_testing.test_drug(c=c, extras=extras) drug_true = drug_true.apply(lambda x: 1 if x > 1 else 0) design = fancyimpute.SimpleFill(fill_method='mean').fit_transform(df) clf.fit(design, y) return (np.log(clf.predict_proba(drug_vals)[:, 1][drug_true == 1].prod() * clf.predict_proba(drug_vals)[:, 0][drug_true == 0].prod()))
def mean_fill_conf(perc, c): """mean fill that provides the conf intervals""" clf = BayesianRidge() df, y = glm_testing.create_missing(perc=perc, c=c) drug_vals, drug_true = glm_testing.test_drug(c=c) design = fancyimpute.SimpleFill(fill_method='mean').fit_transform(df) clf.fit(design, y) drug_preds, std = clf.predict(drug_vals, return_std=True) return sum( scipy.stats.norm( drug_preds, std * 1).pdf(drug_true) < 0.05) # 95 percent confidence
def impute(df, method, verbose=False): """ Impute missing data using specified imputation method. Parameters ---------- df: pd.DataFrame Stat DataFrame with source columns and player/team multi-index. method: str/bool Imputation method for missing data. - False: Do not impute missing data. - None: Do not impute missing data. - 'BiScaler' - 'IterativeImpute' - 'IterativeSVD' - 'KNN': Impute with nearest neighbors. - 'Mean': Impute missing with average of other sources. - 'NuclearNorm' - 'SoftImpute' verbose: bool, default=False If True, print debugging information. Returns ------- df: pd.DataFrame Imputed DataFrame with no NaNs. """ warnings.filterwarnings('ignore', category=RuntimeWarning) # Subset DataFrame to only include only projection columns. ignored_cols = ['Player', 'Team', 'Pos', 'Week', 'STATS'] impute_cols = [col for col in list(df) if col not in ignored_cols] X = df[impute_cols].copy().T # Impute DataFrame. v = verbose if method in [None, False]: imputed_vals = X.values elif np.sum(np.sum(X.isnull())) == 0: # No missing values. imputed_vals = X.values elif method == 'BiScaler': imputed_vals = fi.BiScaler(verbose=v).fit_transform(X) elif method == 'IterativeImpute': imputed_vals = fi.IterativeImputer(verbose=v).fit_transform(X) elif method == 'IterativeSVD': imputed_vals = fi.IterativeSVD(verbose=v).fit_transform(X) elif method == 'KNN': imputed_vals = fi.KNN(k=3, verbose=v).fit_transform(X) elif method == 'MatrixFactorization': imputed_vals = fi.MatrixFactorization(verbose=v).fit_transform(X) elif method == 'Mean': imputed_vals = fi.SimpleFill('mean').fit_transform(X) elif method == 'Median': imputed_vals = fi.SimpleFill('median').fit_transform(X) elif method == 'NuclearNorm': imputed_vals = fi.NuclearNormMinimization(verbose=v).fit_transform(X) elif method == 'SoftImpute': imputed_vals = fi.SoftImpute(verbose=v).fit_transform(X) # Recombine ignored columns with imputed data. imputed_df = pd.DataFrame(imputed_vals.T, columns=X.index) for col in impute_cols: if len(imputed_df[col]) != len(df[col]): print(f'df: {len(df[col])}\nimp: {len(imputed_df[col])}') df[col] = imputed_df[col].values return df
def __init__(self, imputer=None): if imputer is None: self.imputer = fancyimpute.SimpleFill() else: self.imputer = imputer(verbose=False)
label = Y[mask] mae = np.abs(pred - label).sum() / (1e-5 + np.sum(mask)) mre = np.abs(pred - label).sum() / (1e-5 + np.sum(np.abs(label))) return {'mae': mae, 'mre': mre} # Algo1: Mean imputation X_mean = [] print(len(X)) for x, y in zip(X, Y): X_mean.append(fancyimpute.SimpleFill().fit_transform(x)) X_c = np.concatenate(X, axis=0).reshape(-1, 48, 35) Y_c = np.concatenate(Y, axis=0).reshape(-1, 48, 35) Z_c = np.array(Z) X_mean = np.concatenate(X_mean, axis=0).reshape(-1, 48, 35) print('Mean imputation:') print(get_loss(X_c, X_mean, Y_c)) # save mean inputation results print(X_c.shape, Y_c.shape, Z_c.shape) # raw_input() np.save('./result/mean_data.npy', X_mean) np.save('./result/mean_label.npy', Z_c)
pred = X_pred[mask] label = Y[mask] mae = np.abs(pred - label).sum() / (1e-5 + np.sum(mask)) mre = np.abs(pred - label).sum() / (1e-5 + np.sum(np.abs(label))) return {'mae': mae, 'mre': mre} # Algo1: Mean imputation X_mean = [] print(len(X)) for x, y in zip(X, Y): X_mean.append(fancyimpute.SimpleFill().complete(x)) X_c = np.concatenate(X, axis=0).reshape(-1, 48, 35) Y_c = np.concatenate(Y, axis=0).reshape(-1, 48, 35) Z_c = np.array(Z) X_mean = np.concatenate(X_mean, axis=0).reshape(-1, 48, 35) print('Mean imputation:') print(get_loss(X_c, X_mean, Y_c)) # save mean inputation results print(X_c.shape, Y_c.shape, Z_c.shape) raw_input() np.save('./result/mean_data.npy', X_mean) np.save('./result/mean_label.npy', Z_c)