def MICE_imputation(self, dataset): # only for numerical values # Multivariate Imputation by Chained Equations only suitable # for Missing At Random (MAR), # which means that the probability that a value is missing # depends only on observed values and not on unobserved values import impyute as imp df = dataset if dataset.select_dtypes(['number']).isnull().sum().sum() > 0: X = imp.mice(dataset.select_dtypes(['number']).iloc[:, :].values) Z = dataset.select_dtypes(include=['object']) df = pd.DataFrame.from_records( X, columns=dataset.select_dtypes(['number']).columns) df = df.join(Z) else: pass return df
def data_quality_verification(self): try: cols = ["bcg_wastage_rate", "totalbirths"] self.df[cols] = self.df[cols].replace({0: np.nan}) self.df["bcg_wastage_rate"] = self.df["bcg_wastage_rate"].replace( {1: np.nan}) # dealing with the missing data # imputed_training = fast_knn(org_unit_group.values, k=3) # print(self.df.values) imputed_training = mice(self.df.values) self.df[cols] = imputed_training # convert column "totalbirths" to int64 dtype self.df = self.df.astype({"totalbirths": int}) except Exception as e: self.metadata['report_status'] = 'ERROR' self.metadata['report_description'] = 'Data <-> Data Quality Verification Failed. => ' \ '' + str(e) self.has_validation_error = True return
def test_impute_missing_values(self): """ After imputation, no NaN's should exist""" imputed = impy.mice(self.data_m) self.assertFalse(np.isnan(imputed).any())
def test_return_type(self): """ Check return type, should return an np.ndarray""" imputed = impy.mice(self.data_m) self.assertTrue(isinstance(imputed, np.ndarray))
def run_experiment_k_paper(X_test, y_test, clf, NB, a, setting): X_impute_mean = np.mean(X_test, axis = 0) X_impute_median = np.median(X_test, axis = 0) X_impute_max = np.max(X_test, axis = 0) X_impute_min = np.min(X_test, axis = 0) X_impute_flip = np.copy(1 - X_test) k_all = [] missing_err_nb_all = [] missing_err_lr_mean_all = [] missing_err_lr_median_all = [] missing_err_lr_max_all = [] missing_err_lr_min_all = [] missing_err_lr_flip_all = [] missing_err_lr_em_impute_all = [] missing_err_lr_mice_impute_all = [] missing_err_lr_knn_impute_all = [] useEM = setting["em"] if "em" in setting else False discreteFeatures = setting["discreteFeatures"] if "discreteFeatures" in setting else 1 featureEncoding = setting["feature_encoding"] if "feature_encoding" in setting else None do_emImpute = setting["emImpute"] if "emImpute" in setting else False do_miceImpute = setting["miceImpute"] if "miceImpute" in setting else False do_knnImpute = setting["knnImpute"] if "knnImpute" in setting else False if useEM: missing_err_ours_all = {} for i in range(len(a)): missing_err_ours_all["ours_" + str(i)] = [] else: missing_err_ours_all = [] useProb = setting["prob"] if "prob" in setting else True function = setting["function"] if "function" in setting else None if function is None: if useProb: function = conditional_likelihood_k else: function = f1_score print("Using following function: ") print(function) repeat = setting["repeat"] if "repeat" in setting else 1 FEATURES = setting["features"] if "features" in setting else None if FEATURES is None: NNN = X_test.shape[1] if not featureEncoding is None: NNN = len(featureEncoding) FEATURES = np.array( [i for i in range(NNN / discreteFeatures )] ) else: FEATURES = np.array( FEATURES ) print("Possible features to remove: {}".format(FEATURES.shape[0])) K = setting["k"] for k in K: print("K = {}".format(k)) if k > FEATURES.shape[0]: print("Early stop: Only had {} features possible to remove".format(FEATURES.shape[0])) break cur_nb = [] cur_lr_mean = [] cur_lr_median = [] cur_lr_max = [] cur_lr_min = [] cur_flip = [] cur_em_impute = [] cur_mice_impute = [] cur_knn_impute = [] if useEM: cur_ours = {} for i in range(len(a)): cur_ours["ours_" + str(i)] = [] else: cur_ours = [] for R in range(repeat): if R % 10 == 0: print("\t R = {}".format(R)) X_test_mean = np.array(X_test, dtype = 'float') X_test_median = np.array(X_test, dtype = 'float') X_test_max = np.array(X_test, dtype = 'float') X_test_min = np.array(X_test, dtype = 'float') X_test_flip = np.array(X_test, dtype = 'float') X_test_em_impute = np.array(X_test, dtype = 'float') X_test_mice_impute = np.array(X_test, dtype = 'float') X_test_knn_impute = np.array(X_test, dtype = 'float') missing = np.zeros(X_test.shape, dtype=bool) for i in range(X_test.shape[0]): miss = np.random.choice(FEATURES, k, replace=False) if not featureEncoding is None and k > 0: missK = [] for m in miss: for z in featureEncoding[m]: missK.append(z) miss = np.copy(np.array(missK)) elif discreteFeatures != 1 and k > 0: missK = [] for m in miss: for z in range(discreteFeatures): missK.append(m * discreteFeatures + z) miss = np.copy(np.array(missK)) missing[i][miss] = True # if k > 0: # print(missing[i]) # print(np.sum(missing[i])) X_test_mean[i][miss] = X_impute_mean[miss] X_test_median[i][miss] = X_impute_median[miss] X_test_max[i][miss] = X_impute_max[miss] X_test_min[i][miss] = X_impute_min[miss] X_test_flip[i][miss] = X_impute_flip[i][miss] X_test_em_impute[i][miss] = np.nan X_test_mice_impute[i][miss] = np.nan X_test_knn_impute[i][miss] = np.nan if do_emImpute: import time start = time.time() loops = 6 print ("\tStarting to em impute with loops = {}".format(loops)) X_test_em_impute = impyute.em(X_test_em_impute, loops = loops) end = time.time() print ("\tDone imputing! " + str( end - start ) ) else: X_test_em_impute = np.zeros(X_test.shape) if do_miceImpute: import time start = time.time() print ("\tStarting to mice impute") X_test_mice_impute = impyute.mice(X_test_mice_impute) end = time.time() print ("\tDone imputing! " + str( end - start ) ) else: X_test_mice_impute = np.zeros(X_test.shape) if do_knnImpute: import time start = time.time() print ("\tStarting to knn impute") X_test_knn_impute = impyute.fast_knn(X_test_knn_impute) end = time.time() print ("\tDone imputing! " + str( end - start ) ) else: X_test_knn_impute = np.zeros(X_test.shape) lr_prob = clf.predict_proba(X_test) if useProb: cur_nb.append ( function(lr_prob, predict_nbk_with_missing(X_test_mean, NB, missing, prob = True)) ) cur_lr_mean.append ( function(lr_prob, clf.predict_proba(X_test_mean)) ) cur_lr_median.append ( function(lr_prob, clf.predict_proba(X_test_median))) cur_lr_max.append ( function(lr_prob, clf.predict_proba(X_test_max))) cur_lr_min.append ( function(lr_prob, clf.predict_proba(X_test_min))) cur_em_impute.append ( function(lr_prob, clf.predict_proba(X_test_em_impute))) cur_mice_impute.append( function(lr_prob, clf.predict_proba(X_test_mice_impute))) cur_knn_impute.append ( function(lr_prob, clf.predict_proba(X_test_knn_impute))) # cur_flip.append ( function(lr_prob, clf.predict_proba(X_test_flip))) if not useEM: cur_ours.append ( function(lr_prob, a.classify(X_test, missing, prob = True))) else: for z in range(len(a)): cur_ours["ours_" + str(z)].append (function(lr_prob, a[z].classify(X_test, missing, prob = True))) else: cur_nb.append ( function(y_test, predict_nbk_with_missing(X_test_mean, NB, missing)) ) cur_lr_mean.append ( function(y_test, clf.predict(X_test_mean)) ) cur_lr_median.append ( function(y_test, clf.predict(X_test_median))) cur_lr_max.append ( function(y_test, clf.predict(X_test_max))) cur_lr_min.append ( function(y_test, clf.predict(X_test_min))) cur_em_impute.append ( function(y_test, clf.predict(X_test_em_impute))) cur_mice_impute.append( function(y_test, clf.predict(X_test_mice_impute))) cur_knn_impute.append( function(y_test, clf.predict(X_test_knn_impute))) # cur_flip.append ( function(y_test, clf.predict(X_test_flip))) if not useEM: cur_ours.append ( function(y_test, a.classify(X_test_mean, missing))) else: for z in range(len(a)): cur_ours["ours_" + str(z)].append( function(y_test, a[z].classify(X_test_mean, missing))) k_all.append(k) missing_err_nb_all.append (cur_nb) missing_err_lr_mean_all.append (cur_lr_mean) missing_err_lr_median_all.append(cur_lr_median) missing_err_lr_max_all.append (cur_lr_max) missing_err_lr_min_all.append (cur_lr_min) missing_err_lr_flip_all.append (cur_flip) missing_err_lr_em_impute_all.append(cur_em_impute) missing_err_lr_mice_impute_all.append(cur_mice_impute) missing_err_lr_knn_impute_all.append(cur_knn_impute) if useEM: for i in cur_ours: missing_err_ours_all[i].append(cur_ours[i]) else: missing_err_ours_all.append (cur_ours) if not useEM: missing_err_ours_all = np.array(missing_err_ours_all) data = { "features_count": FEATURES.shape[0], "k" : np.array(k_all), "nb": np.array(missing_err_nb_all), "mean": np.array(missing_err_lr_mean_all), "median": np.array(missing_err_lr_median_all), "max": np.array(missing_err_lr_max_all), "min": np.array(missing_err_lr_min_all), "ours": missing_err_ours_all, "flip": np.array(missing_err_lr_flip_all), "em_impute": np.array(missing_err_lr_em_impute_all), "mice_impute": np.array(missing_err_lr_mice_impute_all), "knn_impute": np.array(missing_err_lr_knn_impute_all), } return data
) df2 = dataframe.replace(r'\s+', np.NaN) data_= df2.drop(['PASI.END.WEEK.7',\ 'PASI.END.WEEK.8','PASI.END.WEEK.9','PASI.END.WEEK.10','PASI.END.WEEK.11'], axis = 1) #data_= data_.dropna() ds = data_ #.iloc[:99] # # Drop missing data # print(ds.shape) # ds = ds.dropna() # print(ds.shape) # split data into X and y X6 = data_.iloc[:, 0:-1].values #W0-2 X6 = impy.mice(X6) X6 = normalize(X6) y = data_.iloc[:, -1] # Reduce the number of classes to 2, classes 2 and 1 are # merged together to form a new class (0) and class 3 # becomes class 1. y = np.array([data_.iloc[:, -1]]) b = Binarizer(2) b_scaled = b.fit_transform(y)[0] y = b_scaled # Form a dataframe with the imputed x_values and binarized y_values. d_x = pd.DataFrame(
msk = (imputed + np.random.randn(*imputed.shape) - imputed) < 0.8 imputed[~msk] = 0 # initializing NMF imputation model nmf_model = NMF() # n_components: num. of features nmf_model.fit(imputed) # iterative imputation process # while nmf_model.reconstruction_err_**2 > 10: while nmf_model.reconstruction_err_ > 2.5: W = nmf_model.fit_transform(imputed) imputed[~msk] = W.dot(nmf_model.components_)[~msk] print(nmf_model.reconstruction_err_) # [Imputation mode: MICE] imputed = impy.mice(df.values[:split_idx]) # [Imputation mode: k-NN] imputer = KNNImputer(n_neighbors=10) # default: 2 imputed = imputer.fit_transform(df.values[:split_idx]) # [Imputation mode: EM] imputed = impy.em(df.values[:split_idx], loops=50) # [Imputation mode: LOCF] imputed = df.copy().iloc[:split_idx].ffill() imputed = imputed.fillna(0) imputed = imputed.values # [Imputation mode: NOCB] imputed = df.copy().iloc[:split_idx].bfill()
plt.subplot(140+i+1) dataload3.boxplot(column=i) plt.title(l[i]) plt.show() #数据缺失处理 import impyute as impy data_drop = dataload1.dropna()#将缺失值剔除 data_drop.hist(layout=(1,3),bins=40,figsize=(15,3)) data_mode = dataload1.fillna(dataload1.mode())#用最高频率纸填补缺失值 data_mode.hist(layout=(1,3),bins=40,figsize=(15,3)) data = dataload1[['Unnamed: 0','points','price']] nd = np.array(data) filled_mice = impy.mice(nd)#通过属性的相关关系来填补缺失值 data_mice = pd.DataFrame(filled_mice) data_mice.hist(layout=(1,3),bins=40,figsize=(15,3)) filled_knn = impy.fast_knn(nd,k=3)#通过数据对象之间的相似性来填补缺失值 data_knn = pd.DataFrame(filled_knn) data_knn.hist(layout=(1,3),bins=40,figsize=(15,3)) plt.show() data_drop = dataload2.dropna()#将缺失值剔除 data_drop.hist(layout=(1,3),bins=40,figsize=(15,3)) data_mode = dataload2.fillna(dataload2.mode())#用最高频率纸填补缺失值 data_mode.hist(layout=(1,3),bins=40,figsize=(15,3))
def __init__(self, T, mask, algo, miss_info, kf, notobj, obj, target): try: self.miss_info = miss_info self.columns = notobj self.ord_num_col = self.miss_info["ord_col"] + self.miss_info[ "num_col"] metric = {"rmse": {}, "nrmse": {}} self.rawT = T self.target = target if target is not None: self.target_y = T[target] else: self.target_y = None self.cv = {} self.cv.update(deepcopy(metric)) self.kf = kf self.MSE = {} self.MSE.update(deepcopy(metric)) self.result = {} self.time_ck = {} X = deepcopy(T) mask = pd.DataFrame(mask, columns=T.columns.tolist()) self.rawmask = mask X[(mask == 1).values] = np.nan if obj in [None, []]: obj = None else: pass ########################################## self.X = X[notobj] self.T = T[notobj] self.mask = mask[notobj] self.notobj = notobj ########################################## if obj is not None: ############ Numeric + Category ################# cat_impute = SimpleImputer(strategy="most_frequent") X[obj] = cat_impute.fit_transform(X[obj]) self.true_obj = T[obj] self.pd_obj = X[obj] ################################################### TT = deepcopy(T) cat_encoder = miss_info["ce_encoder"] for k in cat_encoder.category_mapping: col, map_ = k["col"], k["mapping"] TT[col] = TT[col].replace( dict(zip(k["mapping"].index, k["mapping"].values))) self.full_miss_data = TT self.full_miss_data[(mask == 1).values] = np.nan mice_data = deepcopy(T) for obj_col in obj: mice_data[obj_col] = "Cols_" + mice_data[obj_col] self.full_mice_data = mice_data self.full_mice_data[(mask == 1).values] = np.nan else: ########## Numeric ############################### num_data = deepcopy(self.X) num_data[(self.mask == 1).values] = np.nan self.full_miss_data = deepcopy(num_data) self.full_mice_data = deepcopy(num_data) ################################################### self.algo = algo self.method = { "MissForest" : lambda x : MissForest(verbose = 0, n_jobs = -1 ).fit(x) , "mean" : lambda x : impy.mean(x) , "median" : lambda x : impy.median(x) , "mode" : lambda x : impy.mode(x) , "knn" : lambda x : impy.fast_knn(x) , "MICE" : lambda x : impy.mice(x) , "EM" : lambda x : impy.em(x), "MultipleImputer" : lambda x : MultipleImputer(n=1, return_list = True).\ fit_transform(pd.DataFrame(x)).values, } except Exception as e: print(e) pass
def perf_mice_imput(dfs_arg): mice_data = [ impy.mice(dfs_arg[i].values, dtype='cont') for i in range(len(dfs_arg)) ] return [pd.DataFrame(data=mice_data[i]) for i in range(len(dfs_arg))]
import pandas as pd from pandas import datetime import impyute as impy from matplotlib import pyplot as plt def parser(x): return datetime.strptime(x, '%Y-%m-%d %H:%M:%S') input_file = './data/AirQualityUCI_refined.csv' df = pd.read_csv(input_file, index_col=[0], parse_dates=[0], date_parser=parser) # imputation mode: MICE imputed_mice = impy.mice(df.values) imputed_mice = pd.DataFrame(imputed_mice, index=df.index, columns=df.columns) # Visualizing comparison between actual and imputed values plt.plot(imputed_mice[df.columns[0]], label='imputed') plt.plot(df[df.columns[0]], label='actual') plt.legend(loc='best') plt.show() # Save the data set with imputed values imputed_mice.to_csv('./data/AirQualityUCI_MICE.csv', index='Datetime')