def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name miss_rate = args.miss_rate random = args.seed time = args.time x1 = data_name x = x1.split("+") print(x) gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations, 'time': args.time } # Load data and introduce missingness #ori_data_x, miss_data_x, data_m = data_loader2(data_name, miss_rate,random) miss_rate_caption = "{}% Missing".format(int(miss_rate * 100)) col1 = [ miss_rate_caption, 'RMSE', 'RMSE', 'RMSE', 'RMSE', 'RMSE', 'RMSE', 'RMSPE', 'RMSPE', 'RMSPE', 'RMSPE', 'RMSPE', 'RMSPE', '', 'MLP', 'MLP', 'D.Tree', 'D.Tree', 'LogisticR', 'LogisticR', 'LogisticR', 'LogisticR', 'LogisticR', 'LogisticR', 'SVC', 'SVC', 'SVC', 'SVC', 'SVC', 'SVC', 'SGD', 'SGD', 'SGD', 'SGD', 'SGD', 'SGD' ] col2 = [ 'Method', 'EGAIN', 'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE', 'EGAIN', 'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE', '', 'EGAIN', 'GAIN', 'EGAIN', 'GAIN', 'EGAIN', 'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE', 'EGAIN', 'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE', 'EGAIN', 'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE' ] result = [col1, col2] for data_train in x: data_name = data_train dataset = [ 'obesity', 'hepatitisC', 'audit', 'letter', 'spam', 'breast', 'credit', 'news', 'blood', 'vowel', 'ecoli', 'ionosphere', 'parkinsons', 'seedst', 'vehicle', 'vertebral', 'wine', 'banknote', 'balance', 'yeast', 'bean', 'shill', 'phishing', 'firewall', 'iBeacon', 'steel' ] if (data_name not in dataset): print("Wrong name: {} Dataset. Skip this datasets".format( data_train)) break col3 = [] col3.append(data_name) print("****** {} Dataset ******".format(data_train)) gan_rs, egain_rs, mice_rs,miss_rs, gan_mlp, gan_dt, egan_mlp, egan_dt = [],[],[],[],[],[],[],[] gan_svc, egan_svc, gan_lr, egan_lr, gan_sgd, egan_sgd, gan_gau, egan_gau = [],[],[],[],[],[],[],[] knn_rmse, mean_rmse, miss_rmse, mice_rmse = [], [], [], [] gan_rmspe, egan_rmspe, knn_rmspe , mean_rmspe, miss_rmspe, mice_rmspe = [],[],[],[],[],[] knn_lr, knn_svc, knn_sgd, mean_lr, mean_svc, mean_sgd = [],[],[],[],[],[] miss_lr, miss_svc, miss_sgd, mice_lr, mice_svc, mice_sgd = [],[],[],[],[],[] for i in range(time): # Load data and introduce missingness # Fix loader i=42 ori_data_x, miss_data_x, data_m, y = data_loader3( data_name, miss_rate, 42) # 7) #i) block i train_idx, test_idx = train_test_split( range(len(y)), test_size=0.2, stratify=y, random_state=i) #7) #i) block i miss_data_x2 = miss_data_x #* 10000 if i % 5 == 0: print('=== Working on {}/{} ==='.format(i, time)) # Impute missing data imputed_data_x1 = gain(miss_data_x2, gain_parameters) imputed_data_x_e1 = egain(miss_data_x2, gain_parameters) imputed_data_x = imputed_data_x1 #* 1/10000 imputed_data_x_e = imputed_data_x_e1 #* 1/10000 imp_MEAN = SimpleImputer(missing_values=np.nan, strategy='mean') imputed_data_x_mean = imp_MEAN.fit_transform(miss_data_x) imputed_data_x_mean = imputed_data_x_mean.round() #imputed_data_x_mean = imp_MEAN.fit_transform(miss_data_x2) *1/10000 imp_KNN = KNNImputer(missing_values=np.nan, n_neighbors=3) imputed_data_x_knn = imp_KNN.fit_transform(miss_data_x) # *1/10000 imputed_data_x_knn = imputed_data_x_knn.round() # ExtraTreesRegressor: similar to missForest in R; DecisionTreeRegressor() imp_mf = IterativeImputer(estimator=ExtraTreesRegressor(), max_iter=1, initial_strategy="constant", n_nearest_features=1, imputation_order='descending') #20 imputed_data_mf = imp_mf.fit_transform(miss_data_x) #*1/10000 imputed_data_mf = imputed_data_mf.round() #imp_mf = MissForest(max_iter=1) #imputed_data_mf = imp_mf.fit_transform(miss_data_x) imp_mice = IterativeImputer( estimator=BayesianRidge(), max_iter=1, initial_strategy='constant', n_nearest_features=1, imputation_order='descending') # 'mean') #20 imputed_data_mice = imp_mice.fit_transform(miss_data_x) #*1/10000 imputed_data_mice = imputed_data_mice.round() # Report the RMSE performance rmse = rmse_loss(ori_data_x, imputed_data_x, data_m) rmse_e = rmse_loss(ori_data_x, imputed_data_x_e, data_m) rmse_mean = rmse_loss(ori_data_x, imputed_data_x_mean, data_m) rmse_knn = rmse_loss(ori_data_x, imputed_data_x_knn, data_m) rmse_mf = rmse_loss(ori_data_x, imputed_data_mf, data_m) rmse_mice = rmse_loss(ori_data_x, imputed_data_mice, data_m) gan_rs.append(rmse) egain_rs.append(rmse_e) mean_rmse.append(rmse_mean) knn_rmse.append(rmse_knn) mice_rmse.append(rmse_mice) miss_rmse.append(rmse_mf) # Report the RMSPE performance rmspe = rmspe_loss(ori_data_x, imputed_data_x, data_m) rmspe_e = rmspe_loss(ori_data_x, imputed_data_x_e, data_m) rmspe_mean = rmspe_loss(ori_data_x, imputed_data_x_mean, data_m) rmspe_knn = rmspe_loss(ori_data_x, imputed_data_x_knn, data_m) rmspe_mf = rmspe_loss(ori_data_x, imputed_data_mf, data_m) rmspe_mice = rmspe_loss(ori_data_x, imputed_data_mice, data_m) #gan_rmspe, egan_rmspe, knn_rmspe , mean_rmspe, miss_rmspe, mice_rmspe gan_rmspe.append(rmspe) egan_rmspe.append(rmspe_e) mean_rmspe.append(rmspe_mean) knn_rmspe.append(rmspe_knn) mice_rmspe.append(rmspe_mice) miss_rmspe.append(rmspe_mf) mi_data = miss_data_x.astype(float) no, dim = imputed_data_mice.shape miss_data = np.reshape(mi_data, (no, dim)) np.savetxt("data/{}missing_data.csv".format(i), mi_data, delimiter=',', fmt='%1.2f') np.savetxt("data/{}imputed_data_gain.csv".format(i), imputed_data_x, delimiter=',', fmt='%d') np.savetxt("data/{}imputed_data_egain.csv".format(i), imputed_data_x_e, delimiter=',', fmt='%d') imputed_data_x, _ = normalization(imputed_data_x) imputed_data_x_e, _ = normalization(imputed_data_x_e) imputed_data_x_mean, _ = normalization(imputed_data_x_mean) imputed_data_x_knn, _ = normalization(imputed_data_x_knn) imputed_data_mf, _ = normalization(imputed_data_mf) imputed_data_mice, _ = normalization(imputed_data_mice) gan_score_mlp = clf_MLP(imputed_data_x, y, train_idx, test_idx) egan_score_mlp = clf_MLP(imputed_data_x_e, y, train_idx, test_idx) gan_mlp.append(gan_score_mlp) egan_mlp.append(egan_score_mlp) gan_score_dt = clf_DT(imputed_data_x, y, train_idx, test_idx) egan_score_dt = clf_DT(imputed_data_x_e, y, train_idx, test_idx) gan_dt.append(gan_score_dt) egan_dt.append(egan_score_dt) gan_score_lr = clf_LR(imputed_data_x, y, train_idx, test_idx) egan_score_lr = clf_LR(imputed_data_x_e, y, train_idx, test_idx) mean_score_lr = clf_LR(imputed_data_x_mean, y, train_idx, test_idx) knn_score_lr = clf_LR(imputed_data_x_knn, y, train_idx, test_idx) miss_score_lr = clf_LR(imputed_data_mf, y, train_idx, test_idx) mice_score_lr = clf_LR(imputed_data_mice, y, train_idx, test_idx) gan_lr.append(gan_score_lr) egan_lr.append(egan_score_lr) mean_lr.append(mean_score_lr) knn_lr.append(knn_score_lr) miss_lr.append(miss_score_lr) mice_lr.append(mice_score_lr) mean_score_svc = clf_SVC(imputed_data_x_mean, y, train_idx, test_idx) knn_score_svc = clf_SVC(imputed_data_x_knn, y, train_idx, test_idx) miss_score_svc = clf_SVC(imputed_data_mf, y, train_idx, test_idx) mice_score_svc = clf_SVC(imputed_data_mice, y, train_idx, test_idx) mean_svc.append(mean_score_svc) knn_svc.append(knn_score_svc) miss_svc.append(miss_score_svc) mice_svc.append(mice_score_svc) gan_score_svc = clf_SVC(imputed_data_x, y, train_idx, test_idx) egan_score_svc = clf_SVC(imputed_data_x_e, y, train_idx, test_idx) gan_svc.append(gan_score_svc) egan_svc.append(egan_score_svc) mean_score_sgd = clf_SGD(imputed_data_x_mean, y, train_idx, test_idx) knn_score_sgd = clf_SGD(imputed_data_x_knn, y, train_idx, test_idx) miss_score_sgd = clf_SGD(imputed_data_mf, y, train_idx, test_idx) mice_score_sgd = clf_SGD(imputed_data_mice, y, train_idx, test_idx) mean_sgd.append(mean_score_sgd) knn_sgd.append(knn_score_sgd) miss_sgd.append(miss_score_sgd) mice_sgd.append(mice_score_sgd) gan_score_sgd = clf_SGD(imputed_data_x, y, train_idx, test_idx) egan_score_sgd = clf_SGD(imputed_data_x_e, y, train_idx, test_idx) gan_sgd.append(gan_score_sgd) egan_sgd.append(egan_score_sgd) #gan_score_gau = clf_GAU(imputed_data_x , y, train_idx, test_idx) #egan_score_gau = clf_GAU(imputed_data_x_e , y, train_idx, test_idx) #gan_gau.append(gan_score_gau) #egan_gau.append(egan_score_gau) print() print("Datasets: ", data_name) #print(gan_rs,egain_rs, mice_rs,miss_rs) col3.append( f"{round(np.mean(egain_rs)*1,2)} ± {round(np.std(egain_rs),4)}") col3.append( f"{round(np.mean(gan_rs)*1,2)} ± {round(np.std(gan_rs),4)}") col3.append( f"{round(np.mean(mean_rmse)*1,2)} ± {round(np.std(mean_rmse),4)}") col3.append( f"{round(np.mean(knn_rmse)*1,2)} ± {round(np.std(knn_rmse),4)}") col3.append( f"{round(np.mean(mice_rmse)*1,2)} ± {round(np.std(mice_rmse),4)}") col3.append( f"{round(np.mean(miss_rmse)*1,2)} ± {round(np.std(miss_rmse),4)}") ##gan_rmspe, egan_rmspe, knn_rmspe , mean_rmspe, miss_rmspe, mice_rmspe col3.append( f"{round(np.mean(egan_rmspe)*1,2)} ± {round(np.std(egan_rmspe),4)}" ) col3.append( f"{round(np.mean(gan_rmspe)*1,2)} ± {round(np.std(gan_rmspe),4)}") col3.append( f"{round(np.mean(mean_rmspe)*1,2)} ± {round(np.std(mean_rmspe),4)}" ) col3.append( f"{round(np.mean(knn_rmspe)*1,2)} ± {round(np.std(knn_rmspe),4)}") col3.append( f"{round(np.mean(mice_rmspe)*1,2)} ± {round(np.std(mice_rmspe),4)}" ) col3.append( f"{round(np.mean(miss_rmspe)*1,2)} ± {round(np.std(miss_rmspe),4)}" ) col3.append([]) col3.append( f"{round(np.mean(egan_mlp)*1,2)} ± {round(np.std(egan_mlp),4)}") col3.append( f"{round(np.mean(gan_mlp)*1,2)} ± { round(np.std(gan_mlp),4)}") col3.append( f"{round(np.mean(egan_dt)*1,2)} ± { round(np.std(egan_dt),4)}") col3.append( f"{round(np.mean(gan_dt)*1,2)} ± { round(np.std(gan_dt),4)}") col3.append( f"{round(np.mean(egan_lr)*1,2)} ± {round(np.std(egan_lr),4)}") col3.append( f"{round(np.mean(gan_lr)*1,2)} ± {round(np.std(gan_lr),4)}") col3.append( f"{round(np.mean(mean_lr)*1,2)} ± {round(np.std(mean_lr),4)}") col3.append( f"{round(np.mean(knn_lr)*1,2)} ± {round(np.std(knn_lr),4)}") col3.append( f"{round(np.mean(mice_lr)*1,2)} ± {round(np.std(mice_lr),4)}") col3.append( f"{round(np.mean(miss_lr)*1,2)} ± {round(np.std(miss_lr),4)}") col3.append( f"{round(np.mean(egan_svc)*1,2)} ± { round(np.std(egan_svc),4)}") col3.append( f"{round(np.mean(gan_svc)*1,2)} ± { round(np.std(gan_svc),4)}") col3.append( f"{round(np.mean(mean_svc)*1,2)} ± { round(np.std(mean_svc),4)}") col3.append( f"{round(np.mean(knn_svc)*1,2)} ± {round(np.std(knn_svc),4)}") col3.append( f"{round(np.mean(mice_svc)*1,2)} ± {round(np.std(mice_svc),4)}") col3.append( f"{round(np.mean(miss_svc)*1,2)} ± { round(np.std(miss_svc),4)}") col3.append( f"{round(np.mean(egan_sgd)*1,2)} ± { round(np.std(egan_sgd),4)}") col3.append( f"{round(np.mean(gan_sgd)*1,2)} ± { round(np.std(gan_sgd),4)}") col3.append( f"{round(np.mean(mean_sgd)*1,2)} ± { round(np.std(mean_sgd),4)}") col3.append( f"{round(np.mean(knn_sgd)*1,2)} ± {round(np.std(knn_sgd),4)}") col3.append( f"{round(np.mean(mice_sgd)*1,2)} ± { round(np.std(mice_sgd),4)}") col3.append( f"{round(np.mean(miss_sgd)*1,2)} ± {round(np.std(miss_sgd),4)}") ''' print('RMSE GAIN: {} ± {}'.format(round(np.mean(gan_rs)*1,2), round(np.std(gan_rs),4))) #print(gan_rs) print('RMSE EGAIN: {} ± {}'.format(round(np.mean(egain_rs)*1,2), round(np.std(egain_rs),4))) #print(egain_rs) print('RMSE MEAN: {} ± {}'.format(round(np.mean(mean_rmse)*1,2), round(np.std(mean_rmse),4))) #print(knn_rmse) print('RMSE KNN: {} ± {}'.format(round(np.mean(knn_rmse)*1,2), round(np.std(knn_rmse),4))) #print(mice_rmse) print('RMSE MICE: {} ± {}'.format(round(np.mean(mice_rmse)*1,2), round(np.std(mice_rmse),4))) #print(miss_rmse) print('RMSE MFORE: {} ± {}'.format(round(np.mean(miss_rmse)*1,2), round(np.std(miss_rmse),4))) #print(miss_rmse) print() print('MLP GAIN: {} ± {}'.format(round(np.mean(gan_mlp)*1,2), round(np.std(gan_mlp),4))) print('MLP EGAIN: {} ± {}'.format(round(np.mean(egan_mlp)*1,2), round(np.std(egan_mlp),4))) print() print('DT GAIN: {} ± {}'.format(round(np.mean(gan_dt)*1,2), round(np.std(gan_dt),4))) print('DT EGAIN: {} ± {}'.format(round(np.mean(egan_dt)*1,2), round(np.std(egan_dt),4))) print() print('LR GAIN: {} ± {}'.format(round(np.mean(gan_lr)*1,2), round(np.std(gan_lr),4))) #print(gan_lr) print('LR EGAIN: {} ± {}'.format(round(np.mean(egan_lr)*1,2), round(np.std(egan_lr),4))) #print(egan_lr) print('LR MEAN: {} ± {}'.format(round(np.mean(mean_lr)*1,2), round(np.std(mean_lr),4))) #print(mean_lr) print('LR KNN: {} ± {}'.format(round(np.mean(knn_lr)*1,2), round(np.std(knn_lr),4))) #print(knn_lr) print('LR MICE: {} ± {}'.format(round(np.mean(mice_lr)*1,2), round(np.std(mice_lr),4))) #print(mice_lr) print('LR MISSFOR: {} ± {}'.format(round(np.mean(miss_lr)*1,2), round(np.std(miss_lr),4))) #print(miss_lr) print() print('SVC GAIN: {} ± {}'.format(round(np.mean(gan_svc)*1,2), round(np.std(gan_svc),4))) #print(gan_svc) print('SVC EGAIN: {} ± {}'.format(round(np.mean(egan_svc)*1,2), round(np.std(egan_svc),4))) #print(egan_svc) print('SVC MEAN: {} ± {}'.format(round(np.mean(mean_svc)*1,2), round(np.std(mean_svc),4))) #print(mean_svc) print('SVC KNN: {} ± {}'.format(round(np.mean(knn_svc)*1,2), round(np.std(knn_svc),4))) #print(knn_svc) print('SVC MICE: {} ± {}'.format(round(np.mean(mice_svc)*1,2), round(np.std(mice_svc),4))) #print(mice_svc) print('SVC MISS: {} ± {}'.format(round(np.mean(miss_svc)*1,2), round(np.std(miss_svc),4))) #print(miss_svc) print() print('SGD GAIN: {} ± {}'.format(round(np.mean(gan_sgd)*1,2), round(np.std(gan_sgd),4))) #print(gan_sgd) print('SGD EGAIN: {} ± {}'.format(round(np.mean(egan_sgd)*1,2), round(np.std(egan_sgd),4))) #print(egan_sgd) print('SGD MEAN: {} ± {}'.format(round(np.mean(mean_sgd)*1,2), round(np.std(mean_sgd),4))) #print(mean_sgd) print('SGD KNN: {} ± {}'.format(round(np.mean(knn_sgd)*1,2), round(np.std(knn_sgd),4))) #print(knn_sgd) print('SGD MICE: {} ± {}'.format(round(np.mean(mice_sgd)*1,2), round(np.std(mice_sgd),4))) #print(mice_sgd) print('SGD MISS: {} ± {}'.format(round(np.mean(miss_sgd)*1,2), round(np.std(miss_sgd),4))) ''' result.append(col3) my_array = np.asarray(result) #print(my_array) df_result = pd.DataFrame(my_array) df_result_tran = df_result.transpose() print(df_result_tran.to_string(index=False, header=False)) #df_result_tran.to_csv("result.csv", encoding='utf-8', index=False, header=False) df_result_tran.to_csv("result.csv", index=False, header=False) df_result_tran.to_excel("result.xls", encoding='utf-8', index=False, header=False) #print(miss_sgd) #print() #print('GAU GAIN: {} ± {}'.format(round(np.mean(gan_gau)*1,2), round(np.std(gan_dt),4))) #print('GAU EGAIN: {} ± {}'.format(round(np.mean(egan_gau)*1,2), round(np.std(egan_dt),4))) # MissForest #print() #print('=== MissForest RMSE ===') #data = miss_data_x #imp_mean = MissForest(max_iter = 1) #miss_f = imp_mean.fit_transform(data) #miss_f = pd.DataFrame(imputed_train_df) #rmse_MF = rmse_loss (ori_data_x, miss_f, data_m) #print('RMSE Performance: ' + str(np.round(rmse_MF, 6))) #np.savetxt("data/imputed_data_MF.csv",miss_f, delimiter=',', fmt='%d') #print( 'Save results in Imputed_data_MF.csv') # MICE From Auto Impute #print() #print('=== MICE of Auto Impute RMSE ===') #data_mice = pd.DataFrame(miss_data_x) #mi = MiceImputer(k=1, imp_kwgs=None, n=1, predictors='all', return_list=True, # seed=None, strategy='interpolate', visit='default') #mice_out = mi.fit_transform(data_mice) #c = [list(x) for x in mice_out] #c1= c[0] #c2=c1[1] #c3=np.asarray(c2) #mice_x=c3 #print('here :', mice_x, miss_f, miss_f.shape) #rmse_MICE = rmse_loss (ori_data_x, mice_x, data_m) #print('=== MICE of Auto Impute RMSE ===') #print('RMSE Performance: ' + str(np.round(rmse_MICE, 6))) #np.savetxt("data/imputed_data_MICE.csv",mice_x, delimiter=',', fmt='%d') #print( 'Save results in Imputed_data_MICE.csv') return imputed_data_mf, rmse_mf
def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name miss_rate = args.miss_rate random = args.seed time = args.time gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations, 'time': args.time } # Load data and introduce missingness #ori_data_x, miss_data_x, data_m = data_loader2(data_name, miss_rate,random) gan_rs, egain_rs, mice_rs,miss_rs, gan_mlp, gan_dt, egan_mlp, egan_dt = [],[],[],[],[],[],[],[] gan_svc, egan_svc, gan_lr, egan_lr, gan_sgd, egan_sgd, gan_gau, egan_gau = [],[],[],[],[],[],[],[] knn_rmse, mean_rmse, miss_rmse, mice_rmse = [], [], [], [] knn_lr, knn_svc, knn_sgd, mean_lr, mean_svc, mean_sgd = [],[],[],[],[],[] miss_lr, miss_svc, miss_sgd, mice_lr, mice_svc, mice_sgd = [],[],[],[],[],[] for i in range(time): # Load data and introduce missingness ori_data_x, miss_data_x, data_m, y = data_loader3( data_name, miss_rate, i) train_idx, test_idx = train_test_split(range(len(y)), test_size=0.3, stratify=y, random_state=42) miss_data_x2 = miss_data_x * 10000 if i % 5 == 0: print('=== Working on {}/{} ==='.format(i, time)) # Impute missing data imputed_data_x1 = gain(miss_data_x2, gain_parameters) imputed_data_x_e1 = egain(miss_data_x2, gain_parameters) imputed_data_x = imputed_data_x1 * 1 / 10000 imputed_data_x_e = imputed_data_x_e1 * 1 / 10000 imp_MEAN = SimpleImputer(missing_values=np.nan, strategy='mean') imputed_data_x_mean = imp_MEAN.fit_transform(miss_data_x) #imputed_data_x_mean = imp_MEAN.fit_transform(miss_data_x2) *1/10000 imp_KNN = KNNImputer(missing_values=np.nan) imputed_data_x_knn = imp_KNN.fit_transform(miss_data_x) # *1/10000 imp_mf = IterativeImputer(estimator=DecisionTreeRegressor(), max_iter=3) #20 imputed_data_mf = imp_mf.fit_transform(miss_data_x) #*1/10000 imp_mice = IterativeImputer(estimator=BayesianRidge(), max_iter=3) #20 imputed_data_mice = imp_mice.fit_transform(miss_data_x) #*1/10000 # Report the RMSE performance rmse = rmse_loss(ori_data_x, imputed_data_x, data_m) rmse_e = rmse_loss(ori_data_x, imputed_data_x_e, data_m) rmse_mean = rmse_loss(ori_data_x, imputed_data_x_mean, data_m) rmse_knn = rmse_loss(ori_data_x, imputed_data_x_knn, data_m) rmse_mf = rmse_loss(ori_data_x, imputed_data_mf, data_m) rmse_mice = rmse_loss(ori_data_x, imputed_data_mice, data_m) gan_rs.append(rmse) egain_rs.append(rmse_e) mean_rmse.append(rmse_mean) knn_rmse.append(rmse_knn) mice_rmse.append(rmse_mice) miss_rmse.append(rmse_mf) mi_data = miss_data_x.astype(float) no, dim = imputed_data_mice.shape miss_data = np.reshape(mi_data, (no, dim)) np.savetxt("data/missing_data.csv", mi_data, delimiter=',', fmt='%1.2f') np.savetxt("data/imputed_data_gain.csv", imputed_data_x, delimiter=',', fmt='%d') np.savetxt("data/imputed_data_egain.csv", imputed_data_x_e, delimiter=',', fmt='%d') imputed_data_x, _ = normalization(imputed_data_x) imputed_data_x_e, _ = normalization(imputed_data_x_e) imputed_data_x_mean, _ = normalization(imputed_data_x_mean) imputed_data_x_knn, _ = normalization(imputed_data_x_knn) imputed_data_mf, _ = normalization(imputed_data_mf) imputed_data_mice, _ = normalization(imputed_data_mice) #gan_score_mlp = clf_MLP(imputed_data_x , y, train_idx, test_idx) #egan_score_mlp = clf_MLP(imputed_data_x_e, y, train_idx, test_idx) #gan_mlp.append(gan_score_mlp) #egan_mlp.append(egan_score_mlp) #gan_score_dt = clf_DT(imputed_data_x , y, train_idx, test_idx) #egan_score_dt = clf_DT(imputed_data_x_e , y, train_idx, test_idx) #gan_dt.append(gan_score_dt) #egan_dt.append(egan_score_dt) gan_score_lr = clf_LR(imputed_data_x, y, train_idx, test_idx) egan_score_lr = clf_LR(imputed_data_x_e, y, train_idx, test_idx) mean_score_lr = clf_LR(imputed_data_x_mean, y, train_idx, test_idx) knn_score_lr = clf_LR(imputed_data_x_knn, y, train_idx, test_idx) miss_score_lr = clf_LR(imputed_data_mf, y, train_idx, test_idx) mice_score_lr = clf_LR(imputed_data_mice, y, train_idx, test_idx) gan_lr.append(gan_score_lr) egan_lr.append(egan_score_lr) mean_lr.append(mean_score_lr) knn_lr.append(knn_score_lr) miss_lr.append(miss_score_lr) mice_lr.append(mice_score_lr) mean_score_svc = clf_SVC(imputed_data_x_mean, y, train_idx, test_idx) knn_score_svc = clf_SVC(imputed_data_x_knn, y, train_idx, test_idx) miss_score_svc = clf_SVC(imputed_data_mf, y, train_idx, test_idx) mice_score_svc = clf_SVC(imputed_data_mice, y, train_idx, test_idx) mean_svc.append(mean_score_svc) knn_svc.append(knn_score_svc) miss_svc.append(miss_score_svc) mice_svc.append(mice_score_svc) gan_score_svc = clf_SVC(imputed_data_x, y, train_idx, test_idx) egan_score_svc = clf_SVC(imputed_data_x_e, y, train_idx, test_idx) gan_svc.append(gan_score_svc) egan_svc.append(egan_score_svc) mean_score_sgd = clf_SGD(imputed_data_x_mean, y, train_idx, test_idx) knn_score_sgd = clf_SGD(imputed_data_x_knn, y, train_idx, test_idx) miss_score_sgd = clf_SGD(imputed_data_mf, y, train_idx, test_idx) mice_score_sgd = clf_SGD(imputed_data_mice, y, train_idx, test_idx) mean_sgd.append(mean_score_sgd) knn_sgd.append(knn_score_sgd) miss_sgd.append(miss_score_sgd) mice_sgd.append(mice_score_sgd) gan_score_sgd = clf_SGD(imputed_data_x, y, train_idx, test_idx) egan_score_sgd = clf_SGD(imputed_data_x_e, y, train_idx, test_idx) gan_sgd.append(gan_score_sgd) egan_sgd.append(egan_score_sgd) #gan_score_gau = clf_GAU(imputed_data_x , y, train_idx, test_idx) #egan_score_gau = clf_GAU(imputed_data_x_e , y, train_idx, test_idx) #gan_gau.append(gan_score_gau) #egan_gau.append(egan_score_gau) print() print("Datasets: ", data_name) #print(gan_rs,egain_rs, mice_rs,miss_rs) print('RMSE GAIN: {} ± {}'.format(round(np.mean(gan_rs) * 1, 2), round(np.std(gan_rs), 4))) print('RMSE EGAIN: {} ± {}'.format(round(np.mean(egain_rs) * 1, 2), round(np.std(egain_rs), 4))) print('RMSE MEAN: {} ± {}'.format(round(np.mean(mean_rmse) * 1, 2), round(np.std(mean_rmse), 4))) print('RMSE KNN: {} ± {}'.format(round(np.mean(knn_rmse) * 1, 2), round(np.std(knn_rmse), 4))) print('RMSE MICE: {} ± {}'.format(round(np.mean(mice_rmse) * 1, 2), round(np.std(mice_rmse), 4))) print('RMSE MFORE: {} ± {}'.format(round(np.mean(miss_rmse) * 1, 2), round(np.std(miss_rmse), 4))) #print() #print('MLP GAIN: {} ± {}'.format(round(np.mean(gan_mlp)*1,2), round(np.std(gan_mlp),4))) #print('MLP EGAIN: {} ± {}'.format(round(np.mean(egan_mlp)*1,2), round(np.std(egan_mlp),4))) #print() #print('DT GAIN: {} ± {}'.format(round(np.mean(gan_dt)*1,2), round(np.std(gan_dt),4))) #print('DT EGAIN: {} ± {}'.format(round(np.mean(egan_dt)*1,2), round(np.std(egan_dt),4))) print() print('LR GAIN: {} ± {}'.format(round(np.mean(gan_lr) * 1, 2), round(np.std(gan_lr), 4))) print('LR EGAIN: {} ± {}'.format(round(np.mean(egan_lr) * 1, 2), round(np.std(egan_lr), 4))) print('LR MEAN: {} ± {}'.format(round(np.mean(mean_lr) * 1, 2), round(np.std(mean_lr), 4))) print('LR KNN: {} ± {}'.format(round(np.mean(knn_lr) * 1, 2), round(np.std(knn_lr), 4))) print('LR MICE: {} ± {}'.format(round(np.mean(mice_lr) * 1, 2), round(np.std(mice_lr), 4))) print('LR MISSFOR: {} ± {}'.format(round(np.mean(miss_lr) * 1, 2), round(np.std(miss_lr), 4))) print() print('SVC GAIN: {} ± {}'.format(round(np.mean(gan_svc) * 1, 2), round(np.std(gan_svc), 4))) print('SVC EGAIN: {} ± {}'.format(round(np.mean(egan_svc) * 1, 2), round(np.std(egan_svc), 4))) print('SVC MEAN: {} ± {}'.format(round(np.mean(mean_svc) * 1, 2), round(np.std(mean_svc), 4))) print('SVC KNN: {} ± {}'.format(round(np.mean(knn_svc) * 1, 2), round(np.std(knn_svc), 4))) print('SVC MICE: {} ± {}'.format(round(np.mean(mice_svc) * 1, 2), round(np.std(mice_svc), 4))) print('SVC MISS: {} ± {}'.format(round(np.mean(miss_svc) * 1, 2), round(np.std(miss_svc), 4))) print() print('SGD GAIN: {} ± {}'.format(round(np.mean(gan_sgd) * 1, 2), round(np.std(gan_sgd), 4))) print('SGD EGAIN: {} ± {}'.format(round(np.mean(egan_sgd) * 1, 2), round(np.std(egan_sgd), 4))) print('SGD MEAN: {} ± {}'.format(round(np.mean(mean_sgd) * 1, 2), round(np.std(mean_sgd), 4))) print('SGD KNN: {} ± {}'.format(round(np.mean(knn_sgd) * 1, 2), round(np.std(knn_sgd), 4))) print('SGD MICE: {} ± {}'.format(round(np.mean(mice_sgd) * 1, 2), round(np.std(mice_sgd), 4))) print('SGD MISS: {} ± {}'.format(round(np.mean(miss_sgd) * 1, 2), round(np.std(miss_sgd), 4))) #print() #print('GAU GAIN: {} ± {}'.format(round(np.mean(gan_gau)*1,2), round(np.std(gan_dt),4))) #print('GAU EGAIN: {} ± {}'.format(round(np.mean(egan_gau)*1,2), round(np.std(egan_dt),4))) # MissForest #print() #print('=== MissForest RMSE ===') #data = miss_data_x #imp_mean = MissForest(max_iter = 1) #miss_f = imp_mean.fit_transform(data) #miss_f = pd.DataFrame(imputed_train_df) #rmse_MF = rmse_loss (ori_data_x, miss_f, data_m) #print('RMSE Performance: ' + str(np.round(rmse_MF, 6))) #np.savetxt("data/imputed_data_MF.csv",miss_f, delimiter=',', fmt='%d') #print( 'Save results in Imputed_data_MF.csv') # MICE From Auto Impute #print() #print('=== MICE of Auto Impute RMSE ===') #data_mice = pd.DataFrame(miss_data_x) #mi = MiceImputer(k=1, imp_kwgs=None, n=1, predictors='all', return_list=True, # seed=None, strategy='interpolate', visit='default') #mice_out = mi.fit_transform(data_mice) #c = [list(x) for x in mice_out] #c1= c[0] #c2=c1[1] #c3=np.asarray(c2) #mice_x=c3 #print('here :', mice_x, miss_f, miss_f.shape) #rmse_MICE = rmse_loss (ori_data_x, mice_x, data_m) #print('=== MICE of Auto Impute RMSE ===') #print('RMSE Performance: ' + str(np.round(rmse_MICE, 6))) #np.savetxt("data/imputed_data_MICE.csv",mice_x, delimiter=',', fmt='%d') #print( 'Save results in Imputed_data_MICE.csv') return imputed_data_mf, rmse_mf