def main_ide(base_path, params): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: (Dict) imputed data - rmse: Root Mean Squared Error ''' base_path = base_path data_name = params['data_name'] miss_rate = params['miss_rate'] gain_parameters = { 'batch_size': params['batch_size'], 'hint_rate': params['hint_rate'], 'alpha': params['alpha'], 'iterations': params['iterations'] } # Load data and introduce missingness ori_data_x, miss_data_x, data_m = data_loader(base_path, data_name, miss_rate) # Impute missing data imputed_data_x = {} imputed_data_x['GAIN'] = gain(miss_data_x, gain_parameters) imputed_data_x['Median'] = Impute_med(miss_data_x) imputed_data_x['EM'] = Impute_EM(miss_data_x) # Report the RMSE performance rmse_dict = {} rmse_dict['GAIN'] = rmse_loss(ori_data_x, imputed_data_x['GAIN'], data_m) rmse_dict['Median'] = rmse_loss(ori_data_x, imputed_data_x['Median'], data_m) rmse_dict['EM'] = rmse_loss(ori_data_x, imputed_data_x['EM'], data_m) print() print('Parameters:') print(params) print('RMSE Performance:') print(rmse_dict) return imputed_data_x, data_m, rmse_dict
def evaluation_step(generator, data_m, norm_data_x, data_x, ori_data_x, normalizer): """ The validation schema is absent in the original paper implementation We will use for convenience the RMSE Value that is used as a Metric to perform Early Stopping and monitor the During-Training Performance of the Model """ Z_mb = uniform_sampler(0, 0.01, data_m.shape[0], data_m.shape[1]) Z_mb = Z_mb.astype('float32') M_mb = data_m M_mb = M_mb.astype('float32') X_mb = norm_data_x X_mb = X_mb.astype('float32') X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb imputed_data = generator.predict( tf.concat([X_mb.values, M_mb.values], axis=1))[0] imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data # Renormalization imputed_data = normalizer.denormalize(imputed_data) # Rounding imputed_data_values = rounding(imputed_data.values, data_x.values) rmse = rmse_loss(ori_data_x.values, imputed_data_values, data_m.values) imputed_and_rounded_df_to_use_for_downstream_task = pd.DataFrame( data=imputed_data_values, columns=imputed_data.columns) return rmse, imputed_and_rounded_df_to_use_for_downstream_task
def main (args): os.environ["CUDA_VISIBLE_DEVICES"] = "-1" data_name = args.data_name miss_rate = args.miss_rate gain_parameters = {'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'beta': args.beta, 'lambda_': args.lambda_, 'k': args.k, 'iterations': args.iterations, 'cluster_species':args.cluster_species} # Load data and introduce missingness data_x, miss_data_x, data_M = data_loader(data_name, miss_rate) row , col = miss_data_x.shape five_len = row//5 ## 5-cross validations impute missing data for i in range(5): incomplete_data_x = np.vstack((miss_data_x[0:i*five_len:,] , miss_data_x[(i+1)*five_len:row:,])) complete_data_x = np.vstack((data_x[0:i*five_len:,] , data_x[(i+1)*five_len:row:,])) data_m = np.vstack((data_M[0:i*five_len:,] , data_M[(i+1)*five_len:row:,])) imputed_data = PC_GAIN(incomplete_data_x , gain_parameters , data_m) rmse = str(np.round(rmse_loss (complete_data_x, imputed_data, data_m), 4)) print('RMSE Performance: ',rmse)
def test(data_m, data_x, dim, generator, no, norm_data_x, norm_parameters, ori_data_x, test_index): # Return imputed data Z_mb = uniform_sampler(0, 0.01, no, dim) M_mb = data_m X_mb = norm_data_x X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb imputed_data = generator(torch.Tensor(X_mb), torch.Tensor(M_mb)).detach().numpy() imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data # Renormalization imputed_data = renormalization(imputed_data, norm_parameters) # Rounding imputed_data = rounding(imputed_data, data_x) rmse, rmse_mean = rmse_loss(ori_data_x[test_index], imputed_data[test_index], data_m[test_index]) rmse_full, rmse_full_mean = rmse_loss(ori_data_x, imputed_data, data_m) print(f'RMSE Performance (mean): {rmse_mean:.4f} (test), {rmse_full_mean:.4f} (full).') # print(f'RMSE Performance: {rmse:.4f} (test), {rmse_full:.4f} (full).') return rmse
def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name miss_rate = args.miss_rate gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations } # Load data and introduce missingness ori_data_x, miss_data_x, data_m = data_loader(data_name, miss_rate) # Impute missing data imputed_data_x = gain(miss_data_x, gain_parameters) # Report the RMSE performance rmse = rmse_loss(ori_data_x, imputed_data_x, data_m) mae = mae_loss(ori_data_x, imputed_data_x, data_m) print() print('RMSE Performance: ' + str(np.round(rmse, 4))) print('MAE Performance: ' + str(np.round(mae, 4))) return imputed_data_x, rmse
def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name miss_rate = args.miss_rate random = args.seed time = args.time x1 = data_name x = x1.split("+") print(x) gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations, 'time': args.time } # Load data and introduce missingness #ori_data_x, miss_data_x, data_m = data_loader2(data_name, miss_rate,random) miss_rate_caption = "{}% Missing".format(int(miss_rate * 100)) col1 = [ miss_rate_caption, 'RMSE', 'RMSE', 'RMSE', 'RMSE', 'RMSE', 'RMSE', 'RMSPE', 'RMSPE', 'RMSPE', 'RMSPE', 'RMSPE', 'RMSPE', '', 'MLP', 'MLP', 'D.Tree', 'D.Tree', 'LogisticR', 'LogisticR', 'LogisticR', 'LogisticR', 'LogisticR', 'LogisticR', 'SVC', 'SVC', 'SVC', 'SVC', 'SVC', 'SVC', 'SGD', 'SGD', 'SGD', 'SGD', 'SGD', 'SGD' ] col2 = [ 'Method', 'EGAIN', 'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE', 'EGAIN', 'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE', '', 'EGAIN', 'GAIN', 'EGAIN', 'GAIN', 'EGAIN', 'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE', 'EGAIN', 'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE', 'EGAIN', 'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE' ] result = [col1, col2] for data_train in x: data_name = data_train dataset = [ 'obesity', 'hepatitisC', 'audit', 'letter', 'spam', 'breast', 'credit', 'news', 'blood', 'vowel', 'ecoli', 'ionosphere', 'parkinsons', 'seedst', 'vehicle', 'vertebral', 'wine', 'banknote', 'balance', 'yeast', 'bean', 'shill', 'phishing', 'firewall', 'iBeacon', 'steel' ] if (data_name not in dataset): print("Wrong name: {} Dataset. Skip this datasets".format( data_train)) break col3 = [] col3.append(data_name) print("****** {} Dataset ******".format(data_train)) gan_rs, egain_rs, mice_rs,miss_rs, gan_mlp, gan_dt, egan_mlp, egan_dt = [],[],[],[],[],[],[],[] gan_svc, egan_svc, gan_lr, egan_lr, gan_sgd, egan_sgd, gan_gau, egan_gau = [],[],[],[],[],[],[],[] knn_rmse, mean_rmse, miss_rmse, mice_rmse = [], [], [], [] gan_rmspe, egan_rmspe, knn_rmspe , mean_rmspe, miss_rmspe, mice_rmspe = [],[],[],[],[],[] knn_lr, knn_svc, knn_sgd, mean_lr, mean_svc, mean_sgd = [],[],[],[],[],[] miss_lr, miss_svc, miss_sgd, mice_lr, mice_svc, mice_sgd = [],[],[],[],[],[] for i in range(time): # Load data and introduce missingness # Fix loader i=42 ori_data_x, miss_data_x, data_m, y = data_loader3( data_name, miss_rate, 42) # 7) #i) block i train_idx, test_idx = train_test_split( range(len(y)), test_size=0.2, stratify=y, random_state=i) #7) #i) block i miss_data_x2 = miss_data_x #* 10000 if i % 5 == 0: print('=== Working on {}/{} ==='.format(i, time)) # Impute missing data imputed_data_x1 = gain(miss_data_x2, gain_parameters) imputed_data_x_e1 = egain(miss_data_x2, gain_parameters) imputed_data_x = imputed_data_x1 #* 1/10000 imputed_data_x_e = imputed_data_x_e1 #* 1/10000 imp_MEAN = SimpleImputer(missing_values=np.nan, strategy='mean') imputed_data_x_mean = imp_MEAN.fit_transform(miss_data_x) imputed_data_x_mean = imputed_data_x_mean.round() #imputed_data_x_mean = imp_MEAN.fit_transform(miss_data_x2) *1/10000 imp_KNN = KNNImputer(missing_values=np.nan, n_neighbors=3) imputed_data_x_knn = imp_KNN.fit_transform(miss_data_x) # *1/10000 imputed_data_x_knn = imputed_data_x_knn.round() # ExtraTreesRegressor: similar to missForest in R; DecisionTreeRegressor() imp_mf = IterativeImputer(estimator=ExtraTreesRegressor(), max_iter=1, initial_strategy="constant", n_nearest_features=1, imputation_order='descending') #20 imputed_data_mf = imp_mf.fit_transform(miss_data_x) #*1/10000 imputed_data_mf = imputed_data_mf.round() #imp_mf = MissForest(max_iter=1) #imputed_data_mf = imp_mf.fit_transform(miss_data_x) imp_mice = IterativeImputer( estimator=BayesianRidge(), max_iter=1, initial_strategy='constant', n_nearest_features=1, imputation_order='descending') # 'mean') #20 imputed_data_mice = imp_mice.fit_transform(miss_data_x) #*1/10000 imputed_data_mice = imputed_data_mice.round() # Report the RMSE performance rmse = rmse_loss(ori_data_x, imputed_data_x, data_m) rmse_e = rmse_loss(ori_data_x, imputed_data_x_e, data_m) rmse_mean = rmse_loss(ori_data_x, imputed_data_x_mean, data_m) rmse_knn = rmse_loss(ori_data_x, imputed_data_x_knn, data_m) rmse_mf = rmse_loss(ori_data_x, imputed_data_mf, data_m) rmse_mice = rmse_loss(ori_data_x, imputed_data_mice, data_m) gan_rs.append(rmse) egain_rs.append(rmse_e) mean_rmse.append(rmse_mean) knn_rmse.append(rmse_knn) mice_rmse.append(rmse_mice) miss_rmse.append(rmse_mf) # Report the RMSPE performance rmspe = rmspe_loss(ori_data_x, imputed_data_x, data_m) rmspe_e = rmspe_loss(ori_data_x, imputed_data_x_e, data_m) rmspe_mean = rmspe_loss(ori_data_x, imputed_data_x_mean, data_m) rmspe_knn = rmspe_loss(ori_data_x, imputed_data_x_knn, data_m) rmspe_mf = rmspe_loss(ori_data_x, imputed_data_mf, data_m) rmspe_mice = rmspe_loss(ori_data_x, imputed_data_mice, data_m) #gan_rmspe, egan_rmspe, knn_rmspe , mean_rmspe, miss_rmspe, mice_rmspe gan_rmspe.append(rmspe) egan_rmspe.append(rmspe_e) mean_rmspe.append(rmspe_mean) knn_rmspe.append(rmspe_knn) mice_rmspe.append(rmspe_mice) miss_rmspe.append(rmspe_mf) mi_data = miss_data_x.astype(float) no, dim = imputed_data_mice.shape miss_data = np.reshape(mi_data, (no, dim)) np.savetxt("data/{}missing_data.csv".format(i), mi_data, delimiter=',', fmt='%1.2f') np.savetxt("data/{}imputed_data_gain.csv".format(i), imputed_data_x, delimiter=',', fmt='%d') np.savetxt("data/{}imputed_data_egain.csv".format(i), imputed_data_x_e, delimiter=',', fmt='%d') imputed_data_x, _ = normalization(imputed_data_x) imputed_data_x_e, _ = normalization(imputed_data_x_e) imputed_data_x_mean, _ = normalization(imputed_data_x_mean) imputed_data_x_knn, _ = normalization(imputed_data_x_knn) imputed_data_mf, _ = normalization(imputed_data_mf) imputed_data_mice, _ = normalization(imputed_data_mice) gan_score_mlp = clf_MLP(imputed_data_x, y, train_idx, test_idx) egan_score_mlp = clf_MLP(imputed_data_x_e, y, train_idx, test_idx) gan_mlp.append(gan_score_mlp) egan_mlp.append(egan_score_mlp) gan_score_dt = clf_DT(imputed_data_x, y, train_idx, test_idx) egan_score_dt = clf_DT(imputed_data_x_e, y, train_idx, test_idx) gan_dt.append(gan_score_dt) egan_dt.append(egan_score_dt) gan_score_lr = clf_LR(imputed_data_x, y, train_idx, test_idx) egan_score_lr = clf_LR(imputed_data_x_e, y, train_idx, test_idx) mean_score_lr = clf_LR(imputed_data_x_mean, y, train_idx, test_idx) knn_score_lr = clf_LR(imputed_data_x_knn, y, train_idx, test_idx) miss_score_lr = clf_LR(imputed_data_mf, y, train_idx, test_idx) mice_score_lr = clf_LR(imputed_data_mice, y, train_idx, test_idx) gan_lr.append(gan_score_lr) egan_lr.append(egan_score_lr) mean_lr.append(mean_score_lr) knn_lr.append(knn_score_lr) miss_lr.append(miss_score_lr) mice_lr.append(mice_score_lr) mean_score_svc = clf_SVC(imputed_data_x_mean, y, train_idx, test_idx) knn_score_svc = clf_SVC(imputed_data_x_knn, y, train_idx, test_idx) miss_score_svc = clf_SVC(imputed_data_mf, y, train_idx, test_idx) mice_score_svc = clf_SVC(imputed_data_mice, y, train_idx, test_idx) mean_svc.append(mean_score_svc) knn_svc.append(knn_score_svc) miss_svc.append(miss_score_svc) mice_svc.append(mice_score_svc) gan_score_svc = clf_SVC(imputed_data_x, y, train_idx, test_idx) egan_score_svc = clf_SVC(imputed_data_x_e, y, train_idx, test_idx) gan_svc.append(gan_score_svc) egan_svc.append(egan_score_svc) mean_score_sgd = clf_SGD(imputed_data_x_mean, y, train_idx, test_idx) knn_score_sgd = clf_SGD(imputed_data_x_knn, y, train_idx, test_idx) miss_score_sgd = clf_SGD(imputed_data_mf, y, train_idx, test_idx) mice_score_sgd = clf_SGD(imputed_data_mice, y, train_idx, test_idx) mean_sgd.append(mean_score_sgd) knn_sgd.append(knn_score_sgd) miss_sgd.append(miss_score_sgd) mice_sgd.append(mice_score_sgd) gan_score_sgd = clf_SGD(imputed_data_x, y, train_idx, test_idx) egan_score_sgd = clf_SGD(imputed_data_x_e, y, train_idx, test_idx) gan_sgd.append(gan_score_sgd) egan_sgd.append(egan_score_sgd) #gan_score_gau = clf_GAU(imputed_data_x , y, train_idx, test_idx) #egan_score_gau = clf_GAU(imputed_data_x_e , y, train_idx, test_idx) #gan_gau.append(gan_score_gau) #egan_gau.append(egan_score_gau) print() print("Datasets: ", data_name) #print(gan_rs,egain_rs, mice_rs,miss_rs) col3.append( f"{round(np.mean(egain_rs)*1,2)} ± {round(np.std(egain_rs),4)}") col3.append( f"{round(np.mean(gan_rs)*1,2)} ± {round(np.std(gan_rs),4)}") col3.append( f"{round(np.mean(mean_rmse)*1,2)} ± {round(np.std(mean_rmse),4)}") col3.append( f"{round(np.mean(knn_rmse)*1,2)} ± {round(np.std(knn_rmse),4)}") col3.append( f"{round(np.mean(mice_rmse)*1,2)} ± {round(np.std(mice_rmse),4)}") col3.append( f"{round(np.mean(miss_rmse)*1,2)} ± {round(np.std(miss_rmse),4)}") ##gan_rmspe, egan_rmspe, knn_rmspe , mean_rmspe, miss_rmspe, mice_rmspe col3.append( f"{round(np.mean(egan_rmspe)*1,2)} ± {round(np.std(egan_rmspe),4)}" ) col3.append( f"{round(np.mean(gan_rmspe)*1,2)} ± {round(np.std(gan_rmspe),4)}") col3.append( f"{round(np.mean(mean_rmspe)*1,2)} ± {round(np.std(mean_rmspe),4)}" ) col3.append( f"{round(np.mean(knn_rmspe)*1,2)} ± {round(np.std(knn_rmspe),4)}") col3.append( f"{round(np.mean(mice_rmspe)*1,2)} ± {round(np.std(mice_rmspe),4)}" ) col3.append( f"{round(np.mean(miss_rmspe)*1,2)} ± {round(np.std(miss_rmspe),4)}" ) col3.append([]) col3.append( f"{round(np.mean(egan_mlp)*1,2)} ± {round(np.std(egan_mlp),4)}") col3.append( f"{round(np.mean(gan_mlp)*1,2)} ± { round(np.std(gan_mlp),4)}") col3.append( f"{round(np.mean(egan_dt)*1,2)} ± { round(np.std(egan_dt),4)}") col3.append( f"{round(np.mean(gan_dt)*1,2)} ± { round(np.std(gan_dt),4)}") col3.append( f"{round(np.mean(egan_lr)*1,2)} ± {round(np.std(egan_lr),4)}") col3.append( f"{round(np.mean(gan_lr)*1,2)} ± {round(np.std(gan_lr),4)}") col3.append( f"{round(np.mean(mean_lr)*1,2)} ± {round(np.std(mean_lr),4)}") col3.append( f"{round(np.mean(knn_lr)*1,2)} ± {round(np.std(knn_lr),4)}") col3.append( f"{round(np.mean(mice_lr)*1,2)} ± {round(np.std(mice_lr),4)}") col3.append( f"{round(np.mean(miss_lr)*1,2)} ± {round(np.std(miss_lr),4)}") col3.append( f"{round(np.mean(egan_svc)*1,2)} ± { round(np.std(egan_svc),4)}") col3.append( f"{round(np.mean(gan_svc)*1,2)} ± { round(np.std(gan_svc),4)}") col3.append( f"{round(np.mean(mean_svc)*1,2)} ± { round(np.std(mean_svc),4)}") col3.append( f"{round(np.mean(knn_svc)*1,2)} ± {round(np.std(knn_svc),4)}") col3.append( f"{round(np.mean(mice_svc)*1,2)} ± {round(np.std(mice_svc),4)}") col3.append( f"{round(np.mean(miss_svc)*1,2)} ± { round(np.std(miss_svc),4)}") col3.append( f"{round(np.mean(egan_sgd)*1,2)} ± { round(np.std(egan_sgd),4)}") col3.append( f"{round(np.mean(gan_sgd)*1,2)} ± { round(np.std(gan_sgd),4)}") col3.append( f"{round(np.mean(mean_sgd)*1,2)} ± { round(np.std(mean_sgd),4)}") col3.append( f"{round(np.mean(knn_sgd)*1,2)} ± {round(np.std(knn_sgd),4)}") col3.append( f"{round(np.mean(mice_sgd)*1,2)} ± { round(np.std(mice_sgd),4)}") col3.append( f"{round(np.mean(miss_sgd)*1,2)} ± {round(np.std(miss_sgd),4)}") ''' print('RMSE GAIN: {} ± {}'.format(round(np.mean(gan_rs)*1,2), round(np.std(gan_rs),4))) #print(gan_rs) print('RMSE EGAIN: {} ± {}'.format(round(np.mean(egain_rs)*1,2), round(np.std(egain_rs),4))) #print(egain_rs) print('RMSE MEAN: {} ± {}'.format(round(np.mean(mean_rmse)*1,2), round(np.std(mean_rmse),4))) #print(knn_rmse) print('RMSE KNN: {} ± {}'.format(round(np.mean(knn_rmse)*1,2), round(np.std(knn_rmse),4))) #print(mice_rmse) print('RMSE MICE: {} ± {}'.format(round(np.mean(mice_rmse)*1,2), round(np.std(mice_rmse),4))) #print(miss_rmse) print('RMSE MFORE: {} ± {}'.format(round(np.mean(miss_rmse)*1,2), round(np.std(miss_rmse),4))) #print(miss_rmse) print() print('MLP GAIN: {} ± {}'.format(round(np.mean(gan_mlp)*1,2), round(np.std(gan_mlp),4))) print('MLP EGAIN: {} ± {}'.format(round(np.mean(egan_mlp)*1,2), round(np.std(egan_mlp),4))) print() print('DT GAIN: {} ± {}'.format(round(np.mean(gan_dt)*1,2), round(np.std(gan_dt),4))) print('DT EGAIN: {} ± {}'.format(round(np.mean(egan_dt)*1,2), round(np.std(egan_dt),4))) print() print('LR GAIN: {} ± {}'.format(round(np.mean(gan_lr)*1,2), round(np.std(gan_lr),4))) #print(gan_lr) print('LR EGAIN: {} ± {}'.format(round(np.mean(egan_lr)*1,2), round(np.std(egan_lr),4))) #print(egan_lr) print('LR MEAN: {} ± {}'.format(round(np.mean(mean_lr)*1,2), round(np.std(mean_lr),4))) #print(mean_lr) print('LR KNN: {} ± {}'.format(round(np.mean(knn_lr)*1,2), round(np.std(knn_lr),4))) #print(knn_lr) print('LR MICE: {} ± {}'.format(round(np.mean(mice_lr)*1,2), round(np.std(mice_lr),4))) #print(mice_lr) print('LR MISSFOR: {} ± {}'.format(round(np.mean(miss_lr)*1,2), round(np.std(miss_lr),4))) #print(miss_lr) print() print('SVC GAIN: {} ± {}'.format(round(np.mean(gan_svc)*1,2), round(np.std(gan_svc),4))) #print(gan_svc) print('SVC EGAIN: {} ± {}'.format(round(np.mean(egan_svc)*1,2), round(np.std(egan_svc),4))) #print(egan_svc) print('SVC MEAN: {} ± {}'.format(round(np.mean(mean_svc)*1,2), round(np.std(mean_svc),4))) #print(mean_svc) print('SVC KNN: {} ± {}'.format(round(np.mean(knn_svc)*1,2), round(np.std(knn_svc),4))) #print(knn_svc) print('SVC MICE: {} ± {}'.format(round(np.mean(mice_svc)*1,2), round(np.std(mice_svc),4))) #print(mice_svc) print('SVC MISS: {} ± {}'.format(round(np.mean(miss_svc)*1,2), round(np.std(miss_svc),4))) #print(miss_svc) print() print('SGD GAIN: {} ± {}'.format(round(np.mean(gan_sgd)*1,2), round(np.std(gan_sgd),4))) #print(gan_sgd) print('SGD EGAIN: {} ± {}'.format(round(np.mean(egan_sgd)*1,2), round(np.std(egan_sgd),4))) #print(egan_sgd) print('SGD MEAN: {} ± {}'.format(round(np.mean(mean_sgd)*1,2), round(np.std(mean_sgd),4))) #print(mean_sgd) print('SGD KNN: {} ± {}'.format(round(np.mean(knn_sgd)*1,2), round(np.std(knn_sgd),4))) #print(knn_sgd) print('SGD MICE: {} ± {}'.format(round(np.mean(mice_sgd)*1,2), round(np.std(mice_sgd),4))) #print(mice_sgd) print('SGD MISS: {} ± {}'.format(round(np.mean(miss_sgd)*1,2), round(np.std(miss_sgd),4))) ''' result.append(col3) my_array = np.asarray(result) #print(my_array) df_result = pd.DataFrame(my_array) df_result_tran = df_result.transpose() print(df_result_tran.to_string(index=False, header=False)) #df_result_tran.to_csv("result.csv", encoding='utf-8', index=False, header=False) df_result_tran.to_csv("result.csv", index=False, header=False) df_result_tran.to_excel("result.xls", encoding='utf-8', index=False, header=False) #print(miss_sgd) #print() #print('GAU GAIN: {} ± {}'.format(round(np.mean(gan_gau)*1,2), round(np.std(gan_dt),4))) #print('GAU EGAIN: {} ± {}'.format(round(np.mean(egan_gau)*1,2), round(np.std(egan_dt),4))) # MissForest #print() #print('=== MissForest RMSE ===') #data = miss_data_x #imp_mean = MissForest(max_iter = 1) #miss_f = imp_mean.fit_transform(data) #miss_f = pd.DataFrame(imputed_train_df) #rmse_MF = rmse_loss (ori_data_x, miss_f, data_m) #print('RMSE Performance: ' + str(np.round(rmse_MF, 6))) #np.savetxt("data/imputed_data_MF.csv",miss_f, delimiter=',', fmt='%d') #print( 'Save results in Imputed_data_MF.csv') # MICE From Auto Impute #print() #print('=== MICE of Auto Impute RMSE ===') #data_mice = pd.DataFrame(miss_data_x) #mi = MiceImputer(k=1, imp_kwgs=None, n=1, predictors='all', return_list=True, # seed=None, strategy='interpolate', visit='default') #mice_out = mi.fit_transform(data_mice) #c = [list(x) for x in mice_out] #c1= c[0] #c2=c1[1] #c3=np.asarray(c2) #mice_x=c3 #print('here :', mice_x, miss_f, miss_f.shape) #rmse_MICE = rmse_loss (ori_data_x, mice_x, data_m) #print('=== MICE of Auto Impute RMSE ===') #print('RMSE Performance: ' + str(np.round(rmse_MICE, 6))) #np.savetxt("data/imputed_data_MICE.csv",mice_x, delimiter=',', fmt='%d') #print( 'Save results in Imputed_data_MICE.csv') return imputed_data_mf, rmse_mf
def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name miss_rate = args.miss_rate random = args.seed time = args.time gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations, 'time': args.time } # Load data and introduce missingness #ori_data_x, miss_data_x, data_m = data_loader2(data_name, miss_rate,random) gan_rs, egain_rs, mice_rs,miss_rs, gan_mlp, gan_dt, egan_mlp, egan_dt = [],[],[],[],[],[],[],[] gan_svc, egan_svc, gan_lr, egan_lr, gan_sgd, egan_sgd, gan_gau, egan_gau = [],[],[],[],[],[],[],[] knn_rmse, mean_rmse, miss_rmse, mice_rmse = [], [], [], [] knn_lr, knn_svc, knn_sgd, mean_lr, mean_svc, mean_sgd = [],[],[],[],[],[] miss_lr, miss_svc, miss_sgd, mice_lr, mice_svc, mice_sgd = [],[],[],[],[],[] for i in range(time): # Load data and introduce missingness ori_data_x, miss_data_x, data_m, y = data_loader3( data_name, miss_rate, i) train_idx, test_idx = train_test_split(range(len(y)), test_size=0.3, stratify=y, random_state=42) miss_data_x2 = miss_data_x * 10000 if i % 5 == 0: print('=== Working on {}/{} ==='.format(i, time)) # Impute missing data imputed_data_x1 = gain(miss_data_x2, gain_parameters) imputed_data_x_e1 = egain(miss_data_x2, gain_parameters) imputed_data_x = imputed_data_x1 * 1 / 10000 imputed_data_x_e = imputed_data_x_e1 * 1 / 10000 imp_MEAN = SimpleImputer(missing_values=np.nan, strategy='mean') imputed_data_x_mean = imp_MEAN.fit_transform(miss_data_x) #imputed_data_x_mean = imp_MEAN.fit_transform(miss_data_x2) *1/10000 imp_KNN = KNNImputer(missing_values=np.nan) imputed_data_x_knn = imp_KNN.fit_transform(miss_data_x) # *1/10000 imp_mf = IterativeImputer(estimator=DecisionTreeRegressor(), max_iter=3) #20 imputed_data_mf = imp_mf.fit_transform(miss_data_x) #*1/10000 imp_mice = IterativeImputer(estimator=BayesianRidge(), max_iter=3) #20 imputed_data_mice = imp_mice.fit_transform(miss_data_x) #*1/10000 # Report the RMSE performance rmse = rmse_loss(ori_data_x, imputed_data_x, data_m) rmse_e = rmse_loss(ori_data_x, imputed_data_x_e, data_m) rmse_mean = rmse_loss(ori_data_x, imputed_data_x_mean, data_m) rmse_knn = rmse_loss(ori_data_x, imputed_data_x_knn, data_m) rmse_mf = rmse_loss(ori_data_x, imputed_data_mf, data_m) rmse_mice = rmse_loss(ori_data_x, imputed_data_mice, data_m) gan_rs.append(rmse) egain_rs.append(rmse_e) mean_rmse.append(rmse_mean) knn_rmse.append(rmse_knn) mice_rmse.append(rmse_mice) miss_rmse.append(rmse_mf) mi_data = miss_data_x.astype(float) no, dim = imputed_data_mice.shape miss_data = np.reshape(mi_data, (no, dim)) np.savetxt("data/missing_data.csv", mi_data, delimiter=',', fmt='%1.2f') np.savetxt("data/imputed_data_gain.csv", imputed_data_x, delimiter=',', fmt='%d') np.savetxt("data/imputed_data_egain.csv", imputed_data_x_e, delimiter=',', fmt='%d') imputed_data_x, _ = normalization(imputed_data_x) imputed_data_x_e, _ = normalization(imputed_data_x_e) imputed_data_x_mean, _ = normalization(imputed_data_x_mean) imputed_data_x_knn, _ = normalization(imputed_data_x_knn) imputed_data_mf, _ = normalization(imputed_data_mf) imputed_data_mice, _ = normalization(imputed_data_mice) #gan_score_mlp = clf_MLP(imputed_data_x , y, train_idx, test_idx) #egan_score_mlp = clf_MLP(imputed_data_x_e, y, train_idx, test_idx) #gan_mlp.append(gan_score_mlp) #egan_mlp.append(egan_score_mlp) #gan_score_dt = clf_DT(imputed_data_x , y, train_idx, test_idx) #egan_score_dt = clf_DT(imputed_data_x_e , y, train_idx, test_idx) #gan_dt.append(gan_score_dt) #egan_dt.append(egan_score_dt) gan_score_lr = clf_LR(imputed_data_x, y, train_idx, test_idx) egan_score_lr = clf_LR(imputed_data_x_e, y, train_idx, test_idx) mean_score_lr = clf_LR(imputed_data_x_mean, y, train_idx, test_idx) knn_score_lr = clf_LR(imputed_data_x_knn, y, train_idx, test_idx) miss_score_lr = clf_LR(imputed_data_mf, y, train_idx, test_idx) mice_score_lr = clf_LR(imputed_data_mice, y, train_idx, test_idx) gan_lr.append(gan_score_lr) egan_lr.append(egan_score_lr) mean_lr.append(mean_score_lr) knn_lr.append(knn_score_lr) miss_lr.append(miss_score_lr) mice_lr.append(mice_score_lr) mean_score_svc = clf_SVC(imputed_data_x_mean, y, train_idx, test_idx) knn_score_svc = clf_SVC(imputed_data_x_knn, y, train_idx, test_idx) miss_score_svc = clf_SVC(imputed_data_mf, y, train_idx, test_idx) mice_score_svc = clf_SVC(imputed_data_mice, y, train_idx, test_idx) mean_svc.append(mean_score_svc) knn_svc.append(knn_score_svc) miss_svc.append(miss_score_svc) mice_svc.append(mice_score_svc) gan_score_svc = clf_SVC(imputed_data_x, y, train_idx, test_idx) egan_score_svc = clf_SVC(imputed_data_x_e, y, train_idx, test_idx) gan_svc.append(gan_score_svc) egan_svc.append(egan_score_svc) mean_score_sgd = clf_SGD(imputed_data_x_mean, y, train_idx, test_idx) knn_score_sgd = clf_SGD(imputed_data_x_knn, y, train_idx, test_idx) miss_score_sgd = clf_SGD(imputed_data_mf, y, train_idx, test_idx) mice_score_sgd = clf_SGD(imputed_data_mice, y, train_idx, test_idx) mean_sgd.append(mean_score_sgd) knn_sgd.append(knn_score_sgd) miss_sgd.append(miss_score_sgd) mice_sgd.append(mice_score_sgd) gan_score_sgd = clf_SGD(imputed_data_x, y, train_idx, test_idx) egan_score_sgd = clf_SGD(imputed_data_x_e, y, train_idx, test_idx) gan_sgd.append(gan_score_sgd) egan_sgd.append(egan_score_sgd) #gan_score_gau = clf_GAU(imputed_data_x , y, train_idx, test_idx) #egan_score_gau = clf_GAU(imputed_data_x_e , y, train_idx, test_idx) #gan_gau.append(gan_score_gau) #egan_gau.append(egan_score_gau) print() print("Datasets: ", data_name) #print(gan_rs,egain_rs, mice_rs,miss_rs) print('RMSE GAIN: {} ± {}'.format(round(np.mean(gan_rs) * 1, 2), round(np.std(gan_rs), 4))) print('RMSE EGAIN: {} ± {}'.format(round(np.mean(egain_rs) * 1, 2), round(np.std(egain_rs), 4))) print('RMSE MEAN: {} ± {}'.format(round(np.mean(mean_rmse) * 1, 2), round(np.std(mean_rmse), 4))) print('RMSE KNN: {} ± {}'.format(round(np.mean(knn_rmse) * 1, 2), round(np.std(knn_rmse), 4))) print('RMSE MICE: {} ± {}'.format(round(np.mean(mice_rmse) * 1, 2), round(np.std(mice_rmse), 4))) print('RMSE MFORE: {} ± {}'.format(round(np.mean(miss_rmse) * 1, 2), round(np.std(miss_rmse), 4))) #print() #print('MLP GAIN: {} ± {}'.format(round(np.mean(gan_mlp)*1,2), round(np.std(gan_mlp),4))) #print('MLP EGAIN: {} ± {}'.format(round(np.mean(egan_mlp)*1,2), round(np.std(egan_mlp),4))) #print() #print('DT GAIN: {} ± {}'.format(round(np.mean(gan_dt)*1,2), round(np.std(gan_dt),4))) #print('DT EGAIN: {} ± {}'.format(round(np.mean(egan_dt)*1,2), round(np.std(egan_dt),4))) print() print('LR GAIN: {} ± {}'.format(round(np.mean(gan_lr) * 1, 2), round(np.std(gan_lr), 4))) print('LR EGAIN: {} ± {}'.format(round(np.mean(egan_lr) * 1, 2), round(np.std(egan_lr), 4))) print('LR MEAN: {} ± {}'.format(round(np.mean(mean_lr) * 1, 2), round(np.std(mean_lr), 4))) print('LR KNN: {} ± {}'.format(round(np.mean(knn_lr) * 1, 2), round(np.std(knn_lr), 4))) print('LR MICE: {} ± {}'.format(round(np.mean(mice_lr) * 1, 2), round(np.std(mice_lr), 4))) print('LR MISSFOR: {} ± {}'.format(round(np.mean(miss_lr) * 1, 2), round(np.std(miss_lr), 4))) print() print('SVC GAIN: {} ± {}'.format(round(np.mean(gan_svc) * 1, 2), round(np.std(gan_svc), 4))) print('SVC EGAIN: {} ± {}'.format(round(np.mean(egan_svc) * 1, 2), round(np.std(egan_svc), 4))) print('SVC MEAN: {} ± {}'.format(round(np.mean(mean_svc) * 1, 2), round(np.std(mean_svc), 4))) print('SVC KNN: {} ± {}'.format(round(np.mean(knn_svc) * 1, 2), round(np.std(knn_svc), 4))) print('SVC MICE: {} ± {}'.format(round(np.mean(mice_svc) * 1, 2), round(np.std(mice_svc), 4))) print('SVC MISS: {} ± {}'.format(round(np.mean(miss_svc) * 1, 2), round(np.std(miss_svc), 4))) print() print('SGD GAIN: {} ± {}'.format(round(np.mean(gan_sgd) * 1, 2), round(np.std(gan_sgd), 4))) print('SGD EGAIN: {} ± {}'.format(round(np.mean(egan_sgd) * 1, 2), round(np.std(egan_sgd), 4))) print('SGD MEAN: {} ± {}'.format(round(np.mean(mean_sgd) * 1, 2), round(np.std(mean_sgd), 4))) print('SGD KNN: {} ± {}'.format(round(np.mean(knn_sgd) * 1, 2), round(np.std(knn_sgd), 4))) print('SGD MICE: {} ± {}'.format(round(np.mean(mice_sgd) * 1, 2), round(np.std(mice_sgd), 4))) print('SGD MISS: {} ± {}'.format(round(np.mean(miss_sgd) * 1, 2), round(np.std(miss_sgd), 4))) #print() #print('GAU GAIN: {} ± {}'.format(round(np.mean(gan_gau)*1,2), round(np.std(gan_dt),4))) #print('GAU EGAIN: {} ± {}'.format(round(np.mean(egan_gau)*1,2), round(np.std(egan_dt),4))) # MissForest #print() #print('=== MissForest RMSE ===') #data = miss_data_x #imp_mean = MissForest(max_iter = 1) #miss_f = imp_mean.fit_transform(data) #miss_f = pd.DataFrame(imputed_train_df) #rmse_MF = rmse_loss (ori_data_x, miss_f, data_m) #print('RMSE Performance: ' + str(np.round(rmse_MF, 6))) #np.savetxt("data/imputed_data_MF.csv",miss_f, delimiter=',', fmt='%d') #print( 'Save results in Imputed_data_MF.csv') # MICE From Auto Impute #print() #print('=== MICE of Auto Impute RMSE ===') #data_mice = pd.DataFrame(miss_data_x) #mi = MiceImputer(k=1, imp_kwgs=None, n=1, predictors='all', return_list=True, # seed=None, strategy='interpolate', visit='default') #mice_out = mi.fit_transform(data_mice) #c = [list(x) for x in mice_out] #c1= c[0] #c2=c1[1] #c3=np.asarray(c2) #mice_x=c3 #print('here :', mice_x, miss_f, miss_f.shape) #rmse_MICE = rmse_loss (ori_data_x, mice_x, data_m) #print('=== MICE of Auto Impute RMSE ===') #print('RMSE Performance: ' + str(np.round(rmse_MICE, 6))) #np.savetxt("data/imputed_data_MICE.csv",mice_x, delimiter=',', fmt='%d') #print( 'Save results in Imputed_data_MICE.csv') return imputed_data_mf, rmse_mf
def main(): data_names = ['letter', 'spam'] # data_names = ['breasttissue','glass', 'thyroid'] # data with continuous feature and not originally missing #data_names = ['balance','banknote','blood','breasttissue', 'climate','connectionistvowel', # 'ecoli','glass','hillvalley','ionosphere', 'parkinsons','planning','seedst', # 'thyroid','vehicle','vertebral','wine','yeast'] print(len(data_names)) miss_rate = 0.2 batch_size = 64 alpha = 100 iterations = 1000 n_times = 3 wb = xlwt.Workbook() sh_rmse = wb.add_sheet("GAIN_rmse") # sh_acc = wb.add_sheet("EGAIN_acc") sh_acc_dct = wb.add_sheet("GAIN_acc_dct") sh_acc_knn = wb.add_sheet("GAIN_acc_knn") sh_acc_nb = wb.add_sheet("GAIN_acc_nb") sh_acc_lr = wb.add_sheet("GAIN_acc_lr") for k in range(len(data_names)): data_name = data_names[k] gain_parameters = { 'batch_size': batch_size, 'alpha': alpha, 'iterations': iterations } print("Dataset: ", data_name) rmse = [] # acc_dct = [] # acc_knn = [] # acc_nb = [] ori_data_x, y, miss_data_x, m = data_loader(data_name, miss_rate) sh_rmse.write(0, k, data_name) sh_acc_dct.write(0, k, data_name) sh_acc_knn.write(0, k, data_name) sh_acc_nb.write(0, k, data_name) sh_acc_lr.write(0, k, data_name) # sh_acc.write(0, 0, 'dct') # sh_acc.write(0, 1, 'knn') # sh_acc.write(0, 2, 'nb') for i in range(n_times): # Impute missing data imputed_data_x = gain(miss_data_x, gain_parameters) imputed_data_x, _ = normalization(imputed_data_x) # Calculate rmse rmse.append(rmse_loss(ori_data_x, imputed_data_x, m)) print('{:2d}/{:2d}'.format(i + 1, n_times), end=':') print('RMSE = ' + str(np.round(rmse[-1], 4))) sh_rmse.write(i + 1, k, str(np.round(rmse[-1], 4))) if data_name in ['letter', 'spam']: continue scf = StratifiedShuffleSplit(n_splits=10) score_dct = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') print(score_dct) # acc_dct.extend(score_dct) sh_acc_dct.write(i + 1, k, str(np.round(np.mean(score_dct), 4))) # for j in range(len(score_dct)): # sh_acc.write(i * 5 + j + 1, 0, str(np.round(score_dct[j], 4))) score_knn = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') print(score_knn) # acc_knn.extend(score_knn) sh_acc_knn.write(i + 1, k, str(np.round(np.mean(score_knn), 4))) # for j in range(len(score_knn)): # sh_acc.write(i * 5 + j + 1, 1, str(np.round(score_knn[j], 4))) score_nb = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy') print(score_nb) # acc_nb.extend(score_nb) sh_acc_nb.write(i + 1, k, str(np.round(np.mean(score_nb), 4))) # for j in range(len(score_nb)): # sh_acc.write(i * 5 + j + 1, 2, str(np.round(score_nb[j], 4))) score_lr = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf, scoring='accuracy') print(score_lr) # acc_nb.extend(score_nb) sh_acc_lr.write(i + 1, k, str(np.round(np.mean(score_lr), 4))) # rmse = np.array(rmse) # acc_dct = np.array(acc_dct) # acc_knn = np.array(acc_knn) # acc_nb = np.array(acc_nb) # print("RMSE mean = {:.4f}; variance = {:.4f} ".format(np.mean(rmse), np.std(rmse))) # print("Acc mean = {:.4f}; variance = {:.4f} ".format(np.mean(acc_dct), np.std(acc_dct))) # print("Acc mean = {:.4f}; variance = {:.4f} ".format(np.mean(acc_knn), np.std(acc_knn))) # print("Acc mean = {:.4f}; variance = {:.4f} ".format(np.mean(acc_nb), np.std(acc_nb))) print("---------------------------") wb.save('GAIN_results_15.xls')
def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name miss_rate = args.miss_rate gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations } # Load data and introduce missingness ori_data_x, miss_data_x, data_m, data_y = data_loader(data_name, miss_rate) imputed_data_x = gain(miss_data_x, gain_parameters) #pd.DataFrame(data_y, imputed_data_x, axis = 1) # Step- craete data_m using testdata # Step - combine train and missing_test_data # Step - retrun total missing and original data_m # Step - while calculating RMSE # use original as test_original # fetch testing imputed datset 934 to last # data_m as missing_test_data if data_name == 'vals_test_df': imputed_data_x = imputed_data_x[range(918, 1311), :] elif data_name == 'vals_test_df_test_type1': imputed_data_x = imputed_data_x[range(495, 1311), :] elif data_name == 'vals_test_df_test_type2': imputed_data_x = imputed_data_x[range(816, 1311), :] else: imputed_data_x = imputed_data_x imputed_data_x_df = pd.DataFrame(imputed_data_x) data_y_df = pd.DataFrame(data_y) imputed_data_df = pd.concat([data_y_df, imputed_data_x_df], ignore_index=True, axis=1) imputed_data_df.to_csv("GAN_imputated_catalogueData1.csv", index=False) # Report the RMSE performance rmse = rmse_loss(ori_data_x, imputed_data_x, data_m) print() print('RMSE Performance: ' + str(np.round(rmse, 4))) return imputed_data_x, rmse
def main(): data_names = ['letter', 'spam'] # data_names = ['breasttissue','glass', 'thyroid'] # data with continuous feature and not originally missing # data_names = ['balance','banknote','blood','breasttissue', 'climate','connectionistvowel', # 'ecoli','glass','hillvalley','ionosphere', 'parkinsons','planning','seedst', # 'thyroid','vehicle','vertebral','wine','yeast'] print(len(data_names)) miss_rate = 0.2 batch_size = 64 alpha = 100 iterations = 1000 n_times = 30 wb_gain = xlwt.Workbook() sh_rmse_gain = wb_gain.add_sheet("GAIN_rmse") sh_acc_dct_gain = wb_gain.add_sheet("GAIN_acc_dct") sh_acc_knn_gain = wb_gain.add_sheet("GAIN_acc_knn") sh_acc_nb_gain = wb_gain.add_sheet("GAIN_acc_nb") sh_acc_lr_gain = wb_gain.add_sheet("GAIN_acc_lr") wb_egain = xlwt.Workbook() sh_rmse_egain = wb_egain.add_sheet("EGAIN_rmse") sh_acc_dct_egain = wb_egain.add_sheet("EGAIN_acc_dct") sh_acc_knn_egain = wb_egain.add_sheet("EGAIN_acc_knn") sh_acc_nb_egain = wb_egain.add_sheet("EGAIN_acc_nb") sh_acc_lr_egain = wb_egain.add_sheet("EGAIN_acc_lr") wb_mean = xlwt.Workbook() sh_rmse_mean = wb_mean.add_sheet("MEAN_rmse") sh_acc_dct_mean = wb_mean.add_sheet("MEAN_acc_dct") sh_acc_knn_mean = wb_mean.add_sheet("MEAN_acc_knn") sh_acc_nb_mean = wb_mean.add_sheet("MEAN_acc_nb") sh_acc_lr_mean = wb_mean.add_sheet("MEAN_acc_lr") wb_knn = xlwt.Workbook() sh_rmse_knn = wb_knn.add_sheet("KNN_rmse") sh_acc_dct_knn = wb_knn.add_sheet("KNN_acc_dct") sh_acc_knn_knn = wb_knn.add_sheet("KNN_acc_knn") sh_acc_nb_knn = wb_knn.add_sheet("KNN_acc_nb") sh_acc_lr_knn = wb_knn.add_sheet("KNN_acc_lr") for k in range(len(data_names)): data_name = data_names[k] gain_parameters = { 'batch_size': batch_size, 'alpha': alpha, 'iterations': iterations } ori_data_x, y, miss_data_x, m = data_loader(data_name, miss_rate) print("Dataset: ", data_name) ###########################Mean imputation################################# print('Mean imputation') sh_rmse_mean.write(0, k, data_name) sh_acc_dct_mean.write(0, k, data_name) sh_acc_knn_mean.write(0, k, data_name) sh_acc_nb_mean.write(0, k, data_name) sh_acc_lr_mean.write(0, k, data_name) imp = SimpleImputer(missing_values=np.nan, strategy='mean') imputed_data_x = imp.fit_transform(miss_data_x) sh_rmse_mean.write( 1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m), 4)) # Normalize data and classification # imputed_data_x, _ = normalization(imputed_data_x) # # scf = StratifiedShuffleSplit(n_splits=10) # # DCT classifier # score_dct_mean = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_dct_mean.write(1, k, np.round(np.mean(score_dct_mean), 4)) # # KNN classifier # score_knn_mean = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_knn_mean.write(1, k, np.round(np.mean(score_knn_mean), 4)) # # NB classifier # score_nb_mean = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_nb_mean.write(1, k, np.round(np.mean(score_nb_mean), 4)) # # LR classifier # score_lr_mean = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf, # scoring='accuracy') # sh_acc_lr_mean.write(1, k, np.round(np.mean(score_lr_mean), 4)) ###########################KNN imputation################################# print('KNN imputation') sh_rmse_knn.write(0, k, data_name) sh_acc_dct_knn.write(0, k, data_name) sh_acc_knn_knn.write(0, k, data_name) sh_acc_nb_knn.write(0, k, data_name) sh_acc_lr_knn.write(0, k, data_name) imp = KNNImputer(missing_values=np.nan) imputed_data_x = imp.fit_transform(miss_data_x) sh_rmse_knn.write( 1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m), 4)) # # Normalize data and classification # imputed_data_x, _ = normalization(imputed_data_x) # # scf = StratifiedShuffleSplit(n_splits=10) # # DCT classifier # score_dct_knn = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_dct_knn.write(1, k, np.round(np.mean(score_dct_knn), 4)) # # KNN classifier # score_knn_knn = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_knn_knn.write(1, k, np.round(np.mean(score_knn_knn), 4)) # # NB classifier # score_nb_knn = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_nb_knn.write(1, k, np.round(np.mean(score_nb_knn), 4)) # # LR classifier # score_lr_knn = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf, # scoring='accuracy') # sh_acc_lr_knn.write(1, k, np.round(np.mean(score_lr_knn), 4)) ###########################GAIN imputation################################# print('GAIN imputation') sh_rmse_gain.write(0, k, data_name) sh_acc_dct_gain.write(0, k, data_name) sh_acc_knn_gain.write(0, k, data_name) sh_acc_nb_gain.write(0, k, data_name) sh_acc_lr_gain.write(0, k, data_name) for i in tqdm(range(n_times)): # Impute missing data imputed_data_x = gain(miss_data_x, gain_parameters) sh_rmse_gain.write( i + 1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m), 4)) # #Normalize data and classification # imputed_data_x,_ = normalization(imputed_data_x) # # scf = StratifiedShuffleSplit(n_splits=10) # #DCT classifier # score_dct_gain = cross_val_score(DecisionTreeClassifier(),imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_dct_gain.write(i+1, k, np.round(np.mean(score_dct_gain), 4)) # #KNN classifier # score_knn_gain = cross_val_score(KNeighborsClassifier(),imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_knn_gain.write(i+1, k, np.round(np.mean(score_knn_gain), 4)) # #NB classifier # score_nb_gain = cross_val_score(GaussianNB(),imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_nb_gain.write(i+1, k, np.round(np.mean(score_nb_gain), 4)) # #LR classifier # score_lr_gain = cross_val_score(LogisticRegression(max_iter=1000),imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_lr_gain.write(i+1, k, np.round(np.mean(score_lr_gain), 4)) ###########################EGAIN imputation################################# print('EGAIN imputation') sh_rmse_egain.write(0, k, data_name) sh_acc_dct_egain.write(0, k, data_name) sh_acc_knn_egain.write(0, k, data_name) sh_acc_nb_egain.write(0, k, data_name) sh_acc_lr_egain.write(0, k, data_name) for i in tqdm(range(n_times)): imputed_data_x = Egain(miss_data_x, gain_parameters) sh_rmse_egain.write( i + 1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m), 4)) # Normalize data and classification # imputed_data_x, _ = normalization(imputed_data_x) # # scf = StratifiedShuffleSplit(n_splits=10) # # DCT classifier # score_dct_egain = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_dct_egain.write(i + 1, k, np.round(np.mean(score_dct_egain), 4)) # # KNN classifier # score_knn_egain = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_knn_egain.write(i + 1, k, np.round(np.mean(score_knn_egain), 4)) # # NB classifier # score_nb_egain = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_nb_egain.write(i + 1, k, np.round(np.mean(score_nb_egain), 4)) # # LR classifier # score_lr_egain = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf, # scoring='accuracy') # sh_acc_lr_egain.write(i + 1, k, np.round(np.mean(score_lr_egain), 4)) wb_gain.save('GAIN_test.xls') wb_egain.save('EGAIN_test.xls') wb_mean.save('MEAN_test.xls') wb_knn.save('KNN_test.xls')
def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name miss_rate = args.miss_rate gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations } # Load data and introduce missingness ori_data_x, miss_data_x, data_m = data_loader(data_name, miss_rate) # Impute missing data imputed_data_x = gain(miss_data_x, gain_parameters) # Report the RMSE performance rmse = rmse_loss(ori_data_x, imputed_data_x, data_m) print() mi_data = miss_data_x.astype(float) no, dim = imputed_data_x.shape miss_data = np.reshape(mi_data, (no, dim)) np.savetxt("data/missing_data.csv", mi_data, delimiter=',', fmt='%1.2f') print('Shape of miss data: ', miss_data.shape) print('Save results in missing_data.csv') print() print('=== GAIN RMSE ===') print('RMSE Performance: ' + str(np.round(rmse, 6))) #print('Kích thước của file đầu ra: ', imputed_data_x.shape) np.savetxt("data/imputed_data.csv", imputed_data_x, delimiter=',', fmt='%d') print('Save results in Imputed_data.csv') # MissForest print() print('=== MissForest RMSE ===') data = miss_data_x imp_mean = MissForest(max_iter=5) miss_f = imp_mean.fit_transform(data) #miss_f = pd.DataFrame(imputed_train_df) rmse_MF = rmse_loss(ori_data_x, miss_f, data_m) print('RMSE Performance: ' + str(np.round(rmse_MF, 6))) np.savetxt("data/imputed_data_MF.csv", miss_f, delimiter=',', fmt='%d') print('Save results in Imputed_data_MF.csv') # MICE From Auto Impute print() print('=== MICE of Auto Impute RMSE ===') data_mice = pd.DataFrame(miss_data_x) mi = MiceImputer(k=1, imp_kwgs=None, n=1, predictors='all', return_list=True, seed=None, strategy='default predictive', visit='default') mice_out = mi.fit_transform(data_mice) c = [list(x) for x in mice_out] c1 = c[0] c2 = c1[1] c3 = np.asarray(c2) mice_x = c3 #print('here :', mice_x, miss_f, miss_f.shape) rmse_MICE = rmse_loss(ori_data_x, mice_x, data_m) print('=== MICE of Auto Impute RMSE ===') print('RMSE Performance: ' + str(np.round(rmse_MICE, 6))) np.savetxt("data/imputed_data_MICE.csv", mice_x, delimiter=',', fmt='%d') print('Save results in Imputed_data_MICE.csv') return imputed_data_x, rmse
# np.loadtxt(os.path.join(os.path.join(os.getcwd(), '[10] data/' + data_name + '_miss' + '.csv'), delimiter=",", skiprows=1) rmse_gain = rmse #%% # Mean imputation from sklearn.impute import SimpleImputer med_imputer = SimpleImputer(missing_values = np.nan, strategy = 'median') med_imputer = med_imputer.fit(miss_data_x) imputed_data_med = med_imputer.transform(miss_data_x) # Report the RMSE performance rmse_med = rmse_loss(ori_data_x, imputed_data_med, data_m) print() print('RMSE Performance: ' + str(np.round(rmse_med, 4))) #%% # EM imputation import impyute as impy data_missing = pd.DataFrame(miss_data_x) em_imputed = impy.em(miss_data_x) rmse_em = rmse_loss(ori_data_x, em_imputed, data_m) print()