def vae_gan_model(): epochs, latent_size = 1500, 50 name = "best_test-2_vae_gan_gantrain_200-128_64_50_0005" n_layer = 3 hidden_neurons = 128 learning_rate = 0.0002 cgan_sample, vae_sample, ks_result1, ks_result2, fid = run_VAECGAN_Generator( train_main, test_main, epochs, latent_size, n_samples, n_layer, learning_rate, hidden_neurons, name, random_neg, random_pos) print("VAE sample shape", vae_sample.shape) tsne_data_comparision(random_pos, random_neg, cgan_sample, name, vae_sample, method='VAE_CGAN') get_combined_generated = get_combined_generated_real( cgan_sample, train_main, name) (train_X, train_y) = split_XY(get_combined_generated) (test_X, test_y) = split_XY(test_main) multiple_classifier(train_X, train_y.values.ravel(), test_X, test_y.values.ravel(), name, ks_result1, ks_result2, fid) print( "======================Both GAN and VAE=============================") name = name + "VAE_GAN-both" get_combined_generated = get_combined_generated_real( vae_sample, get_combined_generated, name) (train_X, train_y) = split_XY(get_combined_generated) (test_X, test_y) = split_XY(test_main)
def run_VanilaGANGenerator(train, test, epochs, latent_size, hidden_neurons, n_samples, learning_rate, random_neg_sample, real_pos, name): gan_obj = VanilaGANGenerator(train, test, epochs, latent_size, name) gan_obj.define_models_GAN(learning_rate, hidden_neurons, type=None) gan_obj.train_model() gan_sample = gan_obj.generate_samples(n_samples) #tsne_plot(gan_sample, name) ks_result1, ks_result2 = compare_attributes_distribution( real_pos, gan_sample, name) fid = calculate_fid(real_pos, gan_sample) print("Frechet Inception Distance:", fid) tsne_data_comparision(real_pos, random_neg_sample, gan_sample, name, None, 'VGAN') get_combined_generated = get_combined_generated_real( gan_sample, train, name) # print("Count of Failure and non failure",get_combined_generated.failure.value_counts()) (train_X, train_y) = split_XY(get_combined_generated) (test_X, test_y) = split_XY(test) # classify_baseline_model(test_y) #multiple_classifier(train_X, train_y.values.ravel(), test_X, test_y.values.ravel(), name, ks_result1, ks_result2, # fid) # xgb_classifier(train_X, train_y.values.ravel(), test_X, test_y.values.ravel(),name) # rf_classifier(train_X, train_y.values.ravel(), test_X, test_y.values.ravel(),name) return gan_sample, ks_result1, ks_result2, fid
def cgan_vae_model(): epochs, latent_size = 200, 50 name = "best_test_cgan-vae_100-128_64_50_0005" n_layer = 2 hidden_neurons = 64 learning_rate = 0.0005 vae_sample, cgan_sample, ks_result1, ks_result2, fid = run_CGAN_VAE_Generator( train_main, test_main, epochs, latent_size, n_samples, n_layer, learning_rate, hidden_neurons, name, random_neg, random_pos) tsne_data_comparision(random_pos, random_neg, vae_sample, name, cgan_sample, 'CGAN_VAE') get_combined_generated = get_combined_generated_real( vae_sample, train_main, name) (train_X, train_y) = split_XY(get_combined_generated) (test_X, test_y) = split_XY(test_main) multiple_classifier(train_X, train_y.values.ravel(), test_X, test_y.values.ravel(), name, ks_result1, ks_result2, fid) print( "======================Both GAN and VAE=============================") name = name + "CGAN-VAE-both" get_combined_generated = get_combined_generated_real( vae_sample, get_combined_generated, name) (train_X, train_y) = split_XY(get_combined_generated) (test_X, test_y) = split_XY(test_main)
def run_without_sampling(train, test, id): name = id + '-Without-Sampling' (train_X, train_y) = split_XY(train) (test_X, test_y) = split_XY(test) #multiple_classifier(train_X, train_y.values.ravel(), test_X, test_y.values.ravel(),name) #xgb_classifier(train_X, train_y.values.ravel(), test_X, test_y.values.ravel(), name) rf_classifier(train_X, train_y.values.ravel(), test_X, test_y.values.ravel(), name)
def train_cgan(): datasets_dir = 'datasets/' imputation_types = ['NA', 'mean', 'knn'] feature_engineerings = ['Features-selection', 'ALL', 'PCA'] for imputation_type in imputation_types: id = "APS-CGAN" + imputation_type + "-" train_main, test_main = preprocess(data_dir=datasets_dir, imputation_type=imputation_type) #train_main,test_main = load_preprocess_aps_data(imputation_type) for feature_engineering in feature_engineerings: if feature_engineering == 'PCA': train, test = load_PCA_data(train_main, test_main, datasets_dir, imputation_type) feature_n = '-PCA=' elif feature_engineering == 'Features-selection': train, test = feature_selection(train_main, test_main, 120) feature_n = '-Select_Features_K=' + str(120) else: feature_n = '-All_features=' train, test = train_main, test_main random_neg = get_random_sample(train, which='neg') random_pos = get_random_sample(train, which='pos') print(train.shape, test.shape) for i in [100, 300]: epochN = '-epochs=' + str(i) for j in [32, 64, 128]: latentN = '-latent_size=' + str(j) for n_samples in [2000, 5000]: n_sampleN = '-n_samples=' + str(n_samples) for n_layer in [2, 3]: layer_N = '-n_layer=' + str(n_layer) for hidden_neurons in [32, 64, 128]: hidden_N = '-hidden_neurons_base=' + str( hidden_neurons) for learning_rate in [0.0005, 0.0002]: lr_N = '-learning_rate=' + str( learning_rate) name = id + feature_n + epochN + latentN + n_sampleN + layer_N + hidden_N + lr_N #run_VanilaGANGenerator(train,test,i,j,n_samples,imputation_type,id) cgan_sample, ks_result1, ks_result2, fid = run_CGANGenerator( train, test, i, j, n_samples, n_layer, learning_rate, hidden_neurons, name, random_neg, random_pos) get_combined_generated = get_combined_generated_real( cgan_sample, train, name) (train_X, train_y ) = split_XY(get_combined_generated) (test_X, test_y) = split_XY(test) multiple_classifier( train_X, train_y.values.ravel(), test_X, test_y.values.ravel(), name, ks_result1, ks_result2, fid)
def train_vae(): datasets_dir = 'datasets/' imputation_types = ['mean', 'NA', 'knn'] feature_engineerings = ['All', 'Features-selection', 'PCA'] for imputation_type in imputation_types: train_main, test_main = preprocess(data_dir=datasets_dir, imputation_type=imputation_type) id = "APS-VAE" + imputation_type + "-" #train_main, test_main = load_preprocess_aps_data() for feature_engineering in feature_engineerings: if feature_engineering == 'PCA': train, test = load_PCA_data(train_main, test_main, datasets_dir, imputation_type) feature_n = '-PCA=' elif feature_engineering == 'Features-selection': train, test = feature_selection(train_main, test_main) feature_n = '-Select_Features_80=' else: feature_n = '-All_features=' train, test = train_main, test_main random_neg = get_random_sample(train) #run_without_sampling(train,test,id) print(train.shape, test.shape) for i in [50, 100]: epochN = '-epochs=' + str(i) for j in [32, 64, 128]: latentN = '-latent_size=' + str(j) for n_samples in [5000]: n_sampleN = '-n_samples=' + str(n_samples) for n_layer in [2, 3]: layer_N = '-n_layer=' + str(n_layer) for hidden_neurons in [32, 64, 128]: hidden_N = '-hidden_neurons_base=' + str( hidden_neurons) for learning_rate in [0.0005, 0.0002]: lr_N = '-learning_rate=' + str( learning_rate) name = id + feature_n + epochN + latentN + n_sampleN + layer_N + hidden_N + lr_N vae_sample, ks_result1, ks_result2, fid = run_VAEGenerator( train, test, i, j, n_samples, n_layer, hidden_neurons, learning_rate, name, random_neg) get_combined_generated = get_combined_generated_real( vae_sample, train, name) (train_X, train_y ) = split_XY(get_combined_generated) (test_X, test_y) = split_XY(test) multiple_classifier( train_X, train_y.values.ravel(), test_X, test_y.values.ravel(), name, ks_result1, ks_result2, fid)
def vanillaGAN_model(): epochs, latent_size, name = 200, 50, "test_500-50-64-0001_Vanilla-GAN" learning_rate = 0.0005 hidden_neurons = 64 gan_sample, ks_result1, ks_result2, fid = run_VanilaGANGenerator( train_main, test_main, epochs, latent_size, hidden_neurons, n_samples, learning_rate, random_neg, random_pos, name) get_combined_generated = get_combined_generated_real( gan_sample, train_main, name) (train_X, train_y) = split_XY(get_combined_generated) (test_X, test_y) = split_XY(test_main) multiple_classifier(train_X, train_y.values.ravel(), test_X, test_y.values.ravel(), name, ks_result1, ks_result2, fid)
def run_VAECGAN_Generator(train, test, epochs, latent_size, n_samples, n_layer, learning_rate, hidden_neurons, name, random_neg_sample, random_pos): vae_sample = pd.read_csv(r'generated_data/VAE_no_mmd.csv').sample( n=1000, random_state=123) # vae_sample, ks_result1, ks_result2, fid = run_VAEGenerator(train, test, epochs, latent_size, # n_samples, n_layer, # hidden_neurons, # learning_rate, name, # random_neg) # # vae_sample, noise = vae_obj.generate_samples(n_samples) # get_vae_combine = get_combined_generated_real(vae_sample, train, "vae_gan") vae_combine = get_combined_generated_real(vae_sample, train, name) cgan_sample, ks_result3, ks_result4, fid2 = run_CGANGenerator( vae_combine, test, epochs, latent_size, n_samples, n_layer, learning_rate, hidden_neurons, name, random_neg_sample, random_pos, model_type='VAE_CGAN') tsne_data_comparision(random_pos, random_neg_sample, cgan_sample[:, :-1], name, vae_sample, 'VAE_CGAN') #get_combined_generated = get_combined_generated_real(cgan_sample, train, name) return cgan_sample, vae_sample, ks_result3, ks_result4, fid2 print("Count of Failure and non failure", get_combined_generated.shape, get_combined_generated.failure.value_counts()) (train_X, train_y) = split_XY(get_combined_generated) (test_X, test_y) = split_XY(test) #multiple_classifier(train_X, train_y.values.ravel(), test_X, test_y.values.ravel(), # name, ks_result3, ks_result4, fid2) print( "======================Both GAN and VAE=============================") name = name + "VAE_GAN-both" get_combined_generated = get_combined_generated_real( vae_sample, get_combined_generated, name) (train_X, train_y) = split_XY(get_combined_generated) (test_X, test_y) = split_XY(test)
def vae_model(): epochs_base, latent_size, name = 50, 64, "test_250_64_64_2_0005_VAE_no_mmd" n_layer = 2 hidden_neurons = 64 learning_rate = 0.0005 vae_sample, ks_result1, ks_result2, fid = run_VAEGenerator( train_main, test_main, epochs_base, latent_size, n_samples, n_layer, hidden_neurons, learning_rate, name, random_neg) get_combined_generated = get_combined_generated_real( vae_sample, train_main, name) (train_X, train_y) = split_XY(get_combined_generated) (test_X, test_y) = split_XY(test_main) multiple_classifier(train_X, train_y.values.ravel(), test_X, test_y.values.ravel(), name, ks_result1, ks_result2, fid)
def cgan_model(): epochs, latent_size = 500, 50 name = "best_with_cgan_best" n_layer = 3 hidden_neurons = 128 learning_rate = 0.0005 cgan_sample, ks_result1, ks_result2, fid = run_CGANGenerator( train_main, test_main, epochs, latent_size, n_samples, n_layer, learning_rate, hidden_neurons, name, random_neg, random_pos) get_combined_generated = get_combined_generated_real( cgan_sample, train_main, name) (train_X, train_y) = split_XY(get_combined_generated) (test_X, test_y) = split_XY(test_main) multiple_classifier(train_X, train_y.values.ravel(), test_X, test_y.values.ravel(), name, ks_result1, ks_result2, fid)
def run_baseline_model(train, test, n_sample, name): sm_obj = SMOTEGenerator(train, test) train_X, train_y = sm_obj.generate_samples(n_sample) tsne_plot(train_X, "smoted_samples") (test_X, test_y) = split_XY(test) #classify_baseline_model(test_y) #tsne_data_comparision(train,gan_sample) multiple_classifier(train_X, train_y.values.ravel(), test_X, test_y.values.ravel(), 'smote_train')
def train_classify(method): if method == 'VAE': name = 'classify-vae' sample = pd.read_csv(r'pre_trained/data/VAE_no_mmd.csv') #print(sample.shape) get_vae_combine = get_combined_generated_real(sample, train_main, name) (train_X, train_y) = split_XY(get_vae_combine) (test_X, test_y) = split_XY(test_main) threshold, threshold_cost, cm = xgb_classifier(train_X, train_y, test_X, test_y, name) result(cm, 'VAE', 'classify-vae') elif method == 'CGAN': name = 'classify-cgan' cgan_sample = pd.read_csv( r'pre_trained/data/cgan_800_50_64_2_0002_cluster_test.csv') #cgan_sample = pd.read_csv(r'generated_data/cgan_200_50_64_2_0002.csv') print(cgan_sample.shape) cgan_combine = get_combined_generated_real(cgan_sample, train_main, name) (train_X, train_y) = split_XY(cgan_combine) (test_X, test_y) = split_XY(test_main) #threshold, threshold_cost, cm = xgb_classifier(train_X, train_y, test_X, test_y, name) threshold, threshold_cost, cm = rf_classifier(train_X, train_y, test_X, test_y, name) result(cm, 'CGAN', 'rf_classify-cgan') elif method == 'VAE_CGAN': name = 'classify-VAE_CGAN' cgan_sample = pd.read_csv( r'pre_trained/data/vae_gan_gantrain_200_32_50_0002_test.csv') #cgan_sample = pd.read_csv(r'generated_data/cgan_200_50_64_2_0002.csv') print(cgan_sample.shape) get_vae_combine = get_combined_generated_real(cgan_sample, train_main, name) (train_X, train_y) = split_XY(get_vae_combine) (test_X, test_y) = split_XY(test_main) threshold, threshold_cost, cm = xgb_classifier(train_X, train_y, test_X, test_y, name) result(cm, 'VAE_CGAN', 'classify-VAE_CGAN')
def train_vae_cgan(): datasets_dir = 'datasets/' imputation_types = ['mean', 'knn', 'NA'] feature_engineerings = ['All', 'PCA', 'Features-selection'] for imputation_type in imputation_types: id = "APS-VAE-CGAN_Both-" + imputation_type + "-" train_main, test_main = preprocess(data_dir=datasets_dir, imputation_type=imputation_type) #train_main, test_main = load_preprocess_aps_data(imputation_type) for feature_engineering in feature_engineerings: if feature_engineering == 'PCA': train, test = load_PCA_data(train_main, test_main, datasets_dir, imputation_type) feature_n = '-PCA-' elif feature_engineering == 'Features-selection': train, test = feature_selection(train_main, test_main, 120) feature_n = '-Select_Features=' + str(120) else: feature_n = '-All_features-' train, test = train_main, test_main random_neg = get_random_sample(train, which='neg') random_pos = get_random_sample(train, which='pos') print(train.shape, test.shape) for i in [200, 400]: epochN = '-epochs=' + str(i) for j in [32, 64, 128]: latentN = '-latent_size=' + str(j) for n_samples in [2000, 5000]: n_sampleN = '-n_samples=' + str(n_samples) for n_layer in [2, 3]: layer_N = '-n_layer=' + str(n_layer) for hidden_neurons in [32, 64, 128]: hidden_N = '-hidden_neurons_base=' + str( hidden_neurons) for learning_rate in [0.0005, 0.0002]: lr_N = '-learning_rate=' + str( learning_rate) name = id + feature_n + epochN + latentN + n_sampleN + layer_N + hidden_N + lr_N vae_sample, ks_result1, ks_result2, fid = run_VAEGenerator( train, test, i, j, n_samples, n_layer, hidden_neurons, learning_rate, name, random_neg) # vae_sample, noise = vae_obj.generate_samples(n_samples) get_vae_combine = get_combined_generated_real( vae_sample, train, "vae_gan") cgan_sample, ks_result3, ks_result4, fid2 = run_CGANGenerator( get_vae_combine, test, i, j, n_samples, n_layer, learning_rate, hidden_neurons, name, random_neg, random_pos) get_combined_generated = get_combined_generated_real( cgan_sample, train, name) print( "Count of Failure and non failure", get_combined_generated.shape, get_combined_generated.failure. value_counts()) (train_X, train_y ) = split_XY(get_combined_generated) (test_X, test_y) = split_XY(test) multiple_classifier( train_X, train_y.values.ravel(), test_X, test_y.values.ravel(), name, ks_result1, ks_result2, fid)
def run_with_pretrained_model(method, model_flag): if method == 'VAE': print("========================Start VAE==========================") name = 'classify-vae' if model_flag: vae = get_model('pre_trained/model/VAE') vae_sample = base_generator.generate_samples(vae, 32, 5000) #vae_sample = vae.generate_samples(5000) ks_result1, ks_result2 = compare_distribution( random_pos, vae_sample, name) fid = calculate_fid(random_pos, vae_sample) print("## VAE Quantative Analysis##") print("KS-Test 1 ", ks_result1) print("KS-Test 2 ", ks_result2) print(" FID ", fid) vae_combine = get_combined_generated_real(vae_sample, train_main, name) else: vae_sample = pd.read_csv(r'pre_trained/data/VAE.csv') vae_combine = get_combined_generated_real(vae_sample, train_main, name) #load combined data of train and VAE sample #vae_combine = pd.read_csv('pre_trained/data/vae_cgan_combine.csv',index_col=0) (train_X, train_y) = split_XY(vae_combine) (test_X, test_y) = split_XY(test_main) threshold, threshold_cost, cm = xgb_classifier(train_X, train_y, test_X, test_y, name) result(cm, 'VAE', 'XGB-classify-vae') threshold_rf, threshold_cost_rf, cm_rf = rf_classifier( train_X, train_y, test_X, test_y, name) result(cm_rf, 'VAE', 'RF-classify-vae') print("========================End VAE==========================") elif method == 'CGAN': print( "\n========================Start CGAN==========================\n") name = 'classify-cgan' if model_flag: cgan = get_model('pre_trained/model/CGAN') cgan_sample = base_generator.generate_samples( cgan, 64, 5000) #cgan.generate_samples(5000) ks_result1, ks_result2 = compare_distribution( random_pos, cgan_sample, name) fid = calculate_fid(random_pos, cgan_sample) print("## CGAN Quantative Analysis##") print("KS-Test 1 ", ks_result1) print("KS-Test 2 ", ks_result2) print(" FID ", fid) cgan_combine = get_combined_generated_real(cgan_sample, train_main, name) else: #cgan_sample = pd.read_csv(r'pre_trained/data/cgan_800_50_64_2_0002_cluster_test.csv') #cgan_sample = pd.read_csv(r'generated_data/cgan_200_50_64_2_0002.csv') #cgan_combine = get_combined_generated_real(cgan_sample, train_main, name) #load combined data of CGAN sample and Train datsets cgan_combine = pd.read_csv(r'pre_trained/data/CGAN.csv', index_col=0) (train_X, train_y) = split_XY(cgan_combine) (test_X, test_y) = split_XY(test_main) test_X = test_X[train_X.columns] threshold, threshold_cost, cm = xgb_classifier(train_X, train_y, test_X, test_y, name) result(cm, 'CGAN', 'XGB-classify-vae') threshold_rf, threshold_cost_rf, cm_rf = rf_classifier( train_X, train_y, test_X, test_y, name) result(cm_rf, 'CGAN', 'RF-classify-cgan') print("\n========================End CGAN==========================") elif method == 'VAE_CGAN': print( "\n========================Start VAE-CGAN==========================\n" ) name = 'classify-VAE_CGAN' if model_flag: #vae_sample = pd.read_csv(r'pre_trained/data/VAE_no_mmd.csv') vae = get_model('pre_trained/model/VAE') vae_sample = base_generator.generate_samples( vae, 50, 5000) #vae.generate_samples(5000) ks_result1, ks_result2 = compare_distribution( random_pos, vae_sample, name) fid = calculate_fid(random_pos, vae_sample) print("##VAE Quantative Analysis##") print("KS-Test 1 ", ks_result1) print("KS-Test 2 ", ks_result2) print(" FID ", fid) cgan = get_model('pre_trained/model/VAE_CGAN') cgan_sample = base_generator.generate_samples( cgan, 32, 5000) #cgan.generate_samples(5000) ks_result1, ks_result2 = compare_distribution( random_pos, cgan_sample, name) fid = calculate_fid(random_pos, cgan_sample) print("##VAE-CGAN Quantative Analysis##") print("KS-Test 1 ", ks_result1) print("KS-Test 2 ", ks_result2) print(" FID ", fid) vae_cgan_combine = get_combined_generated_real( cgan_sample, train_main, name) else: # cgan_sample = pd.read_csv(r'generated_data/cgan_200_50_64_2_0002.csv') # cgan_combine = get_combined_generated_real(cgan_sample, train_main, name) #load combined data of CGAN sample and Train datsets #vae_gan_gantrain_200_32_50_0002_test vae_cgan_combine = pd.read_csv(r'pre_trained/data/VAE_CGAN.csv', index_col=0) print("VAE CGAN combine shape::", vae_cgan_combine.shape) (train_X, train_y) = split_XY(vae_cgan_combine) (test_X, test_y) = split_XY(test_main) threshold, threshold_cost, cm = xgb_classifier(train_X, train_y, test_X, test_y, name) result(cm, 'VAE_CGAN', 'XGB-classify-VAE_CGAN') threshold_rf, threshold_cost_rf, cm_rf = rf_classifier( train_X, train_y, test_X, test_y, name) result(cm_rf, 'VAE_CGAN', 'RF-classify-VAE-CGAN') print("========================End VAE-CGAN==========================")
def generate_samples(self, n_samples): X, y = split_XY(self.train_data) oversample = SMOTE() X, y = oversample.fit_resample(X, y) return X, y