def apply_aec(data_type, with_noise): # auto-encode cluster # Global initialization kmeans_ms = {} # K-means results agg_ms = {} # Agglomerative results gmm_ms = {} # Gaussian Mixture Model GT_ms = {} # Ground Truth if setting_ != 'all': for setting, repeats in DATA.items(): if str(setting) == setting_: print("setting:", setting, ) kmeans_ms[setting] = {} agg_ms[setting] = {} gmm_ms[setting] = {} GT_ms[setting] = {} for repeat, matrices in repeats.items(): print("repeat:", repeat) GT = matrices['GT'] Y = matrices['Y'].astype("float32") P = matrices['P'].astype("float32") Yn = matrices['Yn'] if len(Yn) != 0: Yn = Yn.astype('float32') N, V = Y.shape # Quantitative case if type_of_data == 'Q' or name.split('(')[-1] == 'r': _, _, Yz, _, Yrng, _, = ds.preprocess_Y(Yin=Y, data_type='Q') if with_noise == 1: Yn, _, Ynz, _, Ynrng, _, = ds.preprocess_Y(Yin=Yn, data_type='Q') # Because there is no Yn in the case of categorical features. if type_of_data == 'C': enc = OneHotEncoder(sparse=False, categories='auto') Y_oneHot = enc.fit_transform(Y) # .astype("float32") # oneHot encoding # for WITHOUT follow-up rescale Y_oneHot and for WITH follow-up # Y_oneHot should be replaced with Y Y, _, Yz, _, Yrng, _, = ds.preprocess_Y(Yin=Y_oneHot, data_type='C') if type_of_data == 'M': Vq = int(np.ceil(V / 2)) # number of quantitative features -- Y[:, :Vq] Vc = int(np.floor(V / 2)) # number of categorical features -- Y[:, Vq:] Y_q, _, Yz_q, _, Yrng_q, _, = ds.preprocess_Y(Yin=Y[:, :Vq], data_type='Q') enc = OneHotEncoder(sparse=False, categories='auto',) Y_oneHot = enc.fit_transform(Y[:, Vq:]) # oneHot encoding # for WITHOUT follow-up rescale Y_oneHot and for WITH follow-up # Y_oneHot should be replaced with Y Y_c, _, Yz_c, _, Yrng_c, _, = ds.preprocess_Y(Yin=Y_oneHot, data_type='C') Y = np.concatenate([Y[:, :Vq], Y_oneHot], axis=1) Yrng = np.concatenate([Yrng_q, Yrng_c], axis=1) Yz = np.concatenate([Yz_q, Yz_c], axis=1) if with_noise == 1: Vq = int(np.ceil(V / 2)) # number of quantitative features -- Y[:, :Vq] Vc = int(np.floor(V / 2)) # number of categorical features -- Y[:, Vq:] Vqn = (Vq + Vc) # the column index of which noise model1 starts _, _, Ynz_q, _, Ynrng_q, _, = ds.preprocess_Y(Yin=Yn[:, :Vq], data_type='Q') enc = OneHotEncoder(sparse=False, categories='auto',) Yn_oneHot = enc.fit_transform(Yn[:, Vq:Vqn]) # oneHot encoding # for WITHOUT follow-up rescale Yn_oneHot and for WITH # follow-up Yn_oneHot should be replaced with Y Yn_c, _, Ynz_c, _, Ynrng_c, _, = ds.preprocess_Y(Yin=Yn_oneHot, data_type='C') Y_ = np.concatenate([Yn[:, :Vq], Yn_c], axis=1) Yrng = np.concatenate([Ynrng_q, Ynrng_c], axis=1) Yz = np.concatenate([Ynz_q, Ynz_c], axis=1) _, _, Ynz_, _, Ynrng_, _, = ds.preprocess_Y(Yin=Yn[:, Vqn:], data_type='Q') Yn_ = np.concatenate([Y_, Yn[:, Vqn:]], axis=1) Ynrng = np.concatenate([Yrng, Ynrng_], axis=1) Ynz = np.concatenate([Yz, Ynz_], axis=1) P, _, _, Pu, _, _, Pm, _, _, Pl, _, _ = ds.preprocess_P(P=P) # Pre-processing - Without Noise if data_type == "NP".lower() and with_noise == 0: print("NP") kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y, P, GT, n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "z-u".lower() and with_noise == 0: print("z-u") kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yz, P=Pu, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "z-m".lower() and with_noise == 0: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yz, P=Pm, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "z-l".lower() and with_noise == 0: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yz, P=Pl, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "rng-u".lower() and with_noise == 0: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yrng, P=Pu, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "rng-m".lower() and with_noise == 0: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yrng, P=Pm, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "rng-l".lower() and with_noise == 0: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yrng, P=Pl, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) # Pre-processing - With Noise if data_type == "NP".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yn, P=P, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "z-u".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Ynz, P=Pu, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "z-m".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Ynz, P=Pm, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "z-l".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Ynz, P=Pl, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "rng-u".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Ynrng, P=Pu, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "rng-m".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Ynrng, P=Pm, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "rng-l".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Ynrng, P=Pl, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) kmeans_ms[setting][repeat] = kmeans_labels agg_ms[setting][repeat] = agg_labels gmm_ms[setting][repeat] = gmm_labels GT_ms[setting][repeat] = y_test print("Algorithm is applied on the" + setting_ + "data set!") if setting_ == 'all': for setting, repeats in DATA.items(): print("setting:", setting, ) kmeans_ms[setting] = {} agg_ms[setting] = {} gmm_ms[setting] = {} GT_ms[setting] = {} for repeat, matrices in repeats.items(): print("repeat:", repeat) GT = matrices['GT'] Y = matrices['Y'].astype('float32') P = matrices['P'].astype('float32') Yn = matrices['Yn'] if len(Yn) != 0: Yn = Yn.astype('float32') N, V = Y.shape # Quantitative case if type_of_data == 'Q' or name.split('(')[-1] == 'r': _, _, Yz, _, Yrng, _, = ds.preprocess_Y(Yin=Y, data_type='Q') if with_noise == 1: Yn, _, Ynz, _, Ynrng, _, = ds.preprocess_Y(Yin=Yn, data_type='Q') # Because there is no Yn in the case of categorical features. if type_of_data == 'C': enc = OneHotEncoder() # categories='auto') Y = enc.fit_transform(Y) # oneHot encoding Y = Y.toarray() # Boris's Theory Y, _, Yz, _, Yrng, _, = ds.preprocess_Y(Yin=Y, data_type='C') if type_of_data == 'M': Vq = int(np.ceil(V / 2)) # number of quantitative features -- Y[:, :Vq] Vc = int(np.floor(V / 2)) # number of categorical features -- Y[:, Vq:] Y_, _, Yz_, _, Yrng_, _, = ds.preprocess_Y(Yin=Y[:, :Vq], data_type='M') enc = OneHotEncoder(sparse=False, ) # categories='auto', ) Y_oneHot = enc.fit_transform(Y[:, Vq:]) # oneHot encoding Y = np.concatenate([Y_oneHot, Y[:, :Vq]], axis=1) Yrng = np.concatenate([Y_oneHot, Yrng_], axis=1) Yz = np.concatenate([Y_oneHot, Yz_], axis=1) if with_noise == 1: Vq = int(np.ceil(V / 2)) # number of quantitative features -- Y[:, :Vq] Vc = int(np.floor(V / 2)) # number of categorical features -- Y[:, Vq:] Vqn = (Vq + Vc) # the column index of which noise model1 starts _, _, Yz_, _, Yrng_, _, = ds.preprocess_Y(Yin=Yn[:, :Vq], data_type='M') enc = OneHotEncoder(sparse=False, ) # categories='auto',) Yn_oneHot = enc.fit_transform(Yn[:, Vq:Vqn]) # oneHot encoding Y_ = np.concatenate([Yn_oneHot, Yn[:, :Vq]], axis=1) Yrng = np.concatenate([Yn_oneHot, Yrng_], axis=1) Yz = np.concatenate([Yn_oneHot, Yz_], axis=1) _, _, Ynz_, _, Ynrng_, _, = ds.preprocess_Y(Yin=Yn[:, Vqn:], data_type='M') Yn_ = np.concatenate([Y_, Yn[:, Vqn:]], axis=1) Ynrng = np.concatenate([Yrng, Ynrng_], axis=1) Ynz = np.concatenate([Yz, Ynz_], axis=1) P, _, _, Pu, _, _, Pm, _, _, Pl, _, _ = ds.preprocess_P(P=P) # Pre-processing - Without Noise if data_type == "NP".lower() and with_noise == 0: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Y, P=P, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "z-u".lower() and with_noise == 0: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yz, P=Pu, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "z-m".lower() and with_noise == 0: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yz, P=Pm, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "z-l".lower() and with_noise == 0: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yz, P=Pl, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "rng-u".lower() and with_noise == 0: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yrng, P=Pu, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "rng-m".lower() and with_noise == 0: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yrng, P=Pm, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "rng-l".lower() and with_noise == 0: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yrng, P=Pl, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) # Pre-processing - With Noise if data_type == "NP".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Yn, P=P, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "z-u".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Ynz, P=Pu, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "z-m".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Ynz, P=Pm, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "z-l".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Ynz, P=Pl, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "rng-u".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Ynrng, P=Pu, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "rng-m".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Ynrng, P=Pm, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) elif data_type == "rng-l".lower() and with_noise == 1: kmeans_labels, agg_labels, gmm_labels, y_test = run_cluster_latents( Y=Ynrng, P=Pl, GT=GT, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=data_name, setting=setting) kmeans_ms[setting][repeat] = kmeans_labels agg_ms[setting][repeat] = agg_labels gmm_ms[setting][repeat] = gmm_labels GT_ms[setting][repeat] = y_test print("Algorithm is applied on the entire data set!") return kmeans_ms, agg_ms, gmm_ms, GT_ms
def apply_the_algorithm(data_type, ): # auto-encode cluster # Global initialization alg_ms = {} # the algorithm results gt_ms = {} # Ground Truth for setting, repeats in DATA.items(): print( "setting:", setting, ) alg_ms[setting] = {} gt_ms[setting] = {} for repeat, matrices in repeats.items(): print("repeat:", repeat) X_tr = DATA[setting][repeat]['X_tr'].astype('float32') X_vl = DATA[setting][repeat]['X_vl'].astype('float32') X_ts = DATA[setting][repeat]['X_ts'].astype('float32') y_tr = DATA[setting][repeat]['y_tr'].astype('float32') y_vl = DATA[setting][repeat]['y_vl'].astype('float32') y_ts = DATA[setting][repeat]['y_ts'].astype('float32') _, _, Xz_tr, _, Xr_tr, _, = ds.preprocess_Y(Yin=X_tr, data_type='Q') _, _, Xz_vl, _, Xr_vl, _, = ds.preprocess_Y(Yin=X_vl, data_type='Q') _, _, Xz_ts, _, Xr_ts, _, = ds.preprocess_Y(Yin=X_ts, data_type='Q') # Different Pre-processing methods if data_type == "NP".lower(): print("No Pre-Proc.") alg_x_test_labels, y_test = run_the_algorithm( X_train=X_tr, y_train=y_tr, X_val=X_vl, y_val=y_vl, X_test=X_ts, y_test=y_ts, n_epochs=n_epochs, repeat=repeat, ds_name=name, setting=setting) elif data_type == "z".lower(): print("Z-score") alg_x_test_labels, y_test = run_the_algorithm( X_train=Xz_tr, y_train=y_tr, X_val=Xz_vl, y_val=y_vl, X_test=Xz_ts, y_test=y_ts, n_epochs=n_epochs, repeat=repeat, ds_name=name, setting=setting) elif data_type == "rng".lower(): print("Rng") alg_x_test_labels, y_test = run_the_algorithm( X_train=Xr_tr, y_train=y_tr, X_val=Xr_vl, y_val=y_vl, X_test=Xr_ts, y_test=y_ts, n_epochs=n_epochs, repeat=repeat, ds_name=name, setting=setting) alg_ms[setting][repeat] = alg_x_test_labels gt_ms[setting][repeat] = y_test print("Algorithm is applied on the" + str(setting) + "data set!") return alg_ms, gt_ms
def apply_aec(data_type, with_noise): # auto-encode cluster # Global initialization AE_ms = {} # K-means results GT_ms = {} # Ground Truth if setting_ != 'all': for setting, repeats in DATA.items(): if str(setting) == setting_: print( "setting:", setting, ) AE_ms[setting] = {} GT_ms[setting] = {} for repeat, matrices in repeats.items(): print("repeat:", repeat) X_tr = DATA[setting][repeat]['X_tr'].astype( 'float32') X_vl = DATA[setting][repeat]['X_vl'].astype( 'float32') X_ts = DATA[setting][repeat]['X_ts'].astype( 'float32') y_tr = DATA[setting][repeat]['y_tr'].astype( 'float32') y_vl = DATA[setting][repeat]['y_vl'].astype( 'float32') y_ts = DATA[setting][repeat]['y_ts'].astype( 'float32') _, _, Xz_tr, _, Xr_tr, _, = ds.preprocess_Y( Yin=X_tr, data_type='Q') _, _, Xz_vl, _, Xr_vl, _, = ds.preprocess_Y( Yin=X_vl, data_type='Q') _, _, Xz_ts, _, Xr_ts, _, = ds.preprocess_Y( Yin=X_ts, data_type='Q') # Different Pre-processing methods if data_type == "NP".lower() and with_noise == 0: print("No Pre-Proc.") AE_X_test_labels, y_test = run_ae( X_train=X_tr, y_train=y_tr, X_val=X_vl, y_val=y_vl, X_test=X_ts, y_test=y_ts, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=name, setting=setting) elif data_type == "z".lower() and with_noise == 0: print("Z-score") AE_X_test_labels, y_test = run_ae( X_train=Xz_tr, y_train=y_tr, X_val=Xz_vl, y_val=y_vl, X_test=Xz_ts, y_test=y_ts, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=name, setting=setting) elif data_type == "rng".lower( ) and with_noise == 0: print("Rng") AE_X_test_labels, y_test = run_ae( X_train=Xr_tr, y_train=y_tr, X_val=Xr_vl, y_val=y_vl, X_test=Xr_ts, y_test=y_ts, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=name, setting=setting) AE_ms[setting][repeat] = AE_X_test_labels GT_ms[setting][repeat] = y_test print("Algorithm is applied on the" + setting_ + "data set!") if setting_ == 'all': for setting, repeats in DATA.items(): print( "setting:", setting, ) AE_ms[setting] = {} GT_ms[setting] = {} for repeat, matrices in repeats.items(): print("repeat:", repeat) X_tr = DATA[setting][repeat]['X_tr'].astype('float32') X_vl = DATA[setting][repeat]['X_vl'].astype('float32') X_ts = DATA[setting][repeat]['X_ts'].astype('float32') y_tr = DATA[setting][repeat]['y_tr'].astype('float32') y_vl = DATA[setting][repeat]['y_vl'].astype('float32') y_ts = DATA[setting][repeat]['y_ts'].astype('float32') _, _, Xz_tr, _, Xr_tr, _, = ds.preprocess_Y( Yin=X_tr, data_type='Q') _, _, Xz_vl, _, Xr_vl, _, = ds.preprocess_Y( Yin=X_vl, data_type='Q') _, _, Xz_ts, _, Xr_ts, _, = ds.preprocess_Y( Yin=X_ts, data_type='Q') # Different Pre-processing methods if data_type == "NP".lower() and with_noise == 0: print("No Pre-Proc.") AE_X_test_labels, y_test = run_ae( X_train=X_tr, y_train=y_tr, X_val=X_vl, y_val=y_vl, X_test=X_ts, y_test=y_ts, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=name, setting=setting) elif data_type == "z".lower() and with_noise == 0: print("Z-score") AE_X_test_labels, y_test = run_ae( X_train=Xz_tr, y_train=y_tr, X_val=Xz_vl, y_val=y_vl, X_test=Xz_ts, y_test=y_ts, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=name, setting=setting) elif data_type == "rng".lower() and with_noise == 0: print("Rng") AE_X_test_labels, y_test = run_ae( X_train=Xr_tr, y_train=y_tr, X_val=Xr_vl, y_val=y_vl, X_test=Xr_ts, y_test=y_ts, n_epochs=n_epochs, latent_dim_ratio=latent_dim_ratio, repeat=repeat, name=name, setting=setting) AE_ms[setting][repeat] = AE_X_test_labels GT_ms[setting][repeat] = y_test print("Algorithm is applied on the entire data set!") return AE_ms, GT_ms