top_kurt_ind = (-kurt).argsort() # IC examine_credit_cluster( X_ica[:, top_kurt_ind[:2]], y, title=f"ICA transformation on {data_key} data", xylabel=["IC1", "IC2"], fname=f"output/ica_ic_cluster_{data_key}.png", ) if data_key == "fashion": # IC plot_fashion_cluster( ica.components_[top_kurt_ind[:25], :], range(25), fname="output/ica_ic_fashion.png", ) # reconstructed image X_recon = ica.inverse_transform(X_ica) plot_fashion_cluster(X_recon, y, fname="output/ica_reconstructed_fashion.png") # recon vs k sample = [] for nc in range(2, 201, 8): print(f"Reconstructing with {nc} PC...") ica = FastICA(n_components=nc, whiten=True, random_state=0) X_recon = ica.inverse_transform(ica.fit_transform(X))
plt.savefig(f"output/pca_eigenval_{data_key}.png") plt.close() # PC examine_credit_cluster( X_pca[:, :2], y, title=f"PCA transformation on {data_key} data", xylabel=["PC1", "PC2"], fname=f"output/pca_pc_cluster_{data_key}.png", ) if data_key == "fashion": # PC image plot_fashion_cluster( pca.components_[:25, :], range(25), fname="output/pca_pc_fashion.png" ) # reconstructed image pca = PCA(n_components=0.95, whiten=True, random_state=0) X_recon = pca.inverse_transform(pca.fit_transform(X)) print(f"...Keep {pca.n_components_} components for {data_key} data...") plot_fashion_cluster(X_recon, y, fname="output/pca_reconstructed_fashion.png") # recon vs k sample = [] for nc in range(2, 201, 8): print(f"Reconstructing with {nc} PC...") pca = PCA(n_components=nc, whiten=True, random_state=0) X_recon = pca.inverse_transform(pca.fit_transform(X)) sample.append(X_recon[0])
k_labels_df.to_csv(f"data/kmeans_labels_{data_key}.csv", index=False) res_df.plot( subplots=True, style=".-", title=f"KMeans performance vs n_clusters on {data_key} data", ) plt.xlabel("n_clusters") plt.savefig(f"output/kmeans_{data_key}.png") plt.close() if "credit" in data_key: examine_credit_cluster( X.values[:, TOP_FEATURES[:2]], y, title="True Label", xylabel=DATA["credit"][0].columns[TOP_FEATURES[:2]], fname=f"output/true_cluster_{data_key}.png", ) examine_credit_cluster( X.values[:, TOP_FEATURES[:2]], k_labels_df[y.nunique()], title="KMeans", xylabel=DATA["credit"][0].columns[TOP_FEATURES[:2]], fname=f"output/kmeans_cluster_{data_key}.png", ) if data_key == "fashion": plot_fashion_cluster(X, k_labels_df[len(np.unique(y))], fname="output/kmeans_cluster_fashion.png")
title=f"RCA transformation on {data_key} data", xylabel=["RC1", "RC2"], fname=f"output/rca_rc_cluster_{data_key}.png", ) if data_key == "fashion": # recon vs k sample = [] for nc in np.linspace(2, X.shape[1], 25).astype(int): print(f"Reconstructing with {nc} RC...") rca = GaussianRandomProjection(n_components=nc, random_state=0) X_rca = rca.fit_transform(X) X_recon = np.dot(X_rca, np.linalg.pinv(rca.components_.T)) sample.append(X_recon[0]) plot_fashion_cluster(np.array(sample), range(25), fname="output/rca_recon_vs_k_fashion.png") elif data_key == "credit": corr_mean = [] corr_std = [] for nc in range(1, X.shape[1]): print(f"Reconstructing with {nc} RC...") corr = [] for i in range(100): rca = GaussianRandomProjection(n_components=nc) X_rca = rca.fit_transform(X) X_recon = np.dot(X_rca, np.linalg.pinv(rca.components_.T)) corr.append( np.corrcoef(X.values.flatten(), X_recon.flatten())[0, 1]) corr_mean.append(np.mean(corr)) corr_std.append(np.std(corr))