plt.ylabel("$Gap(k)-Gap(k_1)+s_{k+1}$") plt.savefig(images+"/tibshirani_gap/"+data_info+"_tibshirani_compare_gap_gaussian.png") plt.close() # looks like 3 clusters win here... X_plotable = np.tile(X_used,(2,1)) X_plotable[X_used.shape[0]:,:]=X_mc coloring = np.ones(X_used.shape[0]*2) coloring[X_used.shape[0]:]=0 three_d_plot_funct(X_plotable,predictions=coloring,save=False) plt.title("Null Guassian vs actual, Gap approach") plt.savefig(images+"/tibshirani_gap/"+data_info+"_tibshirani_null_vs_actual_guassian.png") plt.close() ###### # null distribution is uniform ###### run_uniform_mc = True if run_uniform_mc == True: def bounding_box(X): range_features = [] for feature_num in np.arange(X.shape[-1]):
"bad_SALT2","clare_mega_bad","clare_probably_bad"] # visualizing bad ones coloring_bad = np.zeros(len(names_full)) names_in_mine = {} for i,lists in enumerate(special_look): for element in lists: if element in names_full: names_in_mine[element]=special_look_names[i] coloring_bad[names_full==element]=(i+1) coloring_standardized = coloring_bad.copy() for i,cluster in enumerate(set(coloring_bad)): coloring_standardized[coloring_bad==cluster]=i three_d_plot_funct(X_full,coloring_standardized,save=False) # my suggested bad SN leverage,rank=leverage_make(X_full) bad=names_full[np.max(leverage)==leverage] sorted(leverage)[1] second = names_full[sorted(leverage)[1]==leverage]
storage_prediction = prediction if X.shape[0]!=X_full.shape[0]: storage_prediction = np.ones((X_full.shape[0]))*-1 # for the outliers storage_prediction[kept_truth] = prediction kmean_predictions[num_data][:,index_clusters,kk]=storage_prediction # imaging image_extension = "kmeans"+"("+str(num_clusters)+")_"+data_names[num_data]+"_"+grey_output+".png" # pairs plots pair_plot_funct(pairplotsX,image_extension=image_extension) # 3d plots three_d_plot_funct(X,prediction,np.arange(X.shape[-1])[-3:], image_extension=image_extension) sys.stdout.write("-") sys.stdout.flush() sys.stdout.write("\n") ################### #### Dirichlet #### ################### cluster_choices = [2,3] dirichlet_predictions= [np.zeros((X_full.shape[0],len(cluster_choices),2)) for x in np.arange(4)]
plt.plot(cluster_num_options,quantiles_5_95[i,:],linestyle="-",label=ld,alpha=.3) plt.legend(loc=2) plt.title("Pham distortions") plt.plot(cluster_num_options,.85*np.ones(cluster_num_options.shape[0])) plt.savefig(images+"/pham_improvement/"+data_info+"_pham_distortions.png") plt.close() # so Pham suggests 3 clusters might be best: kmean = sklearn.cluster.KMeans(3) pred = kmean.fit_predict(X_used) plt.close() plt.figure() three_d_plot_funct(X_used[:,-3:],pred,rotation=134,save=False) plt.title("3 clusters") plt.savefig(images+"/pham_improvement/"+data_info+"_pham_3_clusters_3d.png") plt.close() pairplotsX = pd.concat([pd.DataFrame(X_used),pd.DataFrame(pred)],axis=1) pairplotsX.columns = ["x"+str(i) for i in range(X_used.shape[-1])]+["prediction"] plt.close() plt.figure() pair_plot_funct(pairplotsX,save=False) plt.savefig(images+"/pham_improvement/"+data_info+"_pham_3_clusters_pairs.png") plt.close()