Exemplo n.º 1
0
	plt.ylabel("$Gap(k)-Gap(k_1)+s_{k+1}$")
	plt.savefig(images+"/tibshirani_gap/"+data_info+"_tibshirani_compare_gap_gaussian.png")
	plt.close()

	

	# looks like 3 clusters win here...

	X_plotable = np.tile(X_used,(2,1))

	X_plotable[X_used.shape[0]:,:]=X_mc

	coloring = np.ones(X_used.shape[0]*2)
	coloring[X_used.shape[0]:]=0

	three_d_plot_funct(X_plotable,predictions=coloring,save=False)
	plt.title("Null Guassian vs actual, Gap approach")
	plt.savefig(images+"/tibshirani_gap/"+data_info+"_tibshirani_null_vs_actual_guassian.png")
	plt.close()

######
# null distribution is uniform
######

run_uniform_mc = True

if run_uniform_mc == True:

	def bounding_box(X):
		range_features = []
		for feature_num in np.arange(X.shape[-1]):
						"bad_SALT2","clare_mega_bad","clare_probably_bad"]


# visualizing bad ones
coloring_bad  = np.zeros(len(names_full))
names_in_mine = {}
for i,lists in enumerate(special_look):

	for element in lists:
		if element in names_full:
			names_in_mine[element]=special_look_names[i]
			coloring_bad[names_full==element]=(i+1)


coloring_standardized = coloring_bad.copy()
for i,cluster in enumerate(set(coloring_bad)):
	coloring_standardized[coloring_bad==cluster]=i

three_d_plot_funct(X_full,coloring_standardized,save=False)


# my suggested bad SN
leverage,rank=leverage_make(X_full)
bad=names_full[np.max(leverage)==leverage]


sorted(leverage)[1]

second = names_full[sorted(leverage)[1]==leverage]

Exemplo n.º 3
0
	storage_prediction = prediction
	if X.shape[0]!=X_full.shape[0]:
		storage_prediction                 = np.ones((X_full.shape[0]))*-1
		# for the outliers
		storage_prediction[kept_truth]     = prediction 

	kmean_predictions[num_data][:,index_clusters,kk]=storage_prediction

	# imaging 
	image_extension    = "kmeans"+"("+str(num_clusters)+")_"+data_names[num_data]+"_"+grey_output+".png"

	# pairs plots
	pair_plot_funct(pairplotsX,image_extension=image_extension)
	
	# 3d plots
	three_d_plot_funct(X,prediction,np.arange(X.shape[-1])[-3:],
		image_extension=image_extension)


	sys.stdout.write("-")
	sys.stdout.flush()

sys.stdout.write("\n")


###################
#### Dirichlet ####
###################

cluster_choices = [2,3]
dirichlet_predictions= [np.zeros((X_full.shape[0],len(cluster_choices),2)) for x in np.arange(4)]
Exemplo n.º 4
0
	plt.plot(cluster_num_options,quantiles_5_95[i,:],linestyle="-",label=ld,alpha=.3)
plt.legend(loc=2)
plt.title("Pham distortions")
plt.plot(cluster_num_options,.85*np.ones(cluster_num_options.shape[0]))
plt.savefig(images+"/pham_improvement/"+data_info+"_pham_distortions.png")
plt.close()


# so Pham suggests 3 clusters might be best:

kmean = sklearn.cluster.KMeans(3)

pred = kmean.fit_predict(X_used)

plt.close()
plt.figure()
three_d_plot_funct(X_used[:,-3:],pred,rotation=134,save=False)
plt.title("3 clusters")
plt.savefig(images+"/pham_improvement/"+data_info+"_pham_3_clusters_3d.png")
plt.close()

pairplotsX         = pd.concat([pd.DataFrame(X_used),pd.DataFrame(pred)],axis=1)
pairplotsX.columns = ["x"+str(i) for i in range(X_used.shape[-1])]+["prediction"]

plt.close()
plt.figure()
pair_plot_funct(pairplotsX,save=False)
plt.savefig(images+"/pham_improvement/"+data_info+"_pham_3_clusters_pairs.png")
plt.close()