def elbow_clustering_analysis(): institution_info,X = readData() X=np.array(X) KK=range(1,20) KM = [kmeans(X,k) for k in KK] centroids = [cent for (cent,var) in KM] D_k = [cdist(X, cent, 'euclidean') for cent in centroids] cIdx = [np.argmin(D,axis=1) for D in D_k] dist = [np.min(D,axis=1) for D in D_k] tot_withinss = [sum(d**2) for d in dist] # Total within-cluster sum of squares totss = sum(pdist(X)**2)/X.shape[0] # The total sum of squares betweenss = totss - tot_withinss # The between-cluster sum of squares kIdx = 3 # K=6 clr = cm.spectral( np.linspace(0,1,10) ).tolist() # elbow curve fig = plt.figure() ax = fig.add_subplot(111) ax.plot(KK, betweenss/totss*100, 'b*-') ax.plot(KK[kIdx], betweenss[kIdx]/totss*100, marker='o', markersize=12, markeredgewidth=2, markeredgecolor='r', markerfacecolor='None') ax.set_ylim((0,100)) plt.grid(True) plt.xlabel('Number of clusters') plt.ylabel('Percentage of variance explained (%)') plt.title('Elbow for KMeans clustering') plt.savefig('admissions_elbow_klustering_analysis.eps') plt.show()
def setup_figure(): fig=plt.figure(1) plt.clf() ax = fig.add_subplot(1,1,1) ax.set_xlim([-rho-1,rho+1]) ax.set_ylim([-rho-1,rho+1]) ax.set_aspect('equal') cells=[] springs=[] borders=[] for i in range(0,N): c = plt.Circle((-0,0),0.5,color=cm.copper(0)) cells.append(ax.add_artist(c)) if plot_springs: for i in range(0,len(pairs)): springs += ax.plot([], [], color=cm.spectral(0)) if plot_voronoi: for i in range(0, pairs2.shape[0]): borders += ax.plot([], [], color='k') ang_mom = ax.add_patch(FancyArrowPatch((0,0),(1,1),ec='r', fc='r', zorder=0, arrowstyle=u'simple,head_width=20, head_length=10')) return(fig,cells,springs,borders,ang_mom)
def bar_graph(data, bar_names, x_label='', y_label='', title='', axis=None, colors=None, legend_place='lower right'): """Create horzontal bar chart with lists of data values. Plots a bar chart given a dictionary of *data* with a type as key, and a sequence of values corresponding to elements in *bar_names* as value. Place legend with *legend_place* as string argument matching /(lower|middle|upper) (right|center|left)/. """ from matplotlib import cm fig = plt.figure() plt.xlabel(x_label) plt.ylabel(y_label) plt.title(title) ax = fig.add_subplot(111) num_groups = len(data.values()[0]) group_size = len(data.values()) yvals = np.arange(num_groups) width= 0.8/len(data.values()) ps = [] for i, vals in enumerate(data.values()): if colors is None: color = cm.spectral(1.*i/group_size) # colormaps: gist_rainbow, jet, hsv, spectral, .. else: color = colors[i%len(colors)] p = ax.barh(yvals+(width*i), vals, width, color=color) ps.append(p[0]) plt.yticks(yvals+width, bar_names) if legend_place is not None: plt.legend( ps, data.keys(), loc=legend_place) plt.show()
def get_colors(self, qty): qty = np.power(qty / qty.max(), 1.0 / CONTRAST) if COLORMAP == 0: rgba = cm.gray(qty, alpha=ALPHA) elif COLORMAP == 1: rgba = cm.afmhot(qty, alpha=ALPHA) elif COLORMAP == 2: rgba = cm.hot(qty, alpha=ALPHA) elif COLORMAP == 3: rgba = cm.gist_heat(qty, alpha=ALPHA) elif COLORMAP == 4: rgba = cm.copper(qty, alpha=ALPHA) elif COLORMAP == 5: rgba = cm.gnuplot2(qty, alpha=ALPHA) elif COLORMAP == 6: rgba = cm.gnuplot(qty, alpha=ALPHA) elif COLORMAP == 7: rgba = cm.gist_stern(qty, alpha=ALPHA) elif COLORMAP == 8: rgba = cm.gist_earth(qty, alpha=ALPHA) elif COLORMAP == 9: rgba = cm.spectral(qty, alpha=ALPHA) return rgba
def plot_board(self, custom_text=''): X = self.X fig = plt.figure(figsize=(5,5)) plt.xlim(-1,1) plt.ylim(-1,1) if self.mu and self.clusters: mu = self.mu clus = self.clusters K = self.K for m, clu in clus.items(): cs = cm.spectral(1.*m/self.K) plt.plot(mu[m][0], mu[m][1], 'o', marker='*', \ markersize=12, color=cs) plt.plot(zip(*clus[m])[0], zip(*clus[m])[1], '.', \ markersize=8, color=cs, alpha=0.5) else: plt.plot(zip(*X)[0], zip(*X)[1], '.', alpha=0.5) if self.method == '++': tit = 'K-means++' else: tit = 'K-means with random initialization' # Scale the plot image # X lim plt.xlim([min(zip(*X)[0]),max(zip(*X)[0])]) # Y lim plt.ylim([min(zip(*X)[1]),max(zip(*X)[1])]) pars = 'N=%s, K=%s' % (str(self.N), str(self.K)) plt.title('\n'.join([pars, tit]), fontsize=16) plt.savefig('kpp%s_N%s_K%s.png' % (custom_text, str(self.N), str(self.K)), \ bbox_inches='tight', dpi=200)
def __call__(self, event): if event.inaxes: clickX = event.xdata clickY = event.ydata closest_i = 0 closest_dist = 10000000 if self.axis is None or self.axis==event.inaxes: cluster_num = None for i in range(0,len(self.data)): potential = self.distance(clickX, self.data[i][0], clickY, self.data[i][1]) if potential < closest_dist: closest_dist = potential closest_i = i x = self.data[closest_i][0] y = self.data[closest_i][1] c = self.data[closest_i][2] cluster_num = c di = self.data[closest_i][3] du = self.data[closest_i][4] pa = self.data[closest_i][5] cal = self.data[closest_i][6] fu = self.data[closest_i][7] a.set_bbox(dict(facecolor=cm.spectral(float(c) / n_clusters, 1), alpha=.5)) dist_text.set_text ("DIST (km) = %.3f" % di) dur_text.set_text("DUR (min) = %.3f" % du) pace_text.set_text ("PACE (min/mi) = %.3f" % pa) cal_text.set_text ("CAL = %.3f" % cal) fuel_text.set_text ("FUEL = %.3f" % fu) num = 0 clust_di = 0 clust_du = 0 clust_pa = 0 clust_cal = 0 clust_fu = 0 for item in self.data: if item[2] == cluster_num: num += 1 clust_di+=item[3] clust_du+=item[4] clust_pa+=item[5] clust_cal+=item[6] clust_fu+=item[7] clust_di /= float(num) clust_du /= float(num) clust_pa /= float(num) clust_cal /= float(num) clust_fu /= float(num) clust_dist_text.set_text ("DIST (km) = %.3f" % clust_di) clust_dur_text.set_text("DUR (min) = %.3f" % clust_du) clust_pace_text.set_text ("PACE (min/mi) = %.3f" % clust_pa) clust_cal_text.set_text ("CAL = %.3f" % clust_cal) clust_fuel_text.set_text ("FUEL = %.3f" % clust_fu) figsrc.canvas.draw()
def plot_silhouette(sample_silhouette_values, cluster_labels): """ Generate silhouette plot to elucidate number of clusters in data Source: http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html Arguments ========= sample_silhouette_values - silhouette value for every observation cluster_labels - sequential numeric cluster numbers Returns ========= None - the figure """ # Initialise variables n_clusters = max(cluster_labels) - min(cluster_labels) + 1 # assume cluster number are sequential xMin = min(sample_silhouette_values) xMax = 1 # Create a subplot with 1 row and 2 columns fig = plt.figure() #fig.set_size_inches(18, 7) ax1 = plt.gca() ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") ax1.set_title('Silhouette Plot (k=%d)' % n_clusters) # The silhouette coefficient can range from -1, 1 ax1.set_xlim([xMin, xMax]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(cluster_labels) + (n_clusters + 1) * 10]) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples silhouette_avg = sample_silhouette_values.mean() ax1.axvline(x=silhouette_avg, color="red", linestyle="--") # average line
def silhouette_analysis(self): if not self.pca_reduced: self.pc_analysis() range_n_clusters = range(2, 10) for n_clusters in range_n_clusters: fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(self.pca_reduced) + (n_clusters + 1) * 10]) clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(self.pca_reduced) silhouette_avg = silhouette_score(self.pca_reduced, cluster_labels) print("For n_clusters =", n_clusters, "the average silhouette_score is :", silhouette_avg) sample_silhouette_values = silhouette_samples(self.pca_reduced, cluster_labels) y_lower = 10 for i in range(n_clusters): ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(self.pca_reduced[:, 0], self.pca_reduced[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors) centers = clusterer.cluster_centers_ ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200) for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold')
def __init__(self): self.window = GlutWindow(double=True, multisample=True) self.window.display_callback = self.display self.window.mouse_callback = self.mouse self.shader = ShaderProgram(vertex=vertex_shader, fragment=fragment_shader) self.shader.colormap = Texture1D(cm.spectral(linspace(0, 1, 256)), wrap_s="MIRRORED_REPEAT") self.shader.minval = (-2.5, -1.75) self.shader.maxval = (1.0, 1.75) self.vao = get_fullscreen_quad() self.history = []
def intraday_exec_curve(data=None,step_sec=60*30,group_var='strategy_name_mapped'): """ intraday_exec_curve : Plot the daily exec curve in turnover cross by group_var """ ############################################################## # input handling ############################################################## if (data is None): raise NameError('plot:intraday_exec_curve - data is missing') ############################################################## # aggregate data ############################################################## grouped=data.groupby([st_data.gridTime(date=data.index,step_sec=step_sec,out_mode='ceil'),group_var]) grouped_data=pd.DataFrame([{'date':k[0],group_var:k[1], 'mturnover_euro': np.sum(v.rate_to_euro*v.price*v.volume)*1e-6} for k,v in grouped]) grouped_data=grouped_data.set_index('date') # on passe en string parce que ca ne sorte pas sinon !! grouped_data['tmpindex']=[datetime.strftime(x.to_datetime(),'%Y%m%d-%H:%M:%S.%f') for x in grouped_data.index] grouped_data=grouped_data.sort_index(by=['tmpindex',group_var]).drop(['tmpindex'],axis=1) ############################################################## # plot ############################################################## # ----- NEEDED uni_strat=np.sort(np.unique(grouped_data[group_var].values).tolist()) colors_strat=cm.spectral(np.linspace(0, 1.0, len(uni_strat))) # ----- PLOT plt.figure() plt.hold(True) prev_date='' prev_date_cum=0 for i in range(grouped_data.shape[0]): #for i in range(20): date=grouped_data.index[i].to_datetime() idx_uni_strat=np.nonzero(uni_strat==grouped_data[group_var].ix[i])[0][0] if (not date==prev_date): plt.gca().fill([date-timedelta(seconds=step_sec),date,date,date-timedelta(seconds=step_sec)], [0,0,grouped_data['mturnover_euro'].ix[i],grouped_data['mturnover_euro'].ix[i]], facecolor=colors_strat[idx_uni_strat],alpha = 0.5) prev_date_cum=grouped_data['mturnover_euro'].ix[i] # ,edgecolor='none' else: plt.gca().fill([date-timedelta(seconds=step_sec),date,date,date-timedelta(seconds=step_sec)], [prev_date_cum,prev_date_cum,prev_date_cum+grouped_data['mturnover_euro'].ix[i],prev_date_cum+grouped_data['mturnover_euro'].ix[i]], facecolor=colors_strat[idx_uni_strat],alpha = 0.5) prev_date_cum=prev_date_cum+grouped_data['mturnover_euro'].ix[i] prev_date=date plt.hold(False) plt.legend(uni_strat) plt.show()
def plot_intraday_exec_curve(self, duration = "", step_sec=60*30, group_var='strategy_name_mapped'): """ intraday_exec_curve : Plot the daily exec curve in turnover cross by group_var """ self.get_agg_deals(step_sec=step_sec) ############################################################## # plot ############################################################## # ----- NEEDED uni_strat = np.sort(np.unique(self.data_agg_deals[group_var].values).tolist()) colors_strat = cm.spectral(np.linspace(0, 1.0, len(uni_strat))) uni_strat_islabeled = np.array([False]*len(uni_strat)) # ----- PLOT h = plt.figure(figsize = DEFAULT_FIGSIZE) axes = plt.gca() axes.grid(True) plt.hold(True) prev_date='' prev_date_cum=0 for i in range(self.data_agg_deals.shape[0]): #--- date=self.data_agg_deals.index[i].to_datetime() idx_uni_strat=np.nonzero(uni_strat==self.data_agg_deals[group_var].ix[i])[0][0] #-- args=[] if (not date==prev_date): args.append([date-timedelta(seconds=step_sec),date,date,date-timedelta(seconds=step_sec)]) args.append([0,0,self.data_agg_deals['mturnover_euro'].ix[i],self.data_agg_deals['mturnover_euro'].ix[i]]) prev_date_cum=self.data_agg_deals['mturnover_euro'].ix[i] else: args.append([date-timedelta(seconds=step_sec),date,date,date-timedelta(seconds=step_sec)]) args.append([prev_date_cum,prev_date_cum,prev_date_cum+self.data_agg_deals['mturnover_euro'].ix[i],prev_date_cum+self.data_agg_deals['mturnover_euro'].ix[i]]) prev_date_cum=prev_date_cum+self.data_agg_deals['mturnover_euro'].ix[i] #-- kwargs={'facecolor':colors_strat[idx_uni_strat],'alpha':0.85} if not uni_strat_islabeled[idx_uni_strat]: kwargs.update({'label':uni_strat[idx_uni_strat]}) uni_strat_islabeled[idx_uni_strat]=True #-- plt.gca().fill(*args,**kwargs) prev_date=date plt.hold(False) plt.ylabel('Turnover (,000,000) euros') plt.title('Intraday traded curve: ' + duration, size = 'large') plt.legend() return h
def timedomain(ycsb, toplt): arrays_k, arrays_v = splitbyrecordcount(ycsb[toplt]) arrays_ku, arrays_vu = splitbyrecordcount(ycsb[2]) arrays_kr, arrays_vr = splitbyrecordcount(ycsb[1]) arrays_kv, arrays_vv = splitbyrecordcount(ycsb[0]) maxheightu = max([max(x) for x in arrays_vu[1:9]]) maxheightr = max([max(x) for x in arrays_vr[1:9]]) maxheightv = max([max(x) for x in arrays_vv[1:9]]) maxheight = max(maxheightu, maxheightr, maxheightv) #print maxheight K = [] K.extend(arrays_k) V = [] V.extend(arrays_v) #K = [ K[1], K[11], K[21] ] #V = [ V[1], V[11], V[21] ] checktype = ( "Update", "Read", "Verification" )[toplt] fig = plt.figure() ax = fig.add_subplot('111', projection='3d') it = 0 for z in np.arange(1, 9): xs = K[z] ys = V[z] c = colmap.spectral(z/9.,1) ax.plot(xs, z * np.ones(xs.shape), zs=ys, zdir='z', color=c, zorder = -z) # Plot formatting font = {'family' : 'serif', 'weight' : 'normal', 'size' : 12} plt.rc('font', **font) #plt.zlim(0, maxheight) #plt.legend(checktype, loc=2, bbox_to_anchor=(1.05, 1), #borderaxespad=0. ) ax.set_zlim3d(0, maxheight) ax.set_xlabel('Time (ms)') ax.set_ylabel('Test Run') ax.set_zlabel('Runtime') ax.tick_params(axis='both', labelsize = 8) plt.savefig( getfilename("timeseries", checktype), format='png', dpi=300, bbox_inches='tight', transparent=True )
def animate(k): i = int(k/3) if k == 1: ax.view_init(20, 215) for j, y in enumerate(ys): y_seg = y[0:2] plot2(y_seg, fig, cm.spectral(j/len(ys))) ax.scatter(0.16, 0.16, 0.16, c="g", alpha=0.4, s=500) ax.scatter(0.82, 0.17, 0.17, c="b", alpha=0.4, s=500) ax.scatter(0.17, 0.82, 0.17, c="r", alpha=0.4, s=500) ax.scatter(0.17, 0.17, 0.82, c="k", alpha=0.4, s=500) set_title("Decision Space") if i > 0 and i < N:# ys.shape[1]: ax.view_init(20, 215+ANGLE1*k/N/3) for j, y in enumerate(ys): y_seg = y[i-1:i+1] plot2(y_seg, fig, cm.spectral(j/len(ys))) set_title("Decision Space") elif i >= N:# ys.shape[1]: ax.set_axis_off() j = k - 3*N print "rotate" + str(j) ax.view_init(20, (215+ANGLE1+ANGLE2*3*j/int(ANGLE2))%360)
def Silhouette(D,labels,k): """ Taken from SKlearn's plot kmeans example D = matriz de distancia k = numero de clusters """ plt.ion() fig, ax1 = plt.subplots() fig.set_size_inches(18, 7) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(D) + (k + 1) * 10]) sample_silhouette_values = metrics.silhouette_samples(D , labels, metric='precomputed') y_lower = 10 for i in range(k): ith_cluster_silhouette_values = \ sample_silhouette_values[labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / k) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") silhouette_avg = metrics.silhouette_score(D , labels, metric='precomputed') ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) plt.suptitle(("Silhouette analysis with n_clusters =",k," and average = ",silhouette_avg), fontsize=14, fontweight='bold') plt.show()
def plot_partial_factors(ds,sts,x=0,y=1,cmap=None,axes='off', nude=False): mx = np.max(np.abs(ds.samples)) xmx = mx*1.1 hw = .05*xmx w = .01*xmx plt.arrow(-xmx,0,2*xmx,0,color = 'gray',alpha=.7,width=w, head_width=hw,length_includes_head=True) plt.arrow(0,-mx,0,2*mx,color = 'gray',alpha=.7, width=w, head_width=hw,length_includes_head=True) ntables = len(np.unique(ds.chunks)) if cmap is None: cmap = cm.spectral(np.linspace(.2,.85,ntables)) m,ncol = ds.shape nrows = m/ntables data = ds.samples.T.reshape((ncol,nrows,ntables),order='F') centers = np.mean(data,2).T[:,[x,y]] plt.scatter(centers[:,0],centers[:,1]) for t in range(ntables): tab = data[:,:,t].T[:,[x,y]] for r in range(nrows): a,b = centers[r,:] j,k = tab[r,:] plt.plot([a,j],[b,k],c=cmap[t],lw=2,alpha=.5) plt.axis('equal') plt.axis((-mx,mx,-mx,mx)) #plt.axis('equal') plt.axis(axes) if not nude: for t in range(nrows): plt.annotate(ds.targets[t],xy = (centers[t,0], centers[t,1])) plt.text(-xmx,.05*mx,'$\lambda = %s$'%np.round(sts.eigv[x],2)) plt.text(mx*.05,mx*.9,'$\lambda = %s$'%np.round(sts.eigv[y],2)) tau = '$\\tau = $' perc = '$\%$' mpl.rcParams['text.usetex'] = False plt.text(-xmx,-.1*mx, '%s $%s$%s' % (tau,np.round(100*sts.inertia[x],0),perc)) plt.text(xmx*.05,mx*.8, '%s $%s$%s' % (tau,np.round(100*sts.inertia[y],0),perc)) plt.text(-.15*xmx,.8*mx,'$%s$'%(y+1), fontsize=20) plt.text(xmx*.85,-mx*.2,'$%s$'%(x+1),fontsize=20) plt.axis('scaled') plt.axis([-xmx,xmx,-mx,mx])
def clst_hier(Data_matrix, Linkage_Method, Pdist, n_clusters): #pdb.set_trace() #hierarchical clustering Dist_matrix = dt.pdist(Data_matrix, Pdist) Dist_matrix = dt.squareform(Dist_matrix) #, checks=True) --> returns a square matrix; needed for other methods of linkage #check its histogram: #(h,b)=np.histogram(Dist_matrix) #(f_hist, axis_hist)=plt.subplots() #axis_hist.plot(b[1:], h) #f_hist.show() #pdb.set_trace() #Hier_clustering = hr.linkage(Dist_matrix) #, method='centroid') #, method=Linkage_Method, metric=Pdist) Hier_clustering = hr.linkage(Dist_matrix, method=Linkage_Method, metric=Pdist) #draw dendrogram dendro = hr.dendrogram(Hier_clustering) #plt.show() #try to get current axes & modify & save figures ax_dendro = plt.gca() fig_dendro = plt.gcf() #pdb.set_trace() fig_dendro.savefig(Case_loc+'fig_dendrogram.png') #pdb.set_trace() #n_cluster_list = list() tmp_n_clusters = 0 for ith_t in Hier_clustering[:,2]: cluster_labels = hr.fcluster(Hier_clustering, ith_t, criterion=FCluster_Criterion) cluster_labels = cluster_labels - 1 # start from 0 tmp_n_clusters = cluster_labels.max()+1 # cluster index = {0,...,N-1} --> N clusters if tmp_n_clusters == n_clusters: break if tmp_n_clusters == 0: print('unable to find %d clusters in clst_hier'%n_clusters) pdb.set_trace() color_matrix = np.zeros(len(cluster_labels)*4) color_matrix = color_matrix.reshape((len(cluster_labels), 4)) for i in range(n_clusters): ith_cluster_color = cm.spectral(float(i) / n_clusters) color_matrix[cluster_labels==i] = ith_cluster_color SD = 0 #currently not found intertia for hier method return [cluster_labels, color_matrix, SD]
def animate(f): global pairs, pairs2 # load data F=F_vs_t[f] r=r_vs_t[f] n=n_vs_t[f] p=(rho+0.9)*p_angular[f]/np.sqrt(np.sum(p_angular[f]**2)) ang_mom.set_positions((0, 0), (p[x_plane], p[y_plane])) if update_nn: pairs = simforces.get_all_pairs(getDelaunayTrianglesOnSphere(r)+1) for i in range(0,N): #j=indsort[i] c=int((r[z_plane,i]+1)/2*256) cells[i].center=(r[x_plane,i],r[y_plane,i]) cells[i].set_facecolor(cm.copper(c)) cells[i].set_zorder(r[z_plane,i]) if plot_springs: for i in range(0,len(pairs)): i1 = pairs[i,0] - 1 i2 = pairs[i,1] - 1 if (r[z_plane,i1] > 0) and (r[z_plane,i2] > 0): dist = np.sqrt(np.sum((r[:,i1]- r[:,i2])**2)) c=int((dist-1)*128) springs[i].set_data([r[x_plane,i1], r[x_plane,i2]], [r[y_plane,i1], r[y_plane,i2]]) springs[i].set_color(cm.spectral(c)) else: springs[i].set_data([], []) if plot_voronoi: list_, baricenters, out_polygon_dict, pairs2, all_areas = getVoronoiOnSphere(r) b = rho*baricenters for i in range(0,len(pairs2)): i1 = pairs2[i,0] i2 = pairs2[i,1] if (b[z_plane,i1] > 0) and (b[z_plane,i2] > 0): borders[i].set_data([b[x_plane,i1], b[x_plane,i2]], [b[y_plane,i1], b[y_plane,i2]]) else: borders[i].set_data([], []) if f == 20: fig.savefig('test.png') return (cells,springs,borders,ang_mom)
def display_climate_radial(geography, year, tempdata, raindata): #Create figure and polar axis fig = plt.figure('%s_radial' % geography, facecolor='white', figsize=(8,8)) ax = fig.add_subplot(111, polar = True, frameon=False) mintemp=-30 maxtemp=40 ax.text(0,mintemp, geography.upper(), color='#555555', horizontalalignment='center', size=30) ax.text(0,maxtemp+1, str(year), color='#555555', horizontalalignment='center', size=10) #Min/Max temps as bars for i,(tmin,tmax,tmean) in enumerate(tempdata): if np.abs(tmax-tmin)<1: tmin=tmin-0.5 tmax=tmax+0.5 ax.plot([2*np.pi*i/365.0]*2, [tmin,tmax], color=cm.spectral((tmean+5)/45.0), linewidth=1.5, alpha=0.6); # plot rainfall as scatters ax.scatter([2*np.pi*r/365. for r in raindata['rainydays']], raindata['tcenters'], s=[100*r for r in raindata['rainfalls']], alpha=0.5, facecolor='#99aacc', linewidth=0) # tweak ranges and orientation of polar plot ax.set_rmax(maxtemp) ax.set_rmin(mintemp) ax.set_theta_direction(-1) ax.set_theta_zero_location("N") #Tweak polar axes, gridding, labels ax.tick_params(axis='both', colors='#bbbbbb') ax.set_xticks([m*2*np.pi/12 for m in range(12)]) months = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC'] ax.set_xticklabels( months, fontsize=10 ) ax.get_xaxis().grid(False) plt.rgrids( (0.01, 10, 20, 30, 40), labels=('0 C', '', '20 C', '', '40 C' ), angle=180) # radii only positive here, but override later ax.get_yaxis().grid(which='minor',linestyle='-',color='#bbbbbb', alpha=0.3) ax.get_yaxis().grid(which='major',linestyle='-',color='#bbbbbb', alpha=0.4, linewidth=1.4) ax.set_yticks([10, 30], minor=True) ax.set_yticks([0, 20, 40]) ax.set_yticklabels( ['0 C', '20 C', '40 C' ], fontsize=10) plt.show()
def Cluster(self, event=None): global x,y,c,dist,dur,pace,calories,fuel,figsrc x=[] y=[] c=[] print self.bool_vec delaxes(self.axsrc) self.axsrc = figsrc.add_subplot(211, autoscale_on=True) self.axsrc.set_title('Right Click to Zoom') def select(vec): return [elem for elem,b in zip(vec,self.bool_vec) if b] X = [ select(elem) for elem in self.master] print X[0] km = MiniBatchKMeans(k=n_clusters, init='random', n_init=10, random_state=random_state).fit(X) pca = decomposition.PCA(n_components=2) pca.fit(X) X = pca.transform(X) for k in range(n_clusters): my_members = km.labels_ == k color = cm.spectral(float(k) / n_clusters, 1) x.extend(X[my_members, 0]) y.extend(X[my_members, 1]) for i in range(0,len(X[my_members])): c.append(k) plot(X[my_members, 0], X[my_members, 1], 'o', marker='.', c=color) #cluster_center = km.cluster_centers_[k] cluster_center = find_center(X[my_members]) print "Center: " plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=color, markeredgecolor='k', markersize=7) title("Cluster View") self.master_lx_lim = self.axsrc.get_xlim()[0] self.master_ly_lim = self.axsrc.get_ylim()[0] self.master_ux_lim = self.axsrc.get_xlim()[1] self.master_uy_lim = self.axsrc.get_ylim()[1] self.af = AnnoteFinder(x,y,c,dist,dur,pace,calories,fuel, self.axsrc) figsrc.canvas.mpl_connect('button_press_event', self.af)
def get_contrib_colour(self, contrib_key): """ :param contrib_key: :return: :raise ke: """ try: return self._contrib_colour_dict[contrib_key] except (AttributeError, KeyError): self.contrib_colourmap = [ cm.spectral(i) for i in np.linspace(0, 0.9, self.ctl.get_max_node_contribs())] self._contrib_colour_dict = {} for i, contrib_key in enumerate(self.ctl.get_contrib_keys()): self._contrib_colour_dict[ contrib_key] = self.contrib_colourmap[i] try: return self._contrib_colour_dict[contrib_key] except KeyError as ke: logging.error("CDict:{0!s}".format(self._contrib_colour_dict)) raise ke("CDict:{0!s}".format(self._contrib_colour_dict))
def drawLimits(useObjects,filt="Ks",useGrayscale=False): colorDict={} for ii,obj in enumerate(useObjects): if obj not in limitDict[filt].keys(): print obj + " not found" else: if useGrayscale: colormap = cm.gray((ii)/(float(len(useObjects))),1) else: colormap = cm.spectral((ii)/(float(len(useObjects))),1) dists = []; deltaMag = [] for xx in range(len(apertures)): if limitDict[filt][obj][xx][1] != "-": dists.append(eval(apertures[xx])) deltaMag.append(eval(limitDict[filt][obj][xx])) # print "\n\n",dists,"\n",deltaMag,"\n\n" pylab.semilogx(dists,deltaMag,linewidth=2,linestyle='solid',color=colormap,label=obj+" "+instrUsed) colorDict[obj]=colormap return colorDict
def plot_embedding(x, y, selected=None, group=None, dpi=80, **extra): ''' Plot an embedding :Parameters: x : array X coordindates y : array Y coordindates selected : array, optional Plot selected points dpi : int Figure resolution extra : dict Unused key word arguments :Returns: fig : Figure Matplotlib figure ax : Axes Matplotlib axes ''' fig = pylab.figure(dpi=dpi) ax = fig.add_subplot(111) if group is not None: refs = numpy.unique(group) beg, inc = 0.0, 1.0/len(refs) for r in refs: sel = r == group color = cm.spectral(beg)#@UndefinedVariable ax.plot(x[sel], y[sel], 'o', ls='.', markersize=3, c=color, **extra) beg += inc else: ax.plot(x, y, 'ro', ls='.', markersize=3, **extra) if selected is not None: ax.plot(x[selected], y[selected], 'k+', ls='.', markersize=2, **extra) return fig, ax
def clst_kmeans(Data_matrix, n_clusters): #pdb.set_trace() print('kmeans starts') t0 = time() if isWin==1: clusterer = KMeans(n_clusters=n_clusters, random_state=rand_seed) else: clusterer = KMeans(k=n_clusters, random_state=rand_seed) #for linux clusterer.fit(Data_matrix) t1 = time() print('\tkmeans finishes with %.2g sec' % (t1-t0)) #pdb.set_trace() cluster_labels = clusterer.labels_ SD = clusterer.inertia_ #SD: sum of distortion = #Sum of distances of samples to their closest cluster center color_matrix = np.zeros(len(cluster_labels)*4) color_matrix = color_matrix.reshape((len(cluster_labels), 4)) for i in range(n_clusters): ith_cluster_color = cm.spectral(float(i) / n_clusters) color_matrix[cluster_labels==i] = ith_cluster_color return [cluster_labels, color_matrix, SD]
std_rough_times = np.std(rough_times, axis=0) ratios[n,:] = avg_rough_times/smooth_timescales all_Vrands[n,:] = Vrand + Vsmooth all_avg_rough_times[n,:] = avg_rough_times all_std_rough_times[n,:] = std_rough_times if not os.path.exists("double_plots"): os.mkdir("double_plots") os.chdir("double_plots") fig_temp = plt.figure(2) for n in range(ndEs): c_val = float(n)/ndEs plt.plot(ratios[n,:], 'o', ms=10, c=cm.spectral(c_val)) plt.semilogy() plt.xlabel("index") plt.ylabel("Ratio of timescales $\\frac{t_i}{t_i^0}$") fig_temp.savefig("ti_norm_vs_index.png",bbox_inches="tight") fig_temp.savefig("ti_norm_vs_index.pdf",bbox_inches="tight") fig1 = plt.figure() for n in range(ndEs): c_val = float(n)/ndEs if n == 0: plt.plot(all_avg_rough_times[n,:], 'o', ms=10, c=cm.spectral(c_val),label="matrix") else: plt.plot(all_avg_rough_times[n,:], 'o', ms=10, c=cm.spectral(c_val)) plt.plot(smooth_timescales, 'k') plt.ylabel("Implied timescales")
# y_savg.append(silhouette_avg) y_lower = 10 for i in range(cluster): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / cluster) ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax.set_title("The silhouette plot for the various clusters.") ax.set_xlabel("The silhouette coefficient values") ax.set_ylabel("Cluster label") # The vertical line for average silhoutte score of all the values
#settings for axes/ticks widths, etc. -> Called for any plotting ################################################################ plt.rc('axes',linewidth=0.75)#axis border widths (I tend to like bolder than the default) plt.rc('xtick.major',width=0.75)#tick widths (I like them the same width as the border) plt.rc('ytick.major',width=0.75) plt.rc('xtick.minor',width=0.75) plt.rc('ytick.minor',width=0.75) plt.rc('lines',markersize=4,markeredgewidth=0.0)#size of markers,no outline #If you want a range of colors, this is a useful function to generate an equally #spaced range import matplotlib.cm as cm num_colors = 9 colors = np.zeros([num_colors,4])#the four is constant for i in np.arange(num_colors): c = cm.spectral(i/float(num_colors),1) colors[i,:]=c #then, call color=colors[x,:] in the plot routine #These are possible marker styles points = ['o','v','s','p','*','h','^','D','+','>','H','d','x','<'] ########################################################### #Two subplotting options: call subplot or manually set axes ########################################################### ####Using Subplot#### #default settings for margin (can be tweaked accordingly left = 0.2 # the left side of the subplots of the figure right = 0.95 # the right side of the subplots of the figure
def bench_k_means(self, data, name, save=False, path='', plot=True): """ Silhouette analysis :param data: dataset trasposed :param name: component name (gravity or body) :param save: bool parameter that indicates if the plots are saved :param path: path where the plots will be saved :return Koptimal: optimal number of clusters to be used to cluster the data of the given dataset""" #In this example the silhouette analysis is used to choose an optimal value for n_clusters. #Bad pick for the given data due to the presence of clusters with #below average silhouette scores and also due to wide fluctuations in the size of the silhouette plots. threshold = 0.69 #t0 = time() X = data cmin = 2 cmax = 50 for n_clusters in range(cmin, cmax): # Create a subplot with 1 row and 2 columns if (plot == True): fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(X) #cluster_labels = clusterer.fit(X) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels, metric='sqeuclidean') print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) #print sample_silhouette_values Koptimal = n_clusters #if (Koptimal == maxK): #print('MATLAB:noConvergence','Failed to converge to the optimal K: increase maxK.') if (silhouette_avg < threshold): return (Koptimal) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) if (plot == True): ax1.fill_betweenx(arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples if (plot == True): ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhoutte score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors) # Labeling the clusters centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200) for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(( "Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.savefig(path + '/' + str(name) + "_c_" + str(n_clusters) + '.png') return Koptimal
def plot_insertions_two_panels(fname, seqs, gene, domain, tax, id2name): """ plot insertions wrt model positions 2 panels: 1. insertions with ORF - triangle if has intron - circle if no intron 2. insertion does not have ORF - triangle if has intron - circle if no intron *3. conservation of sequence in model *4. e. coli positions # seqs[id] = [gene, model, [[i-gene_pos, i-model_pos, i-length, orf, intron], ...]] """ # import import matplotlib import matplotlib.pyplot as plt import numpy as np from matplotlib.backends.backend_pdf import PdfPages import matplotlib.cm as cm import matplotlib.colors as col from itertools import cycle as cycle from matplotlib.font_manager import FontProperties plt.rcParams['pdf.fonttype'] = 42 # illustrator pdf # axis max_insert = max_insertion(seqs, gene, domain) height = max_insert + max_insert * 0.10 xmax = model_length(gene, domain) f, axarr = plt.subplots(2, sharex=True, sharey=True) plt.axis([0, xmax, 0, height]) plt.xticks(np.arange(0, xmax, 100), rotation=45) plt.yticks(np.arange(0, height, 100)) # labels axarr[0].set_title('encodes ORF') axarr[1].set_title('does not encode ORF') plt.suptitle('%s %s rRNA gene insertions' % (domain, gene)) plt.ylabel('insertion length (bp)') plt.xlabel('position on %s %s rRNA gene model' % (domain, gene)) # colors color2tax = {} if tax is False: taxa = ['n/a'] else: taxa = sorted( set([ tax[id2name[i]] for i, j in list(seqs.items()) if j[2] != [] and id2name[i] in tax ])) if 'n/a' not in taxa: taxa.append('n/a') colors = cm.spectral(np.linspace(0, 1, len(taxa))) colors = cycle(colors) for t in taxa: color2tax[t] = next(colors) # plot for name, seq in list(seqs.items()): g, d = seq[0], seq[1] if g != gene or d != domain or seq[2] == []: continue if tax is False or id2name[name] not in tax: t = 'n/a' else: t = tax[id2name[name]] c = color2tax[t] for ins in seq[2]: x, y = int(ins[1]), int(ins[2]) if ins[4] == True: # has intron, set marker marker, size = '^', 30 else: marker, size = 'o', 30 if ins[3] == True: # has orf, plot separately axarr[0].scatter(x, y, marker = marker, s = size, facecolors = 'none', \ clip_on = False, edgecolors = c, label = t) else: axarr[1].scatter(x, y, marker = marker, s = size, facecolors = 'none', \ clip_on = False, edgecolors = c, label = t) # legend boxes = [ matplotlib.patches.Rectangle((0, 0), 1, 1, fc=color2tax[t]) for t in taxa ] names = [t for t in taxa] plt.legend(boxes, names, prop={'size': 10}, loc='center left', bbox_to_anchor=(1, 0.5), scatterpoints=1) # save figure = plt.gcf() figure.set_size_inches(20, 12) pdf = PdfPages('%s.%s-%srRNAgene-insertions.pdf' % (fname.rsplit('.')[0], domain, gene)) pdf.savefig() plt.close() pdf.close()
def run_silhouette_analysis(**kargs): from sklearn.metrics import silhouette_samples, silhouette_score # import matplotlib.cm as cm tFoundManifoldMethod = False try: import learn_manifold tFoundManifoldMethod = True except: pass # [params] input X = kargs['X'] y = kargs.get('y', None) assert X is not None and X.shape[0] > 1 N = X.shape[0] n_clusters_max = max(2, N / 2) range_n_clusters = kargs.get('range_n_clusters', range(2, n_clusters_max, 5)) n_clusters_min, n_clusters_max = min(range_n_clusters), max( range_n_clusters) identifier = kargs.get('identifier', 'nCm%d_M%d' % (n_clusters_min, n_clusters_max)) dim0 = X.shape[1] if kargs.get( 'reduce_dimension', False ) and tFoundManifoldMethod: # dimensionality reduction prior to gap statistical analysis Xp = learn_manifold.tsne(X, identifier=identifier) # use t-SNE by default print('run_silhouette_analysis> dim of X from %d to %d' % (dim0, Xp.shape[1])) else: Xp = X # Use 'Xp' from this point onwards # [params] # range_n_clusters = kargs.get('range_n_clusters', [2, 3, 4, 5, 6, 10, 15, 20]) n_clusters_requested = kargs.get('n_clusters', None) if n_clusters_requested is not None: if not n_clusters_requested in range_n_clusters: range_n_clusters.append(n_clusters_requested) print('param> input n_clusters (requested): %s > range_n_clusters: %s' % (n_clusters_requested, range_n_clusters)) # identifier identifier = kargs.get( 'identifier', 'nR%s-%s' % (min(range_n_clusters), max(range_n_clusters))) outputdir = kargs.get('outputdir', os.path.join(os.getcwd(), 'plot')) if not os.path.exists(outputdir): os.makedirs(outputdir) # base directory ranked_scores = [] for n_clusters in range_n_clusters: # Create a subplot with 1 row and 2 columns plt.clf() fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(Xp) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(Xp) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(Xp, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) ranked_scores.append((n_clusters, silhouette_avg)) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(Xp, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(Xp[:, 0], Xp[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors) # Labeling the clusters centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200) for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle( ("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') # plt.show() graph_ext = 'tif' fpath = os.path.join( outputdir, 'silhouette_test-%s-nC%s.%s' % (identifier, n_clusters, graph_ext)) print('output> saving silhouette test result to %s' % fpath) plt.savefig(fpath) ### end range of n_clusters ranked_scores = sorted(ranked_scores, key=lambda x: abs(x[1]), reverse=False) # reverse=False => ascending print('output> ranked scores (n_clusters vs average score):\n%s\n' % ranked_scores) return ranked_scores[0][0]
def run_kmodes(syms, X, n, alpha): if os.path.isfile("%d_CLUSTERS.pkl" % (n, )): X_ENC, clusters, centroids = pickle.load( open("%d_CLUSTERS.pkl" % (n, ), "r")) # Create a subplot with 1 row and 2 columns fig, ax1 = plt.subplots(1, 1) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X_ENC) + (n + 1) * 10]) sil_avg = silhouette_score(X_ENC, clusters, metric=simple_compare) print("For n_clusters =", n, "The average silhouette_score is :", sil_avg) sample_silhouette_values = silhouette_samples(X_ENC, clusters, metric=simple_compare) y_lower = 10 for i in range(n): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = sample_silhouette_values[clusters == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples print sample_silhouette_values ax1.set_title("The silhouette plot for %d clusters." % (n, )) ax1.set_xlabel("The silhouette coefficient values (AVF = %f)" % (sil_avg, )) ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=sil_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) mng = plt.get_current_fig_manager() mng.window.state('zoomed') plt.show() # fig.savefig('%d_SILS.png' % (np.amax(clusters)+1,)) # plt.close(fig) return (np.amax(clusters) + 1, sil_avg)
def plot_insertions(fname, seqs, gene, domain, tax, id2name): """ plot insertions wrt model positions 2 panels: 1. insertions with ORF - triangle if has intron - circle if no intron 2. insertion does not have ORF - triangle if has intron - circle if no intron *3. conservation of sequence in model *4. e. coli positions # seqs[id] = [gene, model, [[i-gene_pos, i-model_pos, i-length, orf, intron], ...]] """ # import import matplotlib import matplotlib.pyplot as plt import numpy as np from matplotlib.backends.backend_pdf import PdfPages import matplotlib.cm as cm import matplotlib.colors as col from matplotlib.font_manager import FontProperties plt.rcParams['pdf.fonttype'] = 42 # illustrator pdf # axis max_insert = max_insertion(seqs, gene, domain) height = max_insert + max_insert * 0.10 xmax = model_length(gene, domain) f, axarr = plt.subplots(4, sharex=True, sharey=True) plt.axis([0, xmax, 0, height]) plt.xticks(np.arange(0, xmax, 100), rotation=45) plt.yticks(np.arange(0, height, 200)) # labels axarr[0].set_title('encodes ORF and intron') axarr[1].set_title('encodes ORF, no intron') axarr[2].set_title('encodes intron, no ORF') axarr[3].set_title('no intron, no ORF') plt.suptitle('%s %s rRNA gene insertions' % (domain, gene)) plt.ylabel('insertion length (bp)') plt.xlabel('position on %s %s rRNA gene model' % (domain, gene)) # colors color2tax = {} if tax is False: taxa = ['n/a'] else: taxa = sorted( set([ tax[id2name[i]] for i, j in list(seqs.items()) if j[2] != [] and id2name[i] in tax ])) if 'n/a' not in taxa: taxa.append('n/a') colors = cm.spectral(np.linspace(0, 1, len(taxa))) colors = cycle(colors) for t in taxa: color2tax[t] = next(colors) # markers markers = setup_markers(seqs) # plot for name, seq in list(seqs.items()): g, d = seq[0], seq[1] if g != gene or d != domain or seq[2] == []: continue if tax is False or id2name[name] not in tax: t = 'n/a' else: t = tax[id2name[name]] c = color2tax[t] for ins in seq[2]: family = [i for i in list(ins[-1].values()) if i != 'n/a'] if len(family) != 1: family = 'n/a' else: family = family[0] x, y = int(ins[1]), int(ins[2]) orf, intron = ins[-3], ins[-2] if orf is True: # has orf if intron is True: # has intron p = 0 else: p = 1 else: if intron is True: # intron, no orf p = 2 else: p = 3 marker, size = 'o', 30 if orf is True: marker, size = markers[family] axarr[p].scatter(x, y, \ edgecolors = c, marker = marker, s = size, label = family, \ facecolors = 'none', clip_on = False) # legend handles, labels = [], [] for ax in axarr[0:2]: hs, ls = ax.get_legend_handles_labels() for h, l in zip(hs, ls): if l in labels: continue handles.append(h) labels.append(l) l1 = plt.legend(handles, labels, scatterpoints = 1, \ prop = {'size':10}, loc = 'upper left', bbox_to_anchor = (1, 0.5)) names = [t for t in taxa] boxes = [ matplotlib.patches.Rectangle((0, 0), 1, 1, fc=color2tax[t]) for t in taxa ] plt.legend(boxes, names, scatterpoints = 1, \ prop = {'size':10}, loc = 'lower left', bbox_to_anchor = (1, 0.5)) plt.gca().add_artist(l1) # add l1 as a separate legend # save # plt.tight_layout() figure = plt.gcf() figure.set_size_inches(12, 12) pdf = PdfPages('%s.%s-%srRNAgene-insertions.pdf' % (fname.rsplit('.', 1)[0], domain, gene)) pdf.savefig() plt.close() pdf.close()
def plot_silhouette(self, reduced_data, **kwargs): #Code from http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(reduced_data) + (self.n_clusters + 1) * 10]) # # Initialize the clusterer with n_clusters value and a random generator # # seed of 10 for reproducibility. # clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = self.clusterer.fit_predict(reduced_data) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(reduced_data, cluster_labels) print("For n_clusters =", self.n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(reduced_data, cluster_labels) y_lower = 10 for i in range(self.n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / self.n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhoutte score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / self.n_clusters) ax2.scatter(reduced_data[:, 0], reduced_data[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors) # Labeling the clusters centers = self.clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200) for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle( ("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % self.n_clusters), fontsize=14, fontweight='bold') if kwargs['no_display'] == True: plt.savefig(kwargs['img_name']) else: plt.show()
def compare_silhoutte_scores(dfi, samples, range_n_clusters, cluster_dim='features'): """Compare silhoutte scores kmeans cluster numbers. Source code obtained and modified from :- http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html Parameters ---------- dfi : pandas dataframe input dataframe with features as rows and and samples as columns samples : list of str Names of samples range_n_clusters: list of int The list of cluster numbers for which the silhoutte score is to be computed. cluster_dim : Optional[str] Dimension along which data is to be clustered. Default is along features. To cluster samples, set cluster_dim='samples' Returns ------- """ df = dfi.fillna(0).copy() X = df[samples].values if cluster_dim == 'samples': X = X.T for n_clusters in range_n_clusters: # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(X) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k') # Labeling the clusters centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200, edgecolor='k') for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k') ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle( ("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.show()
print n_clusters, silhouette_avg fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(all) + (n_clusters + 1) * 10]) y_lower = 10 for i in range(n_clusters): ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhoutte score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
# Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(scaledX, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values")
def plot_silhouette(clf, X, title='Silhouette Analysis', metric='euclidean', copy=True, ax=None): """Plots silhouette analysis of clusters using fit_predict. Args: clf: Clusterer instance that implements ``fit`` and ``fit_predict`` methods. X (array-like, shape (n_samples, n_features)): Data to cluster, where n_samples is the number of samples and n_features is the number of features. title (string, optional): Title of the generated plot. Defaults to "Silhouette Analysis" metric (string or callable, optional): The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by sklearn.metrics.pairwise.pairwise_distances. If X is the distance array itself, use "precomputed" as the metric. copy (boolean, optional): Determines whether ``fit`` is used on **clf** or on a copy of **clf**. ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot the learning curve. If None, the plot is drawn on a new set of axes. Returns: ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn. Example: >>> import scikitplot.plotters as skplt >>> kmeans = KMeans(n_clusters=4, random_state=1) >>> skplt.plot_silhouette(kmeans, X) <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490> >>> plt.show() .. image:: _static/examples/plot_silhouette.png :align: center :alt: Silhouette Plot """ if copy: clf = clone(clf) cluster_labels = clf.fit_predict(X) n_clusters = len(set(cluster_labels)) silhouette_avg = silhouette_score(X, cluster_labels, metric=metric) sample_silhouette_values = silhouette_samples(X, cluster_labels, metric=metric) if ax is None: fig, ax = plt.subplots(1, 1) ax.set_title(title) ax.set_xlim([-0.1, 1]) ax.set_ylim([0, len(X) + (n_clusters + 1) * 10 + 10]) ax.set_xlabel('Silhouette coefficient values') ax.set_ylabel('Cluster label') y_lower = 10 for i in range(n_clusters): ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 ax.axvline(x=silhouette_avg, color="red", linestyle="--", label='Silhouette score: {0:0.3f}'.format(silhouette_avg)) ax.set_yticks([]) # Clear the y-axis labels / ticks ax.set_xticks(np.arange(-0.1, 1.0, 0.2)) ax.legend(loc='best') return ax
PEsx1_ic8 = datafile['PEsx1'] SCsx1_ic8 = datafile['SCsx1'] fileheader = 'PE_SC_DPDoubPen_LsEq1_MsEq1_g9p81_tstep001_icscanIC16_embeddelay5_999_delays' datafile = loadnpzfile(datadir + fileheader + npz) PEsx1_ic16 = datafile['PEsx1'] SCsx1_ic16 = datafile['SCsx1'] tmax, dt = 100, 0.001 t = np.arange(0, tmax + dt, dt) timeindex = delayindex * 0.001 import matplotlib.cm as cm colors = np.zeros([20, 4]) for i in np.arange(20): c = cm.spectral(i / 20., 1) colors[i, :] = c points = ['o', 'v', 's', 'p', '*', 'h', '^', 'D', '+', '>', 'H', 'd', 'x', '<'] plt.rc('axes', linewidth=2.0) plt.rc('xtick.major', width=2.0) plt.rc('ytick.major', width=2.0) plt.rc('xtick.minor', width=2.0) plt.rc('ytick.minor', width=2.0) plt.rc('lines', markersize=8, markeredgewidth=0.0, linewidth=2.0) #plt.rcParams['ps.fonttype'] = 3 fig = plt.figure(num=1, figsize=(7, 6), dpi=600, facecolor='w', edgecolor='k') left = 0.15 # the left side of the subplots of the figure right = 0.94 # the right side of the subplots of the figure bottom = 0.1 # the bottom of the subplots of the figure top = 0.96 # the top of the subplots of the figure
def plot_scores_and_clusters_from_pca(X, corpus): """ Plotting silhouette scores and the clusters Adapted from http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html """ # range_n_clusters = [2, 3] range_n_clusters = [2, 3, 4, 5, 6] for n_clusters in range_n_clusters: # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) fig.subplots_adjust(wspace=0.3) # Adjust width space betweens subplots # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.2, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value clusterer = KMeans(n_clusters=n_clusters) cluster_labels = clusterer.fit_predict(X) # Saving the data with predicted classes newdata = pd.DataFrame({'label' : cluster_labels, 'text' : corpus}) newdata.to_csv('rd-pca-labeled-clusters-' + str(n_clusters) + '.csv', encoding='utf-8') # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed clusters silhouette_avg = silhouette_score(X, cluster_labels) print("PCA: For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("Silhouette plot for the various clusters", fontsize=11) ax1.set_xlabel("Silhouette coefficient values, Avg: " + str(round(silhouette_avg, 4)), fontsize=11) ax1.set_ylabel("Cluster label", fontsize=11) # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors) # Labeling the clusters centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200) for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$% d$' % i, alpha=1, s=50) ax2.set_title("Visualization of the clustered data", fontsize=11) ax2.set_xlabel("Feature space for the 1st feature", fontsize=11) ax2.set_ylabel("Feature space for the 2nd feature", fontsize=11) plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=12, fontweight='bold') plt.show(block=False) plt.savefig('rd-pca-clusters-' + str((n_clusters)) + '.png', dpi=600)
def plot_ellipses(ds, sts, x=0, y=1, ci=.95, labels=None, cmap=None,scat=False,linestyle=None, nude=False, axes='off', fig=None, **kwargs): """ center: should be the factor scores from original compromise matrix points: should be factor scores from bootstrap """ f = plt.figure(fig) ax = f.gca() boot = ds.samples i,j,k = boot.shape if cmap==None: cmap = cm.spectral(np.linspace(.2,.85,i)) if linestyle is None: if ds.sa.has_key('linestyle'): linestyle = list(ds.sa['linestyle']) else: linestyle = ['solid']*i if labels is None: labels = list(ds.targets) mx = np.max(abs(boot[:,[x,y],:])) xmx = mx*1.2 w = .01*xmx hw = .05*xmx #plt.plot([-mx,mx],[0,0],c = 'gray',alpha=.7, lw=2) #plt.plot([0,0],[-mx*.8,mx*.8],c = 'gray',alpha=.7, lw=2) plt.arrow(-xmx,0,2*xmx,0,color = 'gray',alpha=.7,width=w, head_width=hw,length_includes_head=True) plt.arrow(0,-mx,0,mx*2,color = 'gray',alpha=.7, width=w, head_width=hw,length_includes_head=True) for l in range(i): points = np.hstack((boot[l,x,:].reshape(-1,1), boot[l,y,:].reshape(-1,1))) center = np.mean(points,0) w, rot = np.linalg.eigh(np.cov(points.T)) # get size corresponding to level a = np.sqrt(w[0] * chi2.ppf(ci, 2)) b = np.sqrt(w[1] * chi2.ppf(ci, 2)) j = np.linspace(0,2*np.pi,128) coords = np.hstack(( (np.cos(j)*a).reshape((-1,1)), (np.sin(j)*b).reshape((-1,1)))) coords = np.mat(coords.dot(rot.T) + center) plt.plot(np.vstack((coords[:,0], coords[0,0])), np.vstack((coords[:,1], coords[0,1])), c=cmap[l], ls=linestyle[l], **kwargs) if scat: plt.scatter(points[:,0],points[:,1],c=cmap[l]) if not nude: plt.annotate(labels[l],xy = (center[0], center[1])) mpl.rcParams['text.usetex'] = True if not nude: plt.text(-xmx,.05*mx,'$\lambda = %s$'%np.round(sts.eigv[x],2)) plt.text(xmx*.05,mx*.9,'$\lambda = %s$'%np.round(sts.eigv[y],2)) tau = '$\\tau = $' perc = '$\%$' mpl.rcParams['text.usetex'] = False plt.text(-xmx,-.1*mx, '%s $%s$%s' % (tau,np.round(100*sts.inertia[x],0),perc)) plt.text(xmx*.05,mx*.8, '%s $%s$%s' % (tau,np.round(100*sts.inertia[y],0),perc)) plt.text(-.15*xmx,.8*mx,'$%s$'%(y+1), fontsize=20) plt.text(xmx*.85,-mx*.2,'$%s$'%(x+1),fontsize=20) #plt.axis('equal') #plt.axis([-mx,mx,-mx,mx]) plt.axis('scaled') plt.axis([-xmx,xmx,-mx,mx]) plt.axis(axes) return f.number
timeindex = (delayindex*1e5)/(1e6) PEs_1 = np.zeros([34,500]) SCs_1 = np.zeros([34,500]) for file in np.arange(len(timestep_arr)): fileheader = 'PE_SC_IDdatabase_Type_1_data_3000_499_delays_3227orbits_'+str(timestep_arr[file])+'_timesteps' datafile = loadnpzfile(datadir+fileheader+npy) PEs_1[file,:]=datafile['PEs'] SCs_1[file,:]=datafile['SCs'] ncolors=6 colors = np.zeros([ncolors,4]) for i in np.arange(ncolors): c = cm.spectral(i/float(ncolors),1) colors[i,:]=c points = ['o','v','s','p','*','h','^','D','+','>','H','d','x','<'] plt.rc('axes',linewidth=2.0) plt.rc('xtick.major',width=2.0) plt.rc('ytick.major',width=2.0) plt.rc('xtick.minor',width=2.0) plt.rc('ytick.minor',width=2.0) plt.rc('lines',markersize=2,markeredgewidth=0.0,linewidth=2.0) #plt.rcParams['ps.fonttype'] = 42 #plt.rcParams['pdf.fonttype'] = 42 plt.rc('lines',markersize=2,markeredgewidth=0.0) fig=plt.figure(num=1,figsize=(7,9),dpi=600,facecolor='w',edgecolor='k') left = 0.16 # the left side of the subplots of the figure
SCs600 = datafile['SCs'] fileheader = 'Data_sine700period_ranphasestart_249_delays' datafile = loadnpzfile(datadir+fileheader+npy) PEs700 = datafile['PEs'] SCs700 = datafile['SCs'] #fileheader = 'PE_SC_sinewave_249_delays' #datafile = loadnpzfile(datadir+fileheader+npy) #PEsin = datafile['PEs'] #SCsin = datafile['SCs'] """ colors = np.zeros([5, 4]) for i in np.arange(5): c = cm.spectral(i / 5., 1) colors[i, :] = c points = ['o', 'v', 's', 'p', '*', 'h', '^', 'D', '+', '>', 'H', 'd', 'x', '<'] plt.rc('axes', linewidth=2.0) plt.rc('xtick.major', width=2.0) plt.rc('ytick.major', width=2.0) plt.rc('xtick.minor', width=2.0) plt.rc('ytick.minor', width=2.0) plt.rc('lines', markersize=2, markeredgewidth=0.0, linewidth=2.0) fig = plt.figure(num=1, figsize=(7, 6), dpi=600, facecolor='w', edgecolor='k') left = 0.15 # the left side of the subplots of the figure right = 0.94 # the right side of the subplots of the figure bottom = 0.1 # the bottom of the subplots of the figure top = 0.96 # the top of the subplots of the figure
SCs850old = datafile['SCs'] fileheader = 'PE_SC_DavidData_Class_3_249_delays_900_timesteps' datafile = loadnpzfile(datadir+fileheader+npy) PEs900old = datafile['PEs'] SCs900old = datafile['SCs'] fileheader = 'PE_SC_DavidData_Class_3_249_delays_950_timesteps' datafile = loadnpzfile(datadir+fileheader+npy) PEs950old = datafile['PEs'] SCs950old = datafile['SCs'] """ colors = np.zeros([19, 4]) for i in np.arange(19): c = cm.spectral(i / 19., 1) colors[i, :] = c points = ['o', 'v', 's', 'p', '*', 'h', '^', 'D', '+', '>', 'H', 'd', 'x', '<'] plt.rc('axes', linewidth=0.75) plt.rc('xtick.major', width=0.75) plt.rc('ytick.major', width=0.75) plt.rc('xtick.minor', width=0.75) plt.rc('ytick.minor', width=0.75) plt.rc('lines', markersize=2, markeredgewidth=0.0) plt.rc('lines', markersize=1.5, markeredgewidth=0.0) fig = plt.figure(num=1, figsize=(4, 3), dpi=300, facecolor='w', edgecolor='k') left = 0.16 # the left side of the subplots of the figure right = 0.94 # the right side of the subplots of the figure bottom = 0.2 # the bottom of the subplots of the figure
def plot_clusters_silhouette(X, cluster_labels, n_clusters, root='', file_format='pdf'): """Plot the silhouette score for each cluster, given the distance matrix X. Parameters ---------- X : array_like, shape [n_samples_a, n_samples_a] Distance matrix. cluster_labels : array_like List of integers which represents the cluster of the corresponding point in X. The size must be the same has a dimension of X. n_clusters : int The number of clusters. root : str, optional The root path for the output creation file_format : ('pdf', 'png') Choose the extension for output images. """ # Create a subplot with 1 row and 2 columns fig, (ax1) = plt.subplots(1, 1) fig.set_size_inches(20, 15) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] # ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels, metric="precomputed") silhouette_avg = np.mean(sample_silhouette_values) logging.info("Average silhouette_score: %.4f", silhouette_avg) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle # ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("silhouette coefficient values") ax1.set_ylabel("cluster label") # The vertical line for average silhoutte score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) plt.suptitle(("Silhouette analysis (n_clusters {}, avg score {:.4f}, " "tot Igs {}".format(n_clusters, silhouette_avg, X.shape[0])), fontsize=14, fontweight='bold') filename = os.path.join( root, 'silhouette_analysis_{}.{}'.format(extra.get_time(), file_format)) fig.savefig(filename) logging.info('Figured saved %s', filename)
""" cluster_id = 1 n_points = m.shape[1] classifications = [UNCLASSIFIED] * n_points for point_id in range(0, n_points): point = m[:,point_id] if classifications[point_id] == UNCLASSIFIED: if _expand_cluster(m, classifications, point_id, cluster_id, eps, min_points): cluster_id = cluster_id + 1 return classifications a = np.array([np.array(x),np.array(y)]) import matplotlib.pyplot as pyplot o = dbscan(a,0.5,4) # print(o) unique_labels = np.array(o) n_clusters_ = np.max(o) # print(n_clusters_) # core_samples_mask = np.zeros_like(o, dtype=bool) k = n_clusters_ colors = cm.spectral(unique_labels.astype(float) / k) pyplot.scatter(X[:, 0], X[:, 1], marker='.', s=40, lw=0, alpha=0.7,c=colors) pyplot.title("The visualization of the clustered data.") pyplot.xlabel("X") pyplot.ylabel("Y") pyplot.suptitle(("DBSCAN clustering with n_clusters = %d" % k),fontsize=14, fontweight='bold') print("The number of clusters are ",k) pyplot.savefig("plot11.png")
# Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) axarr[k, t].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle # axarr[k, t].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples axarr[k, t].set_title("K = %d , AVGS = %f" % (n_clusters, silhouette_avg), font)
# plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (k + 1) * 10]) y_lower = 10 for i in range(k): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = sample_silhouette_values[ cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / k) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.1, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("Silhouette for k = " + str(k) + "\n(average silhouette coefficient dashed red line)")
np.savetxt(cfg['outName'] + '_Hz.csv', Hz) np.savetxt(cfg['outName'] + '_Sil.csv', Sils[base]) np.savetxt(cfg['outName'] + '_Ns.csv', Ns[base]) mis[base] = MI nmis[base] = MI / np.sqrt(Hz * Hgt) vMeasures[base] = 2. * MI / (Hz + Hgt) # print mis[base].shape, nmis[base].shape, vMeasures[base].shape # print mis[base] # print nmis[base] # print Ns[base] # print Sils[base] print "done with the runs" cl = cm.spectral(np.arange(255)) I = len(bases) + 1 if 'spkm' in bases and 'DpvMFmeans' in bases: indSpkm = np.ones(len(paramBase['spkm']), dtype=bool) indSpkm[Ns['spkm'].mean(axis=1) < Ns['DPvMFmeans'].min()] = False indSpkm[Ns['spkm'].mean(axis=1) > Ns['DPvMFmeans'].max()] = False paramBase['spkm'] = paramBase['spkm'][indSpkm] nmis['spkm'] = nmis['spkm'][indSpkm, :] mis['spkm'] = mis['spkm'][indSpkm, :] Ns['spkm'] = Ns['spkm'][indSpkm, :] Sils['spkm'] = Sils['spkm'][indSpkm, :] if 'DirvMF' in bases: print "DirvMF NMI: {} +- {}".format(nmis['DirvMF'].mean(),
def silhouette_test(X, kmeans, n_clusters, numsegs, segsize, summaryonly, display=False): print('generating cluster labels') cluster_labels = kmeans.predict(X) thesilavgs = np.zeros(numsegs, dtype='float') thesilclusterstats = np.zeros((numsegs, 4, n_clusters), dtype='float') print('calculating silhouette stats') for segment in range(numsegs): seg_X = X[segment * segsize:(segment + 1) * segsize] seg_cluster_labels = cluster_labels[segment * segsize:(segment + 1) * segsize] # do a quick sanity check to see if all the labels are present clusternums = np.zeros(n_clusters, dtype='int') for i in range(len(seg_cluster_labels)): clusternums[seg_cluster_labels[i]] += 1 if np.min(clusternums) > 0: thesilavgs[segment] = metrics.silhouette_score( seg_X, seg_cluster_labels) print('average silhouette score for segment', segment, '=', thesilavgs[segment]) if not summaryonly: print('doing silhouette samples') sample_silhouette_values = metrics.silhouette_samples( seg_X, seg_cluster_labels) if display: # Create a subplot with 1 row and 2 columns fig, (ax1) = plt.subplots(1, 1) fig.set_size_inches(8, 4.5) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.3, 1] ax1.set_xlim([-0.3, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(seg_X) + (n_clusters + 1) * 10]) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[seg_cluster_labels == i] ith_cluster_silhouette_values.sort() thesilclusterstats[segment, 0, i] = np.mean( ith_cluster_silhouette_values) thesilclusterstats[segment, 1, i] = np.median( ith_cluster_silhouette_values) thesilclusterstats[segment, 2, i] = ith_cluster_silhouette_values[0] thesilclusterstats[segment, 3, i] = ith_cluster_silhouette_values[-1] size_cluster_i = ith_cluster_silhouette_values.shape[0] if display: y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples if display: ax1.set_title( "The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=thesilavgs[segment], color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) plt.suptitle(( "Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.show() else: print('states are not fully populated - skipping stats') return thesilavgs, thesilclusterstats
def computeSilhouette(appMatrixFile): ## Generating the sample data from make_blobs ## This particular setting has one distict cluster and 3 clusters placed close ## together. # X, y = make_blobs(n_samples=10, # n_features=2, # centers=4, # cluster_std=1, # center_box=(-10.0, 10.0), # shuffle=True, # random_state=1) # For reproducibility # print(X.shape) # print(y) appMatrix = cPickle.load(open(appMatrixFile, 'rb')) newAppMatrix = np.array(appMatrix) ''' sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean', n_jobs=1, **kwds) We will now compute the pairwise distance metric for our input array. The distance metric options are:- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs. From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs. ''' X = pairwise_distances(newAppMatrix, metric='manhattan', n_jobs=4) # # print(X.shape) #print(X.shape[0]) # #for listVec in X: # print(listVec) startingNumberOfClusters = 2 endingNumberOfClusters = 6 for n_clusters in range(startingNumberOfClusters,endingNumberOfClusters): # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(X) # print(cluster_labels) # print(cluster_labels.shape) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels, metric='euclidean') logging.debug('For n_clusters ='+n_clusters+'The average silhouette_score is :'+silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels, metric='euclidean') y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhoutte score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors) # Labeling the clusters centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200) for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.show()
import matplotlib.cm as cm import numpy as np dataset = pd.read_csv("dataset2.txt", header = None,delim_whitespace=True) X = np.array(dataset[0:dataset.columns[0]-1]) y =np.array(dataset[dataset.columns[dataset.shape[1]-1]]) k = 3 clusterer = KMeans(n_clusters=k, random_state=10) cluster_labels = clusterer.fit_predict(X) colors = cm.spectral(cluster_labels.astype(float) / k) plt.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,c=colors) centers = clusterer.cluster_centers_ plt.scatter(centers[:, 0], centers[:, 1],marker='o', c="white", alpha=1, s=200) for i, c in enumerate(centers): plt.scatter(c[0], c[1], marker='$%d$' % (i+1), alpha=1, s=50) plt.title("The visualization of the clustered data.") plt.xlabel("X") plt.ylabel("Y") plt.suptitle(("KMeans clustering with n_clusters = %d" % k),fontsize=14, fontweight='bold') plt.savefig("plot2.png")
def visualize(df, cluster_labels, n_clusters, n_iterations): """ Visualize the points in a n-dimensional space and the silhouette for each cluster""" # Dimension for visualization target_dimension = 2 cluster_labels = np.array(cluster_labels) mds = manifold.MDS(target_dimension, max_iter=100, n_init=1) X = mds.fit_transform(df) # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(df, cluster_labels) y_lower = 10 num_elements = len(cluster_labels) for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = np.array([ sample_silhouette_values[k] for k in range(num_elements) if cluster_labels[k] == i ]) ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values # ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k') ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = {} and n_iterations = {}").format( n_clusters, n_iterations + 1), fontsize=14, fontweight='bold') plt.show()
def Cluster(X): range_n_clusters = [2] for n_clusters in range_n_clusters: # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(X) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhoutte score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors) # Labeling the clusters centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200) for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("1st principal component") ax2.set_ylabel("2nd principal component") plt.suptitle( ("Silhouette analysis for KMeans clustering on pca scores " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.show() return cluster_labels.astype(int)
vz_SC = datafile['SCs'][1:] #datadir = 'C:\\Users\\dschaffner\\OneDrive - brynmawr.edu\\Corrsin Wind Tunnel Data\\NPZ_files\\m50_5mm\\streamwise\\' #fileheader = 'PE_SC_m50_5mm_embed6' #datafile = loadnpzfile(datadir+fileheader+npy) #PEs2 = datafile['PEs'] #SCs2 = datafile['SCs'] #PEs2 = np.mean(PEs2,axis=1) #SCs2 = np.mean(SCs2,axis=1) #taus = datafile['taus'] colors = np.zeros([7,4]) for i in np.arange(7): c = cm.spectral(i/7.,1) colors[i,:]=c points = ['o','v','s','p','*','h','^','D','+','>','H','d','x','<'] plt.rc('axes',linewidth=0.75) plt.rc('xtick.major',width=0.75) plt.rc('ytick.major',width=0.75) plt.rc('xtick.minor',width=0.75) plt.rc('ytick.minor',width=0.75) plt.rc('lines',markersize=2,markeredgewidth=0.0) plt.rc('lines',markersize=1.5,markeredgewidth=0.0) fig=plt.figure(num=1,figsize=(3.5,3.5),dpi=300,facecolor='w',edgecolor='k') left = 0.2 # the left side of the subplots of the figure right = 0.94 # the right side of the subplots of the figure bottom = 0.2 # the bottom of the subplots of the figure
def cluster_crimes_k_means(_data, _grid, _k, _n, _plot=True): if _n > 128: _n = 128 print("n was too big. Set to 128.") if _k**2 != len(_grid.index): raise ValueError('parameter k and number of cells in grid does not match') #initialize crime scenes as matrix _points = _data[['X', 'Y']].as_matrix() #initialize cell centroids as matrix _init_center = _grid[["centroid_x", "centroid_y"]].as_matrix() print("K-Means started...") _clusterer = KMeans(n_clusters=(_k**2), init = _init_center, random_state=101, n_jobs = _n) _labels = _clusterer.fit_predict(_points) _data['cluster'] = _labels _grid_clustered_list = [] for _i in range((_k**2)): _grid_clustered_list.append(MultiPoint(list(_data.geometry[_data.cluster == _i])).convex_hull) _grid_clustered = gpd.GeoDataFrame(_grid_clustered_list, columns = ['geometry']).set_geometry('geometry') # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters _silhouette_avg = silhouette_score(_points, _labels) print("For k =", _k) print("The average silhouette_score is :", _silhouette_avg) if (_plot): # Create a subplot with 1 row and 2 columns _fig, (_ax1, _ax2) = plt.subplots(1, 2) _fig.set_size_inches(36, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] _ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. _ax1.set_ylim([0, len(_points) + ((_k**2) + 1) * 10]) # Compute the silhouette scores for each sample _sample_silhouette_values = silhouette_samples(_points, _labels) _y_lower = 10 for i in range((_k**2)): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them _ith_cluster_silhouette_values = _sample_silhouette_values[_labels == i] _ith_cluster_silhouette_values.sort() _size_cluster_i = _ith_cluster_silhouette_values.shape[0] _y_upper = _y_lower + _size_cluster_i _color = cm.spectral(float(i) / (_k**2)) _ax1.fill_betweenx(np.arange(_y_lower, _y_upper), 0, _ith_cluster_silhouette_values, facecolor = _color, edgecolor = _color, alpha = 0.7) # Label the silhouette plots with their cluster numbers at the middle _ax1.text(-0.05, _y_lower + 0.5 * _size_cluster_i, str(i)) # Compute the new y_lower for next plot _y_lower = _y_upper + 10 # 10 for the 0 samples _ax1.set_title("The silhouette plot for {} clusters.".format((_k**2))) _ax1.set_xlabel("The silhouette coefficient values") _ax1.set_ylabel("Cluster label") # The vertical line for average silhoutte score of all the values _ax1.axvline(x=_silhouette_avg, color="red", linestyle="--") _ax1.set_yticks([]) # Clear the yaxis labels / ticks _ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed _colors = cm.spectral(_labels.astype(float) / (_k**2)) _ax2.scatter(_points[:, 0], _points[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=_colors) # Labeling the clusters _centers = _clusterer.cluster_centers_ # Draw white circles at cluster centers _ax2.scatter(_centers[:, 0], _centers[:, 1], marker='o', c="white", alpha=1, s=200) for i, c in enumerate(_centers): _ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50) _ax2.set_title("The visualization of the clustered data.") _ax2.set_xlabel("X") _ax2.set_ylabel("Y") plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " "with k = %d" % _k), fontsize=14, fontweight='bold') plt.show() return _grid_clustered, _labels
def silhouette_analyze(dataframe, cluster_type='KMeans', n_clusters=None): """ Plot silhouette analysis plot of given data and cluster type across different cluster sizes """ # Use clustering algorithms from here # http://scikit-learn.org/stable/modules/clustering.html#clustering # And add a plot that visually plotter.shows the effectiveness of the clusters/clustering rule.(may be # coloured area plots ??) from sklearn.metrics import silhouette_samples, silhouette_score import matplotlib.cm as cm import numpy as np import collections if not n_clusters: n_clusters = range(2, 8, 2) assert isinstance( n_clusters, collections.Iterable), "n_clusters must be an iterable object" dataframe = dataframe.as_matrix() cluster_scores_df = pd.DataFrame( columns=['cluster_size', 'silhouette_score']) # Silhouette analysis -- # http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html #TODO: Add more clustering methods/types like say dbscan and others for j, cluster in enumerate(n_clusters): clusterer = utils.get_model_obj(cluster_type, n_clusters=cluster) # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. #ax1.set_ylim([0, len(dataframe) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator cluster_labels = clusterer.fit_predict(dataframe) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters if len(cluster_labels) > 1: silhouette_avg = silhouette_score(dataframe, cluster_labels) cluster_scores_df.loc[j] = [cluster, silhouette_avg] print("For clusters =", cluster, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples( dataframe, cluster_labels) y_lower = 10 for i in range(cluster): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / len(n_clusters)) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhoutte score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / cluster) ax2.scatter(dataframe[:, 0], dataframe[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors) if hasattr(clusterer, 'cluster_centers_'): # Labeling the clusters centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200) for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(("Silhouette analysis for %s clustering on sample data " "with clusters = %d" % (cluster_type, cluster)), fontsize=14, fontweight='bold') plt.show() plotter.lineplot(cluster_scores_df, xcol='cluster_size', ycol='silhouette_score')
def plot_kmeans(X_data, X_2d, two_d_transformer): from sklearn import mixture range_n_clusters = [ 20] for n_clusters in range_n_clusters: # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X_data) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(X_data) g = mixture.GMM(n_components=n_clusters) gmm_clusters = g.fit_predict(X_data) cluster_labels = gmm_clusters # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X_data, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X_data, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhoutte score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X_2d[:, 0], X_2d[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors) # Labeling the clusters centers = two_d_transformer.transform(clusterer.cluster_centers_) # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200) for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') bin_count = np.bincount(cluster_labels) parties_in_cluster = np.bincount(train.labels[cluster_labels == 0].astype(np.int64)) plt.show()
def silhouette(self, range_n_clusters, cluster_labelss): X = self.ndf for n_cluster in range_n_clusters: fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(12, 6) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(X) + (n_cluster + 1) * 10]) cluster_labels = cluster_labelss[n_cluster - 2] # categories, cluster_labels, cluster_centers_, summary = self.kmeans_fit_predict(n_cluster, preproc) silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_cluster, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_cluster): ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_cluster) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # mds # mds similarities = euclidean_distances(X) mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=random_state, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(similarities).embedding_ df_pos = pd.DataFrame(pos, columns=["comp1", "comp2"]) df_pos["pred"] = cluster_labels for i in range(n_cluster): color = cm.spectral(float(i) / n_cluster) ax2.scatter(df_pos[df_pos["pred"] == i].iloc[:, 0], df_pos[df_pos["pred"] == i].iloc[:, 1], c=color) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st MDS feature") ax2.set_ylabel("Feature space for the 2nd MDS feature") plt.suptitle( ("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_cluster), fontsize=14, fontweight='bold') # end mds plt.show()
fig = pl.figure() colors = ['#4EACC5', '#FF9C34', '#4E9A06'] # We want to have the same colors for the same cluster from the # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per # closest one. distance = euclidean_distances(k_means_cluster_centers, mbk_means_cluster_centers, squared=True) order = distance.argmin(axis=1) # KMeans ax = fig.add_subplot(1, 3, 1) for k in range(n_clusters): col = cm.spectral(float(k) / n_clusters, 1) my_members = k_means_labels == k cluster_center = k_means_cluster_centers[k] ax.plot(X[my_members, 0], X[my_members, 1], 'w',markerfacecolor=col, marker='.') ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,markeredgecolor='k', markersize=6) ax.set_title('KMeans') pl.text(-3.5, 2.7, 'train time: %.2fs' % t_batch) # MiniBatchKMeans ax = fig.add_subplot(1, 3, 2) for k in range(n_clusters): col = cm.spectral(float(k) / n_clusters, 1) my_members = mbk_means_labels == order[k] cluster_center = mbk_means_cluster_centers[order[k]] ax.plot(X[my_members, 0], X[my_members, 1], 'w',markerfacecolor=col, marker='.') ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,markeredgecolor='k', markersize=6)
def run_tsne(features_file, colors_file, output_prefix , filter_sample=[] , filter_cluster=[] , lst=[] , draw_per = 1.0 , iter = 1000 , perplexity = 50): # read data data_df = pd.read_table(features_file, header=None) cluster_colors = pd.read_table(colors_file, header=None) print(data_df.head()) # make dataframe pretty cluster_colors = cluster_colors.rename(columns={1:'color'}) cluster_colors["color"] = [int(extract_num.findall(str(x))[0]) for x in cluster_colors["color"].tolist()] print(cluster_colors.head()) #cluster_colors = cluster_colors.rename(columns={0:0}) # filter by samples if len(filter_sample) > 0: filter1 = [] for x in cluster_colors[0].tolist(): for it in filter_sample: st = "sample" + it + "-" if x.startswith(st): filter1.append(x) cluster_colors = cluster_colors[cluster_colors[0].isin(filter1)] # filter by percent if draw_per < 1: clusters = divide_by_cluster(cluster_colors[0].tolist(), cluster_colors["color"].tolist()) filter2 = take_first_per(clusters, lst) s = set(filter2) lst_new = [] for n in lst: for x in cluster_colors[0].tolist(): if x.startswith(n): print x lst_new.append(x) if x not in s: filter2.append(x) lst = lst_new cluster_colors = cluster_colors[cluster_colors[0].isin(filter2)] # merge data mapped = pd.merge(cluster_colors, data_df, on=0) # filter by length mapped["length"] = [int(x.split("_")[3]) for x in mapped[0].tolist()] mapped = mapped[mapped["length"] > 2000] print(mapped) # normalize like in CONCOCT data = mapped.as_matrix(columns=mapped.columns[2:-1]) v = (1.0/mapped["length"]).as_matrix()[:, np.newaxis] data = data + v along_Y = np.apply_along_axis(sum, 0, data) data = data/along_Y[None, :] along_X = np.apply_along_axis(sum, 1, data) data = data/along_X[:, None] data = np.log(data) #print(data) embedding_array = bhtsne.run_bh_tsne(data, initial_dims=data.shape[1], perplexity=perplexity, max_iter=iter) mapped["x"] = embedding_array[:, 0] mapped["y"] = embedding_array[:, 1] # draw result of TSNE on scatter plot pp = PdfPages(output_prefix) # filter clusters to show fc = filter_cluster if len(fc) > 0: filtered = mapped[mapped["color"].isin(fc)] #mapped = filtered else: filtered = mapped fig = pyplot.figure() # draw scatter plot color = mapped["color"].tolist() mx_color = max(color) pyplot.scatter(mapped["x"].tolist(), mapped["y"].tolist(), c=[cm.spectral(float(i) /mx_color) for i in color]) # make a legend for specific clusters # find cluster centers x = filtered["x"].tolist() y = filtered["y"].tolist() mp = divide_by_color(x, y, filtered["color"].tolist()) points, names = find_cluster_centers(mp) patches = [] dcolors = list(set(color)) for c in dcolors: if c in fc and len(fc) < 5: patches.append(mpatches.Patch(color=cm.spectral(float(c)/mx_color), label='C-'+ str(c))) pyplot.legend(handles=patches) draw_points(points, names, fig) # mark specific points filtered = mapped[mapped[0].isin(lst)] pyplot.scatter(filtered["x"].tolist(), filtered["y"].tolist(), marker="p", edgecolors='black', c=[cm.spectral(float(i) /mx_color) for i in filtered["color"].tolist()]) pyplot.title('Perp = '+ str(perplexity)+ ' Iter = ' + str(iter)) pp.savefig() pp.close()