def plot_best_Z(motifs, best_Z): """Plot the best Z for each motif in each sequence. """ import scipy.cluster.hierarchy as hier import scipy.cluster.vq as vq fig = pylab.gcf() # Cluster (hiearchical) Y axis Y = hier.centroid(best_Z) axdendro = fig.add_axes([0.01, 0.02, 0.18, 0.96]) axdendro.set_xticks([]) axdendro.set_frame_on(False) dendro = hier.dendrogram(Y, labels=motifs, orientation='right') best_Z_permuted = best_Z[dendro['leaves'], :] # K-means cluster X axis xcentroid, xlabel = vq.kmeans2( best_Z.T, k=num_seq_clusters(best_Z.shape[1])) best_Z_permuted = best_Z_permuted[:, numpy.argsort(xlabel)] # Plot matrix axmatrix = fig.add_axes([0.4, 0.02, 0.5, 0.96]) im = axmatrix.matshow(best_Z_permuted, aspect='auto', origin='lower') axmatrix.set_xticks([]) axmatrix.set_yticks([]) # Plot colorbar axcolor = fig.add_axes([0.91, 0.02, 0.02, 0.96]) pylab.colorbar(im, cax=axcolor)
def plot_collinearity(motifs, best_Z): """Plot the cooccurrences of motifs. """ import scipy.cluster.hierarchy as hier # from scipy.stats import pearsonr M = len(motifs) cooccurrences = numpy.ones((M, M)) for m1 in xrange(M): for m2 in xrange(M): # both = sum(numpy.logical_and(m1seqs, m2seqs)) # cooccurrences[m1,m2] = both/float(sum(m2seqs)) cooccurrences[m1, m2] = \ numpy.sqrt(sum(best_Z[m1] * best_Z[m2])) \ / numpy.linalg.norm(best_Z[m2]) # rho, _ = pearsonr(best_Z[m1], best_Z[m2]) # cooccurrences[m1, m2] = rho Y = hier.centroid(cooccurrences) index = hier.fcluster(Y, -1) - 1 cooccurrences = cooccurrences[index, :] cooccurrences = cooccurrences[:, index] pylab.pcolor(cooccurrences) pylab.colorbar() ax = pylab.gca() ax.set_xticks([]) # ax.set_xticks(.5 + numpy.arange(M)) # ax.set_xticklabels(motifs) ax.set_yticks(.5 + numpy.arange(M)) ax.set_yticklabels(numpy.asarray(motifs)[index]) ax.set_xlim((0, M)) ax.set_ylim((0, M)) for line in ax.yaxis.get_ticklines(): line.set_markersize(0) pylab.gcf().subplots_adjust(left=.27, bottom=.02, top=.98, right=.99)
def getCentroids(self): centroids = {} for i in range(int(self.utils.initial_cluster_size), int(self.utils.max_cluster_size) + 1): if self.memory[str(i)]['arrayMeas'] != None: self.my_logger.debug("GETCENTROIDS state " + str(i) + " measurements : " + str(self.memory[str(i)]['arrayMeas'])) if len(self.memory[str(i)]['arrayMeas']) > 1: # Y = pdist(self.memory[str(i)]['arrayMeas'], 'seuclidean') Y = self.memory[str(i)]['arrayMeas'] # Z = centroid(Y) # Z = linkage(Y, 'single') # single, complete, average, weighted, median centroid, ward # T = fcluster(Z, t=1.0, criterion='distance') T = fclusterdata(self.memory[str(i)]['arrayMeas'], t=15.0, criterion='distance', metric='euclidean', method='single') # self.my_logger.debug("GETCENTROIDS state "+ str(i) +" centroids: "+ str(Z)) self.my_logger.debug("GETCENTROIDS state " + str(i) + " clusters: " + str(T)) Z = centroid(Y) self.my_logger.debug("GETCENTROIDS state " + str(i) + " centroid func: " + str(Z)) else: centroids[str(i)] = {} centroids[str(i)]['throughput'] = self.memory[str( i)]['arrayMeas'][0][0] centroids[str(i)]['latency'] = self.memory[str( i)]['arrayMeas'][0][1] self.my_logger.debug("GETCENTROIDS centroids: " + str(centroids)) return centroids
def hier_cluster_and_permute(matrix): import scipy.cluster.hierarchy as hier from scipy.spatial.distance import pdist return hier.centroid(matrix) D = pdist(matrix) # upper triangle of distance matrix as vector Y = hier.linkage(D, method='single') # Cluster # return permuted matrix and dendrogram return Y
def test_clustering_is_same_as_scipy(): ''' Basic clustering returns the same as scipy ''' features = random_feat_matrix(600, 705) scipy_ward = sci_hie.ward(features) our_ward = our_hie.clustering(features) scipy_centroid = sci_hie.centroid(features) our_centroid = our_hie.clustering(features, method='centroid') numpy.testing.assert_almost_equal(scipy_ward, our_ward) numpy.testing.assert_almost_equal(scipy_centroid, our_centroid) return True
def test_clustering_is_same_as_scipy_2(): ''' Basic clustering returns the same as scipy 2 ''' cifti = nibabel.load('./logpar/cli/tests/data/test.dconn.nii') features = cifti.get_data()[0, 0, 0, 0] scipy_ward = sci_hie.ward(features) our_ward = our_hie.clustering(features) scipy_centroid = sci_hie.centroid(features) our_centroid = our_hie.clustering(features, method='centroid') numpy.testing.assert_almost_equal(scipy_ward[:, :2], our_ward[:, :2]) numpy.testing.assert_almost_equal(scipy_centroid[:, :2], our_centroid[:, :2]) return True
def test_all_neighbors_is_same_as_scipy(): ''' Clustering without constraints returns the same as scipy ''' features = random_feat_matrix(200, 100) n = features.shape[0] all_neighbors = numpy.ones((n, n)) - numpy.eye(n) scipy_ward = sci_hie.ward(features) our_ward = our_hie.clustering(features, method='ward', constraints=all_neighbors) scipy_centroid = sci_hie.centroid(features) our_centroid = our_hie.clustering(features, method='centroid', constraints=all_neighbors) numpy.testing.assert_almost_equal(scipy_ward, our_ward) numpy.testing.assert_almost_equal(scipy_centroid, our_centroid)
def getCentroids(self): centroids = {} for i in range(int(self.utils.initial_cluster_size), int(self.utils.max_cluster_size)+1): if self.memory[str(i)]['arrayMeas'] != None: self.my_logger.debug("GETCENTROIDS state "+ str(i) +" measurements : "+ str(self.memory[str(i)]['arrayMeas'])) if len(self.memory[str(i)]['arrayMeas']) > 1: # Y = pdist(self.memory[str(i)]['arrayMeas'], 'seuclidean') Y = self.memory[str(i)]['arrayMeas'] # Z = centroid(Y) # Z = linkage(Y, 'single') # single, complete, average, weighted, median centroid, ward # T = fcluster(Z, t=1.0, criterion='distance') T= fclusterdata(self.memory[str(i)]['arrayMeas'], t=15.0, criterion='distance', metric='euclidean', method='single') # self.my_logger.debug("GETCENTROIDS state "+ str(i) +" centroids: "+ str(Z)) self.my_logger.debug("GETCENTROIDS state "+ str(i) +" clusters: "+ str(T)) Z = centroid(Y) self.my_logger.debug("GETCENTROIDS state "+ str(i) +" centroid func: "+ str(Z)) else: centroids[str(i)] = {} centroids[str(i)]['throughput'] = self.memory[str(i)]['arrayMeas'][0][0] centroids[str(i)]['latency'] = self.memory[str(i)]['arrayMeas'][0][1] self.my_logger.debug("GETCENTROIDS centroids: "+ str(centroids)) return centroids
plt.figure(figsize=(15, 10)) h.dendrogram(result) plt.show() flat_single = h.fcluster(result, 1394, criterion='distance') adjusted_rand_score(y.flatten(), flat_single) adjusted_mutual_info_score(y.flatten(), flat_single) """### **Centroid** Per ogni cluster viene calcolato un *centroide* che rappresenta la media. I cluster vengono uniti in base a i centroidi più simili tra loro. Tali cluster vengono uniti a due a due. """ result = h.centroid(X) plt.figure(figsize=(15, 10)) h.dendrogram(result) plt.show() flat_single = h.fcluster(result, 1394, criterion='distance') adjusted_rand_score(y.flatten(), flat_single) adjusted_mutual_info_score(y.flatten(), flat_single) """## Conclusioni In conclusione si è visto che per questo dataset K-Means ha prodotto dei cluster molto più raffinati rispetto agli algoritmi gerarchici. Vantaggio per l'algoritmo K-Means è che si conosceva a priori il numero di cluster che dovevano essere creati. L'implementazione dell'algoritmo K-Means non è prestante rispetto all'implementazione di *sklearn*, tuttavia producono dei risultati simili.
print H.shape #sono tutti i link effettuati (#esempi-1) e per ciascuno abbiamo # coppie di cluster uniti, distanza e #esempi contenuti in nuovo cluster h.dendrogram(H) pl.show() #il dendogramma e' lungo perche' c'e' chain effect tipico problema del single link #comlpete link H = h.complete(X) h.dendrogram(H) pl.show() #average link H = h.average(X) h.dendrogram(H) pl.show() #centroid link H = h.centroid(X) h.dendrogram(H) pl.show() #ci sono delle inversioni perche' la distanza qui non e' monotona #per ottenere un cluster devo definire una distanza H = h.average(X) C = h.fcluster(H, 1.9, criterion='distance') #la soglia 3.5 sembra buona dal grafico #per vedere il numero di cluster: print "n cluster:", len(np.unique(C)) print "adj randindex gerarc:", metrics.adjusted_rand_score(C, Y)
import numpy as np import matplotlib.pyplot as plt from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, centroid from scipy.spatial.distance import pdist if __name__ == '__main__': gng_output = np.load('data/gng_output.npy') # 階層的クラスタリング Z = centroid(pdist(gng_output, 'correlation')) dendrogram(Z, color_threshold=0.0035) plt.show() # 階層的クラスタリング # Z = linkage(gng_output, metric='correlation', method='average') # dendrogram(Z, color_threshold=0.0035) # plt.show() cl = fcluster(Z, 0.0035, criterion='distance') np.save('data/node_labels', cl)
nclouds = int(nclouds_cusize[0]) cloud_bin = np.array(cloud_bin_netcdf[0, 0:nclouds:]) # Adjust data format for later use: cloud_lon = cloudlon[0, 0:nclouds] cloud_lat = cloudlat[0, 0:nclouds] cloud_size = cloud_bin * size[0] #cloudcentres = np.vstack((cloud_lon,cloud_lat,cloud_size)).T cloudcentres = np.vstack((cloud_lon, cloud_lat)).T labels = np.arange(nclouds) # Compute distances for all pairs based on White et al 2018: Y = distance.pdist(cloudcentres, haversine) # Compute linkage matrix: Z = centroid(Y) max_d = 8000 # Plot dendrogram: plt.figure(figsize=(25, 10)) plt.xlabel('Cloud label') plt.ylabel('Euclidian distance [m]') dendrogram( Z, leaf_rotation=90., truncate_mode='lastp', p=50, #show_leaf_counts=False, show_contracted=True, color_threshold=max_d, )
if __name__ == '__main__': fits = iofits.open('data/fits/fpC-001729-r3-0083.fit.gz') img = fits[0].data with open('data/galaxies', 'rb') as f: galaxies = pickle.load(f) matrix3 = np.load('data/matrix3.npy').astype(np.float64) wcs = WCS(fits[0].header) for i in range(len(matrix3)): matrix3[i] /= matrix3[i].sum() Z = centroid(pdist(matrix3, 'correlation')) maxclust = 6 ct = Z[-(maxclust - 1), 2] cluster = fcluster(Z, maxclust, criterion='maxclust') dendrogram(Z, color_threshold=ct) plt.show() for cls in range(cluster.max()): cls += 1 idxs = np.where(cluster == cls)[0] for idx in idxs: galaxy = galaxies[idx] if len(galaxy) == 0: continue x, y = calc_coord_ave(galaxy)
kmeans = KMeans(n_clusters=17).fit(df) centroids = kmeans.cluster_centers_ print(centroids) plt.scatter(df['x'], df['y'], c=kmeans.labels_.astype(float), s=50, alpha=0.5) plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50) plt.show() # ============================================================================= # ¿PERO REALMENTE 4 grupos son los suficienteS? # DENDROGRAMA!!! # ============================================================================= #https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html centr = centroid(df) labelList = range(len(x)) plt.figure(figsize=(10, 7)) dendrogram(centr, orientation='top', labels=labelList, distance_sort='descending', show_leaf_counts=True) plt.title('Dendrograma using centroid method') plt.show() # ============================================================================= # ALERTA! Hay más técnicas de clustering # =============================================================================
for i in range(0,len(numpyA)): numpyA[i][1]+=0.1 for i in range(0,2): t1 = time() if i ==0: y=pdist(numpyA,'cosine') a=max(y) b=min(y) y=squareform(y) for k in range(0,len(y)): for j in range(0,len(y[0])): y[k][j]= ((y[k][j]-b)/(a-b))+b elif i ==1: y=pdist(numpyA,'euclidean') y=squareform(y) z=h.centroid(y) if i==0: print "cosine, hierarchy clustering time = "+str(time()-t1) elif i==1: print "euclidean, hierarchy clustering time = "+str(time()-t1) if i ==0: np.savetxt("/Users/mengqizhou/Desktop/datamining/programing3/data/hierarchy_label/cosine_linkage.csv", z, '%5.2f',delimiter=",") elif i==1: np.savetxt("/Users/mengqizhou/Desktop/datamining/programing3/data/hierarchy_label/euclidean_linkage.csv", z, '%5.2f',delimiter=",") j = 4 while(j<129): result = h.fcluster(z, z[len(z)-j][2],'distance') if i==0: np.savetxt("/Users/mengqizhou/Desktop/datamining/programing3/data/hierarchy_label/cosine_"+str(j)+".csv", result, '%i',delimiter=",") elif i==1: np.savetxt("/Users/mengqizhou/Desktop/datamining/programing3/data/hierarchy_label/euclidean_"+str(j)+".csv", result, '%i',delimiter=",") j*=2
# 层次聚类有许多算法。可以使用matplotlib绘制结果 import matplotlib.pyplot as plt wine_complete = hierarchy.complete(wine) fig = plt.figure() dn = hierarchy.dendrogram(wine_complete) plt.show() wine_single = hierarchy.single(wine) fig = plt.figure() dn = hierarchy.dendrogram(wine_single) plt.show() wine_averge = hierarchy.average(wine) fig = plt.figure() dn = hierarchy.dendrogram(wine_averge) plt.show() wine_centroid = hierarchy.centroid(wine) fig = plt.figure() dn = hierarchy.dendrogram(wine_centroid) plt.show() wine_complete = hierarchy.complete(wine) fig = plt.figure() dn = hierarchy.dendrogram( wine_complete, # 默认MATLAB阈值 color_threshold=0.7 * max(wine_complete[:, 2]), above_threshold_color='y') plt.show()