def _global_clustering(self, X=None): #对fitting之后获得的subclusters进行global_clustering clusterer = self.n_clusters centroids = self.subcluster_centers_ compute_labels = (X is not None) and self.compute_labels # 预处理 not_enough_centroids = False if isinstance(clusterer, int): clusterer = AgglomerativeClustering(n_clusters=self.n_clusters) if len(centroids) < self.n_clusters: not_enough_centroids = True elif (clusterer is not None): raise ValueError("n_clusters should be an instance of " "ClusterMixin or an int") # 避免predict环节,重复运算 self._subcluster_norms = row_norms( self.subcluster_centers_, squared=True) if clusterer is None or not_enough_centroids: self.subcluster_labels_ = np.arange(len(centroids)) if not_enough_centroids: warnings.warn( "Number of subclusters found (%d) by Birch is less than (%d). Decrease the threshold."% (len(centroids), self.n_clusters)) else: # 对所有叶子节点的subcluster进行聚类,它将subcluster的centroids作为样本,并且找到最终的centroids. self.subcluster_labels_ = clusterer.fit_predict( self.subcluster_centers_) if compute_labels: self.labels_ = self.predict(X)
def f1(): model = AgglomerativeClustering(n_clusters=claster_number, affinity='euclidean', linkage='complete') # model.fit(matrix.toarray()) # print (model.distance) # print(linkage(matrix.toarray(), method='single', metric='euclidean')) preds = model.fit_predict(matrix.toarray()) res = dict() for i, p in enumerate(preds): # print(p) res[basename(dataset['filenames'][i])] = dataset['target_names'][p] prev = None for k, v in sorted(res.items(), key=operator.itemgetter(1)): if prev != v: print(v, ':') prev = v print('\t\t', k) dist = 1 - cosine_similarity(matrix.toarray()) row_sums = dist.sum(axis=1) new_matrix = dist / row_sums[:, np.newaxis] plt.figure(figsize=(20, 20), dpi=300) sb.heatmap(new_matrix) lbls = list() for fn in dataset['filenames']: lbls.append(basename(fn)[:-4]) plt.xticks(np.arange(0, article_number), lbls, rotation='vertical') plt.yticks(np.arange(0, article_number), lbls, rotation='horizontal') plt.show() print(())
def agglomerative_clustering (matrix): print "====== Agglomerative Clustering ===============" model = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='complete') preds = model.fit_predict(matrix) clusters = model.labels_.tolist() return (preds, clusters)
def _global_clustering(self, X=None): """ Global clustering for the subclusters obtained after fitting """ clusterer = self.n_clusters centroids = self.subcluster_centers_ compute_labels = (X is not None) and self.compute_labels # Preprocessing for the global clustering. not_enough_centroids = False if isinstance(clusterer, int): clusterer = AgglomerativeClustering( n_clusters=self.n_clusters) # There is no need to perform the global clustering step. if len(centroids) < self.n_clusters: not_enough_centroids = True elif (clusterer is not None and not hasattr(clusterer, 'fit_predict')): raise ValueError("n_clusters should be an instance of " "ClusterMixin or an int") # To use in predict to avoid recalculation. self._subcluster_norms = row_norms( self.subcluster_centers_, squared=True) if clusterer is None or not_enough_centroids: self.subcluster_labels_ = np.arange(len(centroids)) if not_enough_centroids: warnings.warn( "Number of subclusters found (%d) by Birch is less " "than (%d). Decrease the threshold." % (len(centroids), self.n_clusters)) else: # The global clustering step that clusters the subclusters of # the leaves. It assumes the centroids of the subclusters as # samples and finds the final centroids. self.subcluster_labels_ = clusterer.fit_predict( self.subcluster_centers_) if compute_labels: self.labels_ = self.predict(X)
plt.show() #kMeans clustering from sklearn.cluster import KMeans km = KMeans(init='random', max_iter=150, n_clusters=2, random_state=0) y_km = km.fit_predict(X) plt.scatter(X[y_km == 0, 0], X[y_km == 0, 1], c='green') plt.scatter(X[y_km == 1, 0], X[y_km == 1, 1], c='red') plt.title("KMeans") plt.show() #Agglomerative Clustering with complete linkage from sklearn.cluster.hierarchical import AgglomerativeClustering aggcl = AgglomerativeClustering(n_clusters=2, linkage='complete') y_agcl = aggcl.fit_predict(X) plt.scatter(X[y_agcl == 0, 0], X[y_agcl == 0, 1], c='green') plt.scatter(X[y_agcl == 1, 0], X[y_agcl == 1, 1], c='red') plt.title("Aggolomerative Clustering") plt.show() #Demonstaring clustering using density-based approach from sklearn.cluster import DBSCAN dbs = DBSCAN(eps=0.2, min_samples=5) y_dbs = dbs.fit_predict(X) plt.scatter(X[y_dbs == 0, 0], X[y_dbs == 0, 1], c='green') plt.scatter(X[y_dbs == 1, 0], X[y_dbs == 1, 1], c='red') plt.title("Density based(DBSCAN) Clustering") plt.show()
# In[24]: for cluster in dbscan_clusters: print(data.loc[cluster].mean()) # In[11]: from sklearn.cluster.hierarchical import AgglomerativeClustering model = AgglomerativeClustering(n_clusters=4, linkage='average', affinity='manhattan') aggl_preds = model.fit_predict(scaled_features) # In[12]: clusters_aggl = [] for lbl in np.unique(aggl_preds): indices = [i for i, x in enumerate(aggl_preds) if x == lbl] clusters_aggl.append(indices) # In[25]: for cluster in clusters_aggl: