def spectral_clustering(A, nb_clusters, laplacian_normalization = None, algo = None): """ Compute the clusters assignement from spectral clustering algorithm steps : * Compute laplacian * Compute k smaller eigenvalues and associated eigenvectors * Train a kmean on this vectors * Apply this kmean to the Laplacian """ if algo not in ['sph', None]: raise Exception('Algorithm {} unknown'.format(algo)) L = get_laplacian(A, laplacian_normalization) L = scipy.sparse.csr_matrix(L, dtype=np.float64) v, w = eigsh(L, nb_clusters, which='SM') if algo == None : km = KMeans(n_clusters= nb_clusters) km.fit(np.transpose(w)) clusters = km.predict(L) elif algo == 'sph': clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25) cluster = clusterer.cluster(np.transpose(w), True) vectors = [np.transpose(L[i, :].toarray()[0]) for i in range(0, L.shape[1])] clusters = [clusterer.classify(vector) for vector in vectors] return clusters
class ClusteringPairwise(): def __init__(self, users_vecs_train_file, centroid_file, clustering_file, num_clusters, n_iteration, mode): self.mode = mode self.num_clusters = num_clusters self.users = np.genfromtxt(users_vecs_train_file) self.tree = LasyTree(np.arange(self.users.shape[0])) self.centroids = np.genfromtxt(centroid_file) clusters_ = np.genfromtxt(clustering_file).astype('int') self.clusters = {} for i in range(num_clusters): self.clusters[i] = [] for i in range(len(clusters_)): self.clusters[clusters_[i]].append(i) self.n_iteration = n_iteration self.kclusterer = KMeansClusterer(num_clusters, distance=cosine_distance, initial_means=list(self.centroids)) def RecieveQuestions(self, item_vecs, user, user_estim, n_points, item_bias, ratings): clusters_ = [self.kclusterer.classify(item) for item in item_vecs] clusters = {} for i in range(self.num_clusters): clusters[i] = [] for i in range(len(clusters_)): clusters[clusters_[i]].append(i) a = np.argsort(clusters_) return AllAlgorithm(self.users, self.n_iteration, self.centroids, item_vecs, item_bias, user, clusters, self.tree, self.mode, ratings)
def spherical_clustering_from_adjency(A, nb_clusters): """ Spectral clustering with spherical kmeans """ A = scipy.sparse.csr_matrix(A, dtype=np.float64) v, w = eigsh(A, nb_clusters, which='LM') clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25) cluster = clusterer.cluster(np.transpose(w), True) vectors = [np.transpose(A[i, :].toarray()[0]) for i in range(0, A.shape[1])] clusters = [clusterer.classify(vector) for vector in vectors] return clusters
class kmeans_cosine(object): def __init__(self,k): self.k = k self.model = KMeansClusterer(k, distance=nltk.cluster.util.cosine_distance, repeats=25) def build(self,X,p): """ """ data = scipy.sparse.csr_matrix(X).toarray() kclusters= np.array(self.model.cluster(data, assign_clusters=True)) prediction = self.model.classify(p) cluster_id = kclusters == prediction return cluster_id, prediction def save(self, filename = "model2.pkl"): """ """ with open(filename, 'w') as f: pickle.dump(self.model, f)
def get_cluster(tfidf_arr, k): """ K-means聚类 :param tfidf_arr: :param k: :return: """ kmeans = KMeansClusterer(num_means=k, distance=cosine_distance, avoid_empty_clusters=True) # 分成k类,使用余弦相似分析 kmeans.cluster(tfidf_arr) # 获取分类 kinds = pd.Series([kmeans.classify(i) for i in tfidf_arr]) fw = open('/you_filed_algos/prod_kudu_data/ClusterText.txt', 'a+', encoding='utf-8') for i, v in kinds.items(): fw.write(str(i) + '\t' + str(v) + '\n') fw.close()
## 构建语料库,并计算文档--词的TF-IDF矩阵 vectorizer = CountVectorizer() transformer = TfidfVectorizer() tfidf = transformer.fit_transform(articals) ## tfidf 以稀疏矩阵的形式存储,将tfidf转化为数组的形式,文档-词矩阵 dtm = tfidf.toarray() ## 使用夹角余弦距离进行k均值聚类 kmeans = KMeansClusterer(num_means=2, #聚类数目 distance=nltk.cluster.util.cosine_distance, #夹角余弦距离 ) kmeans.cluster(dtm) ## 聚类得到的类别 labpre = [kmeans.classify(i) for i in dtm] kmeanlab = Red_df[["ChapName","Chapter"]] kmeanlab["cosd_pre"] = labpre kmeanlab ## 查看每类有多少个分组 count = kmeanlab.groupby("cosd_pre").count() ## 将分类可视化 count.plot(kind="barh",figsize=(6,5)) for xx,yy,s in zip(count.index,count.ChapName,count.ChapName): plt.text(y =xx-0.1, x = yy+0.5,s=s) plt.ylabel("cluster label") plt.xlabel("number") plt.show()