示例#1
0
def spectral_clustering(A, nb_clusters, laplacian_normalization = None, algo = None):
    """
    Compute the clusters assignement from spectral clustering algorithm
    steps :
    * Compute laplacian
    * Compute k smaller eigenvalues and associated eigenvectors
    * Train a kmean on this vectors
    * Apply this kmean to the Laplacian
    """
    if algo not in ['sph', None]:
        raise Exception('Algorithm {} unknown'.format(algo))

    L = get_laplacian(A, laplacian_normalization)
    L = scipy.sparse.csr_matrix(L, dtype=np.float64)
    v, w = eigsh(L, nb_clusters, which='SM')

    if algo == None :
        km = KMeans(n_clusters= nb_clusters)
        km.fit(np.transpose(w))
        clusters = km.predict(L)

    elif algo == 'sph':
        clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
        cluster = clusterer.cluster(np.transpose(w), True)
        vectors = [np.transpose(L[i, :].toarray()[0]) for i in range(0, L.shape[1])]
        clusters = [clusterer.classify(vector) for vector in vectors]
    return clusters
class ClusteringPairwise():
    def __init__(self, users_vecs_train_file, centroid_file, clustering_file,
                 num_clusters, n_iteration, mode):
        self.mode = mode
        self.num_clusters = num_clusters
        self.users = np.genfromtxt(users_vecs_train_file)
        self.tree = LasyTree(np.arange(self.users.shape[0]))
        self.centroids = np.genfromtxt(centroid_file)
        clusters_ = np.genfromtxt(clustering_file).astype('int')
        self.clusters = {}
        for i in range(num_clusters):
            self.clusters[i] = []
        for i in range(len(clusters_)):
            self.clusters[clusters_[i]].append(i)
        self.n_iteration = n_iteration
        self.kclusterer = KMeansClusterer(num_clusters,
                                          distance=cosine_distance,
                                          initial_means=list(self.centroids))

    def RecieveQuestions(self, item_vecs, user, user_estim, n_points,
                         item_bias, ratings):
        clusters_ = [self.kclusterer.classify(item) for item in item_vecs]
        clusters = {}
        for i in range(self.num_clusters):
            clusters[i] = []
        for i in range(len(clusters_)):
            clusters[clusters_[i]].append(i)
        a = np.argsort(clusters_)
        return AllAlgorithm(self.users, self.n_iteration, self.centroids,
                            item_vecs, item_bias, user, clusters, self.tree,
                            self.mode, ratings)
示例#3
0
def spherical_clustering_from_adjency(A, nb_clusters):
    """
    Spectral clustering with spherical kmeans
    """
    A = scipy.sparse.csr_matrix(A, dtype=np.float64)
    v, w = eigsh(A, nb_clusters, which='LM')
    clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
    cluster = clusterer.cluster(np.transpose(w), True)
    vectors = [np.transpose(A[i, :].toarray()[0]) for i in range(0, A.shape[1])]
    clusters = [clusterer.classify(vector) for vector in vectors]
    return clusters
示例#4
0
class kmeans_cosine(object):
    def __init__(self,k):
        self.k = k
        self.model = KMeansClusterer(k, distance=nltk.cluster.util.cosine_distance, repeats=25)

    def build(self,X,p):
        """
        """
        data = scipy.sparse.csr_matrix(X).toarray()
        kclusters= np.array(self.model.cluster(data, assign_clusters=True))
        prediction = self.model.classify(p)
        cluster_id = kclusters == prediction
        return cluster_id, prediction

    def save(self, filename = "model2.pkl"):
        """
        """
        with open(filename, 'w') as f:
            pickle.dump(self.model, f)
示例#5
0
def get_cluster(tfidf_arr, k):
    """
    K-means聚类
    :param tfidf_arr:
    :param k:
    :return:
    """
    kmeans = KMeansClusterer(num_means=k,
                             distance=cosine_distance,
                             avoid_empty_clusters=True)  # 分成k类,使用余弦相似分析
    kmeans.cluster(tfidf_arr)

    # 获取分类
    kinds = pd.Series([kmeans.classify(i) for i in tfidf_arr])
    fw = open('/you_filed_algos/prod_kudu_data/ClusterText.txt',
              'a+',
              encoding='utf-8')
    for i, v in kinds.items():
        fw.write(str(i) + '\t' + str(v) + '\n')
    fw.close()
示例#6
0
## 构建语料库,并计算文档--词的TF-IDF矩阵
vectorizer = CountVectorizer()
transformer = TfidfVectorizer()
tfidf = transformer.fit_transform(articals)

## tfidf 以稀疏矩阵的形式存储,将tfidf转化为数组的形式,文档-词矩阵
dtm = tfidf.toarray()

## 使用夹角余弦距离进行k均值聚类
kmeans = KMeansClusterer(num_means=2,       #聚类数目
                         distance=nltk.cluster.util.cosine_distance,  #夹角余弦距离
                         )
kmeans.cluster(dtm)

## 聚类得到的类别
labpre = [kmeans.classify(i) for i in dtm]
kmeanlab = Red_df[["ChapName","Chapter"]]
kmeanlab["cosd_pre"] = labpre
kmeanlab


## 查看每类有多少个分组
count = kmeanlab.groupby("cosd_pre").count()

## 将分类可视化
count.plot(kind="barh",figsize=(6,5))
for xx,yy,s in zip(count.index,count.ChapName,count.ChapName):
    plt.text(y =xx-0.1, x = yy+0.5,s=s)
plt.ylabel("cluster label")
plt.xlabel("number")
plt.show()