Пример #1
0
    def fit(self, data):

        # step1 construct weight matrix for every point
        #weight = kneighbors_graph(data, n_neighbors = self.n_neighbors_,mode='distance', include_self = False)
        weight = kneighbors_graph(data, n_neighbors = self.n_neighbors_,mode='connectivity', include_self = False)
        weight = 0.5 * (weight + weight.T)
        self.weight_ = weight.toarray()
        self.degree_ = np.diag(np.sum(self.weight_, axis = 0).ravel()) 

        # step2 construct Laplacian matrix for every point, and normalize
        self.laplacians_ = self.degree_ - self.weight_
        #unit_arrary = np.ones([data.shape[0],data.shape[0]],dtype=np.float64)
        #with np.errstate(divide='ignore'): 
        #    degree_nor = unit_arrary/np.sqrt(self.degree_) 
        #    degree_nor[self.degree_ == 0] = 0
        degree_nor=np.sqrt(np.linalg.inv(self.degree_))
        self.laplacians_ = np.dot(degree_nor, self.laplacians_)  
        self.laplacians_ = np.dot(self.laplacians_, degree_nor)#normalize

        #step3 compute minimun k eigenvalues corresponding to eigenvectors and normalize
        eigen_values, eigen_vector  = np.linalg.eigh(self.laplacians_)
        sort_index = eigen_values.argsort()
        eigen_vector = eigen_vector[:,sort_index]
        self.eigen_vector_ = np.asarray([eigen_vector[:,i] for i in range(self.n_clusters_)]).T
        #self.eigen_vector_ /= np.sqrt(np.sum(self.eigen_vector_**2, axis = 1)).reshape(data.shape[0], 1 )
        self.eigen_vector_ /= np.linalg.norm(self.eigen_vector_, axis=1).reshape(data.shape[0], 1 )
        
        #step4  kmeans with eigenvectors 
        spectral_kmeans = KMeans.K_Means(n_clusters=self.n_clusters_)
        spectral_kmeans.fit(self.eigen_vector_)
        spectral_label = spectral_kmeans.predict(self.eigen_vector_)
        self.label_ = spectral_label
        self.fitted = True
Пример #2
0
 def fit(self, data):
     # 作业3
     # 屏蔽开始
     # step1: initial the attribute of gmm by kmeans
     k_means = KMeans.K_Means(self.n_clusters_)
     k_means.fit(data)
     self.mu_ = np.asarray(k_means.centers_)
     print(self.n_clusters_)
     self.prior_ = np.asarray([1 / self.n_clusters_] *
                              self.n_clusters_).reshape(
                                  self.n_clusters_, 1)
     self.posteriori_ = np.zeros((self.n_clusters_, len(data)))
     self.cov_ = np.asarray([eye(2, 2)] * self.n_clusters_)
     # step2:iteration
     Likelihood_value_before = -inf
     for i in range(self.max_iter_):
         # step3: E-step   generate probability density distribution for every point and normalize
         print("gmm iterator:", i)
         for k in range(self.n_clusters_):
             self.posteriori_[k] = multivariate_normal.pdf(x=data,
                                                           mean=self.mu_[k],
                                                           cov=self.cov_[k])
         self.posteriori_ = np.dot(diag(self.prior_.ravel()),
                                   self.posteriori_)
         self.posteriori_ /= np.sum(self.posteriori_, axis=0)
         #posteriori=np.asarray(self.posteriori_)
         #print(posteriori.shape)
         # step4: M-step   update the parameters of generate probability density distribution for every point int E-step and stop when reached threshold
         self.Nk_ = np.sum(self.posteriori_, axis=1)
         self.mu_ = np.asarray([
             np.dot(self.posteriori_[k], data) / self.Nk_[k]
             for k in range(self.n_clusters_)
         ])
         self.cov_ = np.asarray([
             np.dot((data - self.mu_[k]).T,
                    np.dot(np.diag(self.posteriori_[k].ravel()),
                           data - self.mu_[k])) / self.Nk_[k]
             for k in range(self.n_clusters_)
         ])
         self.prior_ = np.asarray([self.Nk_ / self.n_clusters_
                                   ]).reshape(self.n_clusters_, 1)
         Likelihood_value_after = np.sum(np.log(self.posteriori_))
         print(Likelihood_value_after - Likelihood_value_before)
         if np.abs(Likelihood_value_after - Likelihood_value_before
                   ) < self.tolerance_ * self.n_clusters_:
             break
         Likelihood_value_before = np.copy(Likelihood_value_after)
     self.fitted = True
Пример #3
0
 def fit(self, data):
     # 作业3
     # 屏蔽开始
     # step1 初始化 Mu pi cov
     ##init Mu ,使用K-means中心点
     k_means = KMeans.K_Means(n_clusters=self.k)
     k_means.fit(data)
     self.mu = np.asarray(
         k_means.centers_)  # 将mean的初始值为 k-means 的中心点   3*2 矩阵
     self.cov = np.asarray([eye(2, 2)] * self.k)  #初始化的cov为 3*2*2 的单位矩阵
     self.prior = np.asarray([1 / self.k] * self.k).reshape(
         3, 1)  #对pi进行均等分  3*1 矩阵
     self.posteriori = np.zeros((self.k, len(data)))  #初始化 后验概率为 K*N的 矩阵
     for _ in range(self.max_iter):  #迭代
         #step2 E-step 算出后验概率posteriori --一个点属于哪个类的概率
         for k in range(self.k):
             self.posteriori[k] = multivariate_normal.pdf(
                 x=data, mean=self.mu[k], cov=self.cov[k])  #提取每个点的概率密度分布
         self.posteriori = np.dot(
             diag(self.prior.ravel()), self.posteriori
         )  #diag 将一维数组元素放在对角线上,方便进行对应的数据乘法运算 变为3*3对角矩阵 3*3 * 3*N = 3*N,ravel 将矩阵里所有元素变为列表
         ##归一化
         self.posteriori /= np.sum(self.posteriori, axis=0)  #后验概率,3*N矩阵
         #step3 M-step 使用MLE 算出高斯模型三个参数  mu:mean cov:协方差  prior:先验概率
         self.Nk = np.sum(self.posteriori, axis=1)
         self.mu = np.asarray([
             np.dot(self.posteriori[k], data) / self.Nk[k]
             for k in range(self.k)
         ])  #self.posteriori[k]: 3*2  data:n*2  self.Nk[k]:1
         self.cov = np.asarray([
             np.dot((data - self.mu[k]).T,
                    np.dot(np.diag(self.posteriori[k].ravel()),
                           data - self.mu[k])) / self.Nk[k]
             for k in range(self.k)
         ])  #sel.cov : 3*2*2
         self.prior = np.asarray([self.Nk / self.k
                                  ]).reshape(3, 1)  #self.prior  3*1
     self.fitted = True
    def fit(self, data):
        # 作业3
        # 屏蔽开始
        # Initialization
        n_clusters = self.n_clusters_
        n_points = len(data)
        D = data.shape[1]
        # 随机在N个数据点中选取k个初始点
        # Kmeans 轮盘法选初始点
        #Mu = km.get_initial(data, n_clusters)  # 聚类中心
        #seed_idx = random.sample(list(range(n_points)),n_clusters)
        #for seed in seed_idx:
        #    Mu.append(data[seed,:])
        kmean = km.K_Means(n_clusters=n_clusters, max_iter=30)
        kmean.fit(data)
        Mu = kmean.cluster_center
        Var = np.asarray([np.cov(data, rowvar=False)] *
                         n_clusters)  # 方差: K*D*D
        #Var = np.ones((n_clusters, D, D))
        pi = [1 / n_clusters
              ] * n_clusters  # 每一个cluster的比重: pi =[1/k, 1/k, 1/k]
        w = np.ones((n_points, n_clusters)) / n_clusters  # 每一个变量分类权重
        #pi = w.sum(axis = 0) / w.sum()
        #迭代求解
        log_p = 1
        old_log_p = 0
        loglh = []
        time_w, time_pi, time_mu, time_var = 0, 0, 0, 0
        for i in range(self.max_iter_):
            #self.plot_clusters(X, Mu, Var)
            old_log_p = log_p
            # E step
            # Update weight (posterior)
            time_start = time.time()
            w = self.update_w(data, Mu, Var, pi)  #
            time_w += time.time() - time_start
            # M step
            # 更新pi
            time_start = time.time()
            pi = self.update_pi(w)
            time_pi += time.time() - time_start
            # 更新聚类中心
            time_start = time.time()
            Mu = self.update_mu(data, w)
            time_mu += time.time() - time_start
            # 更新协方差矩阵
            time_start = time.time()
            Var = self.update_var(data, Mu, w)
            time_var += time.time() - time_start

            log_p = self.get_log(data, pi, Mu, Var)
            #loglh.append(log_p)
            #print('log-likehood:%.3f'%loglh[-1])
            if abs(log_p - old_log_p) < 0.001:
                #print(i)
                break

        # update parameters
        self.w_ = w
        self.pi_ = pi
        self.Mu_ = Mu
        self.Var_ = Var
        print("时间:", time_w, time_pi, time_mu, time_var)
    k3 = np.array(C3)
    plt.scatter(k1[:, 0], k1[:, 1], s=5)
    plt.scatter(k2[:, 0], k2[:, 1], s=5)
    plt.scatter(k3[:, 0], k3[:, 1], s=5)
    plt.show()
    return X


if __name__ == '__main__':
    # 生成数据
    true_Mu = [[0.5, 0.5], [5.5, 2.5], [1, 7]]
    true_Var = [[1, 3], [2, 2], [6, 2]]
    X = generate_X(true_Mu, true_Var)

    # K-means
    kmeans = km.K_Means(n_clusters=3)
    kmeans.fit(X)
    cat = kmeans.predict(X)
    print(cat)

    #show_cluster(cat, X) # 显示预测结果

    # GMM
    gmm = GMM(n_clusters=3)
    gmm.fit(X)
    cat = gmm.predict(X)
    print(cat)
    show_cluster(cat, X)

    spectral_clustering = sc.SC(n_clusters=3, knn_k=5)
    spectral_clustering.fit(X)