Пример #1
0
 def ClusterBalance(self, indexesToPick, stopCount, kmeansFlag=True):
     print "ClusterBalancing..."
     indexesPicked = []
     obs1 = self.observations[indexesToPick]
     obs = normalize(obs1, axis=0)
     if len(indexesToPick) != 0:
         if kmeansFlag:
             if(len(indexesToPick) < self.numClusters):
                 cluster = KMeans(init='k-means++', n_clusters=len(obs), n_init=10)
             else:
                 cluster = KMeans(init='k-means++', n_clusters=self.numClusters, n_init=10)
         else:
             if(len(indexesToPick) < self.numClusters):
                 cluster = spectral_clustering(n_clusters=len(obs), n_init=10)
             else:
                 cluster = spectral_clustering(n_clusters=self.numClusters, n_init=10)
         cluster.fit(obs)
         labels = cluster.labels_
         whenToStop = max(2, stopCount)
         count = 0
         while count != whenToStop:
             cluster_list = range(self.numClusters)
             index = 0
             for j in labels:
                 if j in cluster_list:
                     indexesPicked.append(indexesToPick[index])
                     cluster_list.remove(j)
                     count += 1
                     if count == whenToStop:
                         break
                     labels[index] = -1
                     if len(cluster_list) == 0:
                         break
                 index += 1
     return indexesPicked
def test_spectral_clustering_with_arpack_amg_solvers():
    # Test that spectral_clustering is the same for arpack and amg solver
    # Based on toy example from plot_segmentation_toy.py

    # a small two coin image
    x, y = np.indices((40, 40))

    center1, center2 = (14, 12), (20, 25)
    radius1, radius2 = 8, 7

    circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1 ** 2
    circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2 ** 2

    circles = circle1 | circle2
    mask = circles.copy()
    img = circles.astype(float)

    graph = img_to_graph(img, mask=mask)
    graph.data = np.exp(-graph.data / graph.data.std())

    labels_arpack = spectral_clustering(
        graph, n_clusters=2, eigen_solver='arpack', random_state=0)

    assert len(np.unique(labels_arpack)) == 2

    if amg_loaded:
        labels_amg = spectral_clustering(
            graph, n_clusters=2, eigen_solver='amg', random_state=0)
        assert adjusted_rand_score(labels_arpack, labels_amg) == 1
    else:
        assert_raises(
            ValueError, spectral_clustering,
            graph, n_clusters=2, eigen_solver='amg', random_state=0)
Пример #3
0
def image_features_labels(img,n_clusters,maxPixel):
     # X is the feature vector with one row of features per image
     #
     imageSize=maxPixel*maxPixel
     img = resize(img, (maxPixel, maxPixel))
     mask = img.astype(bool)
     # Convert the image into a graph with the value of the gradient on the
     # edges.
     graph = s_im.img_to_graph(img, mask=mask)

     # Take a decreasing function of the gradient: we take it weakly
     # dependent from the gradient the segmentation is close to a voronoi
     graph.data = np.exp(-graph.data / graph.data.std())

     # Force the solver to be arpack, since amg is numerically
     # unstable on this example
     labels = spectral_clustering(graph, n_clusters, eigen_solver='arpack')
     label_im = -np.ones(mask.shape)
     label_im[mask] = labels

     X=np.zeros(imageSize, dtype=float)

     # Store the rescaled image pixels
     X[0:imageSize] = np.reshape(label_im,(1, imageSize))
     return X
Пример #4
0
def test_spectral_amg_mode():
    # Test the amg mode of SpectralClustering
    centers = np.array([
        [0., 0., 0.],
        [10., 10., 10.],
        [20., 20., 20.],
    ])
    X, true_labels = make_blobs(n_samples=100, centers=centers,
                                cluster_std=1., random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    S = sparse.coo_matrix(S)
    try:
        from pyamg import smoothed_aggregation_solver
        amg_loaded = True
    except ImportError:
        amg_loaded = False
    if amg_loaded:
        labels = spectral_clustering(S, n_clusters=len(centers),
                                     random_state=0, mode="amg")
        # We don't care too much that it's good, just that it *worked*.
        # There does have to be some lower limit on the performance though.
        assert_greater(np.mean(labels == true_labels), .3)
    else:
        assert_raises(ValueError, spectral_embedding, S,
                      n_components=len(centers), random_state=0, mode="amg")
Пример #5
0
def speclu(data_matrix, k):
	#use spectral clustering
	print 'using spectral clustering......'
	E_matrix = getEMatrix(data_matrix)
	result_total = spectral_clustering(E_matrix, n_clusters = k)
	result = result_total[ : len(data_matrix)]
	return result
Пример #6
0
def compute(n):
	G , nodes , ego = build_graph(n)
	A = nx.to_numpy_matrix(G)
	C = connectedness(A)
	row , col = A.shape
	if row >= 350:
		clus = 10
	else:
		clus = 6
	L = spectral_clustering(C , n_clusters = clus)
	circles = []
	for x in range(0,clus):
		circles += [[]]
	
	tmp = 0
	for node in nodes:
		circles[L[tmp]] += [node]
		tmp += 1
	final_circle = []
	for circle in circles:
		if len(circles) == 1:
			final_circle += [circle]
			continue
		den = compute_density(circle , nodes , A)
		if den + 1e-9 < .250:
			continue
		final_circle += [circle]


	# print(final_circle) 
	return ego , final_circle 
Пример #7
0
    def classifySpeCluLsa(self, class_num):
        from draw_data import draw_data 
        draw_title = draw_data()
        lsa = models.LsiModel.load('model.lsa', mmap='r')
        logging.info("load lsa model!!")
        index = similarities.MatrixSimilarity.load('model_lsa.index')
        self.get_data(num=3000)
        (tfidf, dictionary) = self.get_tfidf(True, num=3000)

        hash_id2list = dict() # 保存id -> 下标 similar_matrix中对应使用
        for i in range(len(self.title_id)):
            hash_id2list[self.title_id[i]] = i

        logging.info('开始创建相似矩阵...')
        similar_matrix = np.zeros((len(tfidf),len(tfidf))) #存放相似度
        for i in range(len(tfidf)):
            sims = index[lsa[tfidf[i]]]
            for j,v in enumerate(sims): 
                similar_matrix[i][j] = v
                similar_matrix[j][i] = v
        logging.info('done,相似矩阵建立完成,使用普聚类进行分类...')
        labels = spectral_clustering(similar_matrix, n_clusters=class_num, eigen_solver='arpack')
        self.vector_table = [[] for i in range(class_num)]
        for i in range(len(labels)):
            self.vector_table[labels[i]].append(self.title_id[i])
        logging.info("print set... "+str(len(self.vector_table)))
        self.printTitleTOfile(hash_id2list)
        draw_title.draw_topic(self.vector_table, 30, '2015-09-25', '2015-12-25')
Пример #8
0
def community_clustering():
    path = settings.COMMUNITY_PATH
    index = 0
    communities = []
    merged_communities = {}
    for root, dirs, files in os.walk(path):
        for year in files:
            merged_communities[int(year)] = [[] for i in range(200)]
            comm_dict = {}
            input = open(os.path.join(path,year))
            for line in input:
                x = line.strip().split(' ')
                author = int(x[0])
                id = int(x[1])
                if not comm_dict.has_key(id):
                    comm_dict[id] = Community(int(year),id,index)
                    index+=1
                comm_dict[id].append_member(author)
            for id in comm_dict.keys():
                communities.append(comm_dict[id])
    verbose.debug("num of communities: "+str(len(communities)))
    adjacency = np.ndarray(shape=(len(communities),len(communities)), dtype=int)
    for i in range(len(communities)):
        for j in range(i+1,len(communities)):
            affinity = communities[i].intersect(communities[j])
            adjacency[i,j]=affinity
            adjacency[j,i]=affinity
    labels = spectral_clustering(adjacency, n_clusters = 200)
    verbose.debug("clustering finished")
    for i in range(len(labels)):
        merged_communities[communities[i].year][labels[i]].extend(communities[i].members)
    for year in merged_communities.keys():
        cluster_file = open(settings.DATA_PATH+"\\clusters\\"+str(year), 'w')
        for i in range(len(merged_communities[year])):
            [cluster_file.write(str(member)+',') for member in merged_communities[year][i]]                     
def spectral(tweetfile,npmifile,dictfile,k,noc):
	Ptmp=textscan(npmifile,'([^ ]*) ([^ ]*) ([^ ]*)');
	PP=textscan(dictfile,'(.*) (.*)',(int,str));
	PP[0] -= 1
	PMI=ssp.coo_matrix(
		(Ptmp[2],(Ptmp[0]-1,Ptmp[1]-1)),
		(PP[0].shape[0],PP[0].shape[0])
	).tocsr();

	W=knnmatrix(PMI,k);
	# This is hidious and wrong and it must be fixed
	W=ssp.csr_matrix(minimum(W.todense(),W.T.todense()))
	
	s,comp = ssp.csgraph.connected_components(W,directed=False)
	comp_mode = mstats.mode(comp)[0]
	inds = comp==comp_mode
	inds = [x for x in range(W.shape[0]) if inds[x]]
	WW = W[inds,:][:,inds]
	P=PP[1][inds];

	ids = P;
	X = WW;

	c = spectral_clustering(X,n_clusters=noc, eigen_solver='arpack')
	fid=file("".join(['cl.',tweetfile,'-',str(noc)]),'w');
	for i in range(max(c)+1):
		cl=[x for x in range(len(c)) if c[x] == i]
		b,wordsix = centralityn(cl,X,ids);
		for j in range(len(b)):
			word=wordsix[j];
			fid.write('%s %d %.5f\n'%(word,i,b[j]));
Пример #10
0
	def __init__(self, laplacian,ncluster,classesnames):
		self.laplacian = laplacian
		self.ncluster = ncluster
		m,n=laplacian.shape
		print 'size Laplacian_matrix: ',m, n
		labels = spectral_clustering(laplacian, n_clusters=ncluster)

		x=range(n+1)
		wordsall=zip(x, classesnames)
		lc= zip(labels,x)
		print "labels", lc
		allwordsclustered=[]
		for m in range(ncluster):
			sort=[item[1] for item in lc if item[0] == m]
			wordsclustered=[]

			for y in sort:

				for item in wordsall:
				 if item[0] == y:
				  wordsclustered.append(item[1])
			if len(wordsclustered) >1:	
				allwordsclustered.append(wordsclustered)

		print'clusteredwords'
		print allwordsclustered
		
		self.cluster=  len(allwordsclustered),allwordsclustered
Пример #11
0
  def getPairwiseDistanceMatrix(self):
    """
      It is sloghtly slower but memory efficient, fast implementation is not tractable in terms of memory for such a scale
    """
    self.clusters = []
    dataSize = self.data_points.shape
    self.PDistMat = sp.sparse.csr_matrix((dataSize[0],dataSize[0]))
    for k in range(dataSize[0]):
      CurrentPoint = self.data_points[k,:]
      Dist = sp.spatial.distance.cdist(np.reshape(CurrentPoint,(1,dataSize[1])),self.data_points,'euclidean')
      kMins = []
      kDists = []
      maxD = np.max(Dist)+1
      while len(kMins)<5:
        cMins = np.argmin(Dist)
        kMins.append(cMins)
        kDists.append(Dist[0,cMins])
        Dist[0,cMins]=maxD
      for pt in range(len(kMins)):
        #print kMins[pt],k,self.PDistMat.shape,kDists[pt],pt,kDists
        self.PDistMat[k,kMins[pt]]=kDists[pt]
        self.PDistMat[kMins[pt],k]=kDists[pt]

    SM=self.PDistMat.data.mean()
    self.PDistMat.data[:] = np.exp(((-1)*self.PDistMat.data)/SM)
    #Here we go a bit low-level and apply the e^(-1.x) to the data array
    #self.PDistMat.data = np.exp((-1)*self.PDistMat.data)

    pickle.dump(self.PDistMat,open('pdist.bnbb','wb'))
    labs = spectral_clustering(self.PDistMat,n_clusters=20)
    pickle.dump(labs,open('labs.bnbb','wb'))
Пример #12
0
def cluster_nodes(dist_laplacian, clusters=3, show=False):
    norm_laplacian = Lapl_normalize(dist_laplacian)
    norm_laplacian.setdiag(0)
    norm_laplacian = -norm_laplacian
    if show:
        plt.imshow(norm_laplacian.toarray(), cmap='jet', interpolation="nearest")
        plt.colorbar()
        plt.show()
    labels = spectral_clustering(norm_laplacian, n_clusters=clusters, eigen_solver='arpack')
    return np.reshape(labels, (dist_laplacian.shape[0], 1))
def spectralClusteringTest01():
	import numpy as np
	import matplotlib.pyplot as plt

	from sklearn.feature_extraction import image
	from sklearn.cluster import spectral_clustering

	l = 100
	x,y = np.indices((l, l)) #x,y 都是二维矩阵, 表示了某点的x 和 y的坐标


	center1 = (28, 24)
	center2 = (40, 50)
	center3 = (67, 58)
	center4 = (24, 70)

	radius1, radius2, radius3, radius4 = 16, 14, 15, 14

	circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1 ** 2
	circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2 ** 2
	circle3 = (x - center3[0]) ** 2 + (y - center3[1]) ** 2 < radius3 ** 2
	circle4 = (x - center4[0]) ** 2 + (y - center4[1]) ** 2 < radius4 ** 2


	img = circle1 + circle2 + circle3 + circle4
	mask = img.astype(bool)
	img = img.astype(float)

	img += 1 + 0.2 * np.random.randn(*img.shape)

	#Convert the image into a graph with the value of the gradient on the edges

	#img就是一个100 * 100的图片
	#mask是一个bool型的100 * 100模板
	#graph是一个稀疏矩阵 -- 不过为什么是2678 * 2678 ?
	#估计这一步里面计算了梯度
	graph = image.img_to_graph(img, mask = mask)

	print graph.shape
	graph.data = np.exp(-graph.data / graph.data.std())

	#这里还是指定了聚类的中心数目
	#这里是只对mask内的点进行聚类
	labels = spectral_clustering(graph, n_clusters = 4, eigen_solver = "arpack")


	print labels

	label_im = -np.ones(mask.shape)
	label_im[mask] = labels

	plt.matshow(img)
	plt.matshow(label_im)

	plt.show()
def cluster_spatial_data(X, n_parcels, xyz=None, shape=None, mask=None,
                         method='ward', verbose=False):
    """Cluster the data using Ward's algorithm

    Parameters
    ==========
    X: array of shape(n_voxels, n_subjects)
       the functional data, across subjects
    n_parcels: int, the desired number of parcels
    xyz: array of shape (n_voxels, 3), optional
         positions of the voxels in grid coordinates
    shape: tuple: the domain shape (assuming a grid structure), optional
          alternative specification of positions
    mask: arbitrary array of arbitrary dimension,optional
          alternative specification of positions
    method: string, one of ['ward', 'spectral', 'kmeans'], optional
            clustering method

    Returns
    =======
    label: array of shape(n_voxels): the resulting cluster assignment

    Note
    ====
    One of xyz, shape or mask needs to be provided
    """
    from sklearn.cluster import spectral_clustering, k_means
    if mask is not None:
        connectivity = grid_to_graph(*shape, mask=mask)
    elif shape is not None:
        connectivity = grid_to_graph(*shape)
    elif xyz is not None:
        from sklearn.neighbors import kneighbors_graph
        n_neighbors = 2 * xyz.shape[1]
        connectivity = kneighbors_graph(xyz, n_neighbors=n_neighbors)
    else:
        raise ValueError('One of mask, shape or xyz has to be provided')

    if n_parcels == 1:
        return np.zeros(X.shape[0])
    if method == 'ward':
        connectivity = connectivity.tocsr()
        ward = Ward(n_clusters=n_parcels, connectivity=connectivity).fit(X)
        label = ward.labels_
    elif method == 'spectral':
        i, j = connectivity.nonzero()
        sigma = np.sum((X[i] - X[j]) ** 2, 1).mean()
        connectivity.data = np.exp(- np.sum((X[i] - X[j]) ** 2, 1) /
                                      (2 * sigma))
        label = spectral_clustering(connectivity, n_clusters=n_parcels)
    elif method == 'kmeans':
        _, label, _ = k_means(X, n_parcels)
    else:
        raise ValueError('Unknown method for parcellation')
    return label
Пример #15
0
def cluster_and_rank_demos(sm, n_clusters, eigen_solver='arpack', assign_labels='discretize'):
    """
    Clusters demos based on similarity matrix.
    """
    labels = spectral_clustering(sm, n_clusters = n_clusters, eigen_solver=eigen_solver,assign_labels=assign_labels)
    clusters = {i:[] for i in xrange(n_clusters)}
    for i,l in enumerate(labels):
        clusters[l].append(i)

    # Maybe re-cluster large demos
    return rank_demos_in_cluster(clusters, sm)
Пример #16
0
    def test_spectral_clustering(self):
        N = 50
        m = np.random.random_integers(1, 200, size=(N, N))
        m = (m + m.T) / 2

        df = pdml.ModelFrame(m)
        result = df.cluster.spectral_clustering(random_state=self.random_state)
        expected = cluster.spectral_clustering(m, random_state=self.random_state)

        self.assertIsInstance(result, pdml.ModelSeries)
        tm.assert_index_equal(result.index, df.index)
        tm.assert_numpy_array_equal(result.values, expected)
def test_spectral_lobpcg_mode():
    # Test the lobpcg mode of SpectralClustering
    # We need a fairly big data matrix, as lobpcg does not work with
    # small data matrices
    centers = np.array([[0.0, 0.0], [10.0, 10.0]])
    X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=0.1, random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    labels = spectral_clustering(S, n_clusters=len(centers), random_state=0, eigen_solver="lobpcg")
    # We don't care too much that it's good, just that it *worked*.
    # There does have to be some lower limit on the performance though.
    assert_greater(np.mean(labels == true_labels), 0.3)
Пример #18
0
 def __speclu(self):
     #use spectral clustering
     print 'using spectral clustering......'
     data_matrix = self.data_matrix
     if len(data_matrix) == len(data_matrix[0]):
         print "Donot need to use E_matrix"
         E_matrix = data_matrix
     else:
         E_matrix = self.__getEMatrix()
     result_total = spectral_clustering(E_matrix, n_clusters=self.k)
     result = result_total[:len(data_matrix)]
     return result
Пример #19
0
def spectralcluster(correlations,n_clusters,names):
    labels=cluster.spectral_clustering(correlations,n_clusters=n_clusters, eigen_solver=None, random_state=0, n_init=10,  k=None, eigen_tol=0.0, 
    assign_labels='kmeans', mode=None)
    #print labels
    clusdict=[]
    print ""
    print "Spectral Clustering - shape: " + str(correlations.shape)
    for i in range(labels.max()+1):
        print 'Cluster %i: %s' % ((i+1),', '.join(names[labels==i]))
        clusdict.append(names[labels==i])
    #print clusdict                     
    return clusdict
	def __speclu(self):
		#use spectral clustering
		print 'using spectral clustering......'
		data_matrix = self.data_matrix
		if len(data_matrix) == len(data_matrix[0]):
			print "Donot need to use E_matrix"
			E_matrix = data_matrix
		else:
			E_matrix = self.__getEMatrix()
		result_total = spectral_clustering(E_matrix, n_clusters = self.k)
		result = result_total[ : len(data_matrix)]
		return result
Пример #21
0
def run_snf2(w1, w2, wall_label):
    Dist1 = dist2(w1.values, w1.values)
    Dist2 = dist2(w2.values, w2.values)

    S1 = snf.compute.affinity_matrix(Dist1, K=args.neighbor_size, mu=args.mu)
    S2 = snf.compute.affinity_matrix(Dist2, K=args.neighbor_size, mu=args.mu)

    # Do SNF2 diffusion
    (
        dicts_common,
        dicts_commonIndex,
        dict_sampleToIndexs,
        dicts_unique,
        original_order,
    ) = data_indexing([w1, w2])
    S1_df = pd.DataFrame(data=S1, index=original_order[0], columns=original_order[0])
    S2_df = pd.DataFrame(data=S2, index=original_order[1], columns=original_order[1])

    fused_networks = snf2(
        args,
        [S1_df, S2_df],
        dicts_common=dicts_common,
        dicts_unique=dicts_unique,
        original_order=original_order,
    )

    S1_fused = fused_networks[0]
    S2_fused = fused_networks[1]

    # S2_fused = S2_fused.reindex(wall_label.index.tolist())
    # labels_final = spectral_clustering(S2_fused.values, n_clusters=10)
    # score = v_measure_score(wall_label["label"].tolist(), labels_final)
    # print("SNF2 for clustering union 832 samples NMI score:", score)

    S_final = tsne_p_deep(
        args,
        dicts_commonIndex,
        dict_sampleToIndexs,
        [S1_fused.values, S2_fused.values],
    )

    S_final_df = pd.DataFrame(data=S_final, index=dict_sampleToIndexs.keys())
    S_final_df = S_final_df.reindex(wall_label.index.tolist())

    Dist_final = dist2(S_final_df.values, S_final_df.values)
    Wall_final = snf.compute.affinity_matrix(
        Dist_final, K=args.neighbor_size, mu=args.mu
    )

    labels_final = spectral_clustering(Wall_final, n_clusters=10)
    score = v_measure_score(wall_label["label"].tolist(), labels_final)
    print("SNF2 for clustering union 832 samples NMI score:", score)
    return score
Пример #22
0
    def test_spectral_clustering(self):
        N = 50
        m = np.random.random_integers(1, 200, size=(N, N))
        m = (m + m.T) / 2

        df = pdml.ModelFrame(m)
        result = df.cluster.spectral_clustering(random_state=self.random_state)
        expected = cluster.spectral_clustering(m, random_state=self.random_state)

        self.assertTrue(isinstance(result, pdml.ModelSeries))
        self.assert_index_equal(result.index, df.index)
        self.assert_numpy_array_equal(result.values, expected)
    def cluster_financial_indexs(self, k):
        #fecth_indexs = FIR.fetch_selected_financial_indexs(indexs, self.dates)
        #date = '2017-12-31'
        date = dates[0]
        print('cluster is', date, 'alg is', self.alg)
        # k = 10
        X = self.fetch_factors.values

        if self.alg == 'kmean':
            km = KMeans(n_clusters=k, random_state=42)
            km.fit(X)
            labels = km.labels_
        elif self.alg == 'agglomerative':
            ward = AgglomerativeClustering(n_clusters=k, linkage='ward')
            ward.fit(X)
            labels = ward.labels_
        elif self.alg == 'DBSCAN':
            # Compute DBSCAN
            db = DBSCAN(eps=10, min_samples=10).fit(X)
            labels = db.labels_
        elif self.alg == 'spectral':
            # Compute DBSCAN
            labels = spectral_clustering(X,
                                         n_clusters=k,
                                         eigen_solver='arpack')
            #labels = db.labels_
        elif self.alg == 'birch':
            # Compute DBSCAN
            brc = Birch(threshold=50,
                        branching_factor=50,
                        n_clusters=300,
                        compute_labels=True)
            labels = brc.fit(X)
            labels = labels.labels_
        elif self.alg == 'affinity':

            #af = AffinityPropagation(affinity='precomputed').fit(X)
            af = AffinityPropagation(max_iter=500, affinity='euclidean').fit(X)
            labels = af.labels_
        else:
            print('not support this cluster')
            exit(-1)

        #labels = spectral_clustering(self.fetch_factors[date].values, n_clusters=k,
        # assign_labels='discretize', random_state=1)

        self.fetch_factors["Cluster"] = labels
        #self.fetch_factors[date]["Cluster"].sort(key='Cluster', reverse=False)
        self.fetch_factors = self.fetch_factors.sort_values("Cluster",
                                                            axis=0,
                                                            ascending=True)
        self.fetch_factors.to_csv(self.path_cluster.format(date))
        print('save folder is', self.path_cluster.format(date))
Пример #24
0
def affin_sclustering(X,n_clust, distance='euclid', gamma=0.1, std=1):
    print 'Basic spectral clustering using affinity matrix'
    if distance=='cosine':
        similarity=cos(X)#pairwise_distances(X, metric='cosine')
    elif distance=='euclid':
        dist=euclidean_distances(X)
        if std:
            similarity = np.exp(-gamma * dist/dist.std())
        else:
            similarity = np.exp(-gamma * dist)
    labels = cluster.spectral_clustering(similarity,n_clusters=n_clust, eigen_solver='arpack')
    return labels
def test_spectral_clustering_with_arpack_amg_solvers():
    # Test that spectral_clustering is the same for arpack and amg solver
    # Based on toy example from plot_segmentation_toy.py

    # a small two coin image
    x, y = np.indices((40, 40))

    center1, center2 = (14, 12), (20, 25)
    radius1, radius2 = 8, 7

    circle1 = (x - center1[0])**2 + (y - center1[1])**2 < radius1**2
    circle2 = (x - center2[0])**2 + (y - center2[1])**2 < radius2**2

    circles = circle1 | circle2
    mask = circles.copy()
    img = circles.astype(float)

    graph = img_to_graph(img, mask=mask)
    graph.data = np.exp(-graph.data / graph.data.std())

    labels_arpack = spectral_clustering(graph,
                                        n_clusters=2,
                                        eigen_solver='arpack',
                                        random_state=0)

    assert len(np.unique(labels_arpack)) == 2

    if amg_loaded:
        labels_amg = spectral_clustering(graph,
                                         n_clusters=2,
                                         eigen_solver='amg',
                                         random_state=0)
        assert adjusted_rand_score(labels_arpack, labels_amg) == 1
    else:
        assert_raises(ValueError,
                      spectral_clustering,
                      graph,
                      n_clusters=2,
                      eigen_solver='amg',
                      random_state=0)
def cluster_spectral(X):
    similarity_matrix = compute_similarity_matrix(X)

    labels = spectral_clustering(similarity_matrix)
    classes = {idx: str(v) for idx, v in enumerate(labels)}

    graph = create_knn_graph(similarity_matrix, 8)

    # export clustered graph as json
    nx.set_node_attributes(graph, classes, 'group')
    graph_json = json_graph.node_link_data(graph)

    return list(labels), graph_json
Пример #27
0
def gen_codebook(graphs, W_matrix, group_num=16):
    m = len(W_matrix)
    res = spectral_clustering(W_matrix, n_clusters=group_num)
    group_res = []
    for i in range(group_num):
        group_res.append([])
    for i in range(m):
        group_res[res[i]].append(i)
    centers = processing_grouping(group_res, W_matrix)
    codebook = []
    for i in centers:
        codebook.append(graphs[i])
    return codebook
Пример #28
0
    def clustering_preGraph(self):

        hardLabelDict, softLabelDict = self.getLabel()

        for key in self.edgeDict:
            groundTrues = hardLabelDict[key]
            clusterNum = 12

            A = self.edgeDict[key]
            nt = NetworkTool()
            nt.initNetwork(A, nodeIndexDict[key])
            X = self.initX(A)
            labels_ajen = spectral_clustering(X,
                                              n_clusters=clusterNum,
                                              eigen_solver='arpack')
            nmi_sc = self.NMI(labels_ajen.tolist(), groundTrues, clusterNum)
            print nmi_sc
            # counter=self.counter(labels,clusterNum)

            Y = self.refexFeature[key]
            pca = PCA(n_components=50, svd_solver='full')
            Y_50 = pca.fit_transform(Y)
            S = cosine_similarity(Y_50)
            S = (S + 1.0) / 2.0
            labels = spectral_clustering(S,
                                         n_clusters=clusterNum,
                                         eigen_solver='arpack')
            counter = self.counter(labels, clusterNum)
            nmi_sc = self.NMI(labels.tolist(), groundTrues, clusterNum)
            print nmi_sc

            # self.draw(nodeIndexDict[key],nt,labels,str(key)+'_spectral_'+str(clusterNum)+'.png')
            # self.output(nodeIndexDict[key],labels,str(key)+'_spectral')

            kmeans = KMeans(n_clusters=clusterNum, random_state=0).fit(Y)
            labels_km = kmeans.labels_.tolist()
            counter = self.counter(labels_km, clusterNum)
            nmi_km = self.NMI(labels_km, groundTrues, clusterNum)
            print nmi_km
Пример #29
0
def sp_clustering(img):
    graph = image.img_to_graph(img)

    # Take a decreasing function of the gradient: we take it weakly
    # dependent from the gradient the segmentation is close to a voronoi
    graph.data = np.exp(-graph.data / graph.data.std())

    # Force the solver to be arpack, since amg is numerically
    # unstable on this example
    labels = spectral_clustering(graph, n_clusters=64, eigen_solver='arpack')

    plt.matshow(img)
    plt.matshow(labels)
Пример #30
0
def graph_cuts(fg_embed,
               edge_index,
               num_cg,
               bandwidth=1.0,
               kernel='rbf',
               device=torch.device(0)):
    affinity = compute_affinity(fg_embed, edge_index, bandwidth, kernel,
                                device)

    pred_cg_idx = spectral_clustering(affinity.cpu().numpy(),
                                      n_clusters=num_cg,
                                      assign_labels='discretize')
    return pred_cg_idx, affinity
Пример #31
0
 def _cluster_model(self, model_name, c):
     if model_name == 'KMeans':
         model = KMeans(n_clusters=c, init='k-means++')
     elif model_name == 'HAC':
         model = AgglomerativeClustering(n_clusters=c,
                                         affinity='euclidean',
                                         linkage='ward')
     elif model_name == 'Spectral':
         model = spectral_clustering(n_clusters=c)
     else:
         print("Options for models are KMeans, HAC or Spectral.")
         exit(-1)
     return model
Пример #32
0
def sp(data, class_num, data_nm, label):
    n_clusters = class_num
    matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    m = euclidean_distances(data, squared=True)
    # print(m)
    sigma = np.median(m)
    plt.figure(figsize=(12, 8), facecolor='w')
    plt.suptitle(u'谱聚类', fontsize=20)
    clrs = [
        '#B03060', '#AEEEEE', '#68228B', 'y', 'c', 'm', '#2E2E2E', '#00008B',
        '#2E8B57', '#FAEBD7', '#8B5A00', '#EEEE00', '#0000FF', '#ABABAB',
        '#8B8B00'
    ]
    # print(len(clrs))

    assess = []
    for i, s in enumerate(np.logspace(-2, 0, 6)):

        af = np.exp(-m**2 / (s**2)) + 1e-6
        y_hat = spectral_clustering(af,
                                    n_clusters=n_clusters,
                                    assign_labels='kmeans',
                                    random_state=1)
        # assess.append(y_hat)
        plt.subplot(2, 3, i + 1)
        for k, clr in enumerate(clrs):
            cur = (y_hat == k)
            plt.scatter(data[cur, 0],
                        data[cur, 1],
                        s=40,
                        color=clr,
                        edgecolors='k')
        x1_min, x2_min = np.min(data, axis=0)
        x1_max, x2_max = np.max(data, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
        plt.title(u'sigma = %.2f' % s, fontsize=16)
    # print(y_hat)
    print("标准化互信息      精度      纯度     轮廓系数    兰德系数")
    nmi, acc, purity, Sc, ARI = evaluate.eva(y_hat, label, data)
    print(nmi, acc, purity, Sc, ARI)
    plt.tight_layout()
    plt.title("SC1+" + data_nm)
    plt.subplots_adjust(top=0.9)
    plt.savefig(
        '.\picture\improved_spectral_clustering\sc1_{0}.png'.format(data_nm))
    plt.close()
Пример #33
0
def affin_sclustering(X, n_clust, distance='euclid', gamma=0.1, std=1):
    print 'Basic spectral clustering using affinity matrix'
    if distance == 'cosine':
        similarity = cos(X)  #pairwise_distances(X, metric='cosine')
    elif distance == 'euclid':
        dist = euclidean_distances(X)
        if std:
            similarity = np.exp(-gamma * dist / dist.std())
        else:
            similarity = np.exp(-gamma * dist)
    labels = cluster.spectral_clustering(similarity,
                                         n_clusters=n_clust,
                                         eigen_solver='arpack')
    return labels
 def global_clustering_by_spectral(self):
     num_clusters = self.num_global_clusters
     X = self.build_global_feature_vectors_by_jaccard_with_weight()
     logging.info("Global spectral clustering...")
     spectral = spectral_clustering(X, n_clusters=num_clusters, eigen_solver='arpack')
     logging.info("Global spectral finished")
     self.global_clusters = [[[] for i in range(num_clusters)] for j in range(self.num_time_slides)]
     self.global_cluster_labels = [[None for i in range(self.num_local_clusters)] for j in range(self.num_time_slides)]
     labels = spectral
     for time in range(self.num_time_slides):
         for i, cluster in enumerate(self.local_clusters[time]):
             l = labels[self.gloabl_feature_vectors_index[time][i]]
             self.global_clusters[time][l].append(i)
             self.global_cluster_labels[time][i] = l
Пример #35
0
def get_spectralClustering(similarity, cluster_num):
    """

    :param similarity: similarity matrix
    :param cluster_num: number of clusters(if it is 0, calculate by spectral clustering)
    :return: labels...
    """
    similarity = pd.DataFrame(similarity)
    similarity = similarity.values
    similarity[np.isnan(similarity)] = 0

    labels = cl.spectral_clustering(affinity=similarity,
                                    n_clusters=cluster_num)
    return labels
Пример #36
0
def unify_communities_spectral_mean(params, GT):
    #This is the technique compared by Han Xu and Airoldi 2015 ICML (who propose variationam profile MLE algo)

    adj_matrix_summed = sp.sparse.csr_matrix(np.zeros(
        (len(GT[0].nodes), len(GT[0].nodes))),
                                             dtype=int)
    for G in GT:
        adj_matrix_summed += nx.adjacency_matrix(G)

    spout = spectral_clustering(adj_matrix_summed, n_clusters=params['k']) + 1
    gfinal = {}
    for i in GT[0].nodes():
        gfinal[i] = spout[i - 1]
    return gfinal, {}
Пример #37
0
def clustering(mat, k, names, size=2):
    labels = spectral_clustering(mat, n_clusters=k)
    clusters = dict()
    for a, clu_id in enumerate(labels):
        clusters.setdefault(clu_id, set())
        clusters[clu_id].add(a)
    name_clusters = list()
    for c_id in clusters:
        cluster = clusters[c_id]
        name_cluster = [names[c] for c in cluster]
        if len(name_cluster) < size:
            continue
        name_clusters.append(name_cluster)
    return name_clusters
Пример #38
0
def clust(vectorfile, matrixfile, clusted):

    fid2fname = {}
    for line in open(vectorfile):
        line = line.strip().split('\t')
        fid2fname.setdefault(int(line[0]), line[1:])

    N = len(fid2fname)
    rowlist = []
    collist = []
    datalist = []
    for line in open(matrixfile):
        line = line.strip().split('\t')
        if len(line) < 3: continue
        f1, f2, sim = line[:3]
        rowlist.append(int(f1))
        collist.append(int(f2))
        datalist.append(float(sim))

    for id in fid2fname:
        rowlist.append(int(id))
        collist.append(int(id))
        datalist.append(1.0)

    row = np.array(rowlist)
    col = np.array(collist)
    data = np.array(datalist)
    graph = coo_matrix((data, (row, col)), shape=(N, N))

    ###############################################################################

    # Force the solver to be arpack, since amg is numerically
    # unstable on this example
    labels = spectral_clustering(graph, n_clusters=550, eigen_solver='arpack')

    cluster2fid = {}
    for index, lab in enumerate(labels):
        cluster2fid.setdefault(lab, [])
        cluster2fid[lab].append(index)

    normal_data = open("normal-data.txt", 'w')
    easy_data = open("spectal_easy-data-550.txt", 'w')
    for index, lab in enumerate(cluster2fid):
        for fid in cluster2fid[lab]:
            strx = ""
            for i in range(0, len(fid2fname[fid])):
                strx += str(fid2fname[fid][i]) + "\t"
            print >> normal_data, strx + '\t' + str(index)
            print >> easy_data, strx + '\t' + str(fid) + '\t' + str(index)
Пример #39
0
def clust(vectorfile,matrixfile,clusted):

    fid2fname = {}
    for line in open(vectorfile) :
        line = line.strip().split('\t')
        fid2fname.setdefault(int(line[0]), line[1:])

    N = len(fid2fname)
    rowlist = []
    collist = []
    datalist = []
    for line in open(matrixfile) :
        line = line.strip().split('\t')
        if len(line) < 3 : continue
        f1, f2, sim = line[:3]
        rowlist.append(int(f1))
        collist.append(int(f2))
        datalist.append(float(sim))

    for id in fid2fname :
        rowlist.append(int(id))
        collist.append(int(id))
        datalist.append(1.0)

    row = np.array(rowlist)
    col = np.array(collist)
    data = np.array(datalist)
    graph = coo_matrix((data, (row, col)), shape=(N, N))

    ###############################################################################

    # Force the solver to be arpack, since amg is numerically
    # unstable on this example
    labels = spectral_clustering(graph, n_clusters=550, eigen_solver='arpack')

    cluster2fid = {}
    for index, lab in enumerate(labels) :
        cluster2fid.setdefault(lab, [])
        cluster2fid[lab].append(index)

    normal_data = open("normal-data.txt", 'w')
    easy_data=open("spectal_easy-data-550.txt", 'w')
    for index, lab in enumerate(cluster2fid) :
        for fid in cluster2fid[lab] :
            strx=""
            for i in range(0, len(fid2fname[fid])):
                strx+=str(fid2fname[fid][i])+"\t"
            print >> normal_data,strx+'\t'+str(index)
            print >> easy_data,strx+'\t'+str(fid)+'\t'+str(index)
Пример #40
0
    def LSTClustering(self):
        # 参考“Segmenting the picture of greek coins in regions”方法,Author: Gael Varoquaux <*****@*****.**>, Brian Cheung
        # License: BSD 3 clause
        orig_coins = self.LST
        # these were introduced in skimage-0.14
        if LooseVersion(skimage.__version__) >= '0.14':
            rescale_params = {'anti_aliasing': False, 'multichannel': False}
        else:
            rescale_params = {}
        smoothened_coins = gaussian_filter(orig_coins, sigma=2)
        rescaled_coins = rescale(smoothened_coins,
                                 0.2,
                                 mode="reflect",
                                 **rescale_params)
        # Convert the image into a graph with the value of the gradient on the
        # edges.
        graph = image.img_to_graph(rescaled_coins)
        # Take a decreasing function of the gradient: an exponential
        # The smaller beta is, the more independent the segmentation is of the
        # actual image. For beta=1, the segmentation is close to a voronoi
        beta = 10
        eps = 1e-6
        graph.data = np.exp(-beta * graph.data / graph.data.std()) + eps
        # Apply spectral clustering (this step goes much faster if you have pyamg
        # installed)
        N_REGIONS = 200
        for assign_labels in ('discretize', ):
            #        for assign_labels in ('kmeans', 'discretize'):
            t0 = time.time()
            labels = spectral_clustering(graph,
                                         n_clusters=N_REGIONS,
                                         assign_labels=assign_labels,
                                         random_state=42)
            t1 = time.time()
            labels = labels.reshape(rescaled_coins.shape)

            plt.figure(figsize=(5 * 3, 5 * 3))
            plt.imshow(rescaled_coins, cmap=plt.cm.gray)
            for l in range(N_REGIONS):
                plt.contour(
                    labels == l,
                    colors=[plt.cm.nipy_spectral(l / float(N_REGIONS))])
            plt.xticks(())
            plt.yticks(())
            title = 'Spectral clustering: %s, %.2fs' % (assign_labels,
                                                        (t1 - t0))
            print(title)
            plt.title(title)
        plt.show()
Пример #41
0
def consensus(clusterings, nclus, weights=None, method='hier', refclus=None):
    """
    Consensus by clustering of the pairings matrix, using hierarchical or spectral clustering

    Parameters
    ----------
    clusterings : ndarray
        ndata x nreal array of cluster realizations
    nclus : int
        the number of clusters to generate
    weights : ndarray
        nreal-long array of weights for each clustering
    method : str
        clustering method for the pairings matrix, either `hier` or `spec`
    refclus : ndarray
        A reference clustering for this dataset that the target will be recoded too

    Returns
    -------
    final_clusterings : ndarray
        1D array of final cluster labels given the passed parameters
    clusterprobs : ndarray
        ndata x nclus array of likelihood to be in each cluster

    """
    from sklearn.cluster import spectral_clustering
    try:
        clusterings = clusterings.clusterings
    except AttributeError:
        pass
    pairings = pairings_matrix(clusterings, weights)
    # use the selected nd x nd matrix clustering method
    if method == 'hier':
        final_clusters = hierarchical_clustering(pairings,
                                                 nclus,
                                                 method='ward')
    else:
        final_clusters = spectral_clustering(pairings, n_clusters=nclus)

    if refclus is not None:
        final_clusters, _ = reclass_clusters(refclus, final_clusters)
        final_ensemble, _ = reclass_clusters(refclus, clusterings)
        # if a reference clustering is passed also recode the passed ensemble
        for i in range(final_ensemble.shape[1]):
            clusterings[:, i] = final_ensemble[:, i]

    clusterprobs = cluster_probability_bycount(final_clusters, clusterings)

    return final_clusters, clusterprobs, pairings
Пример #42
0
    def _spectral_clustering(self,samples):
        if sp_version < (0, 12):
            raise SkipTest("Skipping because SciPy version earlier than 0.12.0 and "
                   "thus does not include the scipy.misc.face() image.")

        # Convert the image into a graph with the value of the gradient on the
        # edges.
        graph = image.img_to_graph(samples)


        # Take a decreasing function of the gradient: an exponential
        # The smaller beta is, the more independent the segmentation is of the
        # actual image. For beta=1, the segmentation is close to a voronoi
        beta = 5
        eps = 1e-6
        graph.data = np.exp(-beta * graph.data / graph.data.std()) + eps

        # Apply spectral clustering (this step goes much faster if you have pyamg
        # installed)
        N_REGIONS = 4

        #############################################################################
        # Visualize the resulting regions

        for assign_labels in ('kmeans', 'discretize'):
            t0 = time.time()
            labels = spectral_clustering(graph, n_clusters=N_REGIONS,
                                         assign_labels=assign_labels, random_state=1)
            sample=pd.DataFrame(labels)
            sample.to_csv(os.path.join(OUTPUT_DIR, "spectral_result.csv"),sep=",")
            t1 = time.time()
            #classif=labels.fit(samples)
            #print classif
            print labels
            print sample
            
            labels = labels.reshape(samples.shape)

            plt.figure(figsize=(5, 5))
            plt.imshow(samples, cmap=plt.cm.gray)
            for l in range(N_REGIONS):
                plt.contour(labels == l, contours=1,
                            colors=[plt.cm.spectral(l / float(N_REGIONS))])
            plt.xticks(())
            plt.yticks(())
            title = 'Spectral clustering: %s, %.2fs' % (assign_labels, (t1 - t0))
            print(title)
            plt.title(title)
        plt.show() 
Пример #43
0
def perform_clustering(alpha=0.0, num_clusters=100):
    """
    clustering the tag/terms and return the cluster ids for each tag
    :param alpha: parameter to combine visual and textual similarity matrix
    :param num_clusters: number of clusters/concepts obtained
    :return: cluster ids for each tag
    """
    vis_sim_mat = utilites.loadVariableFromFile(
        "Corel5k/tag_affinity_matrix_scaled.pkl")
    tex_sim_mat = utilites.loadVariableFromFile(
        "Corel5k/tag_textual_similarity_matrix.pkl")

    tex_sim_mat = adjust_and_norm_affinity(tex_sim_mat)
    vis_sim_mat = expit(vis_sim_mat)

    # introduce a parameter alpha to merge the two matrics
    joint_mat = alpha * vis_sim_mat + (1 - alpha) * tex_sim_mat

    # let's start spectrum clustering
    # obtain cluster IDs for each word
    # eigen_solver: None, arpack, lobpcg, or amg
    cluster_ids = spectral_clustering(joint_mat,
                                      n_clusters=num_clusters,
                                      eigen_solver='arpack')
    print("Done...")
    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number
    words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
    word_centroid_map = dict(zip(words, cluster_ids))
    utilites.saveVariableToFile(cluster_ids, "Corel5k/concepts_ids.pkl")

    cluster_contents = []
    # For the first 10 clusters
    for cluster in range(0, num_clusters):
        # print the cluster number
        print("\nCluster %d" % cluster)
        # Find all of the words for that cluster number, and print them out
        r_words = []
        for i in range(0, len(word_centroid_map.values())):
            if (word_centroid_map.values()[i] == cluster):
                r_words.append(word_centroid_map.keys()[i])

        print(r_words)
        cluster_contents.append(r_words)

    utilites.saveVariableToFile(cluster_contents,
                                "Corel5k/cluster_contents.pkl")

    return cluster_ids
Пример #44
0
def unify_communities_CM(ghats, k):

    Qs = {}
    QQtotal = np.zeros((len(ghats[0]), len(ghats[0])))
    for idx in range(len(ghats)):
        Qs[idx] = np.zeros((len(ghats[idx]), k))
        for i, x in enumerate(ghats[idx]):
            Qs[idx][i, ghats[idx][x] - 1] = 1
        QQtotal += np.dot(Qs[idx], Qs[idx].transpose())

    spout = spectral_clustering(QQtotal, n_clusters=k) + 1
    gfinal = {}
    for i in ghats[0]:
        gfinal[i] = spout[i - 1]
    return gfinal
Пример #45
0
def clustering():
    cosMatrix_mat = sio.loadmat(
        '../data/result/cosMatrix.mat', struct_as_record=False,
        squeeze_me=True)['cosMatrix']
    userMatrix_mat = getFriendsMatrix()
    combinedMatrix_mat = userMatrix_mat + cosMatrix_mat
    clusterNumber = range(50,60)
    sims = []
    for c in clusterNumber:
        labels = spectral_clustering(
            combinedMatrix_mat, n_clusters=c, eigen_solver='arpack')
        sim = clusterSimilarity(combinedMatrix_mat, labels, c)
        sims.append(sim)
        print "{} cluster: average simi={}".format(c, sim)
    print sims
def main():
	src_path = os.path.join(os.getcwd(), 'ratings.csv')
	res_path = os.path.join(os.getcwd(), 'preRatings.csv')
	predicted_data = pd.read_csv(res_path, header = 0, index_col = 0)
	int_col = []
	for col in predicted_data.columns:
		icol = int(col)
		int_col.append(icol)
	predicted_data.columns = int_col
	movie_rated_num = pd.Series(index = predicted_data.columns)
	for i in predicted_data.columns.values:
		movie_rated_num[i] = predicted_data[i].dropna().count()
	movie_rated_num.sort()
	cuted_data = predicted_data.loc[ : , movie_rated_num[8500: ].index]
	print cuted_data.shape

	data_matrix = cuted_data.fillna(0).values

	for i in range(0, len(data_matrix)):
		for j in range(0, len(data_matrix[i])):
			if data_matrix[i][j]>3.5:
				data_matrix[i][j] = 2
			elif data_matrix[i][j]<2.5:
				data_matrix[i][j] = 0
			else:
				data_matrix[i][j] = 1
	print data_matrix

	E_matrix = cs.getEMatrix(data_matrix)

	labels = spectral_clustering(E_matrix, n_clusters = 20)
	print labels
	'''
	init_data = pd.read_csv(src_path, header = 0, index_col = 0)
	# Cause the type of columns that read in csv is str, we need to convert it into int
	int_col = [int(col) for col in init_data.columns]
	init_data.columns = int_col
	init_data = init_data.loc[predicted_data.index, predicted_data.columns]
	init_data_matrix = init_data.fillna(0).values
	'''
	columns = ['userID', 'movieID', 'rating', 'timestamp']
	ratings = pd.read_csv(src_path, header = 1, names = columns)
	data = ratings.pivot(index = 'userID', columns = 'movieID', values = 'rating')
	init_data = data.loc[cuted_data.index, cuted_data.columns]
	init_data_matrix = init_data.fillna(0).values
	dp.drawPicture(init_data_matrix, labels)

	'''
Пример #47
0
def cluster_and_compare(n_clusters, data, labels_true):
    print(75 * '-')
    print('cluster\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI')

    kmeans_cluster = KMeans(n_clusters=n_clusters)
    kmeans_labels = evaluate_clustering(kmeans_cluster, "kmeans", data,
                                        labels_true)

    start_time = time()
    graph = cosine_similarity(data)
    spectral_labels = spectral_clustering(graph, n_clusters=n_clusters)
    execution_time = time() - start_time
    evaluate_with_predited_labels(labels_true, spectral_labels, execution_time,
                                  "spectral")

    dbscan_cluster = DBSCAN(eps=0.0595, min_samples=10, metric='cosine')
    dbscan_labels = evaluate_clustering(dbscan_cluster, "DBSCAN", data,
                                        labels_true)

    agg_cluster = AgglomerativeClustering(n_clusters=n_clusters)
    agg_labels = evaluate_clustering(agg_cluster, "Agglomerative", data,
                                     labels_true)
    print(75 * '-')

    pca_converter = PCA(n_components=2)
    data = pca_converter.fit_transform(data)

    plt.figure()
    plt.title('True labels')
    plt.scatter(data[:, 0], data[:, 1], c=labels_true)

    plt.figure()
    plt.title('Kmeans labels')
    plt.scatter(data[:, 0], data[:, 1], c=kmeans_labels)

    plt.figure()
    plt.title('Spectral labels')
    plt.scatter(data[:, 0], data[:, 1], c=spectral_labels)

    plt.figure()
    plt.title('DBSCAN labels')
    plt.scatter(data[:, 0], data[:, 1], c=dbscan_labels)

    plt.figure()
    plt.title('Agglomerative labels')
    plt.scatter(data[:, 0], data[:, 1], c=agg_labels)

    plt.show()
Пример #48
0
def clustering():
    cosMatrix_mat = sio.loadmat('../data/result/cosMatrix.mat',
                                struct_as_record=False,
                                squeeze_me=True)['cosMatrix']
    userMatrix_mat = getFriendsMatrix()
    combinedMatrix_mat = userMatrix_mat + cosMatrix_mat
    clusterNumber = range(50, 60)
    sims = []
    for c in clusterNumber:
        labels = spectral_clustering(combinedMatrix_mat,
                                     n_clusters=c,
                                     eigen_solver='arpack')
        sim = clusterSimilarity(combinedMatrix_mat, labels, c)
        sims.append(sim)
        print "{} cluster: average simi={}".format(c, sim)
    print sims
Пример #49
0
def spectral_clustering(G,
                        n_clusters=8,
                        node_map=[],
                        no_conversion=False,
                        simple_conversion=False):
    """ Cluster the given similarity matrix using spectral clustering.

    Assumes the given similarity network is connected.

    Args:
        G (ig.Graph)     - the input network
        n_clusters (int) - number of clusters to look for

    Returns:
        clusters (list) - a list of lists of nodes, each sublist represents
                          a cluster
    """
    # generate a numpy distance matrix from the given graph
    mat = G.get_adjacency(attribute='weight')
    dist_matrix = np.array(mat.data)

    if no_conversion:
        sim_matrix = dist_matrix
    elif simple_conversion:
        # take simple inverse to get similarity from distance
        sim_fn = np.vectorize(lambda x: 0 if x == 0 else 1 / float(x),
                              otypes=[np.float])
        sim_matrix = sim_fn(dist_matrix)
    else:
        # apply RBF kernel to generate similarity matrix from distance
        # matrix (i.e. lower DSD => higher similarity)
        std_dev = dist_matrix.std()
        sim_fn = np.vectorize(lambda x: 0
                              if x == 0 else np.exp(-(x) / (2 * (std_dev)**2)),
                              otypes=[np.float])
        sim_matrix = sim_fn(dist_matrix)

    # now do the clustering, scikit-learn implements this
    # return a list of lists representing the clusters
    node_assignments = list(sc.spectral_clustering(sim_matrix, n_clusters))
    clusters = []
    for n in xrange(n_clusters):
        clusters.append([i for i, m in enumerate(node_assignments) if m == n])
    if node_map:
        return [[node_map[n] for n in cl] for cl in clusters]
    else:
        return clusters
Пример #50
0
 def get_Finit(self, seed):
     """
     initialize factors A, B, C
     :param seed: 
     :return: 
     """
     agg_network = self.aggregated_network_matrix()
     # A_init = sparse.dok_matrix((len(self.node_ids),len(self.node_ids)), dtype=np.float32)
     A_init = np.zeros((len(self.node_ids), self.num_of_coms))
     clusters = spectral_clustering(agg_network, n_clusters=self.num_of_coms, n_init=10, eigen_solver='arpack',
                                    random_state=seed)
     for i, t in enumerate(clusters):
         A_init[i, t] = 1
     B_init = deepcopy(A_init)
     C_init = np.random.rand(self.tensor.shape[2], self.num_of_coms)
     Finit = [A_init, B_init, C_init]
     return Finit
Пример #51
0
    def clustering_useFeature(self, f_list):
        i = 0
        hardLabelDict, softLabelDict = self.getLabel()
        for key in self.edgeDict:
            groundTrues = hardLabelDict[key]
            clusterNum = 12
            A = self.edgeDict[key]
            nt = NetworkTool()
            nt.initNetwork(A, nodeIndexDict[key])

            F = f_list[i]

            # X=self.initX(A)
            # labels = spectral_clustering(A, n_clusters=clusterNum, eigen_solver='arpack')
            # counter=self.counter(labels,clusterNum)
            S = cosine_similarity(F)
            S = (S + 1.0) / 2.0
            labels = spectral_clustering(S,
                                         n_clusters=clusterNum,
                                         eigen_solver='arpack')
            counter = self.counter(labels, clusterNum)
            nmi_sc = self.NMI(labels.tolist(), groundTrues, clusterNum)
            print nmi_sc

            # self.draw(nodeIndexDict[key],nt,labels,str(key)+'_spectral_'+str(clusterNum)+'.png')
            # self.output(nodeIndexDict[key],labels,str(key)+'_spectral')

            kmeans = KMeans(n_clusters=clusterNum, random_state=0).fit(F)
            labels_km = kmeans.labels_.tolist()
            counter = self.counter(labels_km, clusterNum)
            nmi_km = self.NMI(labels_km, groundTrues, clusterNum)
            # nmi_km_sk=normalized_mutual_info_score(groundTrues,labels_km)
            print nmi_km

            pca = PCA(n_components=clusterNum, svd_solver='full')
            F_pca = pca.fit_transform(F)
            kmeans = KMeans(n_clusters=clusterNum, random_state=0).fit(F_pca)
            labels_km = kmeans.labels_.tolist()
            counter = self.counter(labels_km, clusterNum)
            nmi_km = self.NMI(labels_km, groundTrues, clusterNum)
            # nmi_km_sk=normalized_mutual_info_score(groundTrues,labels_km)
            print nmi_km

            # self.draw(nodeIndexDict[key],nt,labels_km,str(key)+'_kmean_'+str(clusterNum)+'.png')
            # self.output(nodeIndexDict[key],labels_km,str(key)+'_kmean')
            i += 1
Пример #52
0
def test_spectral_lobpcg_mode():
    # Test the lobpcg mode of SpectralClustering
    # We need a fairly big data matrix, as lobpcg does not work with
    # small data matrices
    centers = np.array([
        [0., 0.],
        [10., 10.],
    ])
    X, true_labels = make_blobs(n_samples=100, centers=centers,
                                cluster_std=1., random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    labels = spectral_clustering(S, n_clusters=len(centers),
                                 random_state=0, eigen_solver="lobpcg")
    # We don't care too much that it's good, just that it *worked*.
    # There does have to be some lower limit on the performance though.
    assert_greater(np.mean(labels == true_labels), .3)
Пример #53
0
def Discretize_Clustering(twoDimg, N_REGIONS):
    """Put clustering code"""
    graph = imp.img_to_graph(twoDimg)
    beta = 1
    eps = 1e-1
    graph.data = np.exp(-beta * graph.data / twoDimg.std()) + eps

    t0 = time.time()
    labels = spectral_clustering(graph,
                                 n_clusters=N_REGIONS,
                                 assign_labels='discretize',
                                 random_state=1)
    t1 = time.time()
    labels = labels.reshape(twoDimg.shape)
    print('time taken', t1 - t0)

    return labels, N_REGIONS
Пример #54
0
def dist_matrix(axis, clusters):
    # if axis:
    #     re_shards = shards[:, non_nilled]
    # else:
    #     re_shards = shards
    # print re_shards.shape
    # c_list = np.split(re_shards, re_shards.shape[axis], axis)
    # accumulator_matrix = np.zeros((re_shards.shape[axis], re_shards.shape[axis]))
    # est_len = re_shards.shape[axis]*(re_shards.shape[axis]-1)/2
    # for i, (i_a, i_b) in enumerate(combinations(range(0, re_shards.shape[axis]), 2)):
    #     if not i%100:
    #         pl = "{0:0.2f}".format(i/float(est_len)*100.0)
    #         print pl, '%'
    #     a = c_list[i_a]
    #     b = c_list[i_b]
    #     dist = distance(a, b)
    #     accumulator_matrix[i_a, i_b] = dist
    #     accumulator_matrix[i_b, i_a] = dist
    #
    # dump(accumulator_matrix,open('loc_dump.dmp','w'))

    ##########################################################################################################

    pre_accumulator_matrix = load(open('loc_dump.dmp','r'))

    accumulator_matrix = np.exp( - pre_accumulator_matrix*pre_accumulator_matrix / pre_accumulator_matrix.std() )
    plt.imshow(accumulator_matrix, interpolation='nearest')
    plt.show()




    vals, vects =  eigh(accumulator_matrix)
    plt.hist(vals, 1000, log=True)
    vals[vals**2 < 0.3] = 0
    print vals
    # accumulator_matrix = np.dot(vects, np.dot(np.diag(vals), vects.T))
    plt.show()

    labels = spectral_clustering(accumulator_matrix, n_clusters=clusters, eigen_solver='arpack')
    print labels
    stable_mappings = crible(10, labels, non_nilled)
    print 'stable mappings redundancy:', len(stable_mappings), len(set(stable_mappings))
    srt_idx = hierchical_clustering(accumulator_matrix, labels)

    dump((stable_mappings, accumulator_matrix, srt_idx, non_nilled), open('loc_dump2.dmp','w'))
Пример #55
0
def SpectralClusterImage(input_image, beta=5, eps=1e-6, n_regions=11, assign_labels='discretize',downsample_factor=np.NaN, order=3):
    """ Spectral Cluster an image
        Inputs:
            input_image: ndarray of image
            beta: Take a decreasing function of the gradient: an exponential
                The smaller beta is, the more independent the segmentation is of 
                the acutal image. For beta=1, the segmentation is close to a 
                voronoi. Default is 5.
            eps: error term. Default is 1E-6
            n_regions: number of regions to decompose into. Default is 11.
            assign_labels: ways of decomposition. Selecting from 'discretize' and 
                'kmeans'. Default is 'discretize'.
            downsample_factor: downsampling before spectral decomposition. Default
                is to keep the original sampling. Enter a single number to apply
                the kernel for both dimensions of the image, or enter as a sequence
                to apply different kernel for each dimension
            order: downsampling method, order of B-spline interpolation
    """
    # Downsample the image
    if not np.isnan(downsample_factor):
        zoom(input_image, zoom=downsample_factor, order=order)
    # Convert the image into a graph with the value of the gradient on the edges
    graph = image.img_to_graph(input_image)
    # Take a decreasing function of the gradient: an exponential
    # The smaller beta is, the more independent the segmentation is of the
    # acutal image. For beta=1, the segmentation is close to a voronoi
    graph.data = np.exp(-beta * graph.data / input_image.std()) + eps 
    # Apply spectral clustering  (this step goes much faster if yuo have pyamg 
    # installed) 
    labels = spectral_clustering(graph, n_clusters=n_regions,
                                 assign_labels='discretize')
    labels = labels.reshape(input_image.shape)
    # Visualizing the resulting regions
    pl.figure(figsize=(5,5))
    pl.imshow(input_image, cmap=pl.cm.gray)
    for lb in range(n_regions):
        pl.contour(labels == lb, contour=1,
                   color=[pl.cm.spectral(lb / float(n_regions)), ])
    # Get rid of x, y tick marks
    pl.xticks(())
    pl.yticks(())



                            
def perform_clustering(alpha=0.0, num_clusters=100):
    """
    clustering the tag/terms and return the cluster ids for each tag
    :param alpha: parameter to combine visual and textual similarity matrix
    :param num_clusters: number of clusters/concepts obtained
    :return: cluster ids for each tag
    """
    vis_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_affinity_matrix_scaled.pkl")
    tex_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_textual_similarity_matrix.pkl")

    tex_sim_mat = adjust_and_norm_affinity(tex_sim_mat)
    vis_sim_mat = expit(vis_sim_mat)

    # introduce a parameter alpha to merge the two matrics
    joint_mat = alpha * vis_sim_mat + (1 - alpha) * tex_sim_mat

    # let's start spectrum clustering
    # obtain cluster IDs for each word
    # eigen_solver: None, arpack, lobpcg, or amg
    cluster_ids = spectral_clustering(joint_mat, n_clusters=num_clusters, eigen_solver='arpack')
    print("Done...")
    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number
    words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
    word_centroid_map = dict(zip(words, cluster_ids))
    utilites.saveVariableToFile(cluster_ids, "Corel5k/concepts_ids.pkl")

    cluster_contents = []
    # For the first 10 clusters
    for cluster in range(0, num_clusters):
        # print the cluster number
        print("\nCluster %d" % cluster)
        # Find all of the words for that cluster number, and print them out
        r_words = []
        for i in range(0,len(word_centroid_map.values())):
            if( word_centroid_map.values()[i] == cluster ):
                r_words.append(word_centroid_map.keys()[i])

        print (r_words)
        cluster_contents.append(r_words)

    utilites.saveVariableToFile(cluster_contents, "Corel5k/cluster_contents.pkl")

    return cluster_ids
Пример #57
0
def spectral_cluster(G, node_list):
    # G is a similarity matrix
    S = nx.to_scipy_sparse_matrix(G, nodelist=node_list)

    previous_sum_cut = 0
    previous_cluster_node = {}
    previous_cluster_label = {}
    for i in range(2, 100):
        labels = spectral_clustering(S, n_clusters=i)
        labels = labels.tolist()
        # print(labels)
        result_cluster_node = dict(zip(node_list, labels))
        result_cluster_label = {}
        for k in result_cluster_node:
            v = result_cluster_node[k]
            if v in result_cluster_label:
                result_cluster_label.get(v).add(k)
            else:
                result_cluster_label[v] = {k}
        # print(result_cluster_label)
        sum_cut = 0
        for k in result_cluster_label:
            cut_k = 0
            vol_k = 0
            v = result_cluster_label[k]
            for nk in v:
                set_not_k = set(node_list).difference(v)
                vol_k += csr_matrix.sum(S.getcol(node_list.index(nk)))
                # print(nk, S.getcol(cited_list.index(nk)).toarray().tolist())
                for notk in set_not_k:
                    cut_k += G.get_edge_data(nk,notk,default={"weight":0})["weight"]
            # print(cut_k, vol_k)
            sum_cut += (cut_k/vol_k)

        if sum_cut > previous_sum_cut != 0 or i == 99:
            print(i, sum_cut, result_cluster_label)
            return {"result_by_node": previous_cluster_node, "result_by_cluster": previous_cluster_label}
            break
        else:
            previous_cluster_node = result_cluster_node
            previous_cluster_label = result_cluster_label
            previous_sum_cut = sum_cut
Пример #58
0
def defficient_spectral_clustring(name_dict, z_depth, shape_dict):
    # requires to re-implement the distance definition on sparse images.

    z_stack = next(name_dict.itervalues())

    _3D_chan1 = np.zeros((shape_dict[1][0], shape_dict[1][1], z_depth))
    _3D_chan2 = np.zeros((shape_dict[2][0], shape_dict[2][1], z_depth))

    for depth, bi_image in z_stack.iteritems():
        img1 = bi_image[1]
        img2 = bi_image[2]

        _3D_chan1[:, :, depth-1] = img1
        _3D_chan2[:, :, depth-1] = img2

    # mlab.pipeline.volume(mlab.pipeline.scalar_field(_3D_chan1), vmin=0.1)
    # mlab.show()

    _3D_chan2[_3D_chan2<0.04] = 0

    mlab.pipeline.volume(mlab.pipeline.scalar_field(_3D_chan2))
    mlab.show()

    mask = _3D_chan2.astype(bool)
    img = _3D_chan2.astype(float)

    graph = image.img_to_graph(img, mask=mask)
    graph.data = np.exp(-graph.data / graph.data.std())

    print graph.shape
    print len(graph.nonzero()[0])

    clusters = 4
    labels = spectral_clustering(graph, n_clusters = clusters, eigen_solver='arpack')
    label_im = -np.ones(mask.shape)
    label_im[mask] = labels

    for i in range(0, clusters):
        re_img = copy(_3D_chan2)
        re_img[label_im!=i] = 0
        mlab.pipeline.volume(mlab.pipeline.scalar_field(re_img))
        mlab.show()