예제 #1
0
    def KMeansClustering_Iterative(self, step=False):
        '''
        Integrate orange here
        actually it is a little subtle here:
        1.  I dont think kmeans is a good way to decide which community a node(user) should be
        however it is the most generalized one

        2.TODO maybe change the number of clusters(rather than make it automatically...) is better, but you have to check to result first
        '''
        eig_data = self.build_orange_data_from_eig_vector()
        # clustering
        self.km = Orange.clustering.kmeans.Clustering(data=eig_data,
                                                      centroids=5,
                                                      distance=EigDistance)
        # you had better construct it into a vertex_clustering in order to compute modularity, it is not reasonable to use orange to insert into database any more
        clusters = self.km.clusters

        d = {}
        for idx, c in enumerate(clusters):
            if not d.has_key(c):
                d[c] = [idx]
            else:
                d[c].append(idx)

        import Queue
        q = Queue.Queue()

        for v in d.values():
            q.put(v)

        res_list = []

        import CommunityExtraction as ce
        while q.qsize() > 0:
            v = q.get()
            print 'qsize:%s cluster size: %s res list size: %s' % (
                q.qsize(), len(v), len(res_list))
            if len(v) < ce.CRITERION_CLUSTER_NODES_LOWER_BOUND:
                res_list.append(v)
                pass
            elif len(v) > ce.CRITERION_CLUSTER_NODES_UPPER_BOUND:
                # may be it can be iterative
                sub_data = eig_data.get_items(v)
                sub_km = Orange.clustering.kmeans.Clustering(
                    data=sub_data, centroids=5, distance=EigDistance)
                sub_clusters = sub_km.clusters
                temp_d = dict()
                for idx, c in enumerate(sub_clusters):
                    if not temp_d.has_key(c):
                        temp_d[c] = [v[idx]]
                    else:
                        temp_d[c].append(v[idx])

                for sub_v in temp_d.values():
                    q.put(sub_v)
                pass
            else:
                res_list.append(v)
                pass
            pass

        clusters = [0] * len(eig_data)
        for idx, res in enumerate(res_list):
            for r in res:
                clusters[r] = idx
            pass

        import igraph
        self.vertex_clustering = igraph.clustering.VertexClustering(
            self.g, clusters)
        print 'writing vertex_clustering'
        FSDao.write_pickle(self.expr_dir,
                           SQLDao.ce.properties['vertex_clustering_file_name'],
                           self.vertex_clustering)
        print 'finished writing vertex_clustering'