def KMeansClustering_Iterative(self, step=False): ''' Integrate orange here actually it is a little subtle here: 1. I dont think kmeans is a good way to decide which community a node(user) should be however it is the most generalized one 2.TODO maybe change the number of clusters(rather than make it automatically...) is better, but you have to check to result first ''' eig_data = self.build_orange_data_from_eig_vector() # clustering self.km = Orange.clustering.kmeans.Clustering(data=eig_data, centroids=5, distance=EigDistance) # you had better construct it into a vertex_clustering in order to compute modularity, it is not reasonable to use orange to insert into database any more clusters = self.km.clusters d = {} for idx, c in enumerate(clusters): if not d.has_key(c): d[c] = [idx] else: d[c].append(idx) import Queue q = Queue.Queue() for v in d.values(): q.put(v) res_list = [] import CommunityExtraction as ce while q.qsize() > 0: v = q.get() print 'qsize:%s cluster size: %s res list size: %s' % ( q.qsize(), len(v), len(res_list)) if len(v) < ce.CRITERION_CLUSTER_NODES_LOWER_BOUND: res_list.append(v) pass elif len(v) > ce.CRITERION_CLUSTER_NODES_UPPER_BOUND: # may be it can be iterative sub_data = eig_data.get_items(v) sub_km = Orange.clustering.kmeans.Clustering( data=sub_data, centroids=5, distance=EigDistance) sub_clusters = sub_km.clusters temp_d = dict() for idx, c in enumerate(sub_clusters): if not temp_d.has_key(c): temp_d[c] = [v[idx]] else: temp_d[c].append(v[idx]) for sub_v in temp_d.values(): q.put(sub_v) pass else: res_list.append(v) pass pass clusters = [0] * len(eig_data) for idx, res in enumerate(res_list): for r in res: clusters[r] = idx pass import igraph self.vertex_clustering = igraph.clustering.VertexClustering( self.g, clusters) print 'writing vertex_clustering' FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['vertex_clustering_file_name'], self.vertex_clustering) print 'finished writing vertex_clustering'