def Guess_User_Group_by_KMeans(db_dir, db_file_name): """ 1. get distinct user ids 2. foreach user id, compute which group should it be 2.1 convert the data to orange data table 2.2 kmeans 3. save them into database """ sqlite = SQLDao.SQLiteDao(db_dir, db_file_name) h1, uids = sqlite.get_distinct_user_id() user_group_dict = {} for uid in uids: # retreive the user group info of a specific user h2, uid_group_info = sqlite.get_group_info_by_user_id(uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]) # convert the uid group info into the orange data table features = [] features.append(Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT)) domain = Orange.data.Domain(features, False) domain.add_meta( Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_USERID) ) domain.add_meta( Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPID) ) datas = [] for i in uid_group_info: data = Orange.data.Instance(domain, [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]]) data[SQLDao.LABEL_USER_GROUP_INFO_USERID] = i[SQLDao.LABEL_USER_GROUP_INFO_USERID] data[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] = i[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] datas.append(data) table = Orange.data.Table(domain, datas) target_instances = [] if len(table) > 3: km = Orange.clustering.kmeans.Clustering(data=table, distance=GroupCountDistance) clusters = km.clusters d = {} for idx, c_label in enumerate(clusters): if d.has_key(c_label): d[c_label].append(table[idx]) else: d[c_label] = [table[idx]] if len(d) == 3: # figure out which cluster represent the largest cluster max_label = None max_value = -1 for label, instances in d.items(): temp_list = [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value for i in instances] if max(temp_list) > max_value: max_value = max(temp_list) max_label = label pass for instance in d[max_label]: target_instances.append(instance) else: # just pick the group which has the largest group_count if it is large enough? table.sort([SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]) if table[-1][SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value > 20: target_instances.append(table[-1]) # print 'processing %s'%uid[SQLDao.LABEL_USER_GROUP_INFO_USERID] user_group_dict[uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]] = target_instances pass print "finish cluster" sqlite.save_user_group_clustered(user_group_dict)
def KMeansClustering_Iterative(self, step=False): ''' Integrate orange here actually it is a little subtle here: 1. I dont think kmeans is a good way to decide which community a node(user) should be however it is the most generalized one 2.TODO maybe change the number of clusters(rather than make it automatically...) is better, but you have to check to result first ''' eig_data = self.build_orange_data_from_eig_vector() # clustering self.km = Orange.clustering.kmeans.Clustering(data=eig_data, centroids=5, distance=EigDistance) # you had better construct it into a vertex_clustering in order to compute modularity, it is not reasonable to use orange to insert into database any more clusters = self.km.clusters d = {} for idx, c in enumerate(clusters): if not d.has_key(c): d[c] = [idx] else: d[c].append(idx) import Queue q = Queue.Queue() for v in d.values(): q.put(v) res_list = [] import CommunityExtraction as ce while q.qsize() > 0: v = q.get() print 'qsize:%s cluster size: %s res list size: %s' % ( q.qsize(), len(v), len(res_list)) if len(v) < ce.CRITERION_CLUSTER_NODES_LOWER_BOUND: res_list.append(v) pass elif len(v) > ce.CRITERION_CLUSTER_NODES_UPPER_BOUND: # may be it can be iterative sub_data = eig_data.get_items(v) sub_km = Orange.clustering.kmeans.Clustering( data=sub_data, centroids=5, distance=EigDistance) sub_clusters = sub_km.clusters temp_d = dict() for idx, c in enumerate(sub_clusters): if not temp_d.has_key(c): temp_d[c] = [v[idx]] else: temp_d[c].append(v[idx]) for sub_v in temp_d.values(): q.put(sub_v) pass else: res_list.append(v) pass pass clusters = [0] * len(eig_data) for idx, res in enumerate(res_list): for r in res: clusters[r] = idx pass import igraph self.vertex_clustering = igraph.clustering.VertexClustering( self.g, clusters) print 'writing vertex_clustering' FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['vertex_clustering_file_name'], self.vertex_clustering) print 'finished writing vertex_clustering'
def Guess_User_Group_by_KMeans(db_dir, db_file_name): ''' 1. get distinct user ids 2. foreach user id, compute which group should it be 2.1 convert the data to orange data table 2.2 kmeans 3. save them into database ''' sqlite = SQLDao.SQLiteDao(db_dir, db_file_name) h1, uids = sqlite.get_distinct_user_id() user_group_dict = {} for uid in uids: # retreive the user group info of a specific user h2, uid_group_info = sqlite.get_group_info_by_user_id( uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]) # convert the uid group info into the orange data table features = [] features.append( Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT)) domain = Orange.data.Domain(features, False) domain.add_meta( Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_USERID)) domain.add_meta( Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPID)) datas = [] for i in uid_group_info: data = Orange.data.Instance( domain, [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]]) data[SQLDao.LABEL_USER_GROUP_INFO_USERID] = i[ SQLDao.LABEL_USER_GROUP_INFO_USERID] data[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] = i[ SQLDao.LABEL_USER_GROUP_INFO_GROUPID] datas.append(data) table = Orange.data.Table(domain, datas) target_instances = [] if len(table) > 3: km = Orange.clustering.kmeans.Clustering( data=table, distance=GroupCountDistance) clusters = km.clusters d = {} for idx, c_label in enumerate(clusters): if d.has_key(c_label): d[c_label].append(table[idx]) else: d[c_label] = [table[idx]] if len(d) == 3: # figure out which cluster represent the largest cluster max_label = None max_value = -1 for label, instances in d.items(): temp_list = [ i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value for i in instances ] if max(temp_list) > max_value: max_value = max(temp_list) max_label = label pass for instance in d[max_label]: target_instances.append(instance) else: # just pick the group which has the largest group_count if it is large enough? table.sort([SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]) if table[-1][SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value > 20: target_instances.append(table[-1]) # print 'processing %s'%uid[SQLDao.LABEL_USER_GROUP_INFO_USERID] user_group_dict[uid[ SQLDao.LABEL_USER_GROUP_INFO_USERID]] = target_instances pass print 'finish cluster' sqlite.save_user_group_clustered(user_group_dict)