def Summarize_User_Group_Specify(groupinfo_tablename, db_dir, db_file_name): """ 1. Summarize the user group info specification, however, some data puring problem should be handled here 2. write the user group specification to database """ sql = SQLDao.SQLDao.getInstance() (headings, user_group_info) = d.trace(sql.getUserGroupSpecify)(groupinfo_tablename) # print debug message print "user_group_info:%s" % len(user_group_info) sqlite = SQLDao.SQLiteDao(db_dir, db_file_name) d.trace(sqlite.save_user_group_info)(user_group_info)
def Summarize_User_Group_Specify(groupinfo_tablename, db_dir, db_file_name): ''' 1. Summarize the user group info specification, however, some data puring problem should be handled here 2. write the user group specification to database ''' sql = SQLDao.SQLDao.getInstance() (headings, user_group_info) = d.trace(sql.getUserGroupSpecify)(groupinfo_tablename) #print debug message print 'user_group_info:%s' % len(user_group_info) sqlite = SQLDao.SQLiteDao(db_dir, db_file_name) d.trace(sqlite.save_user_group_info)(user_group_info)
def Dendrogram(directory, vertexDendrogramFile, write_type, filename="", comment=None): """ In this function, we will 1. calculate the group info iteratively, in order to make sure a group is not so big and also not so small 2. write the word network to a excel file or community.db """ # read dendrogram from file system f = open(directory + vertexDendrogramFile, "rb") vertexDendrogram = d.trace(pickle.load)(f) f.close() vertexClustering = d.trace(vertexDendrogram.as_clustering)() subgraphs = d.trace(vertexClustering.subgraphs)() subgraphs_accordance = [] # make all the subgraphs that # size(subgraphs)>CRITERION_CLUSTER_NODES_LOWER_BOUND and size(subgraphs)<CRITERION_CLUSTER_NODES_UPPER_BOUND while len(subgraphs) > 0: print "subgraphs size: %s" % len(subgraphs) g = subgraphs.pop() nodes = g.vs print "nodes size: %s" % len(nodes) if len(nodes) > CRITERION_CLUSTER_NODES_UPPER_BOUND: # iterate find community here vd = d.trace(g.community_fastgreedy)() vc = d.trace(vd.as_clustering)() sgs = d.trace(vc.subgraphs)() print "new subgraphs count(and all of them will be pushed) %s" % len(sgs) for sg in sgs: subgraphs.append(sg) elif len(nodes) < CRITERION_CLUSTER_NODES_LOWER_BOUND: # omit this community here pass else: # write this community to the file system here subgraphs_accordance.append(g) pass # there must be some subgraphs that contain less than 10 nodes groupinfo = [] gid = 0 for g in subgraphs_accordance: nodes = g.vs gid += 1 for node in nodes: # groupinfo.append([node['dbIndex'],node['name'],gid]) groupinfo.append({LABEL_VERTEX_ID: node["dbIndex"], LABEL_NOUN: node["name"], LABEL_GROUP_ID: gid}) if write_type == "excel": d.trace(FSDao.write_excel)( directory, filename, "group", [LABEL_VERTEX_ID, LABEL_NOUN, LABEL_GROUP_ID], groupinfo, comment ) elif write_type == "db": # write them to the community db sqlite = SQLDao.SQLiteDao(directory, filename) sqlite.save_word_group_info(groupinfo) pass else: raise ValueError("write type error") return groupinfo
def SerializeDendrogram(directory, graph_filename, dendrogram_filename): """ In this function, we 1. read the pick graph from the file system 2. rudely compute the community by fast greedy algorithm 3. then dump the pick object """ graph = d.trace(FSDao.read_pickle_graph)(graph_filename) # we need some filtering here global CRITERION_VERTEX_OCCUR_COUNT graph = d.trace(graph.subgraph)(graph.vs.select(occur_count_ge=CRITERION_VERTEX_OCCUR_COUNT)) vertexDendrogram = d.trace(graph.community_fastgreedy)() FSDao.write_pickle(directory, dendrogram_filename, vertexDendrogram) pass
def SerializeDendrogram(directory, graph_filename, dendrogram_filename): ''' In this function, we 1. read the pick graph from the file system 2. rudely compute the community by fast greedy algorithm 3. then dump the pick object ''' graph = d.trace(FSDao.read_pickle_graph)(graph_filename) #we need some filtering here global CRITERION_VERTEX_OCCUR_COUNT graph = d.trace(graph.subgraph)( graph.vs.select(occur_count_ge=CRITERION_VERTEX_OCCUR_COUNT)) vertexDendrogram = d.trace(graph.community_fastgreedy)() FSDao.write_pickle(directory, dendrogram_filename, vertexDendrogram) pass
def CommunityDiscovery(expr_dir, pickle_filename, dendrogram_file_name): ''' discover community in the relation graph, it takes time 1. read pickle graph from the file system 2. compute the dendrogram 3. serialize the dendrogram ''' print expr_dir + pickle_filename #g=FSDao.read_pickle_graph(expr_dir+pickle_filename) f = open(expr_dir + pickle_filename, 'rb') g = d.trace(pickle.load)(f) f.close() vertexClustering = d.trace(g.community_leading_eigenvector)() FSDao.write_pickle(expr_dir, 'dendrogram.eigen', vertexClustering) # edge betweeness # vertexDendrogram=d.trace(g.community_edge_betweenness)(directed=True) # FSDao.write_pickle(expr_dir,'dendrogram.betweeness',vertexDendrogram) # walk strap # vertexDendrogram=d.trace(g.community_walktrap)() # FSDao.write_pickle(expr_dir,'dendrogram.walkstrap',vertexDendrogram) pass
def SerializeBirelationGraph(): ''' construct the bi-relational graph and write it to the file system as pickle graph ''' import igraph sql = SQLDao.SQLDao.getInstance() h1, uids = d.trace(sql.getAllOU)() # print len(uids) # add users to the graph and construct a dict for index g = igraph.Graph(n=0, directed=False) uid_to_gidx_dict = {} for idx, uid in enumerate(uids): # make sure the name is user_id g.add_vertex({SQLDao.LABEL_USER_GROUP_INFO_USERID:uid[0]}) uid_to_gidx_dict[uid[0]] = idx pass print 'Finish add vertices %s'%len(uids) h, ur = d.trace(sql.getOURelations)(reciprocated=True) #construct the list contain tuples represent the relations between users edge_list = [] for idx, rec in enumerate(ur): if idx % 1000 == 0: print 'edge %s' % idx sid = rec[SQLDao.LABEL_SRC_USERID] tid = rec[SQLDao.LABEL_TAR_USERID] edge_list.append((uid_to_gidx_dict[sid], uid_to_gidx_dict[tid])) edge_list=list(edge_set) print 'Finish constructing edge list %s' % len(edge_list) # Note: It is <bold>very very</bold> slow to add edge iteratively g.add_edges(edge_list) print 'finish building a graph based on social relation' FSDao.write_pickle(SQLDao.ce.properties['base_dir'] + SQLDao.ce.properties['expr_dir'],SQLDao.ce.properties['relation_reciprocated_graph_file_name'], g) pass
def SerializeRelationshipGraph(): ''' construct the graph and write it to the file system as pickle graph ''' import igraph sql = SQLDao.SQLDao.getInstance() h1, uids = d.trace(sql.getAllOU)() sql = SQLDao.SQLDao.getInstance() h, ur = d.trace(sql.getOURelations)() g = igraph.Graph(n=0, directed=True) # add users to the graph and construct a dict for index uid_to_gidx_dict={} assert SQLDao.LABEL_USER_GROUP_INFO_USERID=='user_id' for idx, user_id in enumerate(uids): g.add_vertex(user_id=user_id[0]) uid_to_gidx_dict[user_id[0]] = idx pass print 'Finish add vertices' # construct the list contain tuples represent the relations between users edge_list = [] for idx, rec in enumerate(ur): if idx % 1000 == 0: print 'edge %s' % idx sid = rec[SQLDao.LABEL_SRC_USERID] tid = rec[SQLDao.LABEL_TAR_USERID] edge_list.append((uid_to_gidx_dict[sid], uid_to_gidx_dict[tid])) print 'Finish constructing edge list %s' % len(edge_list) # Note: It is <bold>very very</bold> slow to add edge iteratively g.add_edges(edge_list) print 'finish building a graph based on social relation' # FSDao.write_graph(g, SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir']+'relation.pickle') FSDao.write_pickle(SQLDao.ce.properties['base_dir'] + SQLDao.ce.properties['expr_dir'],SQLDao.ce.properties['relation_graph_file_name'], g) pass
def iGraph_CommunityDiscovery(self, step=False): ''' discover community in the relation graph, it takes time 1. compute the vertexClustering 2. serialize the vertexClustering step to see whether this function start from itself ''' # why I serialize the vertexClustering: # even though I compute eigenvector myself, I am forced to construct vertexClustering myself self.vertex_clustering = d.trace( self.g.community_leading_eigenvector)() print 'modularity is %s' % self.vertex_clustering.modularity # print self.vertex_clustering.membership print 'finish find community_leading_eigenvector' FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['vertex_clustering_file_name'], self.vertex_clustering) pass
def KMeansClustering_Iterative(self, step=False): ''' Integrate orange here actually it is a little subtle here: 1. I dont think kmeans is a good way to decide which community a node(user) should be however it is the most generalized one 2.TODO maybe change the number of clusters(rather than make it automatically...) is better, but you have to check to result first ''' eig_data = self.build_orange_data_from_eig_vector() # clustering self.km = Orange.clustering.kmeans.Clustering(data=eig_data, centroids=5, distance=EigDistance) # you had better construct it into a vertex_clustering in order to compute modularity, it is not reasonable to use orange to insert into database any more clusters = self.km.clusters d = {} for idx, c in enumerate(clusters): if not d.has_key(c): d[c] = [idx] else: d[c].append(idx) import Queue q = Queue.Queue() for v in d.values(): q.put(v) res_list = [] import CommunityExtraction as ce while q.qsize() > 0: v = q.get() print 'qsize:%s cluster size: %s res list size: %s' % ( q.qsize(), len(v), len(res_list)) if len(v) < ce.CRITERION_CLUSTER_NODES_LOWER_BOUND: res_list.append(v) pass elif len(v) > ce.CRITERION_CLUSTER_NODES_UPPER_BOUND: # may be it can be iterative sub_data = eig_data.get_items(v) sub_km = Orange.clustering.kmeans.Clustering( data=sub_data, centroids=5, distance=EigDistance) sub_clusters = sub_km.clusters temp_d = dict() for idx, c in enumerate(sub_clusters): if not temp_d.has_key(c): temp_d[c] = [v[idx]] else: temp_d[c].append(v[idx]) for sub_v in temp_d.values(): q.put(sub_v) pass else: res_list.append(v) pass pass clusters = [0] * len(eig_data) for idx, res in enumerate(res_list): for r in res: clusters[r] = idx pass import igraph self.vertex_clustering = igraph.clustering.VertexClustering( self.g, clusters) print 'writing vertex_clustering' FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['vertex_clustering_file_name'], self.vertex_clustering) print 'finished writing vertex_clustering'
def Guess_User_Group_by_KMeans(db_dir, db_file_name): """ 1. get distinct user ids 2. foreach user id, compute which group should it be 2.1 convert the data to orange data table 2.2 kmeans 3. save them into database """ sqlite = SQLDao.SQLiteDao(db_dir, db_file_name) h1, uids = sqlite.get_distinct_user_id() user_group_dict = {} for uid in uids: # retreive the user group info of a specific user h2, uid_group_info = sqlite.get_group_info_by_user_id(uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]) # convert the uid group info into the orange data table features = [] features.append(Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT)) domain = Orange.data.Domain(features, False) domain.add_meta( Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_USERID) ) domain.add_meta( Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPID) ) datas = [] for i in uid_group_info: data = Orange.data.Instance(domain, [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]]) data[SQLDao.LABEL_USER_GROUP_INFO_USERID] = i[SQLDao.LABEL_USER_GROUP_INFO_USERID] data[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] = i[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] datas.append(data) table = Orange.data.Table(domain, datas) target_instances = [] if len(table) > 3: km = Orange.clustering.kmeans.Clustering(data=table, distance=GroupCountDistance) clusters = km.clusters d = {} for idx, c_label in enumerate(clusters): if d.has_key(c_label): d[c_label].append(table[idx]) else: d[c_label] = [table[idx]] if len(d) == 3: # figure out which cluster represent the largest cluster max_label = None max_value = -1 for label, instances in d.items(): temp_list = [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value for i in instances] if max(temp_list) > max_value: max_value = max(temp_list) max_label = label pass for instance in d[max_label]: target_instances.append(instance) else: # just pick the group which has the largest group_count if it is large enough? table.sort([SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]) if table[-1][SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value > 20: target_instances.append(table[-1]) # print 'processing %s'%uid[SQLDao.LABEL_USER_GROUP_INFO_USERID] user_group_dict[uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]] = target_instances pass print "finish cluster" sqlite.save_user_group_clustered(user_group_dict)
def Dendrogram(directory, vertexDendrogramFile, write_type, filename='', comment=None): ''' In this function, we will 1. calculate the group info iteratively, in order to make sure a group is not so big and also not so small 2. write the word network to a excel file or community.db ''' #read dendrogram from file system f = open(directory + vertexDendrogramFile, 'rb') vertexDendrogram = d.trace(pickle.load)(f) f.close() vertexClustering = d.trace(vertexDendrogram.as_clustering)() subgraphs = d.trace(vertexClustering.subgraphs)() subgraphs_accordance = [] # make all the subgraphs that # size(subgraphs)>CRITERION_CLUSTER_NODES_LOWER_BOUND and size(subgraphs)<CRITERION_CLUSTER_NODES_UPPER_BOUND while len(subgraphs) > 0: print 'subgraphs size: %s' % len(subgraphs) g = subgraphs.pop() nodes = g.vs print 'nodes size: %s' % len(nodes) if len(nodes) > CRITERION_CLUSTER_NODES_UPPER_BOUND: #iterate find community here vd = d.trace(g.community_fastgreedy)() vc = d.trace(vd.as_clustering)() sgs = d.trace(vc.subgraphs)() print 'new subgraphs count(and all of them will be pushed) %s' % len( sgs) for sg in sgs: subgraphs.append(sg) elif len(nodes) < CRITERION_CLUSTER_NODES_LOWER_BOUND: #omit this community here pass else: #write this community to the file system here subgraphs_accordance.append(g) pass # there must be some subgraphs that contain less than 10 nodes groupinfo = [] gid = 0 for g in subgraphs_accordance: nodes = g.vs gid += 1 for node in nodes: #groupinfo.append([node['dbIndex'],node['name'],gid]) groupinfo.append({ LABEL_VERTEX_ID: node['dbIndex'], LABEL_NOUN: node['name'], LABEL_GROUP_ID: gid }) if write_type == 'excel': d.trace(FSDao.write_excel)( directory, filename, 'group', [LABEL_VERTEX_ID, LABEL_NOUN, LABEL_GROUP_ID], groupinfo, comment) elif write_type == 'db': #write them to the community db sqlite = SQLDao.SQLiteDao(directory, filename) sqlite.save_word_group_info(groupinfo) pass else: raise ValueError('write type error') return groupinfo
def Guess_User_Group_by_KMeans(db_dir, db_file_name): ''' 1. get distinct user ids 2. foreach user id, compute which group should it be 2.1 convert the data to orange data table 2.2 kmeans 3. save them into database ''' sqlite = SQLDao.SQLiteDao(db_dir, db_file_name) h1, uids = sqlite.get_distinct_user_id() user_group_dict = {} for uid in uids: # retreive the user group info of a specific user h2, uid_group_info = sqlite.get_group_info_by_user_id( uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]) # convert the uid group info into the orange data table features = [] features.append( Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT)) domain = Orange.data.Domain(features, False) domain.add_meta( Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_USERID)) domain.add_meta( Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPID)) datas = [] for i in uid_group_info: data = Orange.data.Instance( domain, [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]]) data[SQLDao.LABEL_USER_GROUP_INFO_USERID] = i[ SQLDao.LABEL_USER_GROUP_INFO_USERID] data[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] = i[ SQLDao.LABEL_USER_GROUP_INFO_GROUPID] datas.append(data) table = Orange.data.Table(domain, datas) target_instances = [] if len(table) > 3: km = Orange.clustering.kmeans.Clustering( data=table, distance=GroupCountDistance) clusters = km.clusters d = {} for idx, c_label in enumerate(clusters): if d.has_key(c_label): d[c_label].append(table[idx]) else: d[c_label] = [table[idx]] if len(d) == 3: # figure out which cluster represent the largest cluster max_label = None max_value = -1 for label, instances in d.items(): temp_list = [ i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value for i in instances ] if max(temp_list) > max_value: max_value = max(temp_list) max_label = label pass for instance in d[max_label]: target_instances.append(instance) else: # just pick the group which has the largest group_count if it is large enough? table.sort([SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]) if table[-1][SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value > 20: target_instances.append(table[-1]) # print 'processing %s'%uid[SQLDao.LABEL_USER_GROUP_INFO_USERID] user_group_dict[uid[ SQLDao.LABEL_USER_GROUP_INFO_USERID]] = target_instances pass print 'finish cluster' sqlite.save_user_group_clustered(user_group_dict)