def Summarize_User_Group_Specify(groupinfo_tablename, db_dir, db_file_name): """ 1. Summarize the user group info specification, however, some data puring problem should be handled here 2. write the user group specification to database """ sql = SQLDao.SQLDao.getInstance() (headings, user_group_info) = d.trace(sql.getUserGroupSpecify)(groupinfo_tablename) # print debug message print "user_group_info:%s" % len(user_group_info) sqlite = SQLDao.SQLiteDao(db_dir, db_file_name) d.trace(sqlite.save_user_group_info)(user_group_info)
def Summarize_User_Group_Specify(groupinfo_tablename, db_dir, db_file_name): ''' 1. Summarize the user group info specification, however, some data puring problem should be handled here 2. write the user group specification to database ''' sql = SQLDao.SQLDao.getInstance() (headings, user_group_info) = d.trace(sql.getUserGroupSpecify)(groupinfo_tablename) #print debug message print 'user_group_info:%s' % len(user_group_info) sqlite = SQLDao.SQLiteDao(db_dir, db_file_name) d.trace(sqlite.save_user_group_info)(user_group_info)
def Dendrogram(directory, vertexDendrogramFile, write_type, filename="", comment=None): """ In this function, we will 1. calculate the group info iteratively, in order to make sure a group is not so big and also not so small 2. write the word network to a excel file or community.db """ # read dendrogram from file system f = open(directory + vertexDendrogramFile, "rb") vertexDendrogram = d.trace(pickle.load)(f) f.close() vertexClustering = d.trace(vertexDendrogram.as_clustering)() subgraphs = d.trace(vertexClustering.subgraphs)() subgraphs_accordance = [] # make all the subgraphs that # size(subgraphs)>CRITERION_CLUSTER_NODES_LOWER_BOUND and size(subgraphs)<CRITERION_CLUSTER_NODES_UPPER_BOUND while len(subgraphs) > 0: print "subgraphs size: %s" % len(subgraphs) g = subgraphs.pop() nodes = g.vs print "nodes size: %s" % len(nodes) if len(nodes) > CRITERION_CLUSTER_NODES_UPPER_BOUND: # iterate find community here vd = d.trace(g.community_fastgreedy)() vc = d.trace(vd.as_clustering)() sgs = d.trace(vc.subgraphs)() print "new subgraphs count(and all of them will be pushed) %s" % len(sgs) for sg in sgs: subgraphs.append(sg) elif len(nodes) < CRITERION_CLUSTER_NODES_LOWER_BOUND: # omit this community here pass else: # write this community to the file system here subgraphs_accordance.append(g) pass # there must be some subgraphs that contain less than 10 nodes groupinfo = [] gid = 0 for g in subgraphs_accordance: nodes = g.vs gid += 1 for node in nodes: # groupinfo.append([node['dbIndex'],node['name'],gid]) groupinfo.append({LABEL_VERTEX_ID: node["dbIndex"], LABEL_NOUN: node["name"], LABEL_GROUP_ID: gid}) if write_type == "excel": d.trace(FSDao.write_excel)( directory, filename, "group", [LABEL_VERTEX_ID, LABEL_NOUN, LABEL_GROUP_ID], groupinfo, comment ) elif write_type == "db": # write them to the community db sqlite = SQLDao.SQLiteDao(directory, filename) sqlite.save_word_group_info(groupinfo) pass else: raise ValueError("write type error") return groupinfo
def SerializeDendrogram(directory, graph_filename, dendrogram_filename): """ In this function, we 1. read the pick graph from the file system 2. rudely compute the community by fast greedy algorithm 3. then dump the pick object """ graph = d.trace(FSDao.read_pickle_graph)(graph_filename) # we need some filtering here global CRITERION_VERTEX_OCCUR_COUNT graph = d.trace(graph.subgraph)(graph.vs.select(occur_count_ge=CRITERION_VERTEX_OCCUR_COUNT)) vertexDendrogram = d.trace(graph.community_fastgreedy)() FSDao.write_pickle(directory, dendrogram_filename, vertexDendrogram) pass
def SerializeDendrogram(directory, graph_filename, dendrogram_filename): ''' In this function, we 1. read the pick graph from the file system 2. rudely compute the community by fast greedy algorithm 3. then dump the pick object ''' graph = d.trace(FSDao.read_pickle_graph)(graph_filename) #we need some filtering here global CRITERION_VERTEX_OCCUR_COUNT graph = d.trace(graph.subgraph)( graph.vs.select(occur_count_ge=CRITERION_VERTEX_OCCUR_COUNT)) vertexDendrogram = d.trace(graph.community_fastgreedy)() FSDao.write_pickle(directory, dendrogram_filename, vertexDendrogram) pass
def CommunityDiscovery(expr_dir, pickle_filename, dendrogram_file_name): ''' discover community in the relation graph, it takes time 1. read pickle graph from the file system 2. compute the dendrogram 3. serialize the dendrogram ''' print expr_dir + pickle_filename #g=FSDao.read_pickle_graph(expr_dir+pickle_filename) f = open(expr_dir + pickle_filename, 'rb') g = d.trace(pickle.load)(f) f.close() vertexClustering = d.trace(g.community_leading_eigenvector)() FSDao.write_pickle(expr_dir, 'dendrogram.eigen', vertexClustering) # edge betweeness # vertexDendrogram=d.trace(g.community_edge_betweenness)(directed=True) # FSDao.write_pickle(expr_dir,'dendrogram.betweeness',vertexDendrogram) # walk strap # vertexDendrogram=d.trace(g.community_walktrap)() # FSDao.write_pickle(expr_dir,'dendrogram.walkstrap',vertexDendrogram) pass
def SerializeBirelationGraph(): ''' construct the bi-relational graph and write it to the file system as pickle graph ''' import igraph sql = SQLDao.SQLDao.getInstance() h1, uids = d.trace(sql.getAllOU)() # print len(uids) # add users to the graph and construct a dict for index g = igraph.Graph(n=0, directed=False) uid_to_gidx_dict = {} for idx, uid in enumerate(uids): # make sure the name is user_id g.add_vertex({SQLDao.LABEL_USER_GROUP_INFO_USERID:uid[0]}) uid_to_gidx_dict[uid[0]] = idx pass print 'Finish add vertices %s'%len(uids) h, ur = d.trace(sql.getOURelations)(reciprocated=True) #construct the list contain tuples represent the relations between users edge_list = [] for idx, rec in enumerate(ur): if idx % 1000 == 0: print 'edge %s' % idx sid = rec[SQLDao.LABEL_SRC_USERID] tid = rec[SQLDao.LABEL_TAR_USERID] edge_list.append((uid_to_gidx_dict[sid], uid_to_gidx_dict[tid])) edge_list=list(edge_set) print 'Finish constructing edge list %s' % len(edge_list) # Note: It is <bold>very very</bold> slow to add edge iteratively g.add_edges(edge_list) print 'finish building a graph based on social relation' FSDao.write_pickle(SQLDao.ce.properties['base_dir'] + SQLDao.ce.properties['expr_dir'],SQLDao.ce.properties['relation_reciprocated_graph_file_name'], g) pass
def SerializeRelationshipGraph(): ''' construct the graph and write it to the file system as pickle graph ''' import igraph sql = SQLDao.SQLDao.getInstance() h1, uids = d.trace(sql.getAllOU)() sql = SQLDao.SQLDao.getInstance() h, ur = d.trace(sql.getOURelations)() g = igraph.Graph(n=0, directed=True) # add users to the graph and construct a dict for index uid_to_gidx_dict={} assert SQLDao.LABEL_USER_GROUP_INFO_USERID=='user_id' for idx, user_id in enumerate(uids): g.add_vertex(user_id=user_id[0]) uid_to_gidx_dict[user_id[0]] = idx pass print 'Finish add vertices' # construct the list contain tuples represent the relations between users edge_list = [] for idx, rec in enumerate(ur): if idx % 1000 == 0: print 'edge %s' % idx sid = rec[SQLDao.LABEL_SRC_USERID] tid = rec[SQLDao.LABEL_TAR_USERID] edge_list.append((uid_to_gidx_dict[sid], uid_to_gidx_dict[tid])) print 'Finish constructing edge list %s' % len(edge_list) # Note: It is <bold>very very</bold> slow to add edge iteratively g.add_edges(edge_list) print 'finish building a graph based on social relation' # FSDao.write_graph(g, SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir']+'relation.pickle') FSDao.write_pickle(SQLDao.ce.properties['base_dir'] + SQLDao.ce.properties['expr_dir'],SQLDao.ce.properties['relation_graph_file_name'], g) pass
def iGraph_CommunityDiscovery(self, step=False): ''' discover community in the relation graph, it takes time 1. compute the vertexClustering 2. serialize the vertexClustering step to see whether this function start from itself ''' # why I serialize the vertexClustering: # even though I compute eigenvector myself, I am forced to construct vertexClustering myself self.vertex_clustering = d.trace( self.g.community_leading_eigenvector)() print 'modularity is %s' % self.vertex_clustering.modularity # print self.vertex_clustering.membership print 'finish find community_leading_eigenvector' FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['vertex_clustering_file_name'], self.vertex_clustering) pass
def Dendrogram(directory, vertexDendrogramFile, write_type, filename='', comment=None): ''' In this function, we will 1. calculate the group info iteratively, in order to make sure a group is not so big and also not so small 2. write the word network to a excel file or community.db ''' #read dendrogram from file system f = open(directory + vertexDendrogramFile, 'rb') vertexDendrogram = d.trace(pickle.load)(f) f.close() vertexClustering = d.trace(vertexDendrogram.as_clustering)() subgraphs = d.trace(vertexClustering.subgraphs)() subgraphs_accordance = [] # make all the subgraphs that # size(subgraphs)>CRITERION_CLUSTER_NODES_LOWER_BOUND and size(subgraphs)<CRITERION_CLUSTER_NODES_UPPER_BOUND while len(subgraphs) > 0: print 'subgraphs size: %s' % len(subgraphs) g = subgraphs.pop() nodes = g.vs print 'nodes size: %s' % len(nodes) if len(nodes) > CRITERION_CLUSTER_NODES_UPPER_BOUND: #iterate find community here vd = d.trace(g.community_fastgreedy)() vc = d.trace(vd.as_clustering)() sgs = d.trace(vc.subgraphs)() print 'new subgraphs count(and all of them will be pushed) %s' % len( sgs) for sg in sgs: subgraphs.append(sg) elif len(nodes) < CRITERION_CLUSTER_NODES_LOWER_BOUND: #omit this community here pass else: #write this community to the file system here subgraphs_accordance.append(g) pass # there must be some subgraphs that contain less than 10 nodes groupinfo = [] gid = 0 for g in subgraphs_accordance: nodes = g.vs gid += 1 for node in nodes: #groupinfo.append([node['dbIndex'],node['name'],gid]) groupinfo.append({ LABEL_VERTEX_ID: node['dbIndex'], LABEL_NOUN: node['name'], LABEL_GROUP_ID: gid }) if write_type == 'excel': d.trace(FSDao.write_excel)( directory, filename, 'group', [LABEL_VERTEX_ID, LABEL_NOUN, LABEL_GROUP_ID], groupinfo, comment) elif write_type == 'db': #write them to the community db sqlite = SQLDao.SQLiteDao(directory, filename) sqlite.save_word_group_info(groupinfo) pass else: raise ValueError('write type error') return groupinfo