def Summarize_User_Group_Specify(groupinfo_tablename, db_dir, db_file_name):
    """
    1.  Summarize the user group info specification, however, some data puring problem should be handled here
    2.  write the user group specification to database
    """
    sql = SQLDao.SQLDao.getInstance()
    (headings, user_group_info) = d.trace(sql.getUserGroupSpecify)(groupinfo_tablename)

    # print debug message
    print "user_group_info:%s" % len(user_group_info)
    sqlite = SQLDao.SQLiteDao(db_dir, db_file_name)
    d.trace(sqlite.save_user_group_info)(user_group_info)
예제 #2
0
def Summarize_User_Group_Specify(groupinfo_tablename, db_dir, db_file_name):
    '''
    1.  Summarize the user group info specification, however, some data puring problem should be handled here
    2.  write the user group specification to database
    '''
    sql = SQLDao.SQLDao.getInstance()
    (headings,
     user_group_info) = d.trace(sql.getUserGroupSpecify)(groupinfo_tablename)

    #print debug message
    print 'user_group_info:%s' % len(user_group_info)
    sqlite = SQLDao.SQLiteDao(db_dir, db_file_name)
    d.trace(sqlite.save_user_group_info)(user_group_info)
def Dendrogram(directory, vertexDendrogramFile, write_type, filename="", comment=None):
    """
    In this function, we will
    1.  calculate the group info iteratively, in order to make sure a group is not so big and also not so small
    2.  write the word network to a excel file or community.db
    """
    # read dendrogram from file system
    f = open(directory + vertexDendrogramFile, "rb")
    vertexDendrogram = d.trace(pickle.load)(f)
    f.close()

    vertexClustering = d.trace(vertexDendrogram.as_clustering)()
    subgraphs = d.trace(vertexClustering.subgraphs)()
    subgraphs_accordance = []
    # make all the subgraphs that
    # size(subgraphs)>CRITERION_CLUSTER_NODES_LOWER_BOUND and size(subgraphs)<CRITERION_CLUSTER_NODES_UPPER_BOUND
    while len(subgraphs) > 0:
        print "subgraphs size: %s" % len(subgraphs)
        g = subgraphs.pop()
        nodes = g.vs
        print "nodes size: %s" % len(nodes)
        if len(nodes) > CRITERION_CLUSTER_NODES_UPPER_BOUND:
            # iterate find community here
            vd = d.trace(g.community_fastgreedy)()
            vc = d.trace(vd.as_clustering)()
            sgs = d.trace(vc.subgraphs)()
            print "new subgraphs count(and all of them will be pushed) %s" % len(sgs)
            for sg in sgs:
                subgraphs.append(sg)

        elif len(nodes) < CRITERION_CLUSTER_NODES_LOWER_BOUND:
            # omit this community here
            pass
        else:
            # write this community to the file system here
            subgraphs_accordance.append(g)
            pass

    # there must be some subgraphs that contain less than 10 nodes
    groupinfo = []
    gid = 0
    for g in subgraphs_accordance:
        nodes = g.vs
        gid += 1
        for node in nodes:
            # groupinfo.append([node['dbIndex'],node['name'],gid])
            groupinfo.append({LABEL_VERTEX_ID: node["dbIndex"], LABEL_NOUN: node["name"], LABEL_GROUP_ID: gid})

    if write_type == "excel":
        d.trace(FSDao.write_excel)(
            directory, filename, "group", [LABEL_VERTEX_ID, LABEL_NOUN, LABEL_GROUP_ID], groupinfo, comment
        )
    elif write_type == "db":
        # write them to the community db
        sqlite = SQLDao.SQLiteDao(directory, filename)
        sqlite.save_word_group_info(groupinfo)
        pass
    else:
        raise ValueError("write type error")
    return groupinfo
def SerializeDendrogram(directory, graph_filename, dendrogram_filename):
    """
    In this function, we
    1.  read the pick graph from the file system
    2.  rudely compute the community by fast greedy algorithm
    3.  then dump the pick object
    """
    graph = d.trace(FSDao.read_pickle_graph)(graph_filename)
    # we need some filtering here
    global CRITERION_VERTEX_OCCUR_COUNT
    graph = d.trace(graph.subgraph)(graph.vs.select(occur_count_ge=CRITERION_VERTEX_OCCUR_COUNT))
    vertexDendrogram = d.trace(graph.community_fastgreedy)()
    FSDao.write_pickle(directory, dendrogram_filename, vertexDendrogram)
    pass
예제 #5
0
def SerializeDendrogram(directory, graph_filename, dendrogram_filename):
    '''
    In this function, we
    1.  read the pick graph from the file system
    2.  rudely compute the community by fast greedy algorithm
    3.  then dump the pick object
    '''
    graph = d.trace(FSDao.read_pickle_graph)(graph_filename)
    #we need some filtering here
    global CRITERION_VERTEX_OCCUR_COUNT
    graph = d.trace(graph.subgraph)(
        graph.vs.select(occur_count_ge=CRITERION_VERTEX_OCCUR_COUNT))
    vertexDendrogram = d.trace(graph.community_fastgreedy)()
    FSDao.write_pickle(directory, dendrogram_filename, vertexDendrogram)
    pass
def CommunityDiscovery(expr_dir, pickle_filename, dendrogram_file_name):
    '''
     discover community in the relation graph, it takes time
     1. read pickle graph from the file system
     2. compute the dendrogram
     3. serialize the dendrogram
    '''
    print expr_dir + pickle_filename
    #g=FSDao.read_pickle_graph(expr_dir+pickle_filename)
    f = open(expr_dir + pickle_filename, 'rb')
    g = d.trace(pickle.load)(f)
    f.close()

    vertexClustering = d.trace(g.community_leading_eigenvector)()
    FSDao.write_pickle(expr_dir, 'dendrogram.eigen', vertexClustering)

    # edge betweeness
    # vertexDendrogram=d.trace(g.community_edge_betweenness)(directed=True)
    # FSDao.write_pickle(expr_dir,'dendrogram.betweeness',vertexDendrogram)

    # walk strap
    # vertexDendrogram=d.trace(g.community_walktrap)()
    # FSDao.write_pickle(expr_dir,'dendrogram.walkstrap',vertexDendrogram)
    pass
def SerializeBirelationGraph():
    '''
    construct the bi-relational graph and write it to the file system as pickle graph
    '''
    import igraph

    sql = SQLDao.SQLDao.getInstance()
    h1, uids = d.trace(sql.getAllOU)()
    # print len(uids)
    # add users to the graph and construct a dict for index
    g = igraph.Graph(n=0, directed=False)
    uid_to_gidx_dict = {}
    for idx, uid in enumerate(uids):
        # make sure the name is user_id
        g.add_vertex({SQLDao.LABEL_USER_GROUP_INFO_USERID:uid[0]})
        uid_to_gidx_dict[uid[0]] = idx
        pass
    print 'Finish add vertices %s'%len(uids)

    h, ur = d.trace(sql.getOURelations)(reciprocated=True)
    #construct the list contain tuples represent the relations between users
    edge_list = []
    for idx, rec in enumerate(ur):
        if idx % 1000 == 0:
            print 'edge %s' % idx
        sid = rec[SQLDao.LABEL_SRC_USERID]
        tid = rec[SQLDao.LABEL_TAR_USERID]
        edge_list.append((uid_to_gidx_dict[sid], uid_to_gidx_dict[tid]))

    edge_list=list(edge_set)
    print 'Finish constructing edge list %s' % len(edge_list)
    # Note: It is <bold>very very</bold> slow to add edge iteratively
    g.add_edges(edge_list)
    print 'finish building a graph based on social relation'
    FSDao.write_pickle(SQLDao.ce.properties['base_dir'] + SQLDao.ce.properties['expr_dir'],SQLDao.ce.properties['relation_reciprocated_graph_file_name'], g)
    pass
def SerializeRelationshipGraph():
    '''
	construct the graph and write it to the file system as pickle graph
    '''
    import igraph

    sql = SQLDao.SQLDao.getInstance()
    h1, uids = d.trace(sql.getAllOU)()
    sql = SQLDao.SQLDao.getInstance()
    h, ur = d.trace(sql.getOURelations)()
    g = igraph.Graph(n=0, directed=True)
    # add users to the graph and construct a dict for index
    uid_to_gidx_dict={}
    assert SQLDao.LABEL_USER_GROUP_INFO_USERID=='user_id'
    for idx, user_id in enumerate(uids):
        g.add_vertex(user_id=user_id[0])
        uid_to_gidx_dict[user_id[0]] = idx
        pass
    print 'Finish add vertices'
    # construct the list contain tuples represent the relations between users
    edge_list = []
    for idx, rec in enumerate(ur):
        if idx % 1000 == 0:
            print 'edge %s' % idx
        sid = rec[SQLDao.LABEL_SRC_USERID]
        tid = rec[SQLDao.LABEL_TAR_USERID]
        edge_list.append((uid_to_gidx_dict[sid], uid_to_gidx_dict[tid]))

    print 'Finish constructing edge list %s' % len(edge_list)
    # Note: It is <bold>very very</bold> slow to add edge iteratively
    g.add_edges(edge_list)
    print 'finish building a graph based on social relation'

    # FSDao.write_graph(g, SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir']+'relation.pickle')
    FSDao.write_pickle(SQLDao.ce.properties['base_dir'] + SQLDao.ce.properties['expr_dir'],SQLDao.ce.properties['relation_graph_file_name'], g)
    pass
예제 #9
0
    def iGraph_CommunityDiscovery(self, step=False):
        '''
        discover community in the relation graph, it takes time
        1. compute the vertexClustering
        2. serialize the vertexClustering

        step to see whether this function start from itself
        '''
        # why I serialize the vertexClustering:
        # even though I compute eigenvector myself, I am forced to construct vertexClustering myself
        self.vertex_clustering = d.trace(
            self.g.community_leading_eigenvector)()
        print 'modularity is %s' % self.vertex_clustering.modularity
        # print self.vertex_clustering.membership
        print 'finish find community_leading_eigenvector'
        FSDao.write_pickle(self.expr_dir,
                           SQLDao.ce.properties['vertex_clustering_file_name'],
                           self.vertex_clustering)
        pass
예제 #10
0
def Dendrogram(directory,
               vertexDendrogramFile,
               write_type,
               filename='',
               comment=None):
    '''
    In this function, we will
    1.  calculate the group info iteratively, in order to make sure a group is not so big and also not so small
    2.  write the word network to a excel file or community.db
    '''
    #read dendrogram from file system
    f = open(directory + vertexDendrogramFile, 'rb')
    vertexDendrogram = d.trace(pickle.load)(f)
    f.close()

    vertexClustering = d.trace(vertexDendrogram.as_clustering)()
    subgraphs = d.trace(vertexClustering.subgraphs)()
    subgraphs_accordance = []
    # make all the subgraphs that
    # size(subgraphs)>CRITERION_CLUSTER_NODES_LOWER_BOUND and size(subgraphs)<CRITERION_CLUSTER_NODES_UPPER_BOUND
    while len(subgraphs) > 0:
        print 'subgraphs size: %s' % len(subgraphs)
        g = subgraphs.pop()
        nodes = g.vs
        print 'nodes size: %s' % len(nodes)
        if len(nodes) > CRITERION_CLUSTER_NODES_UPPER_BOUND:
            #iterate find community here
            vd = d.trace(g.community_fastgreedy)()
            vc = d.trace(vd.as_clustering)()
            sgs = d.trace(vc.subgraphs)()
            print 'new subgraphs count(and all of them will be pushed) %s' % len(
                sgs)
            for sg in sgs:
                subgraphs.append(sg)

        elif len(nodes) < CRITERION_CLUSTER_NODES_LOWER_BOUND:
            #omit this community here
            pass
        else:
            #write this community to the file system here
            subgraphs_accordance.append(g)
            pass

    # there must be some subgraphs that contain less than 10 nodes
    groupinfo = []
    gid = 0
    for g in subgraphs_accordance:
        nodes = g.vs
        gid += 1
        for node in nodes:
            #groupinfo.append([node['dbIndex'],node['name'],gid])
            groupinfo.append({
                LABEL_VERTEX_ID: node['dbIndex'],
                LABEL_NOUN: node['name'],
                LABEL_GROUP_ID: gid
            })

    if write_type == 'excel':
        d.trace(FSDao.write_excel)(
            directory, filename, 'group',
            [LABEL_VERTEX_ID, LABEL_NOUN, LABEL_GROUP_ID], groupinfo, comment)
    elif write_type == 'db':
        #write them to the community db
        sqlite = SQLDao.SQLiteDao(directory, filename)
        sqlite.save_word_group_info(groupinfo)
        pass
    else:
        raise ValueError('write type error')
    return groupinfo