def build_friends_edgelist():
    """
     Build friends graph 
     from full habrauser DB and
     store it as edgelist (easy readable from igraph). 
    """
    users_dir = ha.get_users_dir()
    user_files_list = os.listdir(users_dir)
    user_files_list.sort()
    login2id = {}
    for userid, filename in enumerate(user_files_list):
        login = filename.split(".")[0].replace("@", "")
        login2id[login] = userid

    edgefile = open(os.path.join(ha.get_graph_dir(), "friends.edgelist"), "w")

    for userid, filename in enumerate(user_files_list):
        user = so.SmartObject(ut.pickle2data(os.path.join(users_dir, filename)))
        login = filename.split(".")[0].replace("@", "")

        if user.friends:
            for friend in user.friends:
                if friend in login2id:
                    friend_id = login2id[friend]
                    edgefile.write("%d %d\n" % (userid, friend_id))

    edgefile.close()
Пример #2
0
def analyze_friends():
    # pylint: disable=E1101
    sourcefile = os.path.join(ha.get_graph_dir(), 'friends.edgelist')
    thegraph = Graph.Read_Edgelist(sourcefile, directed=False)
    print "Graph loaded"
    print thegraph
    # dd = thegraph.degree_distribution() /??
    degs = thegraph.vs.degree()
    print numpy.median(degs)
    print numpy.mean(degs)
    hist = Histogram(bin_width=2)
    hist << degs
    plot(hist, os.path.join(ha.get_reports_dir(), "habrafriends-degrees-distribution.svg"))
Пример #3
0
def cut_thread(postpath):
    """
      Build a graph matrix, set MAX-CUT problem
    """
    thepostsdir = os.path.join(ha.get_data_dir(), 'posts')
    filename = os.path.join(thepostsdir, postpath) + '.dat'
    thread_tree = ut.pickle2data(filename)
    
    def walk_tree_for_login2id(subtree, login2id):
        for login in subtree:
            if login not in login2id:
                login2id[login] = len(login2id) 
            walk_tree_for_login2id(subtree[login], login2id)
            
    login2id = {}
    walk_tree_for_login2id(thread_tree, login2id)
    
    N = len(login2id)
    weights = np.zeros((N, N), dtype=np.int16)

    def walk_tree_for_weights(root_login, subtree, weights):
        u = login2id[root_login]
        for login in subtree:
            v =  login2id[login]
            if u != v:
                weights[u, v] += 1
                weights[v, u] += 1
                walk_tree_for_weights(login, subtree[login], weights)
    
    for root_login in thread_tree:
        walk_tree_for_weights(root_login, thread_tree[root_login], weights)

    id2login = {}
    for login in login2id:
        id2login[login2id[login]] = login

    y = greedy_max_cut(weights)

    def print_habrauser(uid):
        login = id2login[uid]
        print '* [http://' +  login + '.habrahabr.ru ' + login + ']'

    print "Analysis of http://habrahabr.ru/" + postpath
    print "----"
    print "Party 1"
    for i in xrange(N):
        if y[i] > 0:
            print_habrauser(i)

    print "----"
            
    print "Party 2"
    for i in xrange(N):
        if y[i] < 0:
            print_habrauser(i)
            
    print "----"
Пример #4
0
def get_comment_tree(postpath):
    url_to_parse = 'http://habrahabr.ru/' + postpath
    root = ut.doc4url(url_to_parse)
    if not root:
        return None

    author  = root.xpath('//div[@class="author"]/a')[0].text
    print author
    
    comment_root_tree = {}
    ##  Словарь вложенных словарей
    ##  автор
    ##  |→ автор_комментарария
    ##      |→ автор подкомментария
    def dfs_process(node, tree):
        """
        Рекурсивно идет вглубь от node
        и набивает словарь-дерево tree
        """
        print node.get('id')
        comments = node.xpath('.//div[@id="comments" or @class="reply_comments"]')[0]
        for comment in comments.xpath('./div[@class="comment_item"]'):
            author = comment.xpath('.//a[@class="username"]')[0].text
            print author
            child_tree = {}
            dfs_process(comment, child_tree)
            tree[author] = deepcopy(child_tree)
    
    dfs_process(root, comment_root_tree)
    comment_tree = {author: comment_root_tree}
    print 'tree:', comment_tree
    
    thepostsdir = os.path.join(ha.get_data_dir(), 'posts')
    filename = os.path.join(thepostsdir, postpath) + '.dat'
    ut.createdir(os.path.split(filename)[0])
    ut.data2pickle(comment_tree, filename)