Пример #1
0
def smoke_test():
    """
    Simple smoke/acceptance test.
    Crawl user info, store them in pickle file.
    """
    habrauser = '******'
    user = crawl_user(habrauser)
    print user
    ut.data2pickle(user, '../data/users/' + habrauser + '.dat')
Пример #2
0
 def flush_userbase():
     ut.data2pickle(users, "../data/allusers.dat")
     for user in users:
         user = user.replace("\n", "")
         filename = "../data/users/" + user + "@.dat"
         if not os.path.exists(filename):
             print "crawling user: <%s>" % user
             user_data = cu.crawl_user(user)
             ut.data2pickle(user_data, filename)
Пример #3
0
def reload_users():
    """
     Build friends graph 
     from full habrauser DB and
     store it as edgelist (easy readable from igraph). 
    """
    user_files_list = os.listdir(USERS_DIR) 
    for userid, filename in enumerate(user_files_list):
        user = ut.pickle2data( os.path.join(USERS_DIR, filename) )
        so_user = so.SmartObject(user)
        ut.data2pickle(so_user, os.path.join(USERS_DIR, filename))
Пример #4
0
def get_comment_tree(postpath):
    url_to_parse = 'http://habrahabr.ru/' + postpath
    root = ut.doc4url(url_to_parse)
    if not root:
        return None

    author  = root.xpath('//div[@class="author"]/a')[0].text
    print author
    
    comment_root_tree = {}
    ##  Словарь вложенных словарей
    ##  автор
    ##  |→ автор_комментарария
    ##      |→ автор подкомментария
    def dfs_process(node, tree):
        """
        Рекурсивно идет вглубь от node
        и набивает словарь-дерево tree
        """
        print node.get('id')
        comments = node.xpath('.//div[@id="comments" or @class="reply_comments"]')[0]
        for comment in comments.xpath('./div[@class="comment_item"]'):
            author = comment.xpath('.//a[@class="username"]')[0].text
            print author
            child_tree = {}
            dfs_process(comment, child_tree)
            tree[author] = deepcopy(child_tree)
    
    dfs_process(root, comment_root_tree)
    comment_tree = {author: comment_root_tree}
    print 'tree:', comment_tree
    
    thepostsdir = os.path.join(ha.get_data_dir(), 'posts')
    filename = os.path.join(thepostsdir, postpath) + '.dat'
    ut.createdir(os.path.split(filename)[0])
    ut.data2pickle(comment_tree, filename)