Пример #1
0
def crawl_users():
    """
    Crawl all habrausers, enumerating habrakarma pages.
    """
    page_num = 1
    users = set()

    def flush_userbase():
        ut.data2pickle(users, "../data/allusers.dat")
        for user in users:
            user = user.replace("\n", "")
            filename = "../data/users/" + user + "@.dat"
            if not os.path.exists(filename):
                print "crawling user: <%s>" % user
                user_data = cu.crawl_user(user)
                ut.data2pickle(user_data, filename)

    if 0:
        while True:
            url_to_parse = "http://habrahabr.ru/people/page%d/" % page_num
            root = ut.doc4url(url_to_parse)
            if not root:
                break
            items = root.xpath('//div[@class="username"]//a')
            print "Page = ", page_num
            if len(items) > 0:
                new_users = set([ut.unicodeanyway(node.text) for node in items])
                users.update(new_users)
            page_num += 1
    users = ut.pickle2data("../data/allusers.dat")
    # if page_num % 1000 == 0:
    #    flush_userbase()

    flush_userbase()
Пример #2
0
def get_comment_tree(postpath):
    url_to_parse = 'http://habrahabr.ru/' + postpath
    root = ut.doc4url(url_to_parse)
    if not root:
        return None

    author  = root.xpath('//div[@class="author"]/a')[0].text
    print author
    
    comment_root_tree = {}
    ##  Словарь вложенных словарей
    ##  автор
    ##  |→ автор_комментарария
    ##      |→ автор подкомментария
    def dfs_process(node, tree):
        """
        Рекурсивно идет вглубь от node
        и набивает словарь-дерево tree
        """
        print node.get('id')
        comments = node.xpath('.//div[@id="comments" or @class="reply_comments"]')[0]
        for comment in comments.xpath('./div[@class="comment_item"]'):
            author = comment.xpath('.//a[@class="username"]')[0].text
            print author
            child_tree = {}
            dfs_process(comment, child_tree)
            tree[author] = deepcopy(child_tree)
    
    dfs_process(root, comment_root_tree)
    comment_tree = {author: comment_root_tree}
    print 'tree:', comment_tree
    
    thepostsdir = os.path.join(ha.get_data_dir(), 'posts')
    filename = os.path.join(thepostsdir, postpath) + '.dat'
    ut.createdir(os.path.split(filename)[0])
    ut.data2pickle(comment_tree, filename)
Пример #3
0
def crawl_user(username):
    """
     Crawl habrauser info,
     return dictionary with these attributes.
    """
    url_to_parse = 'http://habrahabr.ru/users/' + username + '/' 
    root    = ut.doc4url(url_to_parse)

    def get_set(css_class_name, set_num=0):
        """
        Find in the page list of some hyperlinked properties
        (such as friends, interests, etc)
        and return a set of them.
        """
        if not root:
            return None
        item = root.xpath('//dl[@class="%s"]/dd' % css_class_name)
        if len(item) <= set_num:
            return None
        sets_node  = item[set_num]
        item_set = set([ut.unicodeanyway(node.text).replace('\n', '')
                         for node
                            in sets_node.xpath('.//a') if node.text is not None])
        
        
        
        return item_set

    user = so.SmartObject({
        'interests' : get_set('interests'),
        'companies' : get_set('companies_list'),
        'friends' :  get_set('friends_list'),
        'hubs' : get_set('hubs_list'),
        'invitees': get_set('friends_list', 1)
    })    
    return user