Пример #1
0
def crawl_users():
    """
    Crawl all habrausers, enumerating habrakarma pages.
    """
    page_num = 1
    users = set()

    def flush_userbase():
        ut.data2pickle(users, "../data/allusers.dat")
        for user in users:
            user = user.replace("\n", "")
            filename = "../data/users/" + user + "@.dat"
            if not os.path.exists(filename):
                print "crawling user: <%s>" % user
                user_data = cu.crawl_user(user)
                ut.data2pickle(user_data, filename)

    if 0:
        while True:
            url_to_parse = "http://habrahabr.ru/people/page%d/" % page_num
            root = ut.doc4url(url_to_parse)
            if not root:
                break
            items = root.xpath('//div[@class="username"]//a')
            print "Page = ", page_num
            if len(items) > 0:
                new_users = set([ut.unicodeanyway(node.text) for node in items])
                users.update(new_users)
            page_num += 1
    users = ut.pickle2data("../data/allusers.dat")
    # if page_num % 1000 == 0:
    #    flush_userbase()

    flush_userbase()
Пример #2
0
 def get_set(css_class_name, set_num=0):
     """
     Find in the page list of some hyperlinked properties
     (such as friends, interests, etc)
     and return a set of them.
     """
     if not root:
         return None
     item = root.xpath('//dl[@class="%s"]/dd' % css_class_name)
     if len(item) <= set_num:
         return None
     sets_node  = item[set_num]
     item_set = set([ut.unicodeanyway(node.text).replace('\n', '')
                      for node
                         in sets_node.xpath('.//a') if node.text is not None])
     
     
     
     return item_set