Пример #1
0
    def wrap_weibo_pages_urls(self, domain,page_id, num_page):
        weibo_urls = set()

        for page in range(num_page):
            weibo_urls=weibo_urls.union(QueryFactory.weibo_js_query( domain=domain,page_id=page_id,page_num=page+1))

        return weibo_urls
Пример #2
0
    def weibo_pages_num(self,response):
        #inspect_response(response,self)
        if response == None:
            yield self.start_requests()
            exit(1)

        # pares the current js response
        self.weibo_parse(response)

        login_user       =  response.meta['login_user']
        # load response in json form
        html_block_soup  =  self.json_load_response(response)

        # get the tag containing the max num of page
        page_list_tag    =  html_block_soup.find('div',{'action-type':'feed_list_page_morelist'})
        MAX_PAGE_NUM     =  50
        if page_list_tag:
            total_num_pages  =  int(re.search(r'\d+',page_list_tag.a.string).group(0))
            if total_num_pages > MAX_PAGE_NUM:
                total_num_pages   =   MAX_PAGE_NUM
        else:
            total_num_pages  =  1

        # warp weibo page urls to crawl
        weibo_page_urls       =  self.wrap_weibo_pages_urls(domain=login_user['domain'], page_id=login_user['page_id'], num_page=total_num_pages )

        print '\n\n Number of user weibos pages: ',total_num_pages,'\n\n'

        # test part weibo parser
        user_weibo_page_url   =  QueryFactory.weibo_js_query(domain = login_user['domain'], page_id = login_user['page_id'], page_num=2 )[0]

        # first request to get the total number of user weibos' pages
        #request = Request(url=user_weibo_page_url,callback=self.user_weibo_parse,meta={'login_user':login_user})
        #yield request


        # send requests contained in the weibo pages urls
        for page_url in weibo_page_urls:
            yield Request(url=page_url,callback=self.weibo_parse,meta={'login_user':login_user})

        # insert id crawled into redis
        self.redis_server.lrem(self.ids_processing_name,login_user['toCrawl_user_id'],num=-1)
        self.redis_server.lpush(self.ids_crawled_name  ,login_user['toCrawl_user_id'])

        # TODO
        next_uid  =  self.forward_crawling_redis()

        if next_uid:
            trypage_url = QueryFactory.mainpage_query(next_uid)
            mainpage_request = Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':next_uid})
            yield mainpage_request
        else:
            log.msg(' Queue is empty, task to terminate.',level=log.INFO)