def wrap_weibo_pages_urls(self, domain,page_id, num_page): weibo_urls = set() for page in range(num_page): weibo_urls=weibo_urls.union(QueryFactory.weibo_js_query( domain=domain,page_id=page_id,page_num=page+1)) return weibo_urls
def weibo_pages_num(self,response): #inspect_response(response,self) if response == None: yield self.start_requests() exit(1) # pares the current js response self.weibo_parse(response) login_user = response.meta['login_user'] # load response in json form html_block_soup = self.json_load_response(response) # get the tag containing the max num of page page_list_tag = html_block_soup.find('div',{'action-type':'feed_list_page_morelist'}) MAX_PAGE_NUM = 50 if page_list_tag: total_num_pages = int(re.search(r'\d+',page_list_tag.a.string).group(0)) if total_num_pages > MAX_PAGE_NUM: total_num_pages = MAX_PAGE_NUM else: total_num_pages = 1 # warp weibo page urls to crawl weibo_page_urls = self.wrap_weibo_pages_urls(domain=login_user['domain'], page_id=login_user['page_id'], num_page=total_num_pages ) print '\n\n Number of user weibos pages: ',total_num_pages,'\n\n' # test part weibo parser user_weibo_page_url = QueryFactory.weibo_js_query(domain = login_user['domain'], page_id = login_user['page_id'], page_num=2 )[0] # first request to get the total number of user weibos' pages #request = Request(url=user_weibo_page_url,callback=self.user_weibo_parse,meta={'login_user':login_user}) #yield request # send requests contained in the weibo pages urls for page_url in weibo_page_urls: yield Request(url=page_url,callback=self.weibo_parse,meta={'login_user':login_user}) # insert id crawled into redis self.redis_server.lrem(self.ids_processing_name,login_user['toCrawl_user_id'],num=-1) self.redis_server.lpush(self.ids_crawled_name ,login_user['toCrawl_user_id']) # TODO next_uid = self.forward_crawling_redis() if next_uid: trypage_url = QueryFactory.mainpage_query(next_uid) mainpage_request = Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':next_uid}) yield mainpage_request else: log.msg(' Queue is empty, task to terminate.',level=log.INFO)