def crawl_praise_page(mid): # 这里为了马上拿到返回结果,采用本地调用的方式 cur_time = int(time.time() * 1000) cur_url = BASE_URL.format(mid, cur_time) html = get_page(cur_url, auth_level=2, is_ajax=True) praise_data, ext_param = praise.get_praise_list(html, mid) PraiseOper.add_all(praise_data) WbDataOper.set_weibo_praise_crawled(mid) if not ext_param: crawler.error( 'fail to get praise page 2 ext_param, mid is {mid}'.format( mid=mid)) return # why no app.send_task and fall back to sequential execution # because weibo praise now require a parameter called max_id # and request without it will return something different from normal browser # should work after 5 # TODO: retry or return depending on ext_param for __ in range(2, 5): # ext_param mainly max_id will be updated each time and be used next time html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param) return
def crawl_praise_by_page(mid, page_num): try: cur_time = int(time.time() * 1000) cur_url = BASE_URL.format(mid, page_num, cur_time) html = get_page(cur_url, auth_level=2, is_ajax=True) praise_datas = praise.get_praise_list(html, mid) except SoftTimeLimitExceeded: crawler.error( "praise SoftTimeLimitExceeded mid={mid} page_num={page_num}". format(mid=mid, page_num=page_num)) crawl_praise_by_page(mid, page_num) PraiseOper.add_all(praise_datas) if page_num == 1: WbDataOper.set_weibo_praise_crawled(mid) return html, praise_datas
def crawl_praise_page(mid): # 这里为了马上拿到返回结果,采用本地调用的方式 cur_time = int(time.time() * 1000) cur_url = BASE_URL.format(mid, cur_time) html = get_page(cur_url, auth_level=2, is_ajax=True) praise_data, ext_param = praise.get_praise_list(html, mid) PraiseOper.add_all(praise_data) WbDataOper.set_weibo_praise_crawled(mid) if not ext_param: crawler.error('fail to get praise page 2 ext_param, mid is {mid}'.format(mid=mid)) return # why no app.send_task and fall back to sequential execution # because weibo praise now require a parameter called max_id # and request without it will return something different from normal browser # should work after 5 # TODO: retry or return depending on ext_param for __ in range(2,5): # ext_param mainly max_id will be updated each time and be used next time html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param) return