def crawl_comment_page(mid): limit = get_max_comment_page() cur_page = 1 next_url = '' while cur_page <= limit: cur_time = int(time.time()*1000) if cur_page == 1: url = start_url.format(mid, cur_time) else: url = base_url.format(next_url, cur_time) html = get_page(url, user_verify=False) comment_datas = comment.get_comment_list(html, mid) if not comment_datas and cur_page == 1: crawler.warning('微博id为{}的微博评论未采集成功,请检查原因'.format(mid)) return save_comments(comment_datas) # 由于这里每一步都要根据上一步来迭代,所以不适合采用网络调用(主要是比较麻烦) next_url = comment.get_next_url(html) if not next_url: crawler.info('微博{}的评论采集已经完成'.format(mid)) return cur_page += 1
def crawl_comment_page(mid): limit = conf.get_max_comment_page() + 1 # 这里为了马上拿到返回结果,采用本地调用的方式 first_page = crawl_comment_by_page(mid, 1) total_page = comment.get_total_page(first_page) if total_page < limit: limit = total_page + 1 for page_num in range(2, limit): app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler', routing_key='comment_page_info')
def crawl_comment_page(mid): limit = conf.get_max_comment_page() + 1 first_page = crawl_comment_by_page(mid, 1, db_session)[0] total_page = comment.get_total_page(first_page) if total_page < limit: limit = total_page + 1 for page_num in range(2, limit): #app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler', # routing_key='comment_page_info') Tasks.push_task(1, mid, page_num)
def crawl_comment_page(mid): limit = conf.get_max_comment_page() + 1 # 这里为了马上拿到返回结果,采用本地调用的方式 first_page = crawl_comment_by_page(mid, 1)[0] total_page = comment.get_total_page(first_page) if total_page < limit: limit = total_page + 1 for page_num in range(2, limit): crawl_comment_by_page(mid, page_num) time.sleep(3)