def crawl_comment_page(mid): limit = get_max_comment_page() cur_page = 1 next_url = '' while cur_page <= limit: cur_time = int(time.time()*1000) if cur_page == 1: url = start_url.format(mid, cur_time) else: url = base_url.format(next_url, cur_time) html = get_page(url, user_verify=False) comment_datas = comment.get_comment_list(html, mid) if not comment_datas and cur_page == 1: crawler.warning('微博id为{}的微博评论未采集成功,请检查原因'.format(mid)) return save_comments(comment_datas) # 由于这里每一步都要根据上一步来迭代,所以不适合采用网络调用(主要是比较麻烦) next_url = comment.get_next_url(html) if not next_url: crawler.info('微博{}的评论采集已经完成'.format(mid)) return cur_page += 1
def crawl_comment_by_page(mid, page_num): cur_time = int(time.time() * 1000) cur_url = base_url.format(mid, page_num, cur_time) html = get_page(cur_url, user_verify=False) comment_datas = comment.get_comment_list(html, mid) save_comments(comment_datas) wb_data.set_weibo_comment_crawled(mid) return html
def crawl_comment_by_page(mid, page_num): cur_time = int(time.time() * 1000) cur_url = base_url.format(mid, page_num, cur_time) html = get_page(cur_url, user_verify=False) comment_datas = comment.get_comment_list(html, mid) save_comments(comment_datas) if page_num == 1: wb_data.set_weibo_comment_crawled(mid) return html