def crawl_repost_page(mid, uid): limit = get_max_repost_page() + 1 first_repost_data = crawl_repost_by_page(mid, 1) total_page = repost.get_total_page(first_repost_data[0]) repost_datas = first_repost_data[1] if not repost_datas: return root_user, _ = user_get.get_profile(uid) if total_page < limit: limit = total_page + 1 for page_num in range(2, limit): cur_repost_datas = crawl_repost_by_page(mid, page_num)[1] if cur_repost_datas: repost_datas.extend(cur_repost_datas) for index, repost_obj in enumerate(repost_datas): user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name) if not user_id: # when it comes to errors, set the args to default(root) repost_obj.parent_user_id = root_user.uid repost_obj.parent_user_name = root_user.name else: repost_obj.parent_user_id = user_id repost_datas[index] = repost_obj weibo_repost.save_reposts(repost_datas)
def crawl_repost_page(mid, uid): limit = get_max_repost_page() + 1 first_repost_data = crawl_repost_by_page(mid, 1) wb_data.set_weibo_repost_crawled(mid) total_page = repost.get_total_page(first_repost_data[0]) repost_datas = first_repost_data[1] if not repost_datas: return root_user = user_get.get_profile(uid) if total_page < limit: limit = total_page + 1 # todo 这里需要衡量是否有用网络调用的必要性 for page_num in range(2, limit): # app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler', # routing_key='comment_page_info') cur_repost_datas = crawl_repost_by_page(mid, page_num)[1] if cur_repost_datas: repost_datas.extend(cur_repost_datas) # 补上user_id,方便可视化 for index, repost_obj in enumerate(repost_datas): user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name) if not user_id: # 设置成根用户的uid和用户名 repost_obj.parent_user_id = root_user.uid repost_obj.parent_user_name = root_user.name else: repost_obj.parent_user_id = user_id repost_datas[index] = repost_obj weibo_repost.save_reposts(repost_datas)
def test_get_total_repost_page(self): """ 测试获取所有转发页数 :return: """ from page_parse import repost with open('tests/repost.html', encoding='utf-8') as f: html = f.read() total_page = repost.get_total_page(html) self.assertEqual(total_page, 1580)
def test_get_total_repost_page(self): """ 测试获取所有转发页数 :return: """ from page_parse import repost url = TEST_SERVER + 'repost.html' resp = requests.get(url) resp.encoding = 'utf-8' html = resp.text total_page = repost.get_total_page(html) self.assertEqual(total_page, 1580)