예제 #1
0
파일: repost.py 프로젝트: zjlx/WeiboSpider
def crawl_repost_page(mid, uid):
    limit = get_max_repost_page() + 1
    first_repost_data = crawl_repost_by_page(mid, 1)
    wb_data.set_weibo_repost_crawled(mid)
    total_page = repost.get_total_page(first_repost_data[0])
    repost_datas = first_repost_data[1]

    if not repost_datas:
        return

    root_user = user_get.get_profile(uid)

    if total_page < limit:
        limit = total_page + 1
    # todo 这里需要衡量是否有用网络调用的必要性
    for page_num in range(2, limit):
        # app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler',
        #               routing_key='comment_page_info')
        cur_repost_datas = crawl_repost_by_page(mid, page_num)[1]
        if cur_repost_datas:
            repost_datas.extend(cur_repost_datas)

    # 补上user_id,方便可视化
    for index, repost_obj in enumerate(repost_datas):
        user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name)
        if not user_id:
            # 设置成根用户的uid和用户名
            repost_obj.parent_user_id = root_user.uid
            repost_obj.parent_user_name = root_user.name
        else:
            repost_obj.parent_user_id = user_id
        repost_datas[index] = repost_obj

    weibo_repost.save_reposts(repost_datas)
예제 #2
0
def crawl_repost_by_page(mid, page_num):
    cur_url = base_url.format(mid, page_num)
    html = get_page(cur_url, user_verify=False)
    repost_datas = repost.get_repost_list(html, mid)
    if page_num == 1:
        wb_data.set_weibo_repost_crawled(mid)
    return html, repost_datas
예제 #3
0
def crawl_repost_by_page(mid, page_num):
    cur_url = base_url.format(mid, page_num)
    html = get_page(cur_url, user_verify=False)
    repost_datas = repost.get_repost_list(html, mid)
    if page_num == 1:
        wb_data.set_weibo_repost_crawled(mid)
    return html, repost_datas