Пример #1
0
def crawl_comment_page(mid):
    limit = get_max_comment_page()
    cur_page = 1
    next_url = ''
    while cur_page <= limit:
        cur_time = int(time.time()*1000)
        if cur_page == 1:
            url = start_url.format(mid, cur_time)
        else:
            url = base_url.format(next_url, cur_time)
        html = get_page(url, user_verify=False)
        comment_datas = comment.get_comment_list(html, mid)

        if not comment_datas and cur_page == 1:
            crawler.warning('微博id为{}的微博评论未采集成功,请检查原因'.format(mid))
            return

        save_comments(comment_datas)
        # 由于这里每一步都要根据上一步来迭代,所以不适合采用网络调用(主要是比较麻烦)
        next_url = comment.get_next_url(html)

        if not next_url:
            crawler.info('微博{}的评论采集已经完成'.format(mid))
            return
        cur_page += 1
Пример #2
0
def crawl_comment_page(mid):
    limit = conf.get_max_comment_page() + 1
    # 这里为了马上拿到返回结果,采用本地调用的方式
    first_page = crawl_comment_by_page(mid, 1)
    total_page = comment.get_total_page(first_page)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler',
                      routing_key='comment_page_info')
Пример #3
0
def crawl_comment_page(mid):
    limit = conf.get_max_comment_page() + 1
    first_page = crawl_comment_by_page(mid, 1, db_session)[0]
    total_page = comment.get_total_page(first_page)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        #app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler',
        # routing_key='comment_page_info')
        Tasks.push_task(1, mid, page_num)
Пример #4
0
def crawl_comment_page(mid):
    limit = conf.get_max_comment_page() + 1
    # 这里为了马上拿到返回结果,采用本地调用的方式
    first_page = crawl_comment_by_page(mid, 1)[0]
    total_page = comment.get_total_page(first_page)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        crawl_comment_by_page(mid, page_num)
        time.sleep(3)
Пример #5
0
def crawl_comment_page(mid):
    limit = conf.get_max_comment_page() + 1
    # 这里为了马上拿到返回结果,采用本地调用的方式
    first_page = crawl_comment_by_page(mid, 1)[0]
    total_page = comment.get_total_page(first_page)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        crawl_comment_by_page(mid, page_num)
        time.sleep(3)
Пример #6
0
def crawl_comment_page(mid):
    limit = conf.get_max_comment_page() + 1
    # 这里为了马上拿到返回结果,采用本地调用的方式
    first_page = crawl_comment_by_page(mid, 1)
    total_page = comment.get_total_page(first_page)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler',
                      routing_key='comment_page_info')