Exemplo n.º 1
0
def crawl_praise_page(mid):
    # 这里为了马上拿到返回结果,采用本地调用的方式
    cur_time = int(time.time() * 1000)
    cur_url = BASE_URL.format(mid, cur_time)
    html = get_page(cur_url, auth_level=2, is_ajax=True)
    praise_data, ext_param = praise.get_praise_list(html, mid)
    PraiseOper.add_all(praise_data)

    WbDataOper.set_weibo_praise_crawled(mid)

    if not ext_param:
        crawler.error(
            'fail to get praise page 2 ext_param, mid is {mid}'.format(
                mid=mid))
        return

    # why no app.send_task and fall back to sequential execution
    # because weibo praise now require a parameter called max_id
    # and request without it will return something different from normal browser

    # should work after 5
    # TODO: retry or return depending on ext_param
    for __ in range(2, 5):
        # ext_param mainly max_id will be updated each time and be used next time
        html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param)
    return
Exemplo n.º 2
0
def get_redirect(name, data, post_url, session, proxy):
    logining_page = session.post(post_url,
                                 data=data,
                                 headers=headers,
                                 proxies=proxy)
    login_loop = logining_page.content.decode("GBK")

    # if name or password is wrong, set the value to 2
    if 'retcode=101' in login_loop:
        crawler.error(
            'invalid password for {}, please ensure your account and password'.
            format(name))
        LoginInfoOper.freeze_account(name, 2)
        return ''

    if 'retcode=2070' in login_loop:
        crawler.error('invalid verification code')
        return 'pinerror'

    if 'retcode=4049' in login_loop:
        crawler.warning('account {} need verification for login'.format(name))
        return 'login_need_pincode'

    if '正在登录' in login_loop or 'Signing in' in login_loop:
        pa = r'location\.replace\([\'"](.*?)[\'"]\)'
        return re.findall(pa, login_loop)[0]
    else:
        return ''
Exemplo n.º 3
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    try:
        user, is_crawled = get_profile(uid)
        # If it's enterprise user, just skip it
        if user and user.verify_type == 2:
            SeedidsOper.set_seed_other_crawled(uid)
            return

        # Crawl fans and followers
        if not is_crawled:
            app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers',
                          routing_key='for_fans_followers')

    # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker
    # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10
    except SoftTimeLimitExceeded:
        crawler.error("user SoftTimeLimitExceeded    uid={uid}".format(uid=uid))
        app.send_task('tasks.user.crawl_person_infos', args=(uid, ), queue='user_crawler',
                      routing_key='for_user_info')
Exemplo n.º 4
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    try:
        user, is_crawled = get_profile(uid)
        # If it's enterprise user, just skip it
        if user and user.verify_type == 2:
            SeedidsOper.set_seed_other_crawled(uid)
            return

        # Crawl fans and followers
        if not is_crawled:
            app.send_task('tasks.user.crawl_follower_fans',
                          args=(uid, ),
                          queue='fans_followers',
                          routing_key='for_fans_followers')

    # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker
    # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10
    except SoftTimeLimitExceeded:
        crawler.error(
            "user SoftTimeLimitExceeded    uid={uid}".format(uid=uid))
        app.send_task('tasks.user.crawl_person_infos',
                      args=(uid, ),
                      queue='user_crawler',
                      routing_key='for_user_info')
Exemplo n.º 5
0
 def time_limit(*args, **kargs):
     try:
         return func(*args, **kargs)
     except Exception as e:
         crawler.error(
             'failed to crawl {url},here are details:{e}, stack is {stack}'.
             format(url=args[0], e=e, stack=format_tb(e.__traceback__)[0]))
         return ''
Exemplo n.º 6
0
 def time_limit(*args, **kargs):
     try:
         return func(*args, **kargs)
     except Exception as e:
         crawler.error('failed to crawl {url},here are details:{e}, stack is {stack}'.format(url=args[0], e=e,
                                                                                             stack=format_tb
                                                                                             (e.__traceback__)[0]))
         return ''
Exemplo n.º 7
0
def crawl_praise_by_page(mid, page_num):
    try:
        cur_time = int(time.time() * 1000)
        cur_url = BASE_URL.format(mid, page_num, cur_time)
        html = get_page(cur_url, auth_level=2, is_ajax=True)
        praise_datas = praise.get_praise_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "praise SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        crawl_praise_by_page(mid, page_num)
    PraiseOper.add_all(praise_datas)
    if page_num == 1:
        WbDataOper.set_weibo_praise_crawled(mid)
    return html, praise_datas
Exemplo n.º 8
0
def crawl_comment_by_page(mid, page_num, seeion):
    try:
        cur_url = BASE_URL.format(mid, page_num)
        html = get_page(cur_url, auth_level=1, is_ajax=True)
        comment_datas, seed_ids = comment.get_comment_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "comment SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        crawl_comment_by_page(mid, page_num)
    CommentOper.add_all(comment_datas, seeion)
    SeedidsOper.insert_seeds(seed_ids, seeion)
    if page_num == 1:
        WbDataOper.set_weibo_comment_crawled(mid, seeion)
    return html, comment_datas
Exemplo n.º 9
0
def download_img_task(mid, urls):
    count = 0
    for img_url in urls:
        if IMG_TYPE == 'large':
            img_url = img_url.replace('thumbnail', 'large').replace('square', 'large')
        suffix = img_url[img_url.rfind('.') + 1:]
        # skip gif images, which is used to show loading process
        if suffix != 'gif':
            count += 1
            try:
                image_response = requests.get(img_url, stream=True)
            except Exception as e:
                crawler.error('fail to down image {}, {} is raised'.format(img_url, e))
            else:
                with open(os.path.join(IMG_PATH, '{}-{}.{}'.format(mid, count, suffix)), 'wb') as out_file:
                    shutil.copyfileobj(image_response.raw, out_file)
Exemplo n.º 10
0
def crawl_comment_by_page(mid, page_num):
    try:
        cur_url = BASE_URL.format(mid, page_num)
        html = get_page(cur_url, auth_level=1, is_ajax=True)
        comment_datas = comment.get_comment_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "comment SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        app.send_task('tasks.comment.crawl_comment_by_page',
                      args=(mid, page_num),
                      queue='comment_page_crawler',
                      routing_key='comment_page_info')
    CommentOper.add_all(comment_datas)
    if page_num == 1:
        WbDataOper.set_weibo_comment_crawled(mid)
    return html, comment_datas
Exemplo n.º 11
0
def crawl_comment_by_page(mid, page_num):
    try:
        cur_url = BASE_URL.format(mid, page_num)
        html = get_page(cur_url, auth_level=1, is_ajax=True)
        comment_datas = comment.get_comment_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "comment SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        app.send_task(
            'tasks.comment.crawl_comment_by_page',
            args=(mid, page_num),
            queue='comment_page_crawler',
            routing_key='comment_page_info')
    CommentOper.add_all(comment_datas)
    if page_num == 1:
        WbDataOper.set_weibo_comment_crawled(mid)
    return html, comment_datas
Exemplo n.º 12
0
def crawl_person_infos(uid):

    if not uid:
        return

    try:
        user, is_crawled = get_profile(uid)
        # If it's enterprise user, just skip it
        if user and user.verify_type == 2:
            SeedidsOper.set_seed_other_crawled(uid)
            return

        # Crawl fans and followers
        if not is_crawled:
            crawl_follower_fans(uid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "user SoftTimeLimitExceeded    uid={uid}".format(uid=uid))
        crawl_person_infos(uid)
Exemplo n.º 13
0
def get_redirect(name, data, post_url, session, proxy):
    logining_page = session.post(post_url, data=data, headers=headers, proxies=proxy)
    login_loop = logining_page.content.decode("GBK")

    # if name or password is wrong, set the value to 2
    if 'retcode=101' in login_loop:
        crawler.error('invalid password for {}, please ensure your account and password'.format(name))
        LoginInfoOper.freeze_account(name, 2)
        return ''

    if 'retcode=2070' in login_loop:
        crawler.error('invalid verification code')
        return 'pinerror'

    if 'retcode=4049' in login_loop:
        crawler.warning('account {} need verification for login'.format(name))
        return 'login_need_pincode'

    if '正在登录' in login_loop or 'Signing in' in login_loop:
        pa = r'location\.replace\([\'"](.*?)[\'"]\)'
        return re.findall(pa, login_loop)[0]
    else:
        return ''
Exemplo n.º 14
0
def crawl_praise_page(mid):
    # 这里为了马上拿到返回结果,采用本地调用的方式
    cur_time = int(time.time() * 1000)
    cur_url = BASE_URL.format(mid, cur_time)
    html = get_page(cur_url, auth_level=2, is_ajax=True)
    praise_data, ext_param = praise.get_praise_list(html, mid)
    PraiseOper.add_all(praise_data)
    
    WbDataOper.set_weibo_praise_crawled(mid)

    if not ext_param:
        crawler.error('fail to get praise page 2 ext_param, mid is {mid}'.format(mid=mid))
        return

    # why no app.send_task and fall back to sequential execution
    # because weibo praise now require a parameter called max_id
    # and request without it will return something different from normal browser

    # should work after 5
    # TODO: retry or return depending on ext_param
    for __ in range(2,5):
        # ext_param mainly max_id will be updated each time and be used next time
        html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param)
    return