Python get_userdomain示例，page_parse.user.public.get_userdomain Python示例

示例#1

0

显示文件

文件： home.py 项目： sizeof1234/WeiboSpider

def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功，请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

示例#2

0

显示文件

def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    url = home_url.format(uid)
    html = get_page(url)
    if is_404(html):
        return None

    domain = public.get_userdomain(html)

    user, is_crawled = user_get.get_profile(uid, domain)
    # If it's enterprise user, just skip it
    if user and user.verify_type == 2:
        set_seed_other_crawled(uid)
        return

    # Crawl fans and followers
    if not is_crawled:
        app.send_task('tasks.user.crawl_follower_fans',
                      args=(uid, domain),
                      queue='fans_followers',
                      routing_key='for_fans_followers')

示例#3

0

显示文件

文件： home.py 项目： dittoyi/weibospider

def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)

示例#4

0

显示文件

文件： home.py 项目： Doraying1230/Python-Study

def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        length_weibo_datas = len(weibo_datas)
        for i in range(0, len(weibo_datas)):
            weibo_time = time.mktime(
                time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M'))
            if weibo_time < timeafter:
                weibo_datas = weibo_datas[0:i]
                break

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if i != length_weibo_datas - 1:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1
        else:
            auth_level = 2

        if total_page < limit:
            limit = total_page

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)

示例#5

0

显示文件

def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = BASE_URL.format('100505', user_id)
    html = get_page(url, auth_level=1)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = BASE_URL.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
            samefollow_uid = get_samefollow_uid()
            if samefollow_uid.strip() != '':
                samefollow_uid = samefollow_uid.split(',')
                url = SAMEFOLLOW_URL.format(user_id)
                isFanHtml = get_page(url, auth_level=2)
                person.get_isFan(isFanHtml, samefollow_uid, user_id)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            UserOper.add_one(user)
            storage.info(
                'Has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None

示例#6

0

显示文件

文件： user.py 项目： ResolveWang/WeiboSpider

def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = BASE_URL.format('100505', user_id)
    html = get_page(url, auth_level=1)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = BASE_URL.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
            samefollow_uid = get_samefollow_uid()
            if samefollow_uid.strip() != '':
                samefollow_uid = samefollow_uid.split(',')
                url = SAMEFOLLOW_URL.format(user_id)
                isFanHtml = get_page(url, auth_level=2)
                person.get_isFan(isFanHtml, samefollow_uid, user_id)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            UserOper.add_one(user)
            storage.info('Has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None

示例#7

0

显示文件

文件： user.py 项目： dittoyi/weibospider

def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = base_url.format('100505', user_id)
    html = get_page(url)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = base_url.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            save_user(user)
            storage.info('has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None

示例#8

0

显示文件

def get_url_from_web(user_id):
    """
    根据用户id获取用户资料：如果用户的domain为100505，那么会直接返回用户详细资料；如果是103505或者100306，那么需要再进行
    一次请求，因为用base_url的方式它只会定位到用户主页而不是详细资料页；如果是企业和服务号等，通过base_url访问也会跳转到该
    用户的主页，由于该类用户的详细页价值不大，所以不再进行请求它们的详细页
    :param user_id: 用户id
    :return: 用户类实体
    """
    if not user_id:
        return None

    url = base_url.format('100505', user_id)
    html = get_page(url)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # 作家
        if domain == '103505' or domain == '100306':
            url = base_url.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # 普通用户
        elif domain == '100505':
            user = get_user_detail(user_id, html)
        # 默认是企业
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        # 保存用户信息到数据库
        save_user(user)
        storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id))

        return user
    else:
        return None

示例#9

0

显示文件

文件： user.py 项目： KingOfBanana/SocialNetworkAI

def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = base_url.format('100505', user_id)
    html = get_page(url)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = base_url.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        save_user(user)
        storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id))

        return user
    else:
        return None

示例#10

0

显示文件

文件： home.py 项目： dougaoyang/weibospider

def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        page_id = public.get_pageid(html)
        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, page_id, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, page_id, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_0, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_1, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)

示例#11

0

显示文件

def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        WbDataOper.add_all(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1
        else:
            auth_level = 2

        if total_page < limit:
            limit = total_page

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)

示例#12

0

显示文件

def crawl_weibo_pics(uid):
    limit = get_max_home_page()
    cur_page = 1

    # 自定义最大爬取的页数
    max_page = 20
    # end

    url = home_url.format(uid, cur_page)
    html = get_page(url)
    domain = public.get_userdomain(html)
    
    # 只爬取微博个人用户的相片，如果是非个人用户（如政府，组织等）不爬取。
    if domain not in ['103505', '100306', '100505', '']:
        set_seed_home_crawled(uid, 2)
        return
    # end

    domain_uid = domain + uid
    page_domain = 'page_' + domain
    url = pic_url.format(domain_uid, page_domain)

    html = get_page(url)

    weibo_pics, next_ajax_url = get_wbdata_fromweb(html)

    if weibo_pics is None or next_ajax_url is None:
        crawler.warning('用户id为{}的用户相册未采集成功，请检查原因'.format(uid))
        set_seed_home_crawled(uid, 3)
        return

    if not weibo_pics:
        crawler.warning('用户id为{}的用户相册未采集成功，可能是因为TA没有发过带图微博'.format(uid))
        set_seed_home_crawled(uid, 5)
        return

    insert_weibo_pics(weibo_pics)

    if not next_ajax_url:
        crawler.warning('用户id为{}的相册采集完成'.format(uid))
        set_seed_home_crawled(uid, 4)
        return
    
    cur_page += 1

    while cur_page <= limit:

        # 有些微博账号的照片多达两三千张，如果全部爬取比较浪费时间，这里先简单粗暴地根据当前微博的页数
        # 进行限制。经过调查发现10页左右应该是比较理想的数字。
        if cur_page > max_page:
            break
        # ebd

        cur_time = int(time.time()*1000)
        ajax_call = 1
        page_id = domain_uid
        url = ajax_url.format(page_id, cur_page, ajax_call, cur_time) + '&' + next_ajax_url
        html = get_page(url, user_verify=False)

        weibo_pics, next_ajax_url = get_pic_data_byajax(html)
        
        if weibo_pics is None or next_ajax_url is None:
            crawler.warning('用户id为{}的用户相册未采集成功，请检查原因'.format(uid))
            set_seed_home_crawled(uid, 3)
            return

        if not weibo_pics:
            crawler.warning('用户id为{}的用户相册未采集成功，请检查原因'.format(uid))
            set_seed_home_crawled(uid, 3)
            return
        
        insert_weibo_pics(weibo_pics)
        
        if not next_ajax_url:
            crawler.warning('用户id为{}的相册采集完成'.format(uid))
            set_seed_home_crawled(uid, 4)
            return

        cur_page += 1
        
    # 在完成规定的最大爬取页数后主动退出，将标志位置位为4
    set_seed_home_crawled(uid, 4)
    return

示例#13

0

显示文件

文件： home.py 项目： 402730243/spider-weibo

def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        length_weibo_datas = len(weibo_datas)
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        weibo_datas = [
            weibo_datum for weibo_datum in weibo_datas
            if determine(weibo_datum, timeafter)
        ]

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if len(weibo_datas) != length_weibo_datas:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time() * 1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1

            if total_page < limit:
                limit = total_page

            # Since the second ajax of page 1 has already been crawled
            # in the code above and has been stored in databse,
            # we only have to crawl the first ajax of page 1
            crawl_ajax_page(ajax_url_0, auth_level)

        else:
            auth_level = 2

            # Still the same as before
        # if total_page != limit:
        #     limit = total_page
        #     crawler.warning("total pagenum is {}".format(total_page))
        crawl_ajax_page(ajax_url_0, auth_level)
        crawl_ajax_page(ajax_url_1, auth_level)

        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)

示例#14

0

显示文件

文件： home.py 项目： ResolveWang/WeiboSpider

def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        length_weibo_datas = len(weibo_datas)
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        weibo_datas = [
            weibo_datum for weibo_datum in weibo_datas
            if determine(weibo_datum, timeafter)
        ]

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if len(weibo_datas) != length_weibo_datas:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1

            if total_page < limit:
                limit = total_page

            # Since the second ajax of page 1 has already been crawled
            # in the code above and has been stored in databse,
            # we only have to crawl the first ajax of page 1
            crawl_ajax_page(ajax_url_0, auth_level)

        else:
            auth_level = 2

            # Still the same as before
        # if total_page != limit:
        #     limit = total_page
        #     crawler.warning("total pagenum is {}".format(total_page))
        crawl_ajax_page(ajax_url_0, auth_level)
        crawl_ajax_page(ajax_url_1, auth_level)

        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)

示例#15

0

显示文件

def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 36
    retry_count = 1
    while cur_page <= 36:
        crawler.warning("current page {}".format(cur_page))

        url = HOME_URL.format(uid, cur_page)
        #if cur_page == 1:
        #    html = get_page(url, auth_level=1)
        #else:
        html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            if retry_count < 10:
                crawler.warning("user {} has no weibo, retry".format(uid))
                retry_count = retry_count + 1
                #time.sleep(240)
                continue;
            else:
                crawler.warning("user {} has no weibo, return".format(uid))
                return


         # Check whether weibo created after time in spider.yaml
        # timeafter = time.mktime(
        #     time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        # length_weibo_datas = len(weibo_datas)
        # for i in range(0, len(weibo_datas)):
        #     weibo_time = time.mktime(
        #         time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M'))
        #     if weibo_time < timeafter:
        #         weibo_datas = weibo_datas[0:i]
        #         break

        WbDataOper.add_all(weibo_datas)

        # # If the weibo isn't created after the given time, jump out the loop
        # if i != length_weibo_datas - 1:
        #     break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        # if cur_page == 1:
        #     # here we use local call to get total page number
        #     total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
        #     auth_level = 1
        # else:
        auth_level = 2

        #if total_page < limit:
        #    limit = total_page

        crawler.warning("append tasks.home.crawl_ajax_page{}".format(uid));

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)

示例#16

0

显示文件

def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1

    # 自定义最大爬取的页数
    max_page = 10
    # end

    while cur_page <= limit:

        # 有些微博账号的照片多达两三千张，如果全部爬取比较浪费时间，这里先简单粗暴地根据当前微博的页数
        # 进行限制。经过调查发现10页左右应该是比较理想的数字，电脑版微博一页有45条微博，那么一个账户就是
        # 450条微博。
        if cur_page > max_page:
            break
        # end

        url = home_url.format(uid, cur_page)
        html = get_page(url)

        domain = public.get_userdomain(html)
        # 只爬取微博个人用户的相片，如果是非个人用户（如政府，组织等）不爬取。
        if domain not in ['103505', '100306', '100505', '']:
            set_seed_home_crawled(uid, 2)
            return
        # end

        weibo_datas, weibo_pics = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功，请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        # 如果非空，则将weibo_pics插入数据库中
        if weibo_pics:
            insert_weibo_pics(weibo_pics)
        # end

        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_0, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_1, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

    # 在遍历完所有页数之后，将flag置位。放在这里表示所有页面都遍历过，不保证遍历成功后置位。可能以后还要优化，即在
    # 某个回调函数中使用它。
    set_seed_home_crawled(uid)