示例#1
0
def _get_current_source(url, wb_mid):
    """
    :param url: 当前微博url
    :param wb_mid: 当前微博mid
    :return: 转发数,微博用户id,用户名
    """
    html = get_page(url)
    if not html or basic.is_404(html):
        return None

    reposts = parse_status.get_repostcounts(html)
    comments = parse_status.get_commentcounts(html)

    # 更新weibo_search_data表中的转发数、评论数
    weibosearch_dao.update_repost_comment(mid=wb_mid,
                                          reposts=reposts,
                                          comments=comments)

    root_url = url
    user_id = parse_status.get_userid(html)
    user_name = parse_status.get_username(html)
    post_time = parse_status.get_statustime(html)
    device = parse_status.get_statussource(html)
    comments_count = parse_status.get_commentcounts(html)
    reposts_count = parse_status.get_repostcounts(html)
    root_user = user.get_profile(user_id)
    # 源微博的相关信息存储
    spread_original_dao.save(root_user, wb_mid, post_time, device,
                             reposts_count, comments_count, root_url)

    crawler.info('该微博转发数为{counts}'.format(counts=reposts_count))
    return reposts_count, user_id, user_name
示例#2
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    url = home_url.format(uid)
    html = get_page(url)
    if is_404(html):
        return None

    domain = public.get_userdomain(html)

    user, is_crawled = user_get.get_profile(uid, domain)
    # If it's enterprise user, just skip it
    if user and user.verify_type == 2:
        set_seed_other_crawled(uid)
        return

    # Crawl fans and followers
    if not is_crawled:
        app.send_task('tasks.user.crawl_follower_fans',
                      args=(uid, domain),
                      queue='fans_followers',
                      routing_key='for_fans_followers')
示例#3
0
def get_status_info(url, user_id, name, mid=''):
    soc = SpreadOtherCache()
    print('当前转发微博url为:' + url)
    repost_cont = get_page(url)

    if not is_404(repost_cont):
        repost_user_id = parse_status.get_userid(repost_cont)
        if repost_user_id == '':
            return None

        repost_user_name = parse_status.get_username(repost_cont)
        soc.set_id(repost_user_id)
        soc.set_name(repost_user_name)

        so = SpreadOther()
        so.id = repost_user_id
        so.screen_name = repost_user_name
        so.upper_user_name = parse_status.get_upperusername(repost_cont, name)
        cur_user = user.get_profile(repost_user_id)
        try:
            so.province = cur_user.province
            so.city = cur_user.city
            so.location = cur_user.location
            so.description = cur_user.description
            so.domain_name = cur_user.domain_name
            so.blog_url = cur_user.blog_url
            so.gender = cur_user.gender
            so.headimg_url = cur_user.headimg_url
            so.followers_count = cur_user.followers_count
            so.friends_count = cur_user.friends_count
            so.status_count = cur_user.status_count
            so.verify_type = cur_user.verify_type
            so.verify_info = cur_user.verify_info
            so.register_time = cur_user.register_time

            if so.screen_name == name:
                so.id = user_id

            so.mid = parse_status.get_mid(repost_cont)
            so.status_post_time = parse_status.get_statustime(repost_cont)
            so.device = parse_status.get_statussource(repost_cont)
            if mid:
                so.original_status_id = mid
            else:
                so.original_status_id = parse_status.get_orignalmid(repost_cont)
            so.comments_count = parse_status.get_commentcounts(repost_cont)
            so.reposts_count = parse_status.get_repostcounts(repost_cont)
            so.like_count = parse_status.get_likecounts(repost_cont)
            so.status_url = url
        except AttributeError as e:
            # todo:找出这里的问题
            logging.info('解析{user_id}失败, 堆栈为{e}'.format(user_id=user_id, e=e))
            logging.info(r'该转发页面的源代码为:\n{repost_cont}'.format(repost_cont=repost_cont))
            return None
        else:
            return SpreadOtherAndCache(so, soc)
    else:
        return None
示例#4
0
def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = base_url.format('100505', user_id)
    html = get_page(url)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = base_url.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            save_user(user)
            storage.info('has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None
示例#5
0
def get_url_from_web(user_id, domain):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = base_url.format(domain, user_id)
    html = get_page(url)

    if not is_404(html):
        # writers(special users)
        if domain == '103505' or domain == '100306':
            # url = base_url.format(domain, user_id)
            # html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            save_user(user)
            storage.info(
                'has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None
示例#6
0
def get_url_from_web(user_id):
    """
    根据用户id获取用户资料:如果用户的domain为100505,那么会直接返回用户详细资料;如果是103505或者100306,那么需要再进行
    一次请求,因为用base_url的方式它只会定位到用户主页而不是详细资料页;如果是企业和服务号等,通过base_url访问也会跳转到该
    用户的主页,由于该类用户的详细页价值不大,所以不再进行请求它们的详细页
    :param user_id: 用户id
    :return: 用户类实体
    """
    if not user_id:
        return None

    url = base_url.format('100505', user_id)
    html = get_page(url)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # 作家
        if domain == '103505' or domain == '100306':
            url = base_url.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # 普通用户
        elif domain == '100505':
            user = get_user_detail(user_id, html)
        # 默认是企业
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        # 保存用户信息到数据库
        save_user(user)
        storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id))

        return user
    else:
        return None
示例#7
0
def get_page(url, session, headers, user_verify=True):
    """
    :param user_verify: 是否为可能出现验证码的页面(搜索页面的403还没解析),否为抓取转发的ajax连接
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    try:
        page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \
            encode('utf-8', 'ignore').decode('utf-8')
        time.sleep(interal)

        if user_verify:
            if is_403(page):
                crawler.warning('本账号已经被冻结')
                crawler.info('本次抓取结束,时间是:{curtime}'.format(curtime=time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime())))
                exit(-1)
            if is_404(page):
                crawler.warning('url为{url}的连接不存在'.format(url=url))
                return ''
            if not is_complete(page):
                time.sleep(excp_interal)
                try:
                    page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \
                        encode('utf-8', 'ignore').decode('utf-8')
                except Exception as why:
                    crawler.error(why)
                    return ''
    except requests.exceptions.ReadTimeout:
        crawler.warning('抓取{url}时连接目标服务器超时'.format(url=url))
        time.sleep(excp_interal)
        return ''
    except requests.exceptions.ConnectionError as e:
        crawler.warning('目标服务器拒绝连接,程序休眠1分钟,具体异常信息为:{e}'.format(e=e))
        time.sleep(excp_interal)
        return ''
    else:
        return page
示例#8
0
def get_page(url, user_verify=True):
    """
    :param url: 待出现
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:
        # 每次重试的时候都换cookies,并且和上次不同
        name_cookies = Cookies.fetch_cookies()

        if name_cookies is None:
            crawler.error('cookie池中不存在cookie,请检查账号和登录任务是否正常。采集程序退出。')
            os._exit(0)

        if name_cookies == latest_name_cookies:
            continue

        latest_name_cookies = name_cookies

        try:
            resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)
            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or is_403(page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
示例#9
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: url to be crawled
    :param user_verify: if it's ajax url, the value is False, else True
    :param need_login: if the url is need to login, the value is True, else False
    :return: return '' if exception happens or status_code != 200
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('no cookies in cookies pool, please find out the reason')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        try:
            if need_login:
                resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('account {} has been banned'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url, headers=headers, timeout=time_out, verify=False)

            page = resp.text

            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # slow down to aviod being banned
            time.sleep(interal)

            if user_verify:
                if is_banned(resp.url) or is_403(page):
                    crawler.warning('account {} has been banned'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if 'verifybmobile' in resp.url:
                    crawler.warning('account {} has been locked,you should use your phone to unlock it'.
                                    format(name_cookies[0]))

                    freeze_account(name_cookies[0], -1)
                    Cookies.delete_cookies(name_cookies[0])
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('{url} seems to be 404'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('excepitons happens when crawling {},specific infos are {}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('max tries for {},check the url in redis db2'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
示例#10
0
def _get_current_reposts(url, session, weibo_mid):
    """
    修改过后的抓取主程序,由于微博频率限制比较严格,目前只抓取当前微博及其子微博,不抓取源微博
    """
    spread_other_caches = list()
    spread_others = list()
    spread_other_and_caches = list()

    html = get_page(url, session, headers)
    reposts = status_parse.get_repostcounts(html)
    comments = status_parse.get_commentcounts(html)

    # 更新weibo_search_data表中的转发数、评论数
    weibosearch_dao.update_repost_comment(mid=weibo_mid,
                                          reposts=reposts,
                                          comments=comments)

    if not basic.is_404(html):
        root_url = url
        mid = status_parse.get_mid(html)
        user_id = status_parse.get_userid(html)
        user_name = status_parse.get_username(html)
        post_time = status_parse.get_statustime(html)
        device = status_parse.get_statussource(html)
        comments_count = status_parse.get_commentcounts(html)
        reposts_count = status_parse.get_repostcounts(html)
        root_user = user.get_profile(user_id, session, headers)

        spread_original_dao.save(root_user, mid, post_time, device,
                                 reposts_count, comments_count, root_url)

        crawler.info('该微博转发数为{counts}'.format(counts=reposts_count))

        if reposts_count > 0:
            base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}'
            soc = SpreadOtherCache()
            soc.set_id(user_id)
            soc.set_name(user_name)
            spread_other_caches.append(soc)
            page = 1
            ajax_url = base_url.format(mid=mid, currpage=page)
            source = get_page(ajax_url, session, headers, False)

            crawler.info('本次转发信息url为:' + ajax_url)

            try:
                repost_json = json.loads(source)
                total_page = int(repost_json['data']['page']['totalpage'])
            except Exception as why:
                parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(
                    url=ajax_url, why=why))
            else:
                page = total_page
                page_counter = 0

                while page > 0 and page_counter < page_max:
                    ajax_url = base_url.format(mid=mid, currpage=page)
                    repost_info = get_page(ajax_url, session, headers, False)
                    try:
                        repost_json = json.loads(repost_info)
                        repost_html = repost_json['data']['html']
                    except Exception as why:
                        parser.error(
                            '{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(
                                url=ajax_url, why=why))
                    else:
                        repost_urls = status_parse.get_reposturls(repost_html)

                        # 转发节点排序逻辑
                        for repost_url in repost_urls:
                            repost_cont = status.get_status_info(
                                repost_url, session, user_id, user_name,
                                headers, mid)

                            if repost_cont is not None:
                                spread_other_and_caches.append(repost_cont)

                        for soac in spread_other_and_caches:
                            if soac.get_so().id != '':
                                spread_others.append(soac.get_so())
                                spread_other_caches.append(soac.get_soc())
                    finally:
                        print('当前位于第{currpage}页'.format(currpage=page))
                        page -= 1
                        page_counter += 1

                for so in spread_others:
                    if so.verify_type == '':
                        so.verify_type = 0

                    for i in spread_other_caches:
                        if so.upper_user_name == i.get_name():
                            so.upper_user_id = i.get_id()
                            break
                        else:
                            so.upper_user_id = user_id

                spread_others = list(set(spread_others))

                spread_other_dao.save(spread_others)
                crawler.info('一共获取了{num}条转发信息,该条微博的转发信息已经采集完成'.format(
                    num=len(spread_others)))
    else:
        crawler.info('{url}为404页面'.format(url=url))
示例#11
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待出现
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:

        if need_login:
            # 每次重试的时候都换cookies,并且和上次不同
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号')
                rs = get_login_info()

                if len(rs) == 0:
                    crawler.error('账号均不可用,请检查账号健康状况')
                    # 杀死所有关于celery的进程
                    if 'win32' in sys.platform:
                        os.popen('taskkill /F /IM "celery*"')
                    else:
                        os.popen('pkill -f "celery"')
                else:
                    # 如果有可用账号,那么就拿来登录,这里用了本地调用,好像不是很合理,因为如果login queue
                    # 不会在本机上,那么该调用就会无效但是用网络调用,如何保证不会出现在某些不常用登录地的节点
                    # 上还有待考量,亦或者找到一个更适合的方法可以突破异地登录的限制
                    # TODO 衡量是用网络调用还是直接调用 login.get_session()方法,这里应该不是很合理
                    # 目前暂时不考虑节点登录出现验证码的问题, 考虑到大规模账号登录的话,需要把login_queue的节点放在账号常用地
                    crawler.info('重新获取cookie中...')
                    login.excute_login_task()
                    time.sleep(10)

            if name_cookies == latest_name_cookies:
                continue

            latest_name_cookies = name_cookies

        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(
                        page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
示例#12
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待抓取url
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),
    否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,请检查账号是否正常')
                other.warning('正在关闭爬虫程序...')
                if 'win32' in sys.platform:
                    os.popen('taskkill /F /IM "celery*"')
                else:
                    os.popen('pkill -f "celery"')
        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or 'userblock' in resp.url or is_403(
                        page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if 'verifybmobile' in resp.url:
                    crawler.warning('账号{}功能被锁定,需要手机解锁'.format(name_cookies[0]))

                    freeze_account(name_cookies[0], -1)
                    Cookies.delete_cookies(name_cookies[0])
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
示例#13
0
def get_profile(user_id, session, headers):
    """
    默认为个人用户,如果为作家,则需要再做一次抓取,而为企业用户,它会重定向到企业主页,直接解析即可
    登陆后可以根据http://weibo.com/u/userId来进行确定用户主页,不知道稳定不,todo 测试这个路径
    好像'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more' 这个路径可以解决大部分路径问题,只是非普通用户
    会被重定向到主页,有的并不行,比如domain=100106
    """
    if user_id == '':
        return User()

    user = User()
    info = get_user(user_id)

    if info:
        user.id = user_id
        user.screen_name = info.get('name')
        user.province = info.get('province')
        user.city = info.get('city')
        user.location = info.get('location')
        user.description = info.get('description')
        user.headimg_url = info.get('headimg_url')
        user.blog_url = info.get('blog_url')
        user.domain_name = info.get('domain_name')
        user.gender = info.get('gender')
        user.followers_count = info.get('followers_count')
        user.friends_count = info.get('friends_count')
        user.status_count = info.get('status_count')
        user.birthday = info.get('birthday')
        user.verify_type = info.get('verify_type')
        user.verify_info = info.get('verify_info')
        user.register_time = info.get('register_time')

        # 防止在插入数据库的时候encode()出问题
        for key in user.__dict__:
            if user.__dict__[key] is None:
                setattr(user, key, '')

        storage.info('ID为{id}的用户信息已经存在于数据库中'.format(id=user_id))

    else:
        url = 'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more'
        html = get_page(url, session, headers)

        if not is_404(html):
            domain = get_publicinfo.get_userdomain(html)

            if domain == '100505' or domain == '103505' or domain == '100306':
                user = get_personalinfo.get_detail(html)
                if user is not None:
                    user.followers_count = get_personalinfo.get_fans(html)
                    user.friends_count = get_personalinfo.get_friends(html)
                    user.status_count = get_personalinfo.get_status(html)
                else:
                    user = User()
            else:
                # 为了尽可能少抓取url,所以这里不适配所有服务号
                if domain == '100106':
                    url = 'http://weibo.com/p/'+domain+user_id+'/home'
                    html = get_page(url, session, headers)
                    if html == '':
                        return user

                user.followers_count = get_enterpriseinfo.get_fans(html)
                user.friends_count = get_enterpriseinfo.get_friends(html)
                user.status_count = get_enterpriseinfo.get_status(html)
                user.description = get_enterpriseinfo.get_description(html).encode('gbk', 'ignore').decode('gbk')

            user.id = user_id
            user.screen_name = get_publicinfo.get_username(html)
            user.headimg_url = get_publicinfo.get_headimg(html)
            user.verify_type = get_publicinfo.get_verifytype(html)
            user.verify_info = get_publicinfo.get_verifyreason(html, user.verify_type)

            save_user(user)
            storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id))

    return user
示例#14
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待抓取url
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:

        if need_login:
            # 每次重试的时候都换cookies,并且和上次不同,如果只有一个账号,那么就允许相同
            name_cookies, cookies_count = Cookies.fetch_cookies()
            
            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号')
                rs = get_login_info()

                # 选择状态正常的账号进行登录,账号都不可用就停掉celery worker
                if len(rs) == 0:
                    crawler.error('账号均不可用,请检查账号健康状况')
                    # 杀死所有关于celery的进程
                    if 'win32' in sys.platform:
                        os.popen('taskkill /F /IM "celery*"')
                    else:
                        os.popen('pkill -f "celery"')
                else:
                    crawler.info('重新获取cookie中...')
                    login.excute_login_task()
                    time.sleep(10)

            # 只有cookies总数大于1的时候才会在每次重试的时候切换不同cookie
            if cookies_count > 1 and name_cookies == latest_name_cookies:
                continue

            latest_name_cookies = name_cookies

        try:
            if need_login:
                resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url, headers=headers, timeout=time_out, verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''