Пример #1
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    #crawler.info(limit)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        #crawler.info(search_page)
        if not search_page:
            crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('keyword {} has been crawled in last turn'.format(keyword))
                #continue
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('keyword {} has been crawled in this turn'.format(keyword))
            return
Пример #2
0
def get_redirect(name, data, post_url, session):
    logining_page = session.post(post_url, data=data, headers=headers)
    login_loop = logining_page.content.decode("GBK")

    # if name or password is wrong, set the value to 2
    if 'retcode=101' in login_loop:
        crawler.error(
            'invalid password for {}, please ensure your account and password'.
            format(name))
        freeze_account(name, 2)
        return ''

    if 'retcode=2070' in login_loop:
        crawler.error('invalid verification code')
        return 'pinerror'

    if 'retcode=4049' in login_loop:
        crawler.warning('account {} need verification for login'.format(name))
        return 'login_need_pincode'

    if '正在登录' or 'Signing in' in login_loop:
        pa = r'location\.replace\([\'"](.*?)[\'"]\)'
        return re.findall(pa, login_loop)[0]
    else:
        return ''
Пример #3
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)
Пример #4
0
def crawl_comment_page(mid):
    limit = get_max_comment_page()
    cur_page = 1
    next_url = ''
    while cur_page <= limit:
        cur_time = int(time.time()*1000)
        if cur_page == 1:
            url = start_url.format(mid, cur_time)
        else:
            url = base_url.format(next_url, cur_time)
        html = get_page(url, user_verify=False)
        comment_datas = comment.get_comment_list(html, mid)

        if not comment_datas and cur_page == 1:
            crawler.warning('微博id为{}的微博评论未采集成功,请检查原因'.format(mid))
            return

        save_comments(comment_datas)
        # 由于这里每一步都要根据上一步来迭代,所以不适合采用网络调用(主要是比较麻烦)
        next_url = comment.get_next_url(html)

        if not next_url:
            crawler.info('微博{}的评论采集已经完成'.format(mid))
            return
        cur_page += 1
Пример #5
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning('本次并没获取到关键词{}的相关微博,该页面源码是{}'.format(
                keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)
        # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处
                app.send_task('tasks.user.crawl_person_infos',
                              args=(wb_data.uid, ),
                              queue='user_crawler',
                              routing_key='for_user_info')

        # 判断是否包含下一页
        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('关键词{}搜索完成'.format(keyword))
            return
Пример #6
0
def get_session(name, password, need_verify):
    url, yundama_obj, cid, session = do_login(name, password, need_verify)
    # 打码出错处理
    while url == 'pinerror' and yundama_obj is not None:
        yundama_obj.report_error(cid)
        url, yundama_obj, cid, session = do_login(name, password, need_verify)

    if url != '':
        rs_cont = session.get(url, headers=headers)
        login_info = rs_cont.text

        u_pattern = r'"uniqueid":"(.*)",'
        m = re.search(u_pattern, login_info)
        if m and m.group(1):
            # 任意验证一个页面看能否访问,使用这个方法验证比较依赖外部条件,但是没找到更好的方式(有的情况下,
            # 账号存在问题,但是可以访问自己的主页,所以通过自己的主页验证账号是否正常不恰当)
            check_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more'
            resp = session.get(check_url, headers=headers)
            # 通过实验,目前发现未经过手机验证的账号是救不回来了...
            if is_403(resp.text):
                other.error('账号{}已被冻结'.format(name))
                crawler.warning('账号{}已经被冻结'.format(name))
                freeze_account(name, 0)
                return None
            other.info('本次登陆账号为:{}'.format(name))
            Cookies.store_cookies(name, session.cookies.get_dict())
            return session

    other.error('本次账号{}登陆失败'.format(name))
    return None
Пример #7
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('keyword {} has been crawled in this turn'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('keyword {} has been crawled in this turn'.format(keyword))
            return
Пример #8
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
Пример #9
0
def get_redirect(name, data, post_url, session):
    """
    :param name: 登录用户名
    :param data: 需要提交的数据,可以通过抓包来确定部分不变的
    :param post_url: post地址
    :param session:
    :return: 服务器返回的下一次需要请求的url,如果打码出错,返回特定字符串好做特殊处理
    """
    logining_page = session.post(post_url, data=data, headers=headers)
    login_loop = logining_page.content.decode("GBK")

    # 如果是账号密码不正确,那么就将该字段置为2
    if 'retcode=101' in login_loop:
        crawler.error('账号{}的密码不正确'.format(name))
        freeze_account(name, 2)
        return ''

    if 'retcode=2070' in login_loop:
        crawler.error('输入的验证码不正确')
        return 'pinerror'

    if 'retcode=4049' in login_loop:
        crawler.warning('账号{}登录需要验证码'.format(name))
        return 'login_need_pincode'

    if '正在登录' or 'Signing in' in login_loop:
        pa = r'location\.replace\([\'"](.*?)[\'"]\)'
        return re.findall(pa, login_loop)[0]
    else:
        return ''
Пример #10
0
 def check_cookies_timeout(cls, cookies):
     if cookies is None:
         return True
     if isinstance(cookies, bytes):
         cookies = cookies.decode('utf-8')
     cookies = json.loads(cookies)
     login_time = datetime.datetime.fromtimestamp(cookies['loginTime'])
     if datetime.datetime.now() - login_time > datetime.timedelta(hours=cookie_expire_time):
         crawler.warning('the account has been expired')
         return True
     return False
Пример #11
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        page_id = public.get_pageid(html)
        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, page_id, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, page_id, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_0, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_1, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)
Пример #12
0
def search_keyword(row):
    cur_page = 1
    keyword = row.keyword
    if row.startTime:
        startTime = row.startTime.strftime('%Y-%m-%d')
        url = 'http://s.weibo.com/weibo/{}&scope=ori&suball=1&page={}&timescope=custom:{}'
    if row.endTime:
        endTime = row.endTime.strftime('%Y-%m-%d')
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        if row.startTime and row.endTime:
            finalTime = startTime + ':' + endTime
            cur_url = url.format(encode_keyword, cur_page, finalTime)
        else:
            cur_url = url.format(encode_keyword, cur_page)
        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning(
                '本次并没获取到关键词{}的相关微博,该页面源码是{}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)
        # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        # 判断是否包含下一页
        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('关键词{}搜索完成'.format(keyword))
            return
Пример #13
0
def get_redirect(name, data, post_url, session):
    logining_page = session.post(post_url, data=data, headers=headers)
    login_loop = logining_page.content.decode("GBK")

    # if name or password is wrong, set the value to 2
    if 'retcode=101' in login_loop:
        crawler.error('invalid password for {}, please ensure your account and password'.format(name))
        freeze_account(name, 2)
        return ''

    if 'retcode=2070' in login_loop:
        crawler.error('invalid verification code')
        return 'pinerror'

    if 'retcode=4049' in login_loop:
        crawler.warning('account {} need verification for login'.format(name))
        return 'login_need_pincode'

    if '正在登录' in login_loop or 'Signing in' in login_loop:
        pa = r'location\.replace\([\'"](.*?)[\'"]\)'
        return re.findall(pa, login_loop)[0]
    else:
        return ''
Пример #14
0
def get_page(url, need_login=True):
    """
    :param url: url to be crawled
    :param need_login: if the url is need to login, the value is True, else False
    :return: return '' if exception happens or status_code != 200
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning(
                    'no cookies in cookies pool, please find out the reason')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text

            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue
            # slow down to aviod being banned
            time.sleep(interal)

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning(
                'excepitons happens when crawling {},specific infos are {}'.
                format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('max tries for {},check the url in redis db2'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
Пример #15
0
def get_page(url, session, headers, user_verify=True):
    """
    :param user_verify: 是否为可能出现验证码的页面(搜索页面的403还没解析),否为抓取转发的ajax连接
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    try:
        page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \
            encode('utf-8', 'ignore').decode('utf-8')
        time.sleep(interal)

        if user_verify:
            if is_403(page):
                crawler.warning('本账号已经被冻结')
                crawler.info('本次抓取结束,时间是:{curtime}'.format(curtime=time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime())))
                exit(-1)
            if is_404(page):
                crawler.warning('url为{url}的连接不存在'.format(url=url))
                return ''
            if not is_complete(page):
                time.sleep(excp_interal)
                try:
                    page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \
                        encode('utf-8', 'ignore').decode('utf-8')
                except Exception as why:
                    crawler.error(why)
                    return ''
    except requests.exceptions.ReadTimeout:
        crawler.warning('抓取{url}时连接目标服务器超时'.format(url=url))
        time.sleep(excp_interal)
        return ''
    except requests.exceptions.ConnectionError as e:
        crawler.warning('目标服务器拒绝连接,程序休眠1分钟,具体异常信息为:{e}'.format(e=e))
        time.sleep(excp_interal)
        return ''
    else:
        return page
Пример #16
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: url to be crawled
    :param user_verify: if it's ajax url, the value is False, else True
    :param need_login: if the url is need to login, the value is True, else False
    :return: return '' if exception happens or status_code != 200
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('no cookies in cookies pool, please find out the reason')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        try:
            if need_login:
                resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('account {} has been banned'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url, headers=headers, timeout=time_out, verify=False)

            page = resp.text

            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # slow down to aviod being banned
            time.sleep(interal)

            if user_verify:
                if is_banned(resp.url) or is_403(page):
                    crawler.warning('account {} has been banned'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if 'verifybmobile' in resp.url:
                    crawler.warning('account {} has been locked,you should use your phone to unlock it'.
                                    format(name_cookies[0]))

                    freeze_account(name_cookies[0], -1)
                    Cookies.delete_cookies(name_cookies[0])
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('{url} seems to be 404'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('excepitons happens when crawling {},specific infos are {}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('max tries for {},check the url in redis db2'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
Пример #17
0
def send_personal_message(target_uid, adver_message, user_verify=True, need_login=True):
    """
    :param url: url to be crawled
    :param user_verify: if it's ajax url, the value is False, else True
    :param need_login: if the url is need to login, the value is True, else False
    :return: return '' if exception happens or status_code != 200
    """
    crawler.info('the send_personal_message uid is {uid}'.format(uid=str(target_uid)))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()
            print(name_cookies)
            # check adver_timers
            if int(name_cookies[3]) >= int(adver_timers):
                continue

            if name_cookies is None:
                crawler.warning('no cookies in cookies pool, please find out the reason')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        try:
            if need_login:
                resp = requests.post('http://api.weibo.com/webim/2/direct_messages/new.json?source='+str(name_cookies[2]),
                              data={'text': adver_message, 'uid':str(target_uid)},
                              cookies=name_cookies[1], headers=personal_message_headers)

                if "error" in resp.text:
                    crawler.warning('account {} has been banned, resp.text is: {}'.format(name_cookies[0], resp.text))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
                else:
                    # update adver_times
                    Cookies.store_cookies(name_cookies[0], name_cookies[1], name_cookies[2], 1)
                    return None

            #     if "$CONFIG['islogin'] = '******'" in resp.text:
            #         crawler.warning('account {} has been banned'.format(name_cookies[0]))
            #         freeze_account(name_cookies[0], 0)
            #         Cookies.delete_cookies(name_cookies[0])
            #         continue
            # # else:
            # #     resp = requests.get(url, headers=headers, timeout=time_out, verify=False)
            #
            # page = resp.text
            #
            # if page:
            #     page = page.encode('utf-8', 'ignore').decode('utf-8')
            # else:
            #     continue
            #
            # # slow down to aviod being banned
            # time.sleep(interal)
            #
            # if user_verify:
            #     if is_banned(resp.url) or is_403(page):
            #         crawler.warning('account {} has been banned'.format(name_cookies[0]))
            #         freeze_account(name_cookies[0], 0)
            #         Cookies.delete_cookies(name_cookies[0])
            #         count += 1
            #         continue
            #
            #     if 'verifybmobile' in resp.url:
            #         crawler.warning('account {} has been locked,you should use your phone to unlock it'.
            #                         format(name_cookies[0]))
            #
            #         freeze_account(name_cookies[0], -1)
            #         Cookies.delete_cookies(name_cookies[0])
            #         continue
            #
            #     if not is_complete(page):
            #         count += 1
            #         continue
            #
            #     if is_404(page):
            #         crawler.warning('send_personal_message{uid} seems to be 404'.format(uid=str(target_uid)))
            #         return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('excepitons happens when send_personal_message {},specific infos are {}'.format(target_uid, e))
            count += 1
            time.sleep(excp_interal)

        else:
            # Urls.store_crawl_url(url, 1)
            # return page
            return None

    crawler.warning('max tries for {},check the target_uid in redis db2'.format(target_uid))
    # Urls.store_crawl_url(url, 0)
    return ''
Пример #18
0
def send_jd_seckill_task(jd_user_string, address_string, task_id, skuId, netproxy):
    """
    """
    s = requests.session()
    s.timeout = session_timeout
    s.proxies = netproxy

    jd_user_json = json.loads(jd_user_string)
    address_json = json.loads(address_string)
    cookies_encode = jd_user_json['cookies'].encode()
    cookies_decode = base64.b64decode(cookies_encode).decode()
    # cookies_dict = json.loads(cookies_decode)

    rawdata = '__jdv=122270672|direct|-|none|-|1504798597931; o2-webp=true; TrackID=1d8yuf-8hCib8xjpwDjMwOLGCD0gmGtLEjJFNZQwBIvwskJdwUNnq1kiTmBcsfXw2nATZkxctFmE3r1fN0yVk9egAz0M5KDHytNxuRLuHtOk; pinId=7iwdYGSz99W1ffsfn98I-w; pin=xtuyaowu; thor=C3888A1807C299F45E21294E559BB739649F3F90C26DB309D58688491645C60E7745B49FBD8CD722E210B31A2EE861DAF9C0782F8A06AAF23606C377C1953E40B92BA29EED15FF5F57F2A0165047E0C44F71D5CA5FF000281EC43042F0403E24E8A7B703856EC818D09300F82CB14986EF55754C61CA47D6A3F1A6ADE7E1FE0B99D7576D0BD2721B0E8F279EE5980A2B; _tp=gs6zPQLXL133eDDGdm%2Bv%2Fg%3D%3D; _pst=xtuyaowu; ceshi3.com=000; __jda=122270672.15047985979311779686273.1504798598.1504798598.1504798598.1; __jdb=122270672.3.15047985979311779686273|1.1504798598; __jdc=122270672; __jdu=15047985979311779686273'
    cookie = SimpleCookie()
    cookie.load(cookies_decode)

    # Even though SimpleCookie is dictionary-like, it internally uses a Morsel object
    # which is incompatible with requests. Manually construct a dictionary instead.
    cookies = {}
    for key, morsel in cookie.items():
        cookies[key] = morsel.value

    crawler.info('the send_jd_seckill_task jd_user is {uid}'.format(uid=str(jd_user_string)))
    celery_stask_status = 7
    try:
        # 第一次提交获取地址
        resp = s.get('https://marathon.jd.com/async/getUsualAddressList.action?skuId='+str(skuId), headers=headers,
                            cookies=cookies, timeout=time_out, verify=False)

        # [{
        #     "name": "冷月",
        #     "id": 138356479,
        #     "addressDetail": "广州外国语学校-凤凰大道 丰巢快递柜",
        #     "provinceId": 19,
        #     "cityId": 1601,
        #     "countyId": 50259,
        #     "townId": 51886,
        #     "mobile": "",
        #     "provinceName": "广东",
        #     "cityName": "广州市",
        #     "countyName": "南沙区",
        #     "mobileKey": "5fe7bdd8ce0aa7af84b7d1380d8141a3",
        #     "email": "",
        #     "townName": "城区",
        #     "mobileWithXing": "131****5409"
        # }, {
        #     "name": "冷月",
        #     "id": 138359040,
        #     "addressDetail": "中信香樟墅1街12号",
        #     "provinceId": 19,
        #     "cityId": 1601,
        #     "countyId": 50284,
        #     "townId": 50451,
        #     "mobile": "",
        #     "provinceName": "广东",
        #     "cityName": "广州市",
        #     "countyName": "增城区",
        #     "mobileKey": "5fe7bdd8ce0aa7af84b7d1380d8141a3",
        #     "email": "",
        #     "townName": "中新镇",
        #     "mobileWithXing": "131****5409"
        # }]
        #
        # todo 第一次提交返回校验
        if not resp.text:
            save_task_monitor(task_id, celery_stask_status, "do not contain address")
            return None
        if '登录' in resp.text:
            save_task_monitor(task_id, celery_stask_status, "cookies失败")
            return None

        address_list = json.loads(resp.text)
        if len(address_list) >0:
            address_dict = address_list[0]
            if 'addressDetail' not in address_dict:
                crawler.warning('task_id {} has been banned, resp.text is: {}'.format(task_id, resp.text))
                save_task_monitor(task_id, celery_stask_status, resp.text)
                return None

        # todo 秒杀 参数需要确认
        resp = s.post('https://marathon.jd.com/seckill/submitOrder.action?skuId='+str(skuId)+'&vid= HTTP/1.1',
                      data={'orderParam.name':address_dict['name'],
                            'orderParam.addressDetail':address_dict['addressDetail'],
                            'orderParam.mobile':address_dict['mobileWithXing'],
                            'orderParam.email':address_dict['email'],
                            'orderParam.provinceId':address_dict['provinceId'],
                            'orderParam.cityId':address_dict['cityId'],
                            'orderParam.countyId':address_dict['countyId'],
                            'orderParam.townId':address_dict['townId'],
                            'orderParam.paymentType':4,
                            'orderParam.password':'',
                            'orderParam.invoiceTitle':4,
                            'orderParam.invoiceContent':1,
                            'orderParam.invoiceCompanyName':'',
                            'orderParam.invoiceTaxpayerNO':'',
                            'orderParam.usualAddressId':address_dict['id'],
                            'skuId':skuId,
                            'num':1,
                            'orderParam.provinceName':address_dict['provinceName'],
                            'orderParam.cityName':address_dict['cityName'],
                            'orderParam.countyName':address_dict['countyName'],
                            'orderParam.townName':address_dict['townName'],
                            'orderParam.codTimeType':3,
                            'orderParam.mobileKey':address_dict['mobileKey'],
                            'eid':jd_user_json['eid'],
                            'fp':jd_user_json['fp']
                            },
                             cookies=cookies, headers=personal_message_headers)

        # 秒杀返回校验
        if "//marathon.jd.com/koFail.html?reason=" in resp.text:
            crawler.warning('task_id {} has been banned, resp.text is: {}'.format(task_id, resp.text))
        else:
            celery_stask_status = 8

    except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
        print(e.format_exc())
        crawler.warning('excepitons happens when task_id {},specific infos are {}'.format(task_id, e))
        time.sleep(excp_interal)

    dbc = class_MongoDB.MongoClient(uri, class_logger.getLogger('MongoDB_Users'), 'JD')
    dbc.setUnique('Users', 'username')
    dbc.update('Users', {'username': jd_user_json['username']}, {'status': 2})

    save_task_monitor(task_id, celery_stask_status, resp.text)
    return ''
Пример #19
0
def crawl_weibo_pics(uid):
    limit = get_max_home_page()
    cur_page = 1

    # 自定义最大爬取的页数
    max_page = 20
    # end

    url = home_url.format(uid, cur_page)
    html = get_page(url)
    domain = public.get_userdomain(html)
    
    # 只爬取微博个人用户的相片,如果是非个人用户(如政府,组织等)不爬取。
    if domain not in ['103505', '100306', '100505', '']:
        set_seed_home_crawled(uid, 2)
        return
    # end

    domain_uid = domain + uid
    page_domain = 'page_' + domain
    url = pic_url.format(domain_uid, page_domain)

    html = get_page(url)

    weibo_pics, next_ajax_url = get_wbdata_fromweb(html)

    if weibo_pics is None or next_ajax_url is None:
        crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid))
        set_seed_home_crawled(uid, 3)
        return

    if not weibo_pics:
        crawler.warning('用户id为{}的用户相册未采集成功,可能是因为TA没有发过带图微博'.format(uid))
        set_seed_home_crawled(uid, 5)
        return

    insert_weibo_pics(weibo_pics)

    if not next_ajax_url:
        crawler.warning('用户id为{}的相册采集完成'.format(uid))
        set_seed_home_crawled(uid, 4)
        return
    
    cur_page += 1

    while cur_page <= limit:

        # 有些微博账号的照片多达两三千张,如果全部爬取比较浪费时间,这里先简单粗暴地根据当前微博的页数
        # 进行限制。经过调查发现10页左右应该是比较理想的数字。
        if cur_page > max_page:
            break
        # ebd

        cur_time = int(time.time()*1000)
        ajax_call = 1
        page_id = domain_uid
        url = ajax_url.format(page_id, cur_page, ajax_call, cur_time) + '&' + next_ajax_url
        html = get_page(url, user_verify=False)

        weibo_pics, next_ajax_url = get_pic_data_byajax(html)
        
        if weibo_pics is None or next_ajax_url is None:
            crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid))
            set_seed_home_crawled(uid, 3)
            return

        if not weibo_pics:
            crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid))
            set_seed_home_crawled(uid, 3)
            return
        
        insert_weibo_pics(weibo_pics)
        
        if not next_ajax_url:
            crawler.warning('用户id为{}的相册采集完成'.format(uid))
            set_seed_home_crawled(uid, 4)
            return

        cur_page += 1
        
    # 在完成规定的最大爬取页数后主动退出,将标志位置位为4
    set_seed_home_crawled(uid, 4)
    return
Пример #20
0
def exception_uid_handler(uid, err_code, proxy ={}, html=''):
    crawler.warning('用户id为{}的相册采集出错,这一请求接收到的内容为{},状态码{}'.format(uid, html, err_code))
    # set_seed_home_crawled(uid, 3)
    if proxy:
        proxy_handler(proxy, -1)
Пример #21
0
def finish_uid_handler(uid, proxy):
    crawler.warning('用户id为{}的相册采集完成'.format(uid))
    set_seed_home_crawled(uid, 4)
    proxy_handler(proxy, 1)
Пример #22
0
def get_session(name, password):
    session = requests.Session()
    su = get_encodename(name)

    sever_data = get_server_data(su, session)
    servertime = sever_data["servertime"]
    nonce = sever_data['nonce']
    rsakv = sever_data["rsakv"]
    pubkey = sever_data["pubkey"]

    sp = get_password(password, servertime, nonce, pubkey)

    # 提交的数据可以根据抓包获得
    data = {
        'encoding': 'UTF-8',
        'entry': 'weibo',
        'from': '',
        'gateway': '1',
        'nonce': nonce,
        'pagerefer': "",
        'prelt': 67,
        'pwencode': 'rsa2',
        "returntype": "META",
        'rsakv': rsakv,
        'savestate': '7',
        'servertime': servertime,
        'service': 'miniblog',
        'sp': sp,
        'sr': '1920*1080',
        'su': su,
        'useticket': '1',
        'vsnf': '1',
        'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack'
    }
    post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
    url = get_redirect(data, post_url, session)

    if url != '':
        rs_cont = session.get(url, headers=headers)
        login_info = rs_cont.text

        u_pattern = r'"uniqueid":"(.*)",'
        m = re.search(u_pattern, login_info)
        if m:
            if m.group(1):
                # 任意验证一个页面看能否访问,使用这个方法验证比较依赖外部条件,但是没找到更好的方式(有的情况下,
                # 账号存在问题,但是可以访问自己的主页,所以通过自己的主页验证账号是否正常不恰当)
                check_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more'
                resp = session.get(check_url, headers=headers)

                if is_403(resp.text):
                    other.error('账号{}已被冻结'.format(name))
                    crawler.warning('账号{}已经被冻结'.format(name))
                    freeze_account(name)
                    return None
                other.info('本次登陆账号为:{}'.format(name))
                Cookies.store_cookies(name, session.cookies.get_dict())
                return session
            else:
                other.error('本次账号{}登陆失败'.format(name))
                return None
        else:
            other.error('本次账号{}登陆失败'.format(name))
            return None
    else:
        other.error('本次账号{}登陆失败'.format(name))
        return None
Пример #23
0
def crawl_weibo(uid):
    debug_mode = 1

    limit = get_max_home_page()
    cur_page = 1

    pic_count = 0
    max_pic_count = 150

    max_retry_cnt = 2
    cur_retry_cnt = 0

    direct_get_sleep_time = 30

    containerid = '230413' + uid
    luicode = '10000011'
    lfid = '230283' + uid
    featurecode = '20000180'
    value = uid
    page_type = '03'
    page = cur_page

    # 只要db中没有proxy,就认为当前进入了一个暂时无代理而需要直接连接的状况,sleep的时间就应该相应的拉长
    proxy = get_a_random_proxy()
    if proxy == {}:
        direct_get_sleep_time = 60
    elif random_event_occur():
        proxy = {}
    print(proxy)
    # end

    if debug_mode == 1:
        direct_get_sleep_time = 1
    # test for getting empty proxy
    if proxy == {}:
        # crawler.warning('empty proxy!')
        # time.sleep(3)
        # proxy = get_a_random_proxy()
        # proxy_cnt = count_proxy()
        # crawler.warning('new proxy:{}, proxy count:{}'.format(proxy, proxy_cnt))
        # return
        time.sleep(randint(0, direct_get_sleep_time))
    # end

    url = ori_wb_temp_url.format(containerid, luicode, lfid, featurecode, value, page_type, page)
    html = get_page(url, user_verify=False, need_login=False, proxys=proxy)

    # html为空也有可能是其他原因,但是代理问题应该是大概率,因此对代理进行扣分。
    # 如果重试还是返回空html,那么两个proxy均不扣分,记录uid异常后直接return,如果返回非空但无效的html,则在后面流程进行扣分
    if html == '':
        if cur_retry_cnt < max_retry_cnt:
            cur_retry_cnt = cur_retry_cnt + 1
            proxy_handler(proxy, -1)
            proxy = get_a_random_proxy()

            if proxy == {}:
                time.sleep(randint(0, direct_get_sleep_time))

            html = get_page(url, user_verify=False, need_login=False, proxys=proxy)
            if html == '':
                proxy_handler(proxy, -1)
                return
        else:
            proxy_handler(proxy, -1)
            return
    # end

    weibo_pics = get_weibo_list(html)

    if weibo_pics == '':
        crawler.warning('请求过于频繁')
        if proxy == {}:
            time.sleep(randint(0, direct_get_sleep_time))
        proxy_handler(proxy, -1)
        return

    if weibo_pics == None:
        proxy_handler(proxy, -1)
        return
    elif weibo_pics == False:
        finish_uid_handler(uid, proxy)
        return
    elif weibo_pics:
        insert_weibo_pics(weibo_pics)

    pic_count = pic_count + len(weibo_pics)

    cur_page += 1

    while cur_page <= limit and pic_count < max_pic_count:
        
        page = cur_page
        url = ori_wb_temp_url.format(containerid, luicode, lfid, featurecode, value, page_type, page)
        html = get_page(url, user_verify=False, need_login=False, proxys=proxy)

        # html为空也有可能是其他原因,但是代理问题应该是大概率,因此对代理进行扣分。
        if html == '':
            if cur_retry_cnt < max_retry_cnt:
                cur_retry_cnt = cur_retry_cnt + 1
                proxy_handler(proxy, -1)
                proxy = get_a_random_proxy()

                if proxy == {}:
                    time.sleep(randint(0, direct_get_sleep_time))
                
                html = get_page(url, user_verify=False, need_login=False, proxys=proxy)
                if html == '':
                    exception_uid_handler(uid, 6, proxy)
                    return
            else:
                exception_uid_handler(uid, 3, proxy)
                return
        # end

        weibo_pics = get_weibo_list(html)

        # 如果通过当前代理所获取到的页面是被封锁页面,则将当前代理降分并直接return
        if weibo_pics == '':
            crawler.warning('请求过于频繁')
            if proxy == {}:
                time.sleep(randint(0, direct_get_sleep_time))
            proxy_handler(proxy, -1)
            return

        if weibo_pics == None:
            exception_uid_handler(uid, 4, proxy, html)
            return
        elif weibo_pics == False:
            finish_uid_handler(uid, proxy)
            return
        elif weibo_pics:
            insert_weibo_pics(weibo_pics)
        
        pic_count = pic_count + len(weibo_pics)

        cur_page += 1

    finish_uid_handler(uid, proxy)
    return
Пример #24
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待出现
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:

        if need_login:
            # 每次重试的时候都换cookies,并且和上次不同
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号')
                rs = get_login_info()

                if len(rs) == 0:
                    crawler.error('账号均不可用,请检查账号健康状况')
                    # 杀死所有关于celery的进程
                    if 'win32' in sys.platform:
                        os.popen('taskkill /F /IM "celery*"')
                    else:
                        os.popen('pkill -f "celery"')
                else:
                    # 如果有可用账号,那么就拿来登录,这里用了本地调用,好像不是很合理,因为如果login queue
                    # 不会在本机上,那么该调用就会无效但是用网络调用,如何保证不会出现在某些不常用登录地的节点
                    # 上还有待考量,亦或者找到一个更适合的方法可以突破异地登录的限制
                    # TODO 衡量是用网络调用还是直接调用 login.get_session()方法,这里应该不是很合理
                    # 目前暂时不考虑节点登录出现验证码的问题, 考虑到大规模账号登录的话,需要把login_queue的节点放在账号常用地
                    crawler.info('重新获取cookie中...')
                    login.excute_login_task()
                    time.sleep(10)

            if name_cookies == latest_name_cookies:
                continue

            latest_name_cookies = name_cookies

        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(
                        page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
Пример #25
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待抓取url
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),
    否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,请检查账号是否正常')
                other.warning('正在关闭爬虫程序...')
                if 'win32' in sys.platform:
                    os.popen('taskkill /F /IM "celery*"')
                else:
                    os.popen('pkill -f "celery"')
        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or 'userblock' in resp.url or is_403(
                        page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if 'verifybmobile' in resp.url:
                    crawler.warning('账号{}功能被锁定,需要手机解锁'.format(name_cookies[0]))

                    freeze_account(name_cookies[0], -1)
                    Cookies.delete_cookies(name_cookies[0])
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
Пример #26
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待抓取url
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:

        if need_login:
            # 每次重试的时候都换cookies,并且和上次不同,如果只有一个账号,那么就允许相同
            name_cookies, cookies_count = Cookies.fetch_cookies()
            
            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号')
                rs = get_login_info()

                # 选择状态正常的账号进行登录,账号都不可用就停掉celery worker
                if len(rs) == 0:
                    crawler.error('账号均不可用,请检查账号健康状况')
                    # 杀死所有关于celery的进程
                    if 'win32' in sys.platform:
                        os.popen('taskkill /F /IM "celery*"')
                    else:
                        os.popen('pkill -f "celery"')
                else:
                    crawler.info('重新获取cookie中...')
                    login.excute_login_task()
                    time.sleep(10)

            # 只有cookies总数大于1的时候才会在每次重试的时候切换不同cookie
            if cookies_count > 1 and name_cookies == latest_name_cookies:
                continue

            latest_name_cookies = name_cookies

        try:
            if need_login:
                resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url, headers=headers, timeout=time_out, verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
Пример #27
0
def get_page(url, user_verify=True):
    """
    :param url: 待出现
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:
        # 每次重试的时候都换cookies,并且和上次不同
        name_cookies = Cookies.fetch_cookies()

        if name_cookies is None:
            crawler.error('cookie池中不存在cookie,请检查账号和登录任务是否正常。采集程序退出。')
            os._exit(0)

        if name_cookies == latest_name_cookies:
            continue

        latest_name_cookies = name_cookies

        try:
            resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)
            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or is_403(page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
Пример #28
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1

    # 自定义最大爬取的页数
    max_page = 10
    # end

    while cur_page <= limit:

        # 有些微博账号的照片多达两三千张,如果全部爬取比较浪费时间,这里先简单粗暴地根据当前微博的页数
        # 进行限制。经过调查发现10页左右应该是比较理想的数字,电脑版微博一页有45条微博,那么一个账户就是
        # 450条微博。
        if cur_page > max_page:
            break
        # end

        url = home_url.format(uid, cur_page)
        html = get_page(url)

        domain = public.get_userdomain(html)
        # 只爬取微博个人用户的相片,如果是非个人用户(如政府,组织等)不爬取。
        if domain not in ['103505', '100306', '100505', '']:
            set_seed_home_crawled(uid, 2)
            return
        # end

        weibo_datas, weibo_pics = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        # 如果非空,则将weibo_pics插入数据库中
        if weibo_pics:
            insert_weibo_pics(weibo_pics)
        # end

        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_0, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_1, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

    # 在遍历完所有页数之后,将flag置位。放在这里表示所有页面都遍历过,不保证遍历成功后置位。可能以后还要优化,即在
    # 某个回调函数中使用它。
    set_seed_home_crawled(uid)