예제 #1
0
def get_session(name, password, need_verify):
    url, yundama_obj, cid, session = do_login(name, password, need_verify)
    # 打码出错处理
    while url == 'pinerror' and yundama_obj is not None:
        yundama_obj.report_error(cid)
        url, yundama_obj, cid, session = do_login(name, password, need_verify)

    if url != '':
        rs_cont = session.get(url, headers=headers)
        login_info = rs_cont.text

        u_pattern = r'"uniqueid":"(.*)",'
        m = re.search(u_pattern, login_info)
        if m and m.group(1):
            # 任意验证一个页面看能否访问,使用这个方法验证比较依赖外部条件,但是没找到更好的方式(有的情况下,
            # 账号存在问题,但是可以访问自己的主页,所以通过自己的主页验证账号是否正常不恰当)
            check_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more'
            resp = session.get(check_url, headers=headers)
            # 通过实验,目前发现未经过手机验证的账号是救不回来了...
            if is_403(resp.text):
                other.error('账号{}已被冻结'.format(name))
                crawler.warning('账号{}已经被冻结'.format(name))
                freeze_account(name, 0)
                return None
            other.info('本次登陆账号为:{}'.format(name))
            Cookies.store_cookies(name, session.cookies.get_dict())
            return session

    other.error('本次账号{}登陆失败'.format(name))
    return None
예제 #2
0
def get_session(name, password):
    proxy = getip.getIP("")

    url, yundama_obj, cid, session = do_login(name, password, proxy)

    if url != '':
        rs_cont = session.get(url, headers=headers, proxies=proxy)
        login_info = rs_cont.text

        u_pattern = r'"uniqueid":"(.*)",'
        m = re.search(u_pattern, login_info)
        if m and m.group(1):
            # check if account is valid
            check_url = 'http://weibo.com/2671109275/about'
            resp = session.get(check_url, headers=headers, proxies=proxy)

            if is_403(resp.text):
                other.error('account {} has been forbidden'.format(name))
                LoginInfoOper.freeze_account(name, 0)
                return None
            other.info('Login successful! The login account is {}'.format(name))
            Cookies.store_cookies(name, session.cookies.get_dict(), proxy['http'])
            return session
        
    other.error('login failed for {}'.format(name))
    return None
예제 #3
0
def get_session(name, password):
    proxy = getip.getIP("")

    url, yundama_obj, cid, session = do_login(name, password, proxy)

    if url != '':
        rs_cont = session.get(url, headers=headers, proxies=proxy)
        login_info = rs_cont.text

        u_pattern = r'"uniqueid":"(.*)",'
        m = re.search(u_pattern, login_info)
        if m and m.group(1):
            # check if account is valid
            check_url = 'http://weibo.com/2671109275/about'
            resp = session.get(check_url, headers=headers, proxies=proxy)

            if is_403(resp.text):
                other.error('account {} has been forbidden'.format(name))
                LoginInfoOper.freeze_account(name, 0)
                return None
            other.info(
                'Login successful! The login account is {}'.format(name))
            Cookies.store_cookies(name, session.cookies.get_dict(),
                                  proxy['http'])
            return session

    other.error('login failed for {}'.format(name))
    return None
예제 #4
0
def excute_login_task():
    infos = login_info.get_login_info()
    # Clear all stacked login tasks before each time for login
    Cookies.check_login_task()
    log.crawler.info('The login task is starting...')
    for info in infos:
        app.send_task('tasks.login.login_task', args=(info.name, info.password), queue='login_queue',
                      routing_key='for_login')
        time.sleep(10)
예제 #5
0
def excute_login_task():
    infos = login_info.get_login_info()
    # Clear all stacked login celery_tasks before each time for login
    Cookies.check_login_task()
    log.crawler.info('The excute_login_task is starting...')
    for info in infos:
        celery.send_task('celery_tasks.weibo.login.login_task',
                         args=(info.name, info.password, info.source),
                         queue='login_task',
                         routing_key='login_task')
        time.sleep(10)
예제 #6
0
def excute_login_task():
    infos = login_info.get_login_info()
    # 每次登陆前清楚所有堆积的登录任务
    Cookies.check_login_task()
    log.crawler.info('本轮模拟登陆开始')
    for info in infos:
        app.send_task('tasks.login.login_task',
                      args=(info.name, info.password),
                      queue='login_queue',
                      routing_key='for_login')
        time.sleep(10)
예제 #7
0
def excute_login_task():
    infos = login_info.get_login_info()
    # Clear all stacked login tasks before each time for login
    Cookies.check_login_task()
    log.crawler.info('The login task is starting...')
    for info in infos:
        app.send_task('tasks.login.login_task',
                      args=(info.name, info.password),
                      queue='login_queue',
                      routing_key='for_login')
        time.sleep(10)
예제 #8
0
파일: login.py 프로젝트: cptBTptpwbct/weibo
def execute_login_task():
    infos = LoginInfoOper.get_login_info()
    # Clear all stacked login tasks before each time for login
    Cookies.check_login_task()
    crawler.info('The login task is starting...')
    if len(infos) <= 0:
        crawler.info('登陆的账号为空')
    for info in infos:
        # app.send_task('tasks.login.login_task', args=(info.name, info.password), queue='login_queue',
        #           routing_key='for_login')
        login_task(info.name, info.password)
        time.sleep(10)
예제 #9
0
def execute_login_task():
    # 获取所有的需要登录的weibo账号信息
    infos = LoginInfoOper.get_login_info()
    # Clear all stacked login tasks before each time for login
    Cookies.check_login_task()
    crawler.info('The login task is starting...')
    for info in infos:
        # 对xx任务 发送参数args
        # 让这个任务启动
        # queue参数:表示通过这个队列来路由通知任务
        # 路由的key由参数routing_key 指定
        app.send_task('tasks.login.login_task',
                      args=(info.name, info.password),
                      queue='login_queue',
                      routing_key='for_login')
        time.sleep(10)
예제 #10
0
 def test_delete_cookies(self):
     """
     delete according to key
     """
     from db.redis_db import Cookies
     r = Cookies.delete_cookies('18708103033')
     self.assertEqual(r, True)
예제 #11
0
 def test_delete_cookies(self):
     """
     测试根据键来删除cookie
     """
     from db.redis_db import Cookies
     r = Cookies.delete_cookies('18708103033')
     self.assertEqual(r, True)
예제 #12
0
 def test_delete_cookies(self):
     """
     delete according to key
     """
     from db.redis_db import Cookies
     r = Cookies.delete_cookies('18708103033')
     self.assertEqual(r, True)
예제 #13
0
def get_page(url, need_login=True):
    """
    :param url: url to be crawled
    :param need_login: if the url is need to login, the value is True, else False
    :return: return '' if exception happens or status_code != 200
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning(
                    'no cookies in cookies pool, please find out the reason')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text

            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue
            # slow down to aviod being banned
            time.sleep(interal)

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning(
                'excepitons happens when crawling {},specific infos are {}'.
                format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('max tries for {},check the url in redis db2'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
예제 #14
0
파일: login.py 프로젝트: Glossy/WeiboSpider
def get_session(name, password):
    url, yundama_obj, cid, session = do_login(name, password)

    if url != '':
        rs_cont = session.get(url, headers=headers)
        login_info = rs_cont.text

        u_pattern = r'"uniqueid":"(.*)",'
        m = re.search(u_pattern, login_info)
        if m and m.group(1):
            # 访问微博官方账号看是否正常
            check_url = 'http://weibo.com/2671109275/about'
            resp = session.get(check_url, headers=headers)
            # 通过实验,目前发现未经过手机验证的账号是救不回来了...
            if is_403(resp.text):
                other.error('账号{}已被冻结'.format(name))
                freeze_account(name, 0)
                return None
            other.info('本次登陆账号为:{}'.format(name))
            Cookies.store_cookies(name, session.cookies.get_dict())
            return session

    other.error('本次账号{}登陆失败'.format(name))
    return None
예제 #15
0
    def login(self):
        user_name = input('请输入登陆名: ')
        password = input('请输入密码: ')

        #判断登录方式
        if re.match(r'\d{4}-\d{7}|1[34578]\d{9}', user_name):
            print('当前登录方式:手机登陆')
            post_url = 'https://www.zhihu.com/login/phone_num'
            post_data = {
                'phone_num': user_name,
                'password': password,
                '_xsrf': self.getXsrf()
            }
        else:
            print('当前登录方式:邮箱登陆')
            post_url = 'https://www.zhihu.com/login/email'
            post_data = {
                'email': user_name,
                'password': password,
                '_xsrf': self.getXsrf()
            }
        try:
            login_page = self.session.post(post_url,
                                           headers=headers,
                                           data=post_data)
            print(login_page.status)
            print('直接登陆成功!')
        except:
            print('直接登陆失败!')
            post_data['captcha'] = self.getCaptcha()
            login_page = self.session.post(post_url,
                                           headers=headers,
                                           data=post_data)
            Cookies.store_cookies(user_name, self.session.cookies.get_dict())
            login_code = eval(login_page.text)
            print(login_code['msg'])
예제 #16
0
def excute_login_task():
    infos = login_info.get_login_info()
    log.crawler.info('本轮模拟登陆开始')
    for info in infos:
        try:
            rs = Cookies.check_login_task(info.name)
        except KeyError:
            log.crawler.warning('请检查是否已经启动worker及指定了login_queue')
        else:
            if not rs:
                app.send_task('tasks.login.login_task',
                              args=(info.name, info.password,
                                    info.need_verify),
                              queue='login_queue',
                              routing_key='for_login')
예제 #17
0
def get_page(url, user_verify=True):
    """
    :param url: 待出现
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:
        # 每次重试的时候都换cookies,并且和上次不同
        name_cookies = Cookies.fetch_cookies()

        if name_cookies is None:
            crawler.error('cookie池中不存在cookie,请检查账号和登录任务是否正常。采集程序退出。')
            os._exit(0)

        if name_cookies == latest_name_cookies:
            continue

        latest_name_cookies = name_cookies

        try:
            resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)
            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or is_403(page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
예제 #18
0
 def test_store_and_fetch_cookies(self):
     assert Cookies.fetch_cookies() is None
     Cookies.store_cookies(FAKE_STR, FAKE_STR)
     assert Cookies.fetch_cookies() is not None
예제 #19
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: url to be crawled
    :param user_verify: if it's ajax url, the value is False, else True
    :param need_login: if the url is need to login, the value is True, else False
    :return: return '' if exception happens or status_code != 200
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('no cookies in cookies pool, please find out the reason')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        try:
            if need_login:
                resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('account {} has been banned'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url, headers=headers, timeout=time_out, verify=False)

            page = resp.text

            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # slow down to aviod being banned
            time.sleep(interal)

            if user_verify:
                if is_banned(resp.url) or is_403(page):
                    crawler.warning('account {} has been banned'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if 'verifybmobile' in resp.url:
                    crawler.warning('account {} has been locked,you should use your phone to unlock it'.
                                    format(name_cookies[0]))

                    freeze_account(name_cookies[0], -1)
                    Cookies.delete_cookies(name_cookies[0])
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('{url} seems to be 404'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('excepitons happens when crawling {},specific infos are {}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('max tries for {},check the url in redis db2'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
예제 #20
0
def send_personal_message(target_uid, adver_message, user_verify=True, need_login=True):
    """
    :param url: url to be crawled
    :param user_verify: if it's ajax url, the value is False, else True
    :param need_login: if the url is need to login, the value is True, else False
    :return: return '' if exception happens or status_code != 200
    """
    crawler.info('the send_personal_message uid is {uid}'.format(uid=str(target_uid)))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()
            print(name_cookies)
            # check adver_timers
            if int(name_cookies[3]) >= int(adver_timers):
                continue

            if name_cookies is None:
                crawler.warning('no cookies in cookies pool, please find out the reason')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        try:
            if need_login:
                resp = requests.post('http://api.weibo.com/webim/2/direct_messages/new.json?source='+str(name_cookies[2]),
                              data={'text': adver_message, 'uid':str(target_uid)},
                              cookies=name_cookies[1], headers=personal_message_headers)

                if "error" in resp.text:
                    crawler.warning('account {} has been banned, resp.text is: {}'.format(name_cookies[0], resp.text))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
                else:
                    # update adver_times
                    Cookies.store_cookies(name_cookies[0], name_cookies[1], name_cookies[2], 1)
                    return None

            #     if "$CONFIG['islogin'] = '******'" in resp.text:
            #         crawler.warning('account {} has been banned'.format(name_cookies[0]))
            #         freeze_account(name_cookies[0], 0)
            #         Cookies.delete_cookies(name_cookies[0])
            #         continue
            # # else:
            # #     resp = requests.get(url, headers=headers, timeout=time_out, verify=False)
            #
            # page = resp.text
            #
            # if page:
            #     page = page.encode('utf-8', 'ignore').decode('utf-8')
            # else:
            #     continue
            #
            # # slow down to aviod being banned
            # time.sleep(interal)
            #
            # if user_verify:
            #     if is_banned(resp.url) or is_403(page):
            #         crawler.warning('account {} has been banned'.format(name_cookies[0]))
            #         freeze_account(name_cookies[0], 0)
            #         Cookies.delete_cookies(name_cookies[0])
            #         count += 1
            #         continue
            #
            #     if 'verifybmobile' in resp.url:
            #         crawler.warning('account {} has been locked,you should use your phone to unlock it'.
            #                         format(name_cookies[0]))
            #
            #         freeze_account(name_cookies[0], -1)
            #         Cookies.delete_cookies(name_cookies[0])
            #         continue
            #
            #     if not is_complete(page):
            #         count += 1
            #         continue
            #
            #     if is_404(page):
            #         crawler.warning('send_personal_message{uid} seems to be 404'.format(uid=str(target_uid)))
            #         return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('excepitons happens when send_personal_message {},specific infos are {}'.format(target_uid, e))
            count += 1
            time.sleep(excp_interal)

        else:
            # Urls.store_crawl_url(url, 1)
            # return page
            return None

    crawler.warning('max tries for {},check the target_uid in redis db2'.format(target_uid))
    # Urls.store_crawl_url(url, 0)
    return ''
예제 #21
0
 def test_del_cookies(self):
     Cookies.delete_cookies(FAKE_STR)
     assert Cookies.fetch_cookies() is None
예제 #22
0
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False):
    """
    :param url: url to crawl
    :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login.
    :param is_ajax: whether the request is ajax
    :param need_proxy: whether the request need a http/https proxy
    :return: response text, when a exception is raised, return ''
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < MAX_RETRIES:
        if auth_level == 2:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning(
                    'No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired'
                )
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)

            # There is no difference between http and https address.
            proxy = {
                'http': name_cookies[2],
                'https': name_cookies[2],
            }
        else:
            proxy = getip.getIPWithoutLogin('')
            # if proxy['http'] is None:
            #     crawler.warning('No available ip in ip pools. Using local ip instead.')

        try:
            if auth_level == 2:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=TIME_OUT,
                                    verify=False,
                                    proxies=proxy)
            elif auth_level == 1:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=COOKIES,
                                    timeout=TIME_OUT,
                                    verify=False,
                                    proxies=proxy)
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=TIME_OUT,
                                    verify=False,
                                    proxies=proxy)
        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning(
                'Excepitons are raised when crawling {}.Here are details:{}'.
                format(url, e))
            count += 1
            time.sleep(EXCP_INTERAL)
            continue

        if resp.status_code == 414:
            crawler.warning('This ip has been blocked by weibo system')
            if not need_proxy:
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        if resp.text:
            page = resp.text.encode('utf-8', 'ignore').decode('utf-8')
        else:
            count += 1
            continue
        if auth_level == 2:
            # slow down to aviod being banned
            time.sleep(INTERAL)
            if is_banned(resp.url) or is_403(page):
                crawler.warning('Account {} has been banned'.format(
                    name_cookies[0]))
                LoginInfoOper.freeze_account(name_cookies[0], 0)
                Cookies.delete_cookies(name_cookies[0])
                count += 1
                continue

            if not is_ajax and not is_complete(page):
                count += 1
                continue

        if is_404(page):
            crawler.warning('{} seems to be 404'.format(url))
            return ''
        Urls.store_crawl_url(url, 1)
        return page

    Urls.store_crawl_url(url, 0)
    return ''
예제 #23
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待抓取url
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:

        if need_login:
            # 每次重试的时候都换cookies,并且和上次不同,如果只有一个账号,那么就允许相同
            name_cookies, cookies_count = Cookies.fetch_cookies()
            
            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号')
                rs = get_login_info()

                # 选择状态正常的账号进行登录,账号都不可用就停掉celery worker
                if len(rs) == 0:
                    crawler.error('账号均不可用,请检查账号健康状况')
                    # 杀死所有关于celery的进程
                    if 'win32' in sys.platform:
                        os.popen('taskkill /F /IM "celery*"')
                    else:
                        os.popen('pkill -f "celery"')
                else:
                    crawler.info('重新获取cookie中...')
                    login.excute_login_task()
                    time.sleep(10)

            # 只有cookies总数大于1的时候才会在每次重试的时候切换不同cookie
            if cookies_count > 1 and name_cookies == latest_name_cookies:
                continue

            latest_name_cookies = name_cookies

        try:
            if need_login:
                resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url, headers=headers, timeout=time_out, verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
예제 #24
0
 def test_store_and_fetch_cookies(self):
     assert Cookies.fetch_cookies() is None
     Cookies.store_cookies(FAKE_STR, FAKE_STR)
     assert Cookies.fetch_cookies() is not None
예제 #25
0
def get_session(name, password):
    session = requests.Session()
    su = get_encodename(name)

    sever_data = get_server_data(su, session)
    servertime = sever_data["servertime"]
    nonce = sever_data['nonce']
    rsakv = sever_data["rsakv"]
    pubkey = sever_data["pubkey"]

    sp = get_password(password, servertime, nonce, pubkey)

    # 提交的数据可以根据抓包获得
    data = {
        'encoding': 'UTF-8',
        'entry': 'weibo',
        'from': '',
        'gateway': '1',
        'nonce': nonce,
        'pagerefer': "",
        'prelt': 67,
        'pwencode': 'rsa2',
        "returntype": "META",
        'rsakv': rsakv,
        'savestate': '7',
        'servertime': servertime,
        'service': 'miniblog',
        'sp': sp,
        'sr': '1920*1080',
        'su': su,
        'useticket': '1',
        'vsnf': '1',
        'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack'
    }
    post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
    url = get_redirect(data, post_url, session)

    if url != '':
        rs_cont = session.get(url, headers=headers)
        login_info = rs_cont.text

        u_pattern = r'"uniqueid":"(.*)",'
        m = re.search(u_pattern, login_info)
        if m:
            if m.group(1):
                # 任意验证一个页面看能否访问,使用这个方法验证比较依赖外部条件,但是没找到更好的方式(有的情况下,
                # 账号存在问题,但是可以访问自己的主页,所以通过自己的主页验证账号是否正常不恰当)
                check_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more'
                resp = session.get(check_url, headers=headers)

                if is_403(resp.text):
                    other.error('账号{}已被冻结'.format(name))
                    crawler.warning('账号{}已经被冻结'.format(name))
                    freeze_account(name)
                    return None
                other.info('本次登陆账号为:{}'.format(name))
                Cookies.store_cookies(name, session.cookies.get_dict())
                return session
            else:
                other.error('本次账号{}登陆失败'.format(name))
                return None
        else:
            other.error('本次账号{}登陆失败'.format(name))
            return None
    else:
        other.error('本次账号{}登陆失败'.format(name))
        return None
예제 #26
0
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False):
    """
    :param url: url to crawl
    :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login.
    :param is_ajax: whether the request is ajax
    :param need_proxy: whether the request need a http/https proxy
    :return: response text, when a exception is raised, return ''
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < MAX_RETRIES:
        if auth_level == 2:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)

            # There is no difference between http and https address.
            proxy = {'http': name_cookies[2], 'https': name_cookies[2], }
        else:
            proxy = getip.getIPWithoutLogin('')
            # if proxy['http'] is None:
            #     crawler.warning('No available ip in ip pools. Using local ip instead.')
        
        try:
            if auth_level == 2:
                resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=TIME_OUT, verify=False, proxies=proxy)
            elif auth_level == 1:
                resp = requests.get(url, headers=headers, cookies=COOKIES, timeout=TIME_OUT, verify=False, proxies=proxy)
            else:
                resp = requests.get(url, headers=headers, timeout=TIME_OUT, verify=False, proxies=proxy)
        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('Excepitons are raised when crawling {}.Here are details:{}'.format(url, e))
            count += 1
            time.sleep(EXCP_INTERAL)
            continue

        if resp.status_code == 414:
            crawler.warning('This ip has been blocked by weibo system')
            if not need_proxy:
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        if resp.text:
            page = resp.text.encode('utf-8', 'ignore').decode('utf-8')
        else:
            count += 1
            continue
        if auth_level == 2:
            # slow down to aviod being banned
            time.sleep(INTERAL)
            if is_banned(resp.url) or is_403(page):
                crawler.warning('Account {} has been banned'.format(name_cookies[0]))
                LoginInfoOper.freeze_account(name_cookies[0], 0)
                Cookies.delete_cookies(name_cookies[0])
                count += 1
                continue

            if not is_ajax and not is_complete(page):
                count += 1
                continue

        if is_404(page):
            crawler.warning('{} seems to be 404'.format(url))
            return ''
        Urls.store_crawl_url(url, 1)
        return page

    Urls.store_crawl_url(url, 0)
    return ''
예제 #27
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待出现
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:

        if need_login:
            # 每次重试的时候都换cookies,并且和上次不同
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号')
                rs = get_login_info()

                if len(rs) == 0:
                    crawler.error('账号均不可用,请检查账号健康状况')
                    # 杀死所有关于celery的进程
                    if 'win32' in sys.platform:
                        os.popen('taskkill /F /IM "celery*"')
                    else:
                        os.popen('pkill -f "celery"')
                else:
                    # 如果有可用账号,那么就拿来登录,这里用了本地调用,好像不是很合理,因为如果login queue
                    # 不会在本机上,那么该调用就会无效但是用网络调用,如何保证不会出现在某些不常用登录地的节点
                    # 上还有待考量,亦或者找到一个更适合的方法可以突破异地登录的限制
                    # TODO 衡量是用网络调用还是直接调用 login.get_session()方法,这里应该不是很合理
                    # 目前暂时不考虑节点登录出现验证码的问题, 考虑到大规模账号登录的话,需要把login_queue的节点放在账号常用地
                    crawler.info('重新获取cookie中...')
                    login.excute_login_task()
                    time.sleep(10)

            if name_cookies == latest_name_cookies:
                continue

            latest_name_cookies = name_cookies

        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(
                        page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
예제 #28
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待抓取url
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),
    否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,请检查账号是否正常')
                other.warning('正在关闭爬虫程序...')
                if 'win32' in sys.platform:
                    os.popen('taskkill /F /IM "celery*"')
                else:
                    os.popen('pkill -f "celery"')
        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or 'userblock' in resp.url or is_403(
                        page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if 'verifybmobile' in resp.url:
                    crawler.warning('账号{}功能被锁定,需要手机解锁'.format(name_cookies[0]))

                    freeze_account(name_cookies[0], -1)
                    Cookies.delete_cookies(name_cookies[0])
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
예제 #29
0
 def test_del_cookies(self):
     Cookies.delete_cookies(FAKE_STR)
     assert Cookies.fetch_cookies() is None