예제 #1
0
def get_redirect(name, data, post_url, session, proxy):
    logining_page = session.post(post_url,
                                 data=data,
                                 headers=headers,
                                 proxies=proxy)
    login_loop = logining_page.content.decode("GBK")

    # if name or password is wrong, set the value to 2
    if 'retcode=101' in login_loop:
        crawler.error(
            'invalid password for {}, please ensure your account and password'.
            format(name))
        LoginInfoOper.freeze_account(name, 2)
        return ''

    if 'retcode=2070' in login_loop:
        crawler.error('invalid verification code')
        return 'pinerror'

    if 'retcode=4049' in login_loop:
        crawler.warning('account {} need verification for login'.format(name))
        return 'login_need_pincode'

    if '正在登录' in login_loop or 'Signing in' in login_loop:
        pa = r'location\.replace\([\'"](.*?)[\'"]\)'
        return re.findall(pa, login_loop)[0]
    else:
        return ''
예제 #2
0
def get_session(name, password):
    proxy = getip.getIP("")

    url, yundama_obj, cid, session = do_login(name, password, proxy)

    if url != '':
        rs_cont = session.get(url, headers=headers, proxies=proxy)
        login_info = rs_cont.text

        u_pattern = r'"uniqueid":"(.*)",'
        m = re.search(u_pattern, login_info)
        if m and m.group(1):
            # check if account is valid
            check_url = 'http://weibo.com/2671109275/about'
            resp = session.get(check_url, headers=headers, proxies=proxy)

            if is_403(resp.text):
                other.error('account {} has been forbidden'.format(name))
                LoginInfoOper.freeze_account(name, 0)
                return None
            other.info('Login successful! The login account is {}'.format(name))
            Cookies.store_cookies(name, session.cookies.get_dict(), proxy['http'])
            return session
        
    other.error('login failed for {}'.format(name))
    return None
예제 #3
0
def get_session(name, password):
    proxy = getip.getIP("")

    url, yundama_obj, cid, session = do_login(name, password, proxy)

    if url != '':
        rs_cont = session.get(url, headers=headers, proxies=proxy)
        login_info = rs_cont.text

        u_pattern = r'"uniqueid":"(.*)",'
        m = re.search(u_pattern, login_info)
        if m and m.group(1):
            # check if account is valid
            check_url = 'http://weibo.com/2671109275/about'
            resp = session.get(check_url, headers=headers, proxies=proxy)

            if is_403(resp.text):
                other.error('account {} has been forbidden'.format(name))
                LoginInfoOper.freeze_account(name, 0)
                return None
            other.info(
                'Login successful! The login account is {}'.format(name))
            Cookies.store_cookies(name, session.cookies.get_dict(),
                                  proxy['http'])
            return session

    other.error('login failed for {}'.format(name))
    return None
예제 #4
0
 def test_login_oper(self):
     infos = LoginInfoOper.get_login_info()
     assert len(infos) == 0
     db_session.execute("insert into {} ({}.name) values (".format(
         login_info.name, login_info.name) + FAKE_ID + ")")
     infos = LoginInfoOper.get_login_info()
     assert len(infos) == 1
     LoginInfoOper.freeze_account(FAKE_ID, 0)
     infos = LoginInfoOper.get_login_info()
     assert len(infos) == 0
예제 #5
0
 def test_login_oper(self):
     infos = LoginInfoOper.get_login_info()
     assert len(infos) == 0
     db_session.execute("insert into {} ({}.name) values (".format(login_info.name, login_info.name)
                        + FAKE_ID + ")")
     infos = LoginInfoOper.get_login_info()
     assert len(infos) == 1
     LoginInfoOper.freeze_account(FAKE_ID, 0)
     infos = LoginInfoOper.get_login_info()
     assert len(infos) == 0
예제 #6
0
def execute_login_task():
    infos = LoginInfoOper.get_login_info()
    # Clear all stacked login tasks before each time for login
    Cookies.check_login_task()
    crawler.info('The login task is starting...')
    for info in infos:
        app.send_task('tasks.login.login_task',
                      args=(info.name, info.password),
                      queue='login_queue',
                      routing_key='for_login')
        time.sleep(10)
예제 #7
0
def get_redirect(name, data, post_url, session, proxy):
    logining_page = session.post(post_url, data=data, headers=headers, proxies=proxy)
    login_loop = logining_page.content.decode("GBK")

    # if name or password is wrong, set the value to 2
    if 'retcode=101' in login_loop:
        crawler.error('invalid password for {}, please ensure your account and password'.format(name))
        LoginInfoOper.freeze_account(name, 2)
        return ''

    if 'retcode=2070' in login_loop:
        crawler.error('invalid verification code')
        return 'pinerror'

    if 'retcode=4049' in login_loop:
        crawler.warning('account {} need verification for login'.format(name))
        return 'login_need_pincode'

    if '正在登录' in login_loop or 'Signing in' in login_loop:
        pa = r'location\.replace\([\'"](.*?)[\'"]\)'
        return re.findall(pa, login_loop)[0]
    else:
        return ''
예제 #8
0
def execute_login_task():
    # 获取所有的需要登录的weibo账号信息
    infos = LoginInfoOper.get_login_info()
    # Clear all stacked login tasks before each time for login
    Cookies.check_login_task()
    crawler.info('The login task is starting...')
    for info in infos:
        # 对xx任务 发送参数args
        # 让这个任务启动
        # queue参数:表示通过这个队列来路由通知任务
        # 路由的key由参数routing_key 指定
        app.send_task('tasks.login.login_task',
                      args=(info.name, info.password),
                      queue='login_queue',
                      routing_key='for_login')
        time.sleep(10)
예제 #9
0
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False):
    """
    :param url: url to crawl
    :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login.
    :param is_ajax: whether the request is ajax
    :param need_proxy: whether the request need a http/https proxy
    :return: response text, when a exception is raised, return ''
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < MAX_RETRIES:
        if auth_level == 2:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning(
                    'No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired'
                )
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)

            # There is no difference between http and https address.
            proxy = {
                'http': name_cookies[2],
                'https': name_cookies[2],
            }
        else:
            proxy = getip.getIPWithoutLogin('')
            # if proxy['http'] is None:
            #     crawler.warning('No available ip in ip pools. Using local ip instead.')

        try:
            if auth_level == 2:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=TIME_OUT,
                                    verify=False,
                                    proxies=proxy)
            elif auth_level == 1:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=COOKIES,
                                    timeout=TIME_OUT,
                                    verify=False,
                                    proxies=proxy)
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=TIME_OUT,
                                    verify=False,
                                    proxies=proxy)
        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning(
                'Excepitons are raised when crawling {}.Here are details:{}'.
                format(url, e))
            count += 1
            time.sleep(EXCP_INTERAL)
            continue

        if resp.status_code == 414:
            crawler.warning('This ip has been blocked by weibo system')
            if not need_proxy:
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        if resp.text:
            page = resp.text.encode('utf-8', 'ignore').decode('utf-8')
        else:
            count += 1
            continue
        if auth_level == 2:
            # slow down to aviod being banned
            time.sleep(INTERAL)
            if is_banned(resp.url) or is_403(page):
                crawler.warning('Account {} has been banned'.format(
                    name_cookies[0]))
                LoginInfoOper.freeze_account(name_cookies[0], 0)
                Cookies.delete_cookies(name_cookies[0])
                count += 1
                continue

            if not is_ajax and not is_complete(page):
                count += 1
                continue

        if is_404(page):
            crawler.warning('{} seems to be 404'.format(url))
            return ''
        Urls.store_crawl_url(url, 1)
        return page

    Urls.store_crawl_url(url, 0)
    return ''
예제 #10
0
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False):
    """
    :param url: url to crawl
    :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login.
    :param is_ajax: whether the request is ajax
    :param need_proxy: whether the request need a http/https proxy
    :return: response text, when a exception is raised, return ''
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < MAX_RETRIES:
        if auth_level == 2:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)

            # There is no difference between http and https address.
            proxy = {'http': name_cookies[2], 'https': name_cookies[2], }
        else:
            proxy = getip.getIPWithoutLogin('')
            # if proxy['http'] is None:
            #     crawler.warning('No available ip in ip pools. Using local ip instead.')
        
        try:
            if auth_level == 2:
                resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=TIME_OUT, verify=False, proxies=proxy)
            elif auth_level == 1:
                resp = requests.get(url, headers=headers, cookies=COOKIES, timeout=TIME_OUT, verify=False, proxies=proxy)
            else:
                resp = requests.get(url, headers=headers, timeout=TIME_OUT, verify=False, proxies=proxy)
        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('Excepitons are raised when crawling {}.Here are details:{}'.format(url, e))
            count += 1
            time.sleep(EXCP_INTERAL)
            continue

        if resp.status_code == 414:
            crawler.warning('This ip has been blocked by weibo system')
            if not need_proxy:
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        if resp.text:
            page = resp.text.encode('utf-8', 'ignore').decode('utf-8')
        else:
            count += 1
            continue
        if auth_level == 2:
            # slow down to aviod being banned
            time.sleep(INTERAL)
            if is_banned(resp.url) or is_403(page):
                crawler.warning('Account {} has been banned'.format(name_cookies[0]))
                LoginInfoOper.freeze_account(name_cookies[0], 0)
                Cookies.delete_cookies(name_cookies[0])
                count += 1
                continue

            if not is_ajax and not is_complete(page):
                count += 1
                continue

        if is_404(page):
            crawler.warning('{} seems to be 404'.format(url))
            return ''
        Urls.store_crawl_url(url, 1)
        return page

    Urls.store_crawl_url(url, 0)
    return ''