示例#1
0
    def solve_cf_challenge(self, resp, **original_kwargs):
        body = resp.text
        parsed_url = urlparse(resp.url)
        domain = parsed_url.netloc

        if '/cdn-cgi/l/chk_captcha' in body or 'cf_chl_captcha' in body:
            raise CloudflareError(
                'Cloudflare captcha presented for %s, please notify SickGear for an update, ua: %s'
                % (domain, self.cf_ua),
                response=resp)

        try:
            action, method = re.findall(
                r'(?sim)<form.*?id="challenge.*?action="/?([^?"]+).*?method="([^"]+)',
                body)[0]
        except (Exception, BaseException):
            action, method = 'cdn-cgi/l/chk_jschl', resp.request.method
        submit_url = '%s://%s/%s' % (parsed_url.scheme, domain, action)

        cloudflare_kwargs = {
            k: v
            for k, v in original_kwargs.items() if k not in ['hooks']
        }
        params = cloudflare_kwargs.setdefault(
            ('data', 'params')['GET' == method.upper()], {})
        headers = cloudflare_kwargs.setdefault('headers', {})
        headers['Referer'] = resp.url
        try:
            token = re.findall(r'(?sim)__cf_chl_jschl_tk__=([^"]+)', body)[0]
            cloudflare_kwargs['params'] = dict(__cf_chl_jschl_tk__=token)
        except (Exception, BaseException):
            pass

        if self.delay == self.default_delay:
            try:
                # no instantiated delay, therefore check js for hard coded CF delay
                self.delay = float(
                    re.search(r'submit\(\);[^0-9]*?([0-9]+)',
                              body).group(1)) / float(1000)
            except (BaseException, Exception):
                pass

        for i in re.findall(r'(<input[^>]+?hidden[^>]+?>)',
                            re.sub(r'(?sim)<!--\s+<input.*?(?=<)', '', body)):
            value = re.findall(r'value="([^"\']+?)["\']', i)
            name = re.findall(r'name="([^"\']+?)["\']', i)
            if all([name, value]):
                params[name[0]] = value[0]

        js = self.extract_js(body, domain)
        atob = (lambda s: b64decodestring('%s' % s))
        try:
            # Eval the challenge algorithm
            params['jschl_answer'] = str(js2py.EvalJs({'atob': atob}).eval(js))
        except (BaseException, Exception):
            try:
                params['jschl_answer'] = str(
                    js2py.EvalJs({
                        'atob': atob
                    }).eval(js))
            except (BaseException, Exception) as e:
                # Something is wrong with the page. This may indicate Cloudflare has changed their anti-bot technique.
                raise ValueError(
                    'Unable to parse Cloudflare anti-bot IUAM page: %r' % e)

        # Requests transforms any request into a GET after a redirect,
        # so the redirect has to be handled manually here to allow for
        # performing other types of requests even as the first request.
        cloudflare_kwargs['allow_redirects'] = False

        self.wait()
        response = self.request(method, submit_url, **cloudflare_kwargs)
        if response:
            if 200 == getattr(response, 'status_code'):
                return response

            # legacy redirection handler (pre 2019.11.xx)
            location = response.headers.get('Location')
            try:
                r = urlparse(location)
            except (Exception, BaseException):
                # Something is wrong with the page, perhaps CF changed their anti-bot technique
                raise ValueError(
                    'Unable to find a new location from Cloudflare anti-bot IUAM page'
                )

            if not r.netloc or location.startswith('/'):
                location = urlunparse((parsed_url.scheme, domain, r.path,
                                       r.params, r.query, r.fragment))
            return self.request(resp.request.method, location,
                                **original_kwargs)
示例#2
0
def get_url(
        url,  # type: AnyStr
        post_data=None,  # type: Optional
        params=None,  # type: Optional
        headers=None,  # type: Optional[Dict]
        timeout=30,  # type: int
        session=None,  # type: Optional[requests.Session]
        parse_json=False,  # type: bool
        raise_status_code=False,  # type: bool
        raise_exceptions=False,  # type: bool
        as_binary=False,  # type: bool
        encoding=None,  # type: Optional[AnyStr]
        **kwargs):
    # type: (...) -> Optional[Union[AnyStr, bool, bytes, Dict, Tuple[Union[Dict, List], requests.Session]]]
    """
    Either
    1) Returns a byte-string retrieved from the url provider.
    2) Return True/False if success after using kwargs 'savefile' set to file pathname.
    3) Returns Tuple response, session if success after setting kwargs 'resp_sess' True.
    4) JSON Dict if parse_json=True.

    :param url: url
    :param post_data: post data
    :param params:
    :param headers: headers to add
    :param timeout: timeout
    :param session: optional session object
    :param parse_json: return JSON Dict
    :param raise_status_code: raise exception for status codes
    :param raise_exceptions: raise exceptions
    :param as_binary: return bytes instead of text
    :param encoding: overwrite encoding return header if as_binary is False
    :param kwargs:
    :return:
    """

    response_attr = ('text', 'content')[as_binary]

    # selectively mute some errors
    mute = filter_list(lambda x: kwargs.pop(x, False), [
        'mute_connect_err', 'mute_read_timeout', 'mute_connect_timeout',
        'mute_http_error'
    ])

    # reuse or instantiate request session
    resp_sess = kwargs.pop('resp_sess', None)
    if None is session:
        session = CloudflareScraper.create_scraper()
        session.headers.update({'User-Agent': USER_AGENT})

    # download and save file or simply fetch url
    savename = kwargs.pop('savename', None)
    if savename:
        # session streaming
        session.stream = True

    if not kwargs.pop('nocache', False):
        cache_dir = CACHE_DIR or get_system_temp_dir()
        session = CacheControl(sess=session,
                               cache=caches.FileCache(
                                   ek.ek(os.path.join, cache_dir, 'sessions')))

    provider = kwargs.pop('provider', None)

    # handle legacy uses of `json` param
    if kwargs.get('json'):
        parse_json = kwargs.pop('json')

    # session master headers
    req_headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip,deflate'
    }
    if headers:
        req_headers.update(headers)
    if hasattr(session, 'reserved') and 'headers' in session.reserved:
        req_headers.update(session.reserved['headers'] or {})
    session.headers.update(req_headers)

    # session parameters
    session.params = params

    # session ssl verify
    session.verify = False

    # don't trust os environments (auth, proxies, ...)
    session.trust_env = False

    response = None
    try:
        # sanitise url
        parsed = list(urlparse(url))
        parsed[2] = re.sub('/{2,}', '/',
                           parsed[2])  # replace two or more / with one
        url = urlunparse(parsed)

        # session proxies
        if PROXY_SETTING:
            (proxy_address, pac_found) = proxy_setting(PROXY_SETTING, url)
            msg = '%sproxy for url: %s' % (('', 'PAC parsed ')[pac_found], url)
            if None is proxy_address:
                logger.debug('Proxy error, aborted the request using %s' % msg)
                return
            elif proxy_address:
                logger.debug('Using %s' % msg)
                session.proxies = {
                    'http': proxy_address,
                    'https': proxy_address
                }

        # decide if we get or post data to server
        if post_data or 'post_json' in kwargs:
            if True is post_data:
                post_data = None

            if post_data:
                kwargs.setdefault('data', post_data)

            if 'post_json' in kwargs:
                kwargs.setdefault('json', kwargs.pop('post_json'))

            response = session.post(url, timeout=timeout, **kwargs)
        else:
            response = session.get(url, timeout=timeout, **kwargs)
            if response.ok and not response.content and 'url=' in response.headers.get(
                    'Refresh', '').lower():
                url = response.headers.get('Refresh').lower().split(
                    'url=')[1].strip('/')
                if not url.startswith('http'):
                    parsed[2] = '/%s' % url
                    url = urlunparse(parsed)
                response = session.get(url, timeout=timeout, **kwargs)

        # if encoding is not in header try to use best guess
        # ignore downloads with savename
        if not savename and not as_binary:
            if encoding:
                response.encoding = encoding
            elif not response.encoding or 'charset' not in response.headers.get(
                    'Content-Type', ''):
                response.encoding = response.apparent_encoding

        # noinspection PyProtectedMember
        if provider and provider._has_signature(response.text):
            return getattr(response, response_attr)

        if raise_status_code:
            response.raise_for_status()

        if not response.ok:
            http_err_text = 'CloudFlare Ray ID' in response.text and \
                            'CloudFlare reports, "Website is offline"; ' or ''
            if response.status_code in http_error_code:
                http_err_text += http_error_code[response.status_code]
            elif response.status_code in range(520, 527):
                http_err_text += 'Origin server connection failure'
            else:
                http_err_text = 'Custom HTTP error code'
                if 'mute_http_error' not in mute:
                    logger.debug(
                        u'Response not ok. %s: %s from requested url %s' %
                        (response.status_code, http_err_text, url))
            return

    except requests.exceptions.HTTPError as e:
        if raise_status_code:
            response.raise_for_status()
        logger.warning(u'HTTP error %s while loading URL%s' %
                       (e.errno, _maybe_request_url(e)))
        return
    except requests.exceptions.ConnectionError as e:
        if 'mute_connect_err' not in mute:
            logger.warning(u'Connection error msg:%s while loading URL%s' %
                           (ex(e), _maybe_request_url(e)))
        if raise_exceptions:
            raise e
        return
    except requests.exceptions.ReadTimeout as e:
        if 'mute_read_timeout' not in mute:
            logger.warning(u'Read timed out msg:%s while loading URL%s' %
                           (ex(e), _maybe_request_url(e)))
        if raise_exceptions:
            raise e
        return
    except (requests.exceptions.Timeout, socket.timeout) as e:
        if 'mute_connect_timeout' not in mute:
            logger.warning(
                u'Connection timed out msg:%s while loading URL %s' %
                (ex(e), _maybe_request_url(e, url)))
        if raise_exceptions:
            raise e
        return
    except (BaseException, Exception) as e:
        if ex(e):
            logger.warning(
                u'Exception caught while loading URL %s\r\nDetail... %s\r\n%s'
                % (url, ex(e), traceback.format_exc()))
        else:
            logger.warning(
                u'Unknown exception while loading URL %s\r\nDetail... %s' %
                (url, traceback.format_exc()))
        if raise_exceptions:
            raise e
        return

    if parse_json:
        try:
            data_json = response.json()
            if resp_sess:
                return ({}, data_json)[isinstance(data_json,
                                                  (dict, list))], session
            return ({}, data_json)[isinstance(data_json, (dict, list))]
        except (TypeError, Exception) as e:
            logger.warning(u'JSON data issue from URL %s\r\nDetail... %s' %
                           (url, ex(e)))
            if raise_exceptions:
                raise e
            return None

    if savename:
        try:
            write_file(savename,
                       response,
                       raw=True,
                       raise_exceptions=raise_exceptions)
        except (BaseException, Exception) as e:
            if raise_exceptions:
                raise e
            return
        return True

    if resp_sess:
        return getattr(response, response_attr), session

    return getattr(response, response_attr)