def solve_cf_challenge(self, resp, **original_kwargs): body = resp.text parsed_url = urlparse(resp.url) domain = parsed_url.netloc if '/cdn-cgi/l/chk_captcha' in body or 'cf_chl_captcha' in body: raise CloudflareError( 'Cloudflare captcha presented for %s, please notify SickGear for an update, ua: %s' % (domain, self.cf_ua), response=resp) try: action, method = re.findall( r'(?sim)<form.*?id="challenge.*?action="/?([^?"]+).*?method="([^"]+)', body)[0] except (Exception, BaseException): action, method = 'cdn-cgi/l/chk_jschl', resp.request.method submit_url = '%s://%s/%s' % (parsed_url.scheme, domain, action) cloudflare_kwargs = { k: v for k, v in original_kwargs.items() if k not in ['hooks'] } params = cloudflare_kwargs.setdefault( ('data', 'params')['GET' == method.upper()], {}) headers = cloudflare_kwargs.setdefault('headers', {}) headers['Referer'] = resp.url try: token = re.findall(r'(?sim)__cf_chl_jschl_tk__=([^"]+)', body)[0] cloudflare_kwargs['params'] = dict(__cf_chl_jschl_tk__=token) except (Exception, BaseException): pass if self.delay == self.default_delay: try: # no instantiated delay, therefore check js for hard coded CF delay self.delay = float( re.search(r'submit\(\);[^0-9]*?([0-9]+)', body).group(1)) / float(1000) except (BaseException, Exception): pass for i in re.findall(r'(<input[^>]+?hidden[^>]+?>)', re.sub(r'(?sim)<!--\s+<input.*?(?=<)', '', body)): value = re.findall(r'value="([^"\']+?)["\']', i) name = re.findall(r'name="([^"\']+?)["\']', i) if all([name, value]): params[name[0]] = value[0] js = self.extract_js(body, domain) atob = (lambda s: b64decodestring('%s' % s)) try: # Eval the challenge algorithm params['jschl_answer'] = str(js2py.EvalJs({'atob': atob}).eval(js)) except (BaseException, Exception): try: params['jschl_answer'] = str( js2py.EvalJs({ 'atob': atob }).eval(js)) except (BaseException, Exception) as e: # Something is wrong with the page. This may indicate Cloudflare has changed their anti-bot technique. raise ValueError( 'Unable to parse Cloudflare anti-bot IUAM page: %r' % e) # Requests transforms any request into a GET after a redirect, # so the redirect has to be handled manually here to allow for # performing other types of requests even as the first request. cloudflare_kwargs['allow_redirects'] = False self.wait() response = self.request(method, submit_url, **cloudflare_kwargs) if response: if 200 == getattr(response, 'status_code'): return response # legacy redirection handler (pre 2019.11.xx) location = response.headers.get('Location') try: r = urlparse(location) except (Exception, BaseException): # Something is wrong with the page, perhaps CF changed their anti-bot technique raise ValueError( 'Unable to find a new location from Cloudflare anti-bot IUAM page' ) if not r.netloc or location.startswith('/'): location = urlunparse((parsed_url.scheme, domain, r.path, r.params, r.query, r.fragment)) return self.request(resp.request.method, location, **original_kwargs)
def get_url( url, # type: AnyStr post_data=None, # type: Optional params=None, # type: Optional headers=None, # type: Optional[Dict] timeout=30, # type: int session=None, # type: Optional[requests.Session] parse_json=False, # type: bool raise_status_code=False, # type: bool raise_exceptions=False, # type: bool as_binary=False, # type: bool encoding=None, # type: Optional[AnyStr] **kwargs): # type: (...) -> Optional[Union[AnyStr, bool, bytes, Dict, Tuple[Union[Dict, List], requests.Session]]] """ Either 1) Returns a byte-string retrieved from the url provider. 2) Return True/False if success after using kwargs 'savefile' set to file pathname. 3) Returns Tuple response, session if success after setting kwargs 'resp_sess' True. 4) JSON Dict if parse_json=True. :param url: url :param post_data: post data :param params: :param headers: headers to add :param timeout: timeout :param session: optional session object :param parse_json: return JSON Dict :param raise_status_code: raise exception for status codes :param raise_exceptions: raise exceptions :param as_binary: return bytes instead of text :param encoding: overwrite encoding return header if as_binary is False :param kwargs: :return: """ response_attr = ('text', 'content')[as_binary] # selectively mute some errors mute = filter_list(lambda x: kwargs.pop(x, False), [ 'mute_connect_err', 'mute_read_timeout', 'mute_connect_timeout', 'mute_http_error' ]) # reuse or instantiate request session resp_sess = kwargs.pop('resp_sess', None) if None is session: session = CloudflareScraper.create_scraper() session.headers.update({'User-Agent': USER_AGENT}) # download and save file or simply fetch url savename = kwargs.pop('savename', None) if savename: # session streaming session.stream = True if not kwargs.pop('nocache', False): cache_dir = CACHE_DIR or get_system_temp_dir() session = CacheControl(sess=session, cache=caches.FileCache( ek.ek(os.path.join, cache_dir, 'sessions'))) provider = kwargs.pop('provider', None) # handle legacy uses of `json` param if kwargs.get('json'): parse_json = kwargs.pop('json') # session master headers req_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip,deflate' } if headers: req_headers.update(headers) if hasattr(session, 'reserved') and 'headers' in session.reserved: req_headers.update(session.reserved['headers'] or {}) session.headers.update(req_headers) # session parameters session.params = params # session ssl verify session.verify = False # don't trust os environments (auth, proxies, ...) session.trust_env = False response = None try: # sanitise url parsed = list(urlparse(url)) parsed[2] = re.sub('/{2,}', '/', parsed[2]) # replace two or more / with one url = urlunparse(parsed) # session proxies if PROXY_SETTING: (proxy_address, pac_found) = proxy_setting(PROXY_SETTING, url) msg = '%sproxy for url: %s' % (('', 'PAC parsed ')[pac_found], url) if None is proxy_address: logger.debug('Proxy error, aborted the request using %s' % msg) return elif proxy_address: logger.debug('Using %s' % msg) session.proxies = { 'http': proxy_address, 'https': proxy_address } # decide if we get or post data to server if post_data or 'post_json' in kwargs: if True is post_data: post_data = None if post_data: kwargs.setdefault('data', post_data) if 'post_json' in kwargs: kwargs.setdefault('json', kwargs.pop('post_json')) response = session.post(url, timeout=timeout, **kwargs) else: response = session.get(url, timeout=timeout, **kwargs) if response.ok and not response.content and 'url=' in response.headers.get( 'Refresh', '').lower(): url = response.headers.get('Refresh').lower().split( 'url=')[1].strip('/') if not url.startswith('http'): parsed[2] = '/%s' % url url = urlunparse(parsed) response = session.get(url, timeout=timeout, **kwargs) # if encoding is not in header try to use best guess # ignore downloads with savename if not savename and not as_binary: if encoding: response.encoding = encoding elif not response.encoding or 'charset' not in response.headers.get( 'Content-Type', ''): response.encoding = response.apparent_encoding # noinspection PyProtectedMember if provider and provider._has_signature(response.text): return getattr(response, response_attr) if raise_status_code: response.raise_for_status() if not response.ok: http_err_text = 'CloudFlare Ray ID' in response.text and \ 'CloudFlare reports, "Website is offline"; ' or '' if response.status_code in http_error_code: http_err_text += http_error_code[response.status_code] elif response.status_code in range(520, 527): http_err_text += 'Origin server connection failure' else: http_err_text = 'Custom HTTP error code' if 'mute_http_error' not in mute: logger.debug( u'Response not ok. %s: %s from requested url %s' % (response.status_code, http_err_text, url)) return except requests.exceptions.HTTPError as e: if raise_status_code: response.raise_for_status() logger.warning(u'HTTP error %s while loading URL%s' % (e.errno, _maybe_request_url(e))) return except requests.exceptions.ConnectionError as e: if 'mute_connect_err' not in mute: logger.warning(u'Connection error msg:%s while loading URL%s' % (ex(e), _maybe_request_url(e))) if raise_exceptions: raise e return except requests.exceptions.ReadTimeout as e: if 'mute_read_timeout' not in mute: logger.warning(u'Read timed out msg:%s while loading URL%s' % (ex(e), _maybe_request_url(e))) if raise_exceptions: raise e return except (requests.exceptions.Timeout, socket.timeout) as e: if 'mute_connect_timeout' not in mute: logger.warning( u'Connection timed out msg:%s while loading URL %s' % (ex(e), _maybe_request_url(e, url))) if raise_exceptions: raise e return except (BaseException, Exception) as e: if ex(e): logger.warning( u'Exception caught while loading URL %s\r\nDetail... %s\r\n%s' % (url, ex(e), traceback.format_exc())) else: logger.warning( u'Unknown exception while loading URL %s\r\nDetail... %s' % (url, traceback.format_exc())) if raise_exceptions: raise e return if parse_json: try: data_json = response.json() if resp_sess: return ({}, data_json)[isinstance(data_json, (dict, list))], session return ({}, data_json)[isinstance(data_json, (dict, list))] except (TypeError, Exception) as e: logger.warning(u'JSON data issue from URL %s\r\nDetail... %s' % (url, ex(e))) if raise_exceptions: raise e return None if savename: try: write_file(savename, response, raw=True, raise_exceptions=raise_exceptions) except (BaseException, Exception) as e: if raise_exceptions: raise e return return True if resp_sess: return getattr(response, response_attr), session return getattr(response, response_attr)