def download_file(self, target, referer='', post=None, dest_name=None): if not self.download_dir: return None Url.headers['Referer'] = referer if self.use_auth: if not self.auth_try(): return None try: if not dest_name: dest_name = os.path.basename(target) url = self.url_opener.open( urllib2.Request(url=target, data=post, headers=Url.headers)) fl = open(os.path.join(self.download_dir, dest_name), "wb") fl.write(url.read()) fl.close() return os.path.join(self.download_dir, dest_name) except urllib2.HTTPError, e: if int(e.getcode()) == 503: try: from cfscrape import CloudflareScraper scraper = CloudflareScraper() self.log('Loading CF protected image %s > %s' % (target, dest_name)) fl = open(os.path.join(self.download_dir, dest_name), "wb") c = scraper.get(target).content fl.write(c) fl.close() return os.path.join(self.download_dir, dest_name) except Exception: pass if self.show_errors: xbmc.executebuiltin( 'XBMC.Notification("HTTP_ERROR", "%s", 3000, "")' % e) self.log(target + ' ' + e) return None
class Scraper: scraper = CloudflareScraper() ua = UserAgent() personality = ua.random def get_html(url): request: CloudflareScraper for i in range(120): try: request = Scraper.scraper.get( url, headers={'User-Agent': Scraper.personality}, timeout=0.7) if request.status_code == 200: return request.content else: Scraper.personality = Scraper.ua.random continue except: pass print('Scraper can\'t do request') try: return request.content except: return ''
def cloudflare(session, resp, **kwargs): """ Bypass Cloudflare's anti-bot protection. A request handler that retries a request after bypassing Cloudflare anti-bot protection. """ if CloudflareScraper.is_cloudflare_iuam_challenge(resp): log.debug('Cloudflare protection detected, trying to bypass it.') # Get the original request original_request = resp.request # Get the Cloudflare tokens and original user-agent tokens, user_agent = CloudflareScraper.get_tokens(original_request.url) # Add Cloudflare tokens to the session cookies session.cookies.update(tokens) # Add Cloudflare Tokens to the original request original_cookies = dict_from_cookiejar(original_request._cookies) original_cookies.update(tokens) original_request.prepare_cookies(original_cookies) # The same User-Agent must be used for the retry # Update the session with the Cloudflare User-Agent session.headers['User-Agent'] = user_agent # Update the original request with the Cloudflare User-Agent original_request.headers['User-Agent'] = user_agent # Resend the request kwargs = filtered_kwargs(kwargs) kwargs['allow_redirects'] = True cf_resp = session.send( original_request, **kwargs ) cf_resp.raise_for_status() if cf_resp.ok: log.debug('Cloudflare successfully bypassed.') return cf_resp else: if CloudflareScraper.is_cloudflare_captcha_challenge(resp): log.warning("Cloudflare captcha challenge detected, it can't be bypassed.") return resp
def _handle_submission_results(submission_id: str, session: CloudflareScraper): submission_update_url = get_submission_update_url(submission_id) finished_strings = [ "Final score:", "An internal error occurred while grading." "Submission Aborted!", "Compilation Error" ] failed_tries = 0 max_fail_attempts = 5 last_attempt_time = 0 delay = 0.5 batch_index = 0 testcase_index = 0 while True: while time.time() - last_attempt_time < delay: pass last_attempt_time = time.time() # Tudor plz give better interface. Maybe JSON string? data = session.get(submission_update_url) if not data or data.status_code != 200: failed_tries += 1 if failed_tries > max_fail_attempts: print("Something went wrong... Breaking!") break print("Failed attempt: re-attempt {} of {}".format( failed_tries, max_fail_attempts)) soup = BeautifulSoup(data.text, "html.parser") batches = soup.find_all("table", "submissions-status-table") while batch_index < len(batches): testcases = list(batches[batch_index].find_all( "tr", {"class": "case-row"})) if testcase_index == len(testcases): if batch_index + 1 < len(batches): batch_index += 1 print("Batch ${}".format(batch_index)) else: break else: print("\t{}".format(_format_case_row( testcases[testcase_index]))) testcase_index += 1 if any(s in data.text for s in finished_strings): print("Finished") break
def get_binary_raw(session: CloudflareScraper, url: str, speed: Union[int, float] = 100) -> Optional[bytes]: assert isinstance(speed, int) or isinstance(speed, float) file_stream: Response = session.get(url, stream=True) file_stream.raise_for_status() file_binary: bytes = bytes() for chunk in file_stream.iter_content(chunk_size=1024): file_binary += chunk sleep(1 / speed) if speed > 0 else None if (length := int(file_stream.headers.get( "Content-Length", 0))) > 0 and length != len(file_binary): raise IncompleteRead(l := len(file_binary), length - l)
def get_url( url, # type: AnyStr post_data=None, # type: Optional params=None, # type: Optional headers=None, # type: Optional[Dict] timeout=30, # type: int session=None, # type: Optional[requests.Session] parse_json=False, # type: bool raise_status_code=False, # type: bool raise_exceptions=False, # type: bool as_binary=False, # type: bool encoding=None, # type: Optional[AnyStr] **kwargs): # type: (...) -> Optional[Union[AnyStr, bool, bytes, Dict, Tuple[Union[Dict, List], requests.Session]]] """ Either 1) Returns a byte-string retrieved from the url provider. 2) Return True/False if success after using kwargs 'savefile' set to file pathname. 3) Returns Tuple response, session if success after setting kwargs 'resp_sess' True. 4) JSON Dict if parse_json=True. :param url: url :param post_data: post data :param params: :param headers: headers to add :param timeout: timeout :param session: optional session object :param parse_json: return JSON Dict :param raise_status_code: raise exception for status codes :param raise_exceptions: raise exceptions :param as_binary: return bytes instead of text :param encoding: overwrite encoding return header if as_binary is False :param kwargs: :return: """ response_attr = ('text', 'content')[as_binary] # selectively mute some errors mute = filter_list(lambda x: kwargs.pop(x, False), [ 'mute_connect_err', 'mute_read_timeout', 'mute_connect_timeout', 'mute_http_error' ]) # reuse or instantiate request session resp_sess = kwargs.pop('resp_sess', None) if None is session: session = CloudflareScraper.create_scraper() session.headers.update({'User-Agent': USER_AGENT}) # download and save file or simply fetch url savename = kwargs.pop('savename', None) if savename: # session streaming session.stream = True if not kwargs.pop('nocache', False): cache_dir = CACHE_DIR or get_system_temp_dir() session = CacheControl(sess=session, cache=caches.FileCache( ek.ek(os.path.join, cache_dir, 'sessions'))) provider = kwargs.pop('provider', None) # handle legacy uses of `json` param if kwargs.get('json'): parse_json = kwargs.pop('json') # session master headers req_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip,deflate' } if headers: req_headers.update(headers) if hasattr(session, 'reserved') and 'headers' in session.reserved: req_headers.update(session.reserved['headers'] or {}) session.headers.update(req_headers) # session parameters session.params = params # session ssl verify session.verify = False # don't trust os environments (auth, proxies, ...) session.trust_env = False response = None try: # sanitise url parsed = list(urlparse(url)) parsed[2] = re.sub('/{2,}', '/', parsed[2]) # replace two or more / with one url = urlunparse(parsed) # session proxies if PROXY_SETTING: (proxy_address, pac_found) = proxy_setting(PROXY_SETTING, url) msg = '%sproxy for url: %s' % (('', 'PAC parsed ')[pac_found], url) if None is proxy_address: logger.debug('Proxy error, aborted the request using %s' % msg) return elif proxy_address: logger.debug('Using %s' % msg) session.proxies = { 'http': proxy_address, 'https': proxy_address } # decide if we get or post data to server if post_data or 'post_json' in kwargs: if True is post_data: post_data = None if post_data: kwargs.setdefault('data', post_data) if 'post_json' in kwargs: kwargs.setdefault('json', kwargs.pop('post_json')) response = session.post(url, timeout=timeout, **kwargs) else: response = session.get(url, timeout=timeout, **kwargs) if response.ok and not response.content and 'url=' in response.headers.get( 'Refresh', '').lower(): url = response.headers.get('Refresh').lower().split( 'url=')[1].strip('/') if not url.startswith('http'): parsed[2] = '/%s' % url url = urlunparse(parsed) response = session.get(url, timeout=timeout, **kwargs) # if encoding is not in header try to use best guess # ignore downloads with savename if not savename and not as_binary: if encoding: response.encoding = encoding elif not response.encoding or 'charset' not in response.headers.get( 'Content-Type', ''): response.encoding = response.apparent_encoding # noinspection PyProtectedMember if provider and provider._has_signature(response.text): return getattr(response, response_attr) if raise_status_code: response.raise_for_status() if not response.ok: http_err_text = 'CloudFlare Ray ID' in response.text and \ 'CloudFlare reports, "Website is offline"; ' or '' if response.status_code in http_error_code: http_err_text += http_error_code[response.status_code] elif response.status_code in range(520, 527): http_err_text += 'Origin server connection failure' else: http_err_text = 'Custom HTTP error code' if 'mute_http_error' not in mute: logger.debug( u'Response not ok. %s: %s from requested url %s' % (response.status_code, http_err_text, url)) return except requests.exceptions.HTTPError as e: if raise_status_code: response.raise_for_status() logger.warning(u'HTTP error %s while loading URL%s' % (e.errno, _maybe_request_url(e))) return except requests.exceptions.ConnectionError as e: if 'mute_connect_err' not in mute: logger.warning(u'Connection error msg:%s while loading URL%s' % (ex(e), _maybe_request_url(e))) if raise_exceptions: raise e return except requests.exceptions.ReadTimeout as e: if 'mute_read_timeout' not in mute: logger.warning(u'Read timed out msg:%s while loading URL%s' % (ex(e), _maybe_request_url(e))) if raise_exceptions: raise e return except (requests.exceptions.Timeout, socket.timeout) as e: if 'mute_connect_timeout' not in mute: logger.warning( u'Connection timed out msg:%s while loading URL %s' % (ex(e), _maybe_request_url(e, url))) if raise_exceptions: raise e return except (BaseException, Exception) as e: if ex(e): logger.warning( u'Exception caught while loading URL %s\r\nDetail... %s\r\n%s' % (url, ex(e), traceback.format_exc())) else: logger.warning( u'Unknown exception while loading URL %s\r\nDetail... %s' % (url, traceback.format_exc())) if raise_exceptions: raise e return if parse_json: try: data_json = response.json() if resp_sess: return ({}, data_json)[isinstance(data_json, (dict, list))], session return ({}, data_json)[isinstance(data_json, (dict, list))] except (TypeError, Exception) as e: logger.warning(u'JSON data issue from URL %s\r\nDetail... %s' % (url, ex(e))) if raise_exceptions: raise e return None if savename: try: write_file(savename, response, raw=True, raise_exceptions=raise_exceptions) except (BaseException, Exception) as e: if raise_exceptions: raise e return return True if resp_sess: return getattr(response, response_attr), session return getattr(response, response_attr)
request.url = request.url.replace( 'https://' + result.hostname, 'https://' + resolvedIP, ) connection_pool_kwargs['server_hostname'] = result.hostname connection_pool_kwargs['assert_hostname'] = result.hostname request.headers['Host'] = result.hostname else: connection_pool_kwargs.pop('server_hostname', None) connection_pool_kwargs.pop('assert_hostname', None) return super(HostHeaderSSLAdapter, self).send(request, **kwargs) cfs = CloudflareScraper() cfs.mount('https://', HostHeaderSSLAdapter()) hParser = 'html.parser' infoBanner = "[Marumaru-Downloader]" header = { 'User-agent': 'Mozilla/5.0', 'Referer': baseURL, } def PrintBanner(): print('''
def get(session: CloudflareScraper, path: str, **params) -> Response: return session.get(join_url(root, path), params=params)