def get_timeout(url): if "wiki.ros.org" in url or "abyz.me.uk" in url: return Timeout(connect=15, read=20, total=45) if "svn.eby-sarna.com" in url or "gitorious.org" in url: # pragma: no cover return Timeout(connect=30, read=30, total=60) if "w3.org" in url or "pygal.org" in url: return Timeout(connect=30, read=30, total=45) if not _CI and "galaxyproject.org" in url: return Timeout(connect=100, read=100, total=200) return Timeout(connect=15, read=11, total=40)
def _test_head_302_bypasses_cache(self): # HEAD 302 is completed before the caching from pypidb._cache import get_file_cache_session url = "http://gpodder.org/" s = get_file_cache_session("web") r = s.head(url, timeout=Timeout(15)) self.assertEqual(r.status_code, 302) self.assertEqual(r.url, url) self.assertEqual(r.headers["location"], "https://gpodder.github.io") self.assertFalse(r.content) self.assertFalse(hasattr(r, "from_cache")) r = s.head(url, timeout=Timeout(15)) self.assertFalse(hasattr(r, "from_cache"))
def _test_fetch_ply_requests_session(self): url = "https://www.dabeaz.com/ply/" s = requests.Session() r = s.get(url, timeout=Timeout(5)) self.assertEqual(r.status_code, 200) self.assertEqual(r.url, url)
def test_head_requests_get(self): url = "http://gpodder.org/" r = requests.head(url, timeout=Timeout(15)) self.assertEqual(r.status_code, 302) self.assertEqual(r.url, url) self.assertEqual(r.headers["location"], "https://gpodder.github.io") self.assertFalse(r.content)
def _test_timeout_moksha(self): # mokshaproject.net is a parked domain url = "https://mokshaproject.net" from pypidb._cache import get_file_cache_session s = get_file_cache_session("web") with self.assertRaises(requests.exceptions.ConnectTimeout): s.get(url, timeout=Timeout(5))
def test_fetch_ply_https(self): from pypidb._cache import get_file_cache_session url = "https://www.dabeaz.com/ply/" s = get_file_cache_session("web") r = s.get(url, timeout=Timeout(5)) self.assertEqual(r.status_code, 200) self.assertNotEqual(r.url, url) self.assertEqual(r.url, url.replace("https://", "http://"))
def test_vlc(self): from pypidb._cache import get_file_cache_session url = "https://wiki.videolan.org/PythonBinding" s = get_file_cache_session("web") r = s.get(url, timeout=Timeout(15)) self.assertEqual(r.status_code, 200) self.assertEqual(r.url, url) self.assertTrue(r.content) self.assertIn(b'<meta name="generator" content="MediaWiki', r.content) self.assertIn(b"https://git.videolan.org/", r.content)
def get_with_retries(url, params=None): s = requests.Session() timeout = Timeout(100, 100, 100) retries = Retry(total=10, read=10, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]) s.mount('http://', HTTPAdapter(max_retries=retries)) return s.get(url, params=params, timeout=100).content.strip(b"\"\n").replace(b"\\n", b"\n")
def _test_head_cachecontrol(self): # cache polluted by HEAD url = "http://gpodder.org/" class MoreCodesCacheController(CacheController): def __init__(self, *args, **kwargs): kwargs["status_codes"] = (200, 203, 300, 301, 302, 404) super(MoreCodesCacheController, self).__init__(*args, **kwargs) cache = DictCache() s = CacheControl( requests.Session(), cache=cache, heuristic=OneDayCache(), cacheable_methods=["GET", "HEAD"], controller_class=MoreCodesCacheController, ) r = s.head(url, timeout=Timeout(15)) self.assertEqual(r.status_code, 302) self.assertEqual(r.url, url) self.assertEqual(r.headers["location"], "https://gpodder.github.io") self.assertFalse(r.content) self.assertFalse(r.from_cache) r = s.get(url, timeout=Timeout(15), allow_redirects=False) self.assertEqual(r.status_code, 302) self.assertEqual(r.url, url) self.assertEqual(r.headers["location"], "https://gpodder.github.io") self.assertTrue(r.content) self.assertIn(b"302 Found", r.content) self.assertFalse(r.from_cache) # Re-fetch the redirect from the cache r = s.get(url, timeout=Timeout(15), allow_redirects=False) self.assertEqual(r.status_code, 302) self.assertEqual(r.url, url) self.assertEqual(r.headers["location"], "https://gpodder.github.io") self.assertTrue(r.content) self.assertIn(b"302 Found", r.content) self.assertTrue(r.from_cache) # Now allow redirects r = s.get(url, timeout=Timeout(15)) self.assertEqual(r.status_code, 200) self.assertNotEqual(r.url, url) self.assertTrue(r.content) self.assertFalse(r.from_cache) # Re-fetch the redirect from the cache r = s.get(url, timeout=Timeout(15), allow_redirects=False) self.assertEqual(r.status_code, 302) self.assertEqual(r.url, url) self.assertEqual(r.headers["location"], "https://gpodder.github.io") self.assertTrue(r.content) self.assertIn(b"302 Found", r.content) self.assertTrue(r.from_cache) # Re-fetch the head from the cache r = s.head(url, timeout=Timeout(15)) self.assertEqual(r.status_code, 302) self.assertEqual(r.url, url) self.assertEqual(r.headers["location"], "https://gpodder.github.io") self.assertTrue(r.content) self.assertIn(b"302 Found", r.content) self.assertTrue(r.from_cache)
def _test_timeout_moksha_requests_get(self): url = "https://mokshaproject.net" r = requests.get(url, timeout=Timeout(5)) self.assertEqual(r.status_code, 200) self.assertEqual(r.url, url)
def _test_timeout_moksha_requests_session(self): url = "https://mokshaproject.net" session = requests.Session() r = session.get(url, timeout=Timeout(5)) self.assertEqual(r.status_code, 200) self.assertEqual(r.url, url)
class HTTPSAdapter(RedirectAdapter): _head_timeout = Timeout(connect=10, read=5) def __init__(self, *args, **kwargs): https_exceptions = kwargs.pop("https_exceptions", []) super(HTTPSAdapter, self).__init__(*args, **kwargs) self._https_exceptions = https_exceptions def ignore_handle_error(self, exc, request=None): if not request: raise exc if request.url.startswith("https"): tail = request.url[8:] if self.prevent_https(tail): logger.info("downgrading to http", request.url, exc) new_request = request.copy() new_request.url = "http://" + tail return self.send(request, timeout=self._head_timeout) raise exc def block_redirect(self, from_url, to_url): pass def prevent_https(self, tail): if tail[0].isdigit(): # TODO: detect IP return True for rule in self._https_exceptions: if tail.startswith(rule): return True return False def send(self, request, *args, **kwargs): if request.url.startswith("https://"): tail = request.url[8:] if self.prevent_https(tail): logger.info("downgraded {} to http".format(request.url)) request.url = "http://" + tail try: resp = super(HTTPSAdapter, self).send(request, *args, **kwargs) except Exception as e: resp = self.handle_error(e, request) return resp def get_redirect(self, url): if not url.startswith("http://"): return super(HTTPSAdapter, self).get_redirect(url) tail = url[7:] if self.prevent_https(tail): logger.info("https blocked for {}".format(url)) return super(HTTPSAdapter, self).get_redirect(url) while True: current_url = url try: if current_url.startswith("http://code.google.com"): resp = requests.get( current_url, allow_redirects=False, timeout=self._head_timeout ) else: resp = requests.head( current_url, allow_redirects=False, timeout=self._head_timeout ) resp.raise_for_status() except Exception as e: logger.info("head failed for {}: {!r}".format(current_url, e)) break else: logger.debug( "head {} {} {} {} {}".format( current_url, resp.url, resp, resp.headers, resp.content ) ) location = resp.headers.get("location") if location and location != current_url: code = self.block_redirect(current_url, location) if code is True: return super(HTTPSAdapter, self).get_redirect(url) elif code: return _generate_response(code) code = self.block_redirect(url, location) if code is True: return super(HTTPSAdapter, self).get_redirect(url) elif code: return _generate_response(code) if location.startswith("https:"): tail = location[8:] if self.prevent_https(tail): return super(HTTPSAdapter, self).get_redirect(url) return resp url = location break tail = url[7:] if self.prevent_https(tail): logger.info("https blocked for {}".format(url)) return super(HTTPSAdapter, self).get_redirect(url) return "https" + url[4:]
def _make_request(self, conn, method, url, timeout=Timeout.from_float(2), chunked=False, **httplib_request_kw): self.num_requests += 1 timeout_obj = self._get_timeout(timeout) timeout_obj.start_connect() conn.timeout = timeout_obj.connect_timeout # Trigger any extra validation we need to do. try: self._validate_conn(conn, **httplib_request_kw) except (SocketTimeout, BaseSSLError) as e: # Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout. self._raise_timeout(err=e, url=url, timeout_value=conn.timeout) raise # conn.request() calls httplib.*.request, not the method in # urllib3.request. It also calls makefile (recv) on the socket. if chunked: conn.request_chunked(method, url, **httplib_request_kw) else: conn.request(method, url, **httplib_request_kw) # Reset the timeout for the recv() on the socket read_timeout = timeout_obj.read_timeout # App Engine doesn't have a sock attr if getattr(conn, 'sock', None): # In Python 3 socket.py will catch EAGAIN and return None when you # try and read into the file pointer created by http.client, which # instead raises a BadStatusLine exception. Instead of catching # the exception and assuming all BadStatusLine exceptions are read # timeouts, check for a zero timeout before making the request. if read_timeout == 0: raise ReadTimeoutError( self, url, "Read timed out. (read timeout=%s)" % read_timeout) if read_timeout is Timeout.DEFAULT_TIMEOUT: conn.sock.settimeout(socket.getdefaulttimeout()) else: # None or a value conn.sock.settimeout(read_timeout) # Receive the response from the server try: try: # Python 2.7, use buffering of HTTP responses httplib_response = conn.getresponse(buffering=True) except TypeError: # Python 2.6 and older, Python 3 try: httplib_response = conn.getresponse() except Exception as e: # Remove the TypeError from the exception chain in Python 3; # otherwise it looks like a programming error was the cause. six.raise_from(e, None) except (SocketTimeout, BaseSSLError, SocketError) as e: self._raise_timeout(err=e, url=url, timeout_value=read_timeout) raise # AppEngine doesn't have a version attr. http_version = getattr(conn, '_http_vsn_str', 'HTTP/?') log.debug("%s://%s:%s \"%s %s %s\" %s %s", self.scheme, self.host, self.port, method, url, http_version, httplib_response.status, httplib_response.length) try: assert_header_parsing(httplib_response.msg) except HeaderParsingError as hpe: # Platform-specific: Python 3 log.warning('Failed to parse headers (url=%s): %s', self._absolute_url(url), hpe, exc_info=True) sock = getattr(conn, 'sock', False) if sock: setattr(httplib_response, 'peer', sock.getpeername()) else: setattr(httplib_response, 'peer', None) return httplib_response