def open(self, url, data=None): # cf https://github.com/w3c/py-http-handler/blob/master/checkremote.py from checkremote import check_url_safety, UnsupportedResourceError try: check_url_safety(url) except UnsupportedResourceError: raise IOError(403, "Access to url '%s' is not allowed" % url) if self.surblchecker.isMarkedAsSpam(url): raise IOError( 403, "Access to url '%s' is not allowed as it is marked as spam in SURBL" % url) return urllib.FancyURLopener.open(self, url, data)
def brett_test(uri): if not sys.platform == "darwin": from checkremote import check_url_safety, UnsupportedResourceError from urllib2 import HTTPError, URLError try: check_url_safety(uri) except HTTPError as e: err_message( 'HTTP Error with the error code: %s and the error message: "%s"' ( e.code, e.reason)) except URLError as e: err_message('URL Error with the error message: "%s"' % e.reason) except UnsupportedResourceError as e: msg = e.args[0] + ": " + e.args[1] err_message( 'Unsupported Resource Error with the error message "%s"' % msg) except Exception as e: l = len(e.args) msg = "" if l == 0 else (e.args[0] if l == 1 else e.args) err_message('Exception raised: "%s"' % msg)
def brett_test(uri): """ Test, when running on W3C, the safety of the URL. :param str uri: The URI used to start up the script :return: result of the check :rtype: Boolean If the the test does not pass, ie an exception is raised somewhere down the line, an error message is sent back (via HTTP) to the caller. Contributed by Brett Smith, W3C, and relying on an external library (``check_url_safety``) running at the W3C. *This method runs only on the W3C site and its invocation must be preceded by an appropriate check*. """ from checkremote import check_url_safety, UnsupportedResourceError if PY3: from urllib.error import HTTPError, URLError else: from urllib2 import HTTPError, URLError try: check_url_safety(uri) # If we got here, there have been no issues; Brett's script simply raises exceptions return True except HTTPError as e: err_message(uri, 'HTTP Error with the error code: %s and the error message: "%s"' (e.code, e.reason)) except URLError as e: err_message(uri, 'URL Error with the error message: "%s"' % e.reason) except UnsupportedResourceError as e: msg = e.args[0] + ": " + e.args[1] err_message(uri, 'Unsupported Resource Error with the error message "%s"' % msg) except Exception as e: args = len(e.args) msg = "" if args == 0 else (e.args[0] if args == 1 else repr(e.args)) err_message(uri, 'Exception raised: "%s"' % msg) # If we got here one of the exceptions were handled, ie, the result of the check # is False... return False
def __init__(self, name, additional_headers={}): """ @param name: URL to be opened @keyword additional_headers: additional HTTP request headers to be added to the call """ try: # Note the removal of the fragment ID. This is necessary, per the HTTP spec url = name.split('#')[0] if socket.getfqdn().endswith('.w3.org'): import checkremote checkremote.check_url_safety(url) if 'Accept' not in additional_headers: additional_headers[ 'Accept'] = 'text/html, application/xhtml+xml' import requests # Switching off the verification is not cool. But, at least for now, too many # sites still go wrong because the cerficates are not o.k. with request... r = requests.get(url, headers=additional_headers, verify=False) self.data = r.content self.headers = r.headers if URIOpener.CONTENT_TYPE in self.headers: # The call below will remove the possible media type parameters, like charset settings ct = content_type(self.headers[URIOpener.CONTENT_TYPE]) self.content_type = ct.media_type if 'charset' in ct.parmdict: self.charset = ct.parmdict['charset'] else: self.charset = None # print else: # check if the suffix can be used for the content type; this may be important # for file:// type URI or if the server is not properly set up to return the right # mime type self.charset = None self.content_type = "" for suffix in preferred_suffixes.keys(): if name.endswith(suffix): self.content_type = preferred_suffixes[suffix] break if URIOpener.CONTENT_LOCATION in self.headers: self.location = urljoin( r.url, self.headers[URIOpener.CONTENT_LOCATION]) else: self.location = name self.expiration_date = datetime.datetime.utcnow( ) + datetime.timedelta(days=1) if URIOpener.EXPIRES in self.headers: try: # Thanks to Deron Meranda for the HTTP date conversion method... self.expiration_date = parse_http_datetime( self.headers[URIOpener.EXPIRES]) except: # The Expires date format was wrong, sorry, forget it... pass self.last_modified_date = None if URIOpener.LAST_MODIFIED in self.headers: try: # Thanks to Deron Meranda for the HTTP date conversion method... self.last_modified_date = parse_http_datetime( self.headers[URIOpener.LAST_MODIFIED]) except: # The last modified date format was wrong, sorry, forget it... pass except urllib_HTTPError: e = sys.exc_info()[1] from . import HTTPError msg = BaseHTTPRequestHandler.responses[e.code] raise HTTPError('%s' % msg[1], e.code) except Exception: e = sys.exc_info()[1] from . import RDFaError raise RDFaError('%s' % e)
def __init__(self, name, additional_headers = {}) : """ @param name: URL to be opened @keyword additional_headers: additional HTTP request headers to be added to the call """ try : # Note the removal of the fragment ID. This is necessary, per the HTTP spec url = name.split('#')[0] if socket.getfqdn().endswith('.w3.org'): import checkremote checkremote.check_url_safety(url) if 'Accept' not in additional_headers: additional_headers['Accept'] = 'text/html, application/xhtml+xml' import requests r = requests.get(url, headers=additional_headers) self.data = r.content self.headers = r.headers if URIOpener.CONTENT_TYPE in self.headers : # The call below will remove the possible media type parameters, like charset settings ct = content_type(self.headers[URIOpener.CONTENT_TYPE]) self.content_type = ct.media_type if 'charset' in ct.parmdict : self.charset = ct.parmdict['charset'] else : self.charset = None # print else : # check if the suffix can be used for the content type; this may be important # for file:// type URI or if the server is not properly set up to return the right # mime type self.charset = None self.content_type = "" for suffix in preferred_suffixes.keys() : if name.endswith(suffix) : self.content_type = preferred_suffixes[suffix] break if URIOpener.CONTENT_LOCATION in self.headers : self.location = urljoin(r.url,self.headers[URIOpener.CONTENT_LOCATION]) else : self.location = name self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1) if URIOpener.EXPIRES in self.headers : try : # Thanks to Deron Meranda for the HTTP date conversion method... self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES]) except : # The Expires date format was wrong, sorry, forget it... pass self.last_modified_date = None if URIOpener.LAST_MODIFIED in self.headers : try : # Thanks to Deron Meranda for the HTTP date conversion method... self.last_modified_date = parse_http_datetime(self.headers[URIOpener.LAST_MODIFIED]) except : # The last modified date format was wrong, sorry, forget it... pass except urllib_HTTPError : e = sys.exc_info()[1] from . import HTTPError msg = BaseHTTPRequestHandler.responses[e.code] raise HTTPError('%s' % msg[1], e.code) except Exception : e = sys.exc_info()[1] from . import RDFaError raise RDFaError('%s' % e)