def execute(self, obj): if not isinstance(obj, str): raise TypeError('\'{}\' is not of type str.'.format(obj)) parsed = self._parse(obj) parsed = self._process(**parsed) return rfc3987.parse(rfc3987.compose(**parsed))
def is_internal_url(url): is_internal = url['authority'] in (None, 'software.esciencecenter.nl') if is_internal and not url['path'].startswith('/'): raise ValueError('Path {} must start with /'.format( rfc3987.compose(**url))) if is_internal and url['scheme'] == 'https': raise ValueError('For the time being, use http instead of https ' 'prefixes for http://software.esciencecenter.nl') return is_internal
def parse_url(url): try: matches = rfc3987.parse(url, rule='URI') except ValueError: raise HTTPBadRequest(detail=Messages.invalid_uri) if matches['scheme'] not in ['http', 'https']: raise HTTPBadRequest(detail=Messages.invalid_uri) matches['path'] = matches['path'] or '/' matches['fragment'] = None return rfc3987.compose(**matches)
def _sanitizewebiri(iri): res = rfc3987.parse(iri, b"IRI") scheme = res[b"scheme"] if scheme == None or scheme.lower() not in ("http", "https"): raise ValueError() # Not a Web address authority = res[b"authority"] if authority == None or len(authority) == 0: raise ValueError() # No host specified res[b"authority"] = DataProvider._sanitizefqdn(authority)[:-1] iri = rfc3987.compose(**res) # Derived from Django uri = urllib.quote(iri.encode("utf-8"), safe=b"/#%[]=:;$&()+,!?*@'~") return unicode(uri)
def remap(url): url_parts = parse(url, 'URI') if not (url_parts['scheme'] == src_parts['scheme'] and url_parts['authority'] == src_parts['authority']): return False, url url_path = Path(unquote(url_parts['path'])).resolve() if src_path != url_path and src_path not in url_path.parents: return False, url result_path = dest_path / url_path.relative_to(src_path) # Use a trailing slash if the incoming path had one. This facilitates # further URI resolution operations. if url_parts['path'].endswith('/'): final_path = f'{result_path}/' else: final_path = str(result_path) return True, (compose(scheme=dest_parts['scheme'], authority=dest_parts['authority'], path=quote(final_path), query=url_parts['query'], fragment=url_parts['fragment']))
def to_iri(iri): """ Safely quotes an IRI in a way that is resilient to unicode and incorrect arguments (checks for RFC 3987 compliance and falls back to percent encoding) """ # First decode the IRI if needed if not isinstance(iri, str): logger.debug("Converting IRI to unicode") iri = iri.decode('utf-8') try: # If we can safely parse the URI, then we don't # need to do anything special here rfc3987.parse(iri, rule='IRI') logger.debug("This is already a valid IRI, doing nothing...") return iri except: # The URI is not valid, so we'll have to fix it. logger.debug("The IRI is not valid, proceeding to quote...") # First see whether we can actually parse it *as if* it is a URI parts = urlparse.urlsplit(iri) if not parts.scheme or not parts.netloc: # If there is no scheme (e.g. http) nor a net location (e.g. # example.com) then we cannot do anything logger.error("The argument you provided does not comply with " "RFC 3987 and is not parseable as a IRI" "(there is no scheme or no net location part)") logger.error(iri) raise Exception("The argument you provided does not comply with" "RFC 3987 and is not parseable as a IRI" "(there is no scheme or no net location part)") logger.debug( "The IRI contains all necessary parts (scheme + net location)") quoted_parts = {} # We'll now convert the path, query and fragment parts of the URI # Get the 'anti-pattern' for the valid characters (see rfc3987 package) # This is roughly the ipchar pattern plus the '/' as we don't need to match # the entire path, but merely the individual characters no_invalid_characters = rfc3987.get_compiled_pattern( "(?!%(iunreserved)s|%(pct_encoded)s|%(sub_delims)s|:|@|/)(.)") # Replace the invalid characters with an underscore (no need to roundtrip) quoted_parts['path'] = no_invalid_characters.sub(u'_', parts.path) if parts.fragment: quoted_parts['fragment'] = no_invalid_characters.sub( u'_', parts.fragment) if parts.query: quoted_parts['query'] = urllib.quote(parts.query.encode('utf-8'), safe="&=") # Leave these untouched quoted_parts['scheme'] = parts.scheme quoted_parts['authority'] = parts.netloc # Extra check to make sure we now have a valid IRI quoted_iri = rfc3987.compose(**quoted_parts) try: rfc3987.parse(quoted_iri) except: # Unable to generate a valid quoted iri, using the straightforward # urllib percent quoting (but this is ugly!) logger.warning('Could not safely quote as IRI, falling back to ' 'percent encoding') quoted_iri = urllib.quote(iri.encode('utf-8')) return quoted_iri
def absolute_url(url): url = parse_url(url) if url['authority'] is None: url['scheme'] = 'http' url['authority'] = 'software.esciencecenter.nl' return rfc3987.compose(**url)
def check_internal_url(url): if not is_internal_url(url): raise ValueError('Url {} is not internal'.format( rfc3987.compose(**url)))
def to_iri(iri): """ Safely quotes an IRI in a way that is resilient to unicode and incorrect arguments (checks for RFC 3987 compliance and falls back to percent encoding) """ # First decode the IRI if needed if not isinstance(iri, unicode): logger.debug("Converting IRI to unicode") iri = iri.decode('utf-8') try: # If we can safely parse the URI, then we don't # need to do anything special here rfc3987.parse(iri, rule='IRI') logger.debug("This is already a valid IRI, doing nothing...") return iri except: # The URI is not valid, so we'll have to fix it. logger.debug("The IRI is not valid, proceeding to quote...") # First see whether we can actually parse it *as if* it is a URI parts = urlparse.urlsplit(iri) if not parts.scheme or not parts.netloc: # If there is no scheme (e.g. http) nor a net location (e.g. # example.com) then we cannot do anything logger.error("The argument you provided does not comply with " "RFC 3987 and is not parseable as a IRI" "(there is no scheme or no net location part)") logger.error(iri) raise Exception("The argument you provided does not comply with" "RFC 3987 and is not parseable as a IRI" "(there is no scheme or no net location part)") logger.debug("The IRI contains all necessary parts (scheme + net location)") quoted_parts = {} # We'll now convert the path, query and fragment parts of the URI # Get the 'anti-pattern' for the valid characters (see rfc3987 package) # This is roughly the ipchar pattern plus the '/' as we don't need to match # the entire path, but merely the individual characters no_invalid_characters = rfc3987.get_compiled_pattern("(?!%(iunreserved)s|%(pct_encoded)s|%(sub_delims)s|:|@|/)(.)") # Replace the invalid characters with an underscore (no need to roundtrip) quoted_parts['path'] = no_invalid_characters.sub(u'_', parts.path) if parts.fragment: quoted_parts['fragment'] = no_invalid_characters.sub(u'_', parts.fragment) if parts.query: quoted_parts['query'] = urllib.quote(parts.query.encode('utf-8'),safe="&=") # Leave these untouched quoted_parts['scheme'] = parts.scheme quoted_parts['authority'] = parts.netloc # Extra check to make sure we now have a valid IRI quoted_iri = rfc3987.compose(**quoted_parts) try: rfc3987.parse(quoted_iri) except: # Unable to generate a valid quoted iri, using the straightforward # urllib percent quoting (but this is ugly!) logger.warning('Could not safely quote as IRI, falling back to ' 'percent encoding') quoted_iri = urllib.quote(iri.encode('utf-8')) return quoted_iri
def __str__(self): return compose(**self.components)