def test_iri_support(self): self.assertEqual( urls.uri_to_iri('http://xn--n3h.net/'), 'http://\u2603.net/' ) self.assertEqual( urls.uri_to_iri( 'http://%C3%BCser:p%C3%[email protected]/p%C3%A5th' ), 'http://\xfcser:p\xe4ssword@\u2603.net/p\xe5th' ) self.assertEqual( urls.iri_to_uri('http://☃.net/'), 'http://xn--n3h.net/' ) self.assertEqual( urls.iri_to_uri('http://üser:pässword@☃.net/påth'), 'http://%C3%BCser:p%C3%[email protected]/p%C3%A5th' ) self.assertEqual( urls.uri_to_iri('http://test.com/%3Fmeh?foo=%26%2F'), 'http://test.com/%3Fmeh?foo=%26%2F' ) self.assertEqual(urls.iri_to_uri('/foo'), '/foo') self.assertEqual( urls.iri_to_uri('http://föö.com:8080/bam/baz'), 'http://xn--f-1gaa.com:8080/bam/baz' )
def get_current_url( environ, root_only=False, strip_querystring=False, host_only=False, trusted_hosts=None, ): """A handy helper function that recreates the full URL as IRI for the current request or parts of it. Here an example: >>> from verktyg.test import create_environ >>> env = create_environ("/?param=foo", "http://localhost/script") >>> get_current_url(env) 'http://localhost/script/?param=foo' >>> get_current_url(env, root_only=True) 'http://localhost/script/' >>> get_current_url(env, host_only=True) 'http://localhost/' >>> get_current_url(env, strip_querystring=True) 'http://localhost/script/' This optionally it verifies that the host is in a list of trusted hosts. If the host is not in there it will raise a :exc:`~verktyg.exceptions.SecurityError`. Note that the string returned might contain unicode characters as the representation is an IRI not an URI. If you need an ASCII only representation you can use the :func:`~verktyg.urls.iri_to_uri` function: >>> from verktyg.urls import iri_to_uri >>> iri_to_uri(get_current_url(env)) 'http://localhost/script/?param=foo' :param environ: The WSGI environment to get the current URL from. :param root_only: Set `True` if you only want the root URL. :param strip_querystring: Set to `True` if you don't want the querystring. :param host_only: Set to `True` if the host URL should be returned. :param trusted_hosts: A list of trusted hosts, see :func:`host_is_trusted` for more information. """ tmp = [environ['wsgi.url_scheme'], '://', get_host(environ, trusted_hosts)] cat = tmp.append if host_only: return uri_to_iri(''.join(tmp) + '/') cat(urlquote(wsgi_get_bytes(environ.get('SCRIPT_NAME', ''))).rstrip('/')) cat('/') if not root_only: cat(urlquote( wsgi_get_bytes(environ.get('PATH_INFO', '')).lstrip(b'/') )) if not strip_querystring: qs = get_query_string(environ) if qs: cat('?' + qs) return uri_to_iri(''.join(tmp))
def get_current_url(environ, root_only=False, strip_querystring=False, host_only=False, trusted_hosts=None): """A handy helper function that recreates the full URL as IRI for the current request or parts of it. Here an example: >>> from verktyg.test import create_environ >>> env = create_environ("/?param=foo", "http://localhost/script") >>> get_current_url(env) 'http://localhost/script/?param=foo' >>> get_current_url(env, root_only=True) 'http://localhost/script/' >>> get_current_url(env, host_only=True) 'http://localhost/' >>> get_current_url(env, strip_querystring=True) 'http://localhost/script/' This optionally it verifies that the host is in a list of trusted hosts. If the host is not in there it will raise a :exc:`~verktyg.exceptions.SecurityError`. Note that the string returned might contain unicode characters as the representation is an IRI not an URI. If you need an ASCII only representation you can use the :func:`~verktyg.urls.iri_to_uri` function: >>> from verktyg.urls import iri_to_uri >>> iri_to_uri(get_current_url(env)) 'http://localhost/script/?param=foo' :param environ: the WSGI environment to get the current URL from. :param root_only: set `True` if you only want the root URL. :param strip_querystring: set to `True` if you don't want the querystring. :param host_only: set to `True` if the host URL should be returned. :param trusted_hosts: a list of trusted hosts, see :func:`host_is_trusted` for more information. """ tmp = [environ['wsgi.url_scheme'], '://', get_host(environ, trusted_hosts)] cat = tmp.append if host_only: return uri_to_iri(''.join(tmp) + '/') cat(urlquote(wsgi_get_bytes(environ.get('SCRIPT_NAME', ''))).rstrip('/')) cat('/') if not root_only: cat(urlquote( wsgi_get_bytes(environ.get('PATH_INFO', '')).lstrip(b'/'))) if not strip_querystring: qs = get_query_string(environ) if qs: cat('?' + qs) return uri_to_iri(''.join(tmp))
def test_uri_iri_normalization(self): uri = 'http://xn--f-rgao.com/%E2%98%90/fred?utf8=%E2%9C%93' iri = 'http://föñ.com/\N{BALLOT BOX}/fred?utf8=\u2713' tests = [ 'http://föñ.com/\N{BALLOT BOX}/fred?utf8=\u2713', 'http://xn--f-rgao.com/\u2610/fred?utf8=\N{CHECK MARK}', 'http://xn--f-rgao.com/%E2%98%90/fred?utf8=%E2%9C%93', 'http://xn--f-rgao.com/%E2%98%90/fred?utf8=%E2%9C%93', 'http://föñ.com/\u2610/fred?utf8=%E2%9C%93', ] for test in tests: self.assertEqual(urls.uri_to_iri(test), iri) self.assertEqual(urls.iri_to_uri(test), uri) self.assertEqual(urls.uri_to_iri(urls.iri_to_uri(test)), iri) self.assertEqual(urls.iri_to_uri(urls.uri_to_iri(test)), uri) self.assertEqual(urls.uri_to_iri(urls.uri_to_iri(test)), iri) self.assertEqual(urls.iri_to_uri(urls.iri_to_uri(test)), uri)
def test_uri_to_iri_to_uri(self): uri = 'http://xn--f-rgao.com/%C3%9E' iri = urls.uri_to_iri(uri) self.assertEqual(urls.iri_to_uri(iri), uri)
def test_iri_to_uri_to_iri(self): iri = 'http://föö.com/' uri = urls.iri_to_uri(iri) self.assertEqual(urls.uri_to_iri(uri), iri)
def test_uri_to_iri_idempotence_non_ascii(self): uri = 'http://xn--n3h/%E2%98%83' uri = urls.uri_to_iri(uri) self.assertEqual(urls.uri_to_iri(uri), uri)
def test_uri_to_iri_idempotence_ascii_only(self): uri = 'http://www.idempoten.ce' uri = urls.uri_to_iri(uri) self.assertEqual(urls.uri_to_iri(uri), uri)
def extract_path_info(environ_or_baseurl, path_or_url, errors='replace', collapse_http_schemes=True): """Extracts the path info from the given URL (or WSGI environment) and path. The path info returned is a unicode string, not a bytestring suitable for a WSGI environment. The URLs might also be IRIs. If the path info could not be determined, `None` is returned. Some examples: >>> extract_path_info('http://example.com/app', '/app/hello') u'/hello' >>> extract_path_info('http://example.com/app', ... 'https://example.com/app/hello') u'/hello' >>> extract_path_info('http://example.com/app', ... 'https://example.com/app/hello', ... collapse_http_schemes=False) is None True Instead of providing a base URL you can also pass a WSGI environment. :param environ_or_baseurl: A WSGI environment dict, a base URL or base IRI. This is the root of the application. :param path_or_url: An absolute path from the server root, a relative path (in which case it's the path info) or a full URL. Also accepts IRIs and unicode parameters. :param errors: The error handling on decode. :param collapse_http_schemes: If set to `False` the algorithm does not assume that http and https on the same server point to the same resource. """ def _normalize_netloc(scheme, netloc): parts = netloc.split(u'@', 1)[-1].split(u':', 1) if len(parts) == 2: netloc, port = parts if ((scheme == u'http' and port == u'80') or (scheme == u'https' and port == u'443')): port = None else: netloc = parts[0] port = None if port is not None: netloc += u':' + port return netloc # make sure whatever we are working on is a IRI and parse it path = uri_to_iri(path_or_url, errors=errors) if isinstance(environ_or_baseurl, dict): environ_or_baseurl = get_current_url(environ_or_baseurl, root_only=True) base_iri = uri_to_iri(environ_or_baseurl, errors=errors) base_scheme, base_netloc, base_path = urlsplit(base_iri)[:3] cur_scheme, cur_netloc, cur_path, = urlsplit(urljoin(base_iri, path))[:3] # normalize the network location base_netloc = _normalize_netloc(base_scheme, base_netloc) cur_netloc = _normalize_netloc(cur_scheme, cur_netloc) # is that IRI even on a known HTTP scheme? if collapse_http_schemes: for scheme in base_scheme, cur_scheme: if scheme not in (u'http', u'https'): return None else: if not (base_scheme in (u'http', u'https') and base_scheme == cur_scheme): return None # are the netlocs compatible? if base_netloc != cur_netloc: return None # are we below the application path? base_path = base_path.rstrip(u'/') if not cur_path.startswith(base_path): return None return u'/' + cur_path[len(base_path):].lstrip(u'/')
def test_iri_safe_quoting(self): uri = 'http://xn--f-1gaa.com/%2F%25?q=%C3%B6&x=%3D%25#%25' iri = 'http://föö.com/%2F%25?q=ö&x=%3D%25#%25' self.assertEqual(urls.uri_to_iri(uri), iri) self.assertEqual(urls.iri_to_uri(urls.uri_to_iri(uri)), uri)