def uri_to_iri(uri, charset='utf-8', errors='replace'): r"""Converts a URI in a given charset to a IRI. Examples for URI versus IRI >>> uri_to_iri('http://xn--n3h.net/') u'http://\u2603.net/' >>> uri_to_iri('http://%C3%BCser:p%C3%[email protected]/p%C3%A5th') u'http://\xfcser:p\xe4ssword@\u2603.net/p\xe5th' Query strings are left unchanged: >>> uri_to_iri('/?foo=24&x=%26%2f') u'/?foo=24&x=%26%2f' .. versionadded:: 0.6 :param uri: the URI to convert :param charset: the charset of the URI :param errors: the error handling on decode """ uri = url_fix(str(uri), charset) scheme, auth, hostname, port, path, query, fragment = _uri_split(uri) scheme = _decode_unicode(scheme, 'ascii', errors) try: hostname = hostname.decode('idna') except UnicodeError: # dammit, that codec raised an error. Because it does not support # any error handling we have to fake it.... badly if errors not in ('ignore', 'replace'): raise hostname = hostname.decode('ascii', errors) if ':' in hostname: hostname = '[' + hostname + ']' if auth: if ':' in auth: auth, password = auth.split(':', 1) else: password = None auth = _decode_unicode(_unquote(auth), charset, errors) if password: auth += u':' + _decode_unicode(_unquote(password), charset, errors) hostname = auth + u'@' + hostname if port: # port should be numeric, but you never know... hostname += u':' + port.decode(charset, errors) path = _decode_unicode(_unquote(path, '/;?'), charset, errors) query = _decode_unicode(_unquote(query, ';/?:@&=+,$'), charset, errors) return urlparse.urlunsplit([scheme, hostname, path, query, fragment])
def uri_to_iri(uri, charset='utf-8', errors='replace'): r"""Converts a URI in a given charset to a IRI. Examples for URI versus IRI >>> uri_to_iri('http://xn--n3h.net/') u'http://\u2603.net/' >>> uri_to_iri('http://%C3%BCser:p%C3%[email protected]/p%C3%A5th') u'http://\xfcser:p\xe4ssword@\u2603.net/p\xe5th' Query strings are left unchanged: >>> uri_to_iri('/?foo=24&x=%26%2f') u'/?foo=24&x=%26%2f' .. versionadded:: 0.6 :param uri: the URI to convert :param charset: the charset of the URI :param errors: the error handling on decode """ uri = url_fix(str(uri), charset) scheme, auth, hostname, port, path, query, fragment = _uri_split(uri) scheme = _decode_unicode(scheme, 'ascii', errors) try: hostname = hostname.decode('idna') except UnicodeError: # dammit, that codec raised an error. Because it does not support # any error handling we have to fake it.... badly if errors not in ('ignore', 'replace'): raise hostname = hostname.decode('ascii', errors) if auth: if ':' in auth: auth, password = auth.split(':', 1) else: password = None auth = _decode_unicode(_unquote(auth), charset, errors) if password: auth += u':' + _decode_unicode(_unquote(password), charset, errors) hostname = auth + u'@' + hostname if port: # port should be numeric, but you never know... hostname += u':' + port.decode(charset, errors) path = _decode_unicode(_unquote(path, '/;?'), charset, errors) query = _decode_unicode(_unquote(query, ';/?:@&=+,$'), charset, errors) return urlparse.urlunsplit([scheme, hostname, path, query, fragment])
def url_unquote(s, charset='utf-8', errors='replace'): """URL decode a single string with a given decoding. Per default encoding errors are ignored. If you want a different behavior you can set `errors` to ``'replace'`` or ``'strict'``. In strict mode a `HTTPUnicodeError` is raised. :param s: the string to unquote. :param charset: the charset to be used. :param errors: the error handling for the charset decoding. """ if isinstance(s, unicode): s = s.encode(charset) return _decode_unicode(_unquote(s), charset, errors)
def _url_decode_impl(pair_iter, charset, decode_keys, include_empty, errors): for pair in pair_iter: if not pair: continue if '=' in pair: key, value = pair.split('=', 1) else: if not include_empty: continue key = pair value = '' key = _unquote_plus(key) if decode_keys: key = _decode_unicode(key, charset, errors) yield key, url_unquote_plus(value, charset, errors)