def test_urlsplit_attributes(self): url = "HTTP://WWW.PYTHON.ORG/doc/#frag" p = urlparse.urlsplit(url) self.assertEqual(p.scheme, "http") self.assertEqual(p.netloc, "WWW.PYTHON.ORG") self.assertEqual(p.path, "/doc/") self.assertEqual(p.query, "") self.assertEqual(p.fragment, "frag") self.assertEqual(p.username, None) self.assertEqual(p.password, None) self.assertEqual(p.hostname, "www.python.org") self.assertEqual(p.port, None) # geturl() won't return exactly the original URL in this case # since the scheme is always case-normalized #self.assertEqual(p.geturl(), url) url = "http://*****:*****@www.python.org:080/doc/?query=yes#frag" p = urlparse.urlsplit(url) self.assertEqual(p.scheme, "http") self.assertEqual(p.netloc, "User:[email protected]:080") self.assertEqual(p.path, "/doc/") self.assertEqual(p.query, "query=yes") self.assertEqual(p.fragment, "frag") self.assertEqual(p.username, "User") self.assertEqual(p.password, "Pass") self.assertEqual(p.hostname, "www.python.org") self.assertEqual(p.port, 80) self.assertEqual(p.geturl(), url) # Addressing issue1698, which suggests Username can contain # "@" characters. Though not RFC compliant, many ftp sites allow # and request email addresses as usernames. url = "http://[email protected]:[email protected]:080/doc/?query=yes#frag" p = urlparse.urlsplit(url) self.assertEqual(p.scheme, "http") self.assertEqual(p.netloc, "[email protected]:[email protected]:080") self.assertEqual(p.path, "/doc/") self.assertEqual(p.query, "query=yes") self.assertEqual(p.fragment, "frag") self.assertEqual(p.username, "*****@*****.**") self.assertEqual(p.password, "Pass") self.assertEqual(p.hostname, "www.python.org") self.assertEqual(p.port, 80) self.assertEqual(p.geturl(), url) # Verify an illegal port of value greater than 65535 is set as None url = "http://www.python.org:65536" p = urlparse.urlsplit(url) self.assertEqual(p.port, None)
def test_urlsplit_attributes(self): url = "HTTP://WWW.PYTHON.ORG/doc/#frag" p = urlparse.urlsplit(url) self.assertEqual(p.scheme, "http") self.assertEqual(p.netloc, "WWW.PYTHON.ORG") self.assertEqual(p.path, "/doc/") self.assertEqual(p.query, "") self.assertEqual(p.fragment, "frag") self.assertEqual(p.username, None) self.assertEqual(p.password, None) self.assertEqual(p.hostname, "www.python.org") self.assertEqual(p.port, None) # geturl() won't return exactly the original URL in this case # since the scheme is always case-normalized # self.assertEqual(p.geturl(), url) url = "http://*****:*****@www.python.org:080/doc/?query=yes#frag" p = urlparse.urlsplit(url) self.assertEqual(p.scheme, "http") self.assertEqual(p.netloc, "User:[email protected]:080") self.assertEqual(p.path, "/doc/") self.assertEqual(p.query, "query=yes") self.assertEqual(p.fragment, "frag") self.assertEqual(p.username, "User") self.assertEqual(p.password, "Pass") self.assertEqual(p.hostname, "www.python.org") self.assertEqual(p.port, 80) self.assertEqual(p.geturl(), url) # Addressing issue1698, which suggests Username can contain # "@" characters. Though not RFC compliant, many ftp sites allow # and request email addresses as usernames. url = "http://[email protected]:[email protected]:080/doc/?query=yes#frag" p = urlparse.urlsplit(url) self.assertEqual(p.scheme, "http") self.assertEqual(p.netloc, "[email protected]:[email protected]:080") self.assertEqual(p.path, "/doc/") self.assertEqual(p.query, "query=yes") self.assertEqual(p.fragment, "frag") self.assertEqual(p.username, "*****@*****.**") self.assertEqual(p.password, "Pass") self.assertEqual(p.hostname, "www.python.org") self.assertEqual(p.port, 80) self.assertEqual(p.geturl(), url) # Verify an illegal port of value greater than 65535 is set as None url = "http://www.python.org:65536" p = urlparse.urlsplit(url) self.assertEqual(p.port, None)
def checkRoundtrips(self, url, parsed, split): result = urlparse.urlparse(url) self.assertEqual(result, parsed) t = (result.scheme, result.netloc, result.path, result.params, result.query, result.fragment) self.assertEqual(t, parsed) # put it back together and it should be the same result2 = urlparse.urlunparse(result) self.assertEqual(result2, url) self.assertEqual(result2, result.geturl()) # the result of geturl() is a fixpoint; we can always parse it # again to get the same result: result3 = urlparse.urlparse(result.geturl()) self.assertEqual(result3.geturl(), result.geturl()) self.assertEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) self.assertEqual(result3.netloc, result.netloc) self.assertEqual(result3.path, result.path) self.assertEqual(result3.params, result.params) self.assertEqual(result3.query, result.query) self.assertEqual(result3.fragment, result.fragment) self.assertEqual(result3.username, result.username) self.assertEqual(result3.password, result.password) self.assertEqual(result3.hostname, result.hostname) self.assertEqual(result3.port, result.port) # check the roundtrip using urlsplit() as well result = urlparse.urlsplit(url) self.assertEqual(result, split) t = (result.scheme, result.netloc, result.path, result.query, result.fragment) self.assertEqual(t, split) result2 = urlparse.urlunsplit(result) self.assertEqual(result2, url) self.assertEqual(result2, result.geturl()) # check the fixpoint property of re-parsing the result of geturl() result3 = urlparse.urlsplit(result.geturl()) self.assertEqual(result3.geturl(), result.geturl()) self.assertEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) self.assertEqual(result3.netloc, result.netloc) self.assertEqual(result3.path, result.path) self.assertEqual(result3.query, result.query) self.assertEqual(result3.fragment, result.fragment) self.assertEqual(result3.username, result.username) self.assertEqual(result3.password, result.password) self.assertEqual(result3.hostname, result.hostname) self.assertEqual(result3.port, result.port)
def test_issue14072(self): p1 = urlparse.urlsplit('tel:+31-641044153') self.assertEqual(p1.scheme, 'tel') self.assertEqual(p1.path, '+31-641044153') p2 = urlparse.urlsplit('tel:+31641044153') self.assertEqual(p2.scheme, 'tel') self.assertEqual(p2.path, '+31641044153') # Assert for urlparse p1 = urlparse.urlparse('tel:+31-641044153') self.assertEqual(p1.scheme, 'tel') self.assertEqual(p1.path, '+31-641044153') p2 = urlparse.urlparse('tel:+31641044153') self.assertEqual(p2.scheme, 'tel') self.assertEqual(p2.path, '+31641044153')
def test_attributes_bad_port(self): """Check handling of non-integer ports.""" p = urlparse.urlsplit("http://www.example.net:foo") self.assertEqual(p.netloc, "www.example.net:foo") self.assertRaises(ValueError, lambda: p.port) p = urlparse.urlparse("http://www.example.net:foo") self.assertEqual(p.netloc, "www.example.net:foo") self.assertRaises(ValueError, lambda: p.port)
def test_unparse_parse(self): for u in [ 'Python', './Python', 'x-newscheme://foo.com/stuff', 'x://y', 'x:/y', 'x:/', '/', ]: self.assertEqual(urlparse.urlunsplit(urlparse.urlsplit(u)), u) self.assertEqual(urlparse.urlunparse(urlparse.urlparse(u)), u)
def __init__(self, url, check_encoding=False): if isinstance(url, unicode): self.url = url.encode("utf-8") else: self.url = url if check_encoding: try: self.url.decode('ascii') except UnicodeDecodeError: p = urlparse.urlsplit(self.url) # TODO: check the rightfulness of this! self.url = urlparse.urlunsplit(( p[0], p[1], urllib.quote(p[2], safe="/"), urllib.quote(p[3], safe="&?="), urllib.quote(p[4]) ))
def __init__(self, url, check_encoding=False): if isinstance(url, py2_unicode): self.url = url.encode("utf-8") else: self.url = url if check_encoding: try: self.url.decode('ascii') except UnicodeDecodeError: p = urlparse.urlsplit(self.url) # TODO: check the rightfulness of this! self.url = urlparse.urlunsplit(( p[0], p[1], urllib.quote(p[2], safe=b"/"), urllib.quote(p[3], safe=b"&?="), urllib.quote(p[4]) ))
def test_attributes_without_netloc(self): # This example is straight from RFC 3261. It looks like it # should allow the username, hostname, and port to be filled # in, but doesn't. Since it's a URI and doesn't use the # scheme://netloc syntax, the netloc and related attributes # should be left empty. uri = "sip:[email protected];maddr=239.255.255.1;ttl=15" p = urlparse.urlsplit(uri) self.assertEqual(p.netloc, "") self.assertEqual(p.username, None) self.assertEqual(p.password, None) self.assertEqual(p.hostname, None) self.assertEqual(p.port, None) self.assertEqual(p.geturl(), uri) p = urlparse.urlparse(uri) self.assertEqual(p.netloc, "") self.assertEqual(p.username, None) self.assertEqual(p.password, None) self.assertEqual(p.hostname, None) self.assertEqual(p.port, None) self.assertEqual(p.geturl(), uri)
def test_urlsplit(self): for case in urlsplit_testcases: self.assertEqual(urlparse.urlsplit(case[0]), case[1])
def __getattr__(self, attr): # pylint: disable=redefined-variable-type if attr == "parsed": # try: value = urlparse.urlsplit(self.url) # except ValueError: # value = urlparse.urlsplit("about:blank") elif attr == "tldextracted": value = tld_extract(self.parsed.netloc) # value = _tldextractor(self.url) elif attr == "normalized": value = urlparse.urlunsplit(( None, self.normalized_domain, self.parsed.path if self.parsed.path else b"/", self.parsed.query, b"" )).lstrip(b"/") if value.count(b"/") == 1: value = value.strip(b"/") elif attr == "normalized_without_query": value = urlparse.urlunsplit(( None, self.normalized_domain, self.parsed.path if self.parsed.path else b"/", b"", b"" )).lstrip(b"/") if value.count(b"/") == 1: value = value.strip(b"/") elif attr == "homepage": value = urlparse.urlunsplit(( self.parsed.scheme, self.domain, b"/", b"", b"" )).strip(b"/") # Pay-level domain elif attr == "pld": value = b"%s.%s" % (self.tldextracted[1], self.tldextracted[2]) elif attr == "domain": value = self.parsed.netloc elif attr == "subdomain": value = self.tldextracted[0] elif attr == "normalized_domain": value = self.domain.strip(b".") while value.startswith(b"www."): value = value[4:] if value.endswith(b':80'): value = value[:-3] elif value.endswith(b':443'): value = value[:-4] value = value.strip(b".") elif attr == "normalized_subdomain": value = self.subdomain.strip(b".") if value == b"www": value = b"" else: while value.startswith(b"www."): value = value[4:] elif attr == "normalized_path": if self.parsed.path == b"/": return b"" return self.parsed.path # https://en.wikipedia.org/wiki/Public_Suffix_List # Returns the domain name suffix ("co.uk" for "bbc.co.uk") elif attr == "suffix": value = self.tldextracted[2] else: raise Exception("Unknown attribute %s !" % attr) self.__dict__[attr] = value return value
def test_unparse_parse(self): for u in ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',]: self.assertEqual(urlparse.urlunsplit(urlparse.urlsplit(u)), u) self.assertEqual(urlparse.urlunparse(urlparse.urlparse(u)), u)
def __getattr__(self, attr): # pylint: disable=redefined-variable-type if attr == "parsed": # try: value = urlparse.urlsplit(self.url) # except ValueError: # value = urlparse.urlsplit("about:blank") elif attr == "tldextracted": value = tld_extract(self.parsed.netloc) # value = _tldextractor(self.url) elif attr == "normalized": value = urlparse.urlunsplit(( None, self.normalized_domain, self.parsed.path if self.parsed.path else "/", self.parsed.query, "" )).lstrip("/") if value.count("/") == 1: value = value.strip("/") elif attr == "normalized_without_query": value = urlparse.urlunsplit(( None, self.normalized_domain, self.parsed.path if self.parsed.path else "/", "", "" )).lstrip("/") if value.count("/") == 1: value = value.strip("/") elif attr == "homepage": value = urlparse.urlunsplit(( self.parsed.scheme, self.domain, "/", "", "" )).strip("/") # Pay-level domain elif attr == "pld": value = "%s.%s" % (self.tldextracted[1], self.tldextracted[2]) elif attr == "domain": value = self.parsed.netloc elif attr == "subdomain": value = self.tldextracted[0] elif attr == "normalized_domain": value = self.domain.strip(".") while value.startswith("www."): value = value[4:] if value.endswith(':80'): value = value[:-3] elif value.endswith(':443'): value = value[:-4] value = value.strip(".") elif attr == "normalized_subdomain": value = self.subdomain.strip(".") if value == "www": value = "" else: while value.startswith("www."): value = value[4:] elif attr == "normalized_path": if self.parsed.path == "/": return "" return self.parsed.path # https://en.wikipedia.org/wiki/Public_Suffix_List # Returns the domain name suffix ("co.uk" for "bbc.co.uk") elif attr == "suffix": value = self.tldextracted[2] else: raise Exception("Unknown attribute %s !" % attr) self.__dict__[attr] = value return value
row = [name, sum(times), mean(times), median(times), percentile(times, 90)] print(row) data.append(row) def title(name): data.append(["", "", "", "", ""]) data.append(["%s:" % name, "", "", "", ""]) data.append(["----", "----", "----", "----", "----"]) # Segfault: https://github.com/mitghi/cyuri/issues/1 cyuri_parser = cyuri.uriparser() title("urlsplit") benchmark("urlparse4", lambda url: urlparse4.urlsplit(url)) benchmark("pygurl", lambda url: pygurl.ParseStandard(url)) benchmark("uritools", lambda url: uritools_urisplit(url)) benchmark("yurl", lambda url: yurl_url(url)) benchmark("urlparse2", lambda url: urlparse2.urlsplit(url)) benchmark("urlparse", lambda url: urlparse.urlsplit(url)) benchmark("cyuri", lambda url: cyuri_parser.components(url)) title("urljoin_sibling") benchmark("urlparse4", lambda url: urlparse4.urljoin(url, "sibling.html?q=1#e=b")) benchmark("pygurl", lambda url: pygurl.URL(url).Resolve("sibling.html?q=1#e=b")) benchmark("uritools", lambda url: uritools_urijoin(url, "sibling.html?q=1#e=b")) benchmark("yurl", lambda url: yurl_url(url) + yurl_url("sibling.html?q=1#e=b")) benchmark("urlparse2", lambda url: urlparse2.urljoin(url, "sibling.html?q=1#e=b"))
row = [name, sum(times), mean(times), median(times), percentile(times, 90)] print row data.append(row) def title(name): data.append(["", "", "", "", ""]) data.append(["%s:" % name, "", "", "", ""]) data.append(["----", "----", "----", "----", "----"]) # Segfault: https://github.com/mitghi/cyuri/issues/1 cyuri_parser = cyuri.uriparser() title("urlsplit") benchmark("urlparse4", lambda url: urlparse4.urlsplit(url)) benchmark("pygurl", lambda url: pygurl.ParseStandard(url)) benchmark("uritools", lambda url: uritools_urisplit(url)) benchmark("yurl", lambda url: yurl_url(url)) benchmark("urlparse2", lambda url: urlparse2.urlsplit(url)) benchmark("urlparse", lambda url: urlparse.urlsplit(url)) benchmark("cyuri", lambda url: cyuri_parser.components(url)) title("urljoin_sibling") benchmark("urlparse4", lambda url: urlparse4.urljoin(url, "sibling.html?q=1#e=b")) benchmark("pygurl", lambda url: pygurl.URL(url).Resolve("sibling.html?q=1#e=b")) benchmark("uritools", lambda url: uritools_urijoin(url, "sibling.html?q=1#e=b")) benchmark("yurl", lambda url: yurl_url(url) + yurl_url("sibling.html?q=1#e=b")) benchmark("urlparse2", lambda url: urlparse2.urljoin(url, "sibling.html?q=1#e=b")) benchmark("urlparse", lambda url: urlparse.urljoin(url, "sibling.html?q=1#e=b")) benchmark("cyuri", lambda url: cyuri_parser.join(url, "sibling.html?q=1#e=b"))