Пример #1
0
    def test_urlsplit_attributes(self):
        url = "HTTP://WWW.PYTHON.ORG/doc/#frag"
        p = urlparse.urlsplit(url)
        self.assertEqual(p.scheme, "http")
        self.assertEqual(p.netloc, "WWW.PYTHON.ORG")
        self.assertEqual(p.path, "/doc/")
        self.assertEqual(p.query, "")
        self.assertEqual(p.fragment, "frag")
        self.assertEqual(p.username, None)
        self.assertEqual(p.password, None)
        self.assertEqual(p.hostname, "www.python.org")
        self.assertEqual(p.port, None)
        # geturl() won't return exactly the original URL in this case
        # since the scheme is always case-normalized
        #self.assertEqual(p.geturl(), url)

        url = "http://*****:*****@www.python.org:080/doc/?query=yes#frag"
        p = urlparse.urlsplit(url)
        self.assertEqual(p.scheme, "http")
        self.assertEqual(p.netloc, "User:[email protected]:080")
        self.assertEqual(p.path, "/doc/")
        self.assertEqual(p.query, "query=yes")
        self.assertEqual(p.fragment, "frag")
        self.assertEqual(p.username, "User")
        self.assertEqual(p.password, "Pass")
        self.assertEqual(p.hostname, "www.python.org")
        self.assertEqual(p.port, 80)
        self.assertEqual(p.geturl(), url)

        # Addressing issue1698, which suggests Username can contain
        # "@" characters.  Though not RFC compliant, many ftp sites allow
        # and request email addresses as usernames.

        url = "http://[email protected]:[email protected]:080/doc/?query=yes#frag"
        p = urlparse.urlsplit(url)
        self.assertEqual(p.scheme, "http")
        self.assertEqual(p.netloc, "[email protected]:[email protected]:080")
        self.assertEqual(p.path, "/doc/")
        self.assertEqual(p.query, "query=yes")
        self.assertEqual(p.fragment, "frag")
        self.assertEqual(p.username, "*****@*****.**")
        self.assertEqual(p.password, "Pass")
        self.assertEqual(p.hostname, "www.python.org")
        self.assertEqual(p.port, 80)
        self.assertEqual(p.geturl(), url)

        # Verify an illegal port of value greater than 65535 is set as None
        url = "http://www.python.org:65536"
        p = urlparse.urlsplit(url)
        self.assertEqual(p.port, None)
Пример #2
0
    def test_urlsplit_attributes(self):
        url = "HTTP://WWW.PYTHON.ORG/doc/#frag"
        p = urlparse.urlsplit(url)
        self.assertEqual(p.scheme, "http")
        self.assertEqual(p.netloc, "WWW.PYTHON.ORG")
        self.assertEqual(p.path, "/doc/")
        self.assertEqual(p.query, "")
        self.assertEqual(p.fragment, "frag")
        self.assertEqual(p.username, None)
        self.assertEqual(p.password, None)
        self.assertEqual(p.hostname, "www.python.org")
        self.assertEqual(p.port, None)
        # geturl() won't return exactly the original URL in this case
        # since the scheme is always case-normalized
        # self.assertEqual(p.geturl(), url)

        url = "http://*****:*****@www.python.org:080/doc/?query=yes#frag"
        p = urlparse.urlsplit(url)
        self.assertEqual(p.scheme, "http")
        self.assertEqual(p.netloc, "User:[email protected]:080")
        self.assertEqual(p.path, "/doc/")
        self.assertEqual(p.query, "query=yes")
        self.assertEqual(p.fragment, "frag")
        self.assertEqual(p.username, "User")
        self.assertEqual(p.password, "Pass")
        self.assertEqual(p.hostname, "www.python.org")
        self.assertEqual(p.port, 80)
        self.assertEqual(p.geturl(), url)

        # Addressing issue1698, which suggests Username can contain
        # "@" characters.  Though not RFC compliant, many ftp sites allow
        # and request email addresses as usernames.

        url = "http://[email protected]:[email protected]:080/doc/?query=yes#frag"
        p = urlparse.urlsplit(url)
        self.assertEqual(p.scheme, "http")
        self.assertEqual(p.netloc, "[email protected]:[email protected]:080")
        self.assertEqual(p.path, "/doc/")
        self.assertEqual(p.query, "query=yes")
        self.assertEqual(p.fragment, "frag")
        self.assertEqual(p.username, "*****@*****.**")
        self.assertEqual(p.password, "Pass")
        self.assertEqual(p.hostname, "www.python.org")
        self.assertEqual(p.port, 80)
        self.assertEqual(p.geturl(), url)

        # Verify an illegal port of value greater than 65535 is set as None
        url = "http://www.python.org:65536"
        p = urlparse.urlsplit(url)
        self.assertEqual(p.port, None)
Пример #3
0
    def checkRoundtrips(self, url, parsed, split):
        result = urlparse.urlparse(url)
        self.assertEqual(result, parsed)
        t = (result.scheme, result.netloc, result.path, result.params,
             result.query, result.fragment)
        self.assertEqual(t, parsed)
        # put it back together and it should be the same
        result2 = urlparse.urlunparse(result)
        self.assertEqual(result2, url)
        self.assertEqual(result2, result.geturl())

        # the result of geturl() is a fixpoint; we can always parse it
        # again to get the same result:
        result3 = urlparse.urlparse(result.geturl())
        self.assertEqual(result3.geturl(), result.geturl())
        self.assertEqual(result3, result)
        self.assertEqual(result3.scheme, result.scheme)
        self.assertEqual(result3.netloc, result.netloc)
        self.assertEqual(result3.path, result.path)
        self.assertEqual(result3.params, result.params)
        self.assertEqual(result3.query, result.query)
        self.assertEqual(result3.fragment, result.fragment)
        self.assertEqual(result3.username, result.username)
        self.assertEqual(result3.password, result.password)
        self.assertEqual(result3.hostname, result.hostname)
        self.assertEqual(result3.port, result.port)

        # check the roundtrip using urlsplit() as well
        result = urlparse.urlsplit(url)
        self.assertEqual(result, split)
        t = (result.scheme, result.netloc, result.path, result.query,
             result.fragment)
        self.assertEqual(t, split)
        result2 = urlparse.urlunsplit(result)
        self.assertEqual(result2, url)
        self.assertEqual(result2, result.geturl())

        # check the fixpoint property of re-parsing the result of geturl()
        result3 = urlparse.urlsplit(result.geturl())
        self.assertEqual(result3.geturl(), result.geturl())
        self.assertEqual(result3, result)
        self.assertEqual(result3.scheme, result.scheme)
        self.assertEqual(result3.netloc, result.netloc)
        self.assertEqual(result3.path, result.path)
        self.assertEqual(result3.query, result.query)
        self.assertEqual(result3.fragment, result.fragment)
        self.assertEqual(result3.username, result.username)
        self.assertEqual(result3.password, result.password)
        self.assertEqual(result3.hostname, result.hostname)
        self.assertEqual(result3.port, result.port)
Пример #4
0
    def checkRoundtrips(self, url, parsed, split):
        result = urlparse.urlparse(url)
        self.assertEqual(result, parsed)
        t = (result.scheme, result.netloc, result.path,
             result.params, result.query, result.fragment)
        self.assertEqual(t, parsed)
        # put it back together and it should be the same
        result2 = urlparse.urlunparse(result)
        self.assertEqual(result2, url)
        self.assertEqual(result2, result.geturl())

        # the result of geturl() is a fixpoint; we can always parse it
        # again to get the same result:
        result3 = urlparse.urlparse(result.geturl())
        self.assertEqual(result3.geturl(), result.geturl())
        self.assertEqual(result3,          result)
        self.assertEqual(result3.scheme,   result.scheme)
        self.assertEqual(result3.netloc,   result.netloc)
        self.assertEqual(result3.path,     result.path)
        self.assertEqual(result3.params,   result.params)
        self.assertEqual(result3.query,    result.query)
        self.assertEqual(result3.fragment, result.fragment)
        self.assertEqual(result3.username, result.username)
        self.assertEqual(result3.password, result.password)
        self.assertEqual(result3.hostname, result.hostname)
        self.assertEqual(result3.port,     result.port)

        # check the roundtrip using urlsplit() as well
        result = urlparse.urlsplit(url)
        self.assertEqual(result, split)
        t = (result.scheme, result.netloc, result.path,
             result.query, result.fragment)
        self.assertEqual(t, split)
        result2 = urlparse.urlunsplit(result)
        self.assertEqual(result2, url)
        self.assertEqual(result2, result.geturl())

        # check the fixpoint property of re-parsing the result of geturl()
        result3 = urlparse.urlsplit(result.geturl())
        self.assertEqual(result3.geturl(), result.geturl())
        self.assertEqual(result3,          result)
        self.assertEqual(result3.scheme,   result.scheme)
        self.assertEqual(result3.netloc,   result.netloc)
        self.assertEqual(result3.path,     result.path)
        self.assertEqual(result3.query,    result.query)
        self.assertEqual(result3.fragment, result.fragment)
        self.assertEqual(result3.username, result.username)
        self.assertEqual(result3.password, result.password)
        self.assertEqual(result3.hostname, result.hostname)
        self.assertEqual(result3.port,     result.port)
Пример #5
0
    def test_issue14072(self):
        p1 = urlparse.urlsplit('tel:+31-641044153')
        self.assertEqual(p1.scheme, 'tel')
        self.assertEqual(p1.path, '+31-641044153')

        p2 = urlparse.urlsplit('tel:+31641044153')
        self.assertEqual(p2.scheme, 'tel')
        self.assertEqual(p2.path, '+31641044153')

        # Assert for urlparse
        p1 = urlparse.urlparse('tel:+31-641044153')
        self.assertEqual(p1.scheme, 'tel')
        self.assertEqual(p1.path, '+31-641044153')

        p2 = urlparse.urlparse('tel:+31641044153')
        self.assertEqual(p2.scheme, 'tel')
        self.assertEqual(p2.path, '+31641044153')
Пример #6
0
    def test_issue14072(self):
        p1 = urlparse.urlsplit('tel:+31-641044153')
        self.assertEqual(p1.scheme, 'tel')
        self.assertEqual(p1.path, '+31-641044153')

        p2 = urlparse.urlsplit('tel:+31641044153')
        self.assertEqual(p2.scheme, 'tel')
        self.assertEqual(p2.path, '+31641044153')

        # Assert for urlparse
        p1 = urlparse.urlparse('tel:+31-641044153')
        self.assertEqual(p1.scheme, 'tel')
        self.assertEqual(p1.path, '+31-641044153')

        p2 = urlparse.urlparse('tel:+31641044153')
        self.assertEqual(p2.scheme, 'tel')
        self.assertEqual(p2.path, '+31641044153')
Пример #7
0
    def test_attributes_bad_port(self):
        """Check handling of non-integer ports."""
        p = urlparse.urlsplit("http://www.example.net:foo")
        self.assertEqual(p.netloc, "www.example.net:foo")
        self.assertRaises(ValueError, lambda: p.port)

        p = urlparse.urlparse("http://www.example.net:foo")
        self.assertEqual(p.netloc, "www.example.net:foo")
        self.assertRaises(ValueError, lambda: p.port)
Пример #8
0
    def test_attributes_bad_port(self):
        """Check handling of non-integer ports."""
        p = urlparse.urlsplit("http://www.example.net:foo")
        self.assertEqual(p.netloc, "www.example.net:foo")
        self.assertRaises(ValueError, lambda: p.port)

        p = urlparse.urlparse("http://www.example.net:foo")
        self.assertEqual(p.netloc, "www.example.net:foo")
        self.assertRaises(ValueError, lambda: p.port)
Пример #9
0
 def test_unparse_parse(self):
     for u in [
             'Python',
             './Python',
             'x-newscheme://foo.com/stuff',
             'x://y',
             'x:/y',
             'x:/',
             '/',
     ]:
         self.assertEqual(urlparse.urlunsplit(urlparse.urlsplit(u)), u)
         self.assertEqual(urlparse.urlunparse(urlparse.urlparse(u)), u)
Пример #10
0
    def __init__(self, url, check_encoding=False):

        if isinstance(url, unicode):
            self.url = url.encode("utf-8")
        else:
            self.url = url

        if check_encoding:
            try:
                self.url.decode('ascii')
            except UnicodeDecodeError:
                p = urlparse.urlsplit(self.url)

                # TODO: check the rightfulness of this!
                self.url = urlparse.urlunsplit((
                    p[0],
                    p[1],
                    urllib.quote(p[2], safe="/"),
                    urllib.quote(p[3], safe="&?="),
                    urllib.quote(p[4])
                ))
Пример #11
0
    def __init__(self, url, check_encoding=False):

        if isinstance(url, py2_unicode):
            self.url = url.encode("utf-8")
        else:
            self.url = url

        if check_encoding:
            try:
                self.url.decode('ascii')
            except UnicodeDecodeError:
                p = urlparse.urlsplit(self.url)

                # TODO: check the rightfulness of this!
                self.url = urlparse.urlunsplit((
                    p[0],
                    p[1],
                    urllib.quote(p[2], safe=b"/"),
                    urllib.quote(p[3], safe=b"&?="),
                    urllib.quote(p[4])
                ))
Пример #12
0
    def test_attributes_without_netloc(self):
        # This example is straight from RFC 3261.  It looks like it
        # should allow the username, hostname, and port to be filled
        # in, but doesn't.  Since it's a URI and doesn't use the
        # scheme://netloc syntax, the netloc and related attributes
        # should be left empty.
        uri = "sip:[email protected];maddr=239.255.255.1;ttl=15"
        p = urlparse.urlsplit(uri)
        self.assertEqual(p.netloc, "")
        self.assertEqual(p.username, None)
        self.assertEqual(p.password, None)
        self.assertEqual(p.hostname, None)
        self.assertEqual(p.port, None)
        self.assertEqual(p.geturl(), uri)

        p = urlparse.urlparse(uri)
        self.assertEqual(p.netloc, "")
        self.assertEqual(p.username, None)
        self.assertEqual(p.password, None)
        self.assertEqual(p.hostname, None)
        self.assertEqual(p.port, None)
        self.assertEqual(p.geturl(), uri)
Пример #13
0
    def test_attributes_without_netloc(self):
        # This example is straight from RFC 3261.  It looks like it
        # should allow the username, hostname, and port to be filled
        # in, but doesn't.  Since it's a URI and doesn't use the
        # scheme://netloc syntax, the netloc and related attributes
        # should be left empty.
        uri = "sip:[email protected];maddr=239.255.255.1;ttl=15"
        p = urlparse.urlsplit(uri)
        self.assertEqual(p.netloc, "")
        self.assertEqual(p.username, None)
        self.assertEqual(p.password, None)
        self.assertEqual(p.hostname, None)
        self.assertEqual(p.port, None)
        self.assertEqual(p.geturl(), uri)

        p = urlparse.urlparse(uri)
        self.assertEqual(p.netloc, "")
        self.assertEqual(p.username, None)
        self.assertEqual(p.password, None)
        self.assertEqual(p.hostname, None)
        self.assertEqual(p.port, None)
        self.assertEqual(p.geturl(), uri)
Пример #14
0
 def test_urlsplit(self):
     for case in urlsplit_testcases:
         self.assertEqual(urlparse.urlsplit(case[0]), case[1])
Пример #15
0
    def __getattr__(self, attr):
        # pylint: disable=redefined-variable-type

        if attr == "parsed":
            # try:
            value = urlparse.urlsplit(self.url)
            # except ValueError:
            #     value = urlparse.urlsplit("about:blank")

        elif attr == "tldextracted":

            value = tld_extract(self.parsed.netloc)
            # value = _tldextractor(self.url)

        elif attr == "normalized":
            value = urlparse.urlunsplit((
                None,
                self.normalized_domain,
                self.parsed.path if self.parsed.path else b"/",
                self.parsed.query,
                b""
            )).lstrip(b"/")

            if value.count(b"/") == 1:
                value = value.strip(b"/")

        elif attr == "normalized_without_query":
            value = urlparse.urlunsplit((
                None,
                self.normalized_domain,
                self.parsed.path if self.parsed.path else b"/",
                b"",
                b""
            )).lstrip(b"/")

            if value.count(b"/") == 1:
                value = value.strip(b"/")

        elif attr == "homepage":
            value = urlparse.urlunsplit((
                self.parsed.scheme,
                self.domain,
                b"/",
                b"",
                b""
            )).strip(b"/")

        # Pay-level domain
        elif attr == "pld":
            value = b"%s.%s" % (self.tldextracted[1], self.tldextracted[2])

        elif attr == "domain":
            value = self.parsed.netloc

        elif attr == "subdomain":
            value = self.tldextracted[0]

        elif attr == "normalized_domain":

            value = self.domain.strip(b".")

            while value.startswith(b"www."):
                value = value[4:]

            if value.endswith(b':80'):
                value = value[:-3]
            elif value.endswith(b':443'):
                value = value[:-4]

            value = value.strip(b".")

        elif attr == "normalized_subdomain":

            value = self.subdomain.strip(b".")

            if value == b"www":
                value = b""
            else:
                while value.startswith(b"www."):
                    value = value[4:]

        elif attr == "normalized_path":
            if self.parsed.path == b"/":
                return b""
            return self.parsed.path

        # https://en.wikipedia.org/wiki/Public_Suffix_List
        # Returns the domain name suffix ("co.uk" for "bbc.co.uk")
        elif attr == "suffix":
            value = self.tldextracted[2]

        else:
            raise Exception("Unknown attribute %s !" % attr)

        self.__dict__[attr] = value
        return value
Пример #16
0
 def test_unparse_parse(self):
     for u in ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',]:
         self.assertEqual(urlparse.urlunsplit(urlparse.urlsplit(u)), u)
         self.assertEqual(urlparse.urlunparse(urlparse.urlparse(u)), u)
Пример #17
0
    def __getattr__(self, attr):
        # pylint: disable=redefined-variable-type

        if attr == "parsed":
            # try:
            value = urlparse.urlsplit(self.url)
            # except ValueError:
            #     value = urlparse.urlsplit("about:blank")

        elif attr == "tldextracted":

            value = tld_extract(self.parsed.netloc)
            # value = _tldextractor(self.url)

        elif attr == "normalized":
            value = urlparse.urlunsplit((
                None,
                self.normalized_domain,
                self.parsed.path if self.parsed.path else "/",
                self.parsed.query,
                ""
            )).lstrip("/")

            if value.count("/") == 1:
                value = value.strip("/")

        elif attr == "normalized_without_query":
            value = urlparse.urlunsplit((
                None,
                self.normalized_domain,
                self.parsed.path if self.parsed.path else "/",
                "",
                ""
            )).lstrip("/")

            if value.count("/") == 1:
                value = value.strip("/")

        elif attr == "homepage":
            value = urlparse.urlunsplit((
                self.parsed.scheme,
                self.domain,
                "/",
                "",
                ""
            )).strip("/")

        # Pay-level domain
        elif attr == "pld":
            value = "%s.%s" % (self.tldextracted[1], self.tldextracted[2])

        elif attr == "domain":
            value = self.parsed.netloc

        elif attr == "subdomain":
            value = self.tldextracted[0]

        elif attr == "normalized_domain":

            value = self.domain.strip(".")

            while value.startswith("www."):
                value = value[4:]

            if value.endswith(':80'):
                value = value[:-3]
            elif value.endswith(':443'):
                value = value[:-4]

            value = value.strip(".")

        elif attr == "normalized_subdomain":

            value = self.subdomain.strip(".")

            if value == "www":
                value = ""
            else:
                while value.startswith("www."):
                    value = value[4:]

        elif attr == "normalized_path":
            if self.parsed.path == "/":
                return ""
            return self.parsed.path

        # https://en.wikipedia.org/wiki/Public_Suffix_List
        # Returns the domain name suffix ("co.uk" for "bbc.co.uk")
        elif attr == "suffix":
            value = self.tldextracted[2]

        else:
            raise Exception("Unknown attribute %s !" % attr)

        self.__dict__[attr] = value
        return value
Пример #18
0
    row = [name, sum(times), mean(times), median(times), percentile(times, 90)]
    print(row)
    data.append(row)


def title(name):
    data.append(["", "", "", "", ""])
    data.append(["%s:" % name, "", "", "", ""])
    data.append(["----", "----", "----", "----", "----"])


# Segfault: https://github.com/mitghi/cyuri/issues/1
cyuri_parser = cyuri.uriparser()

title("urlsplit")
benchmark("urlparse4", lambda url: urlparse4.urlsplit(url))
benchmark("pygurl", lambda url: pygurl.ParseStandard(url))
benchmark("uritools", lambda url: uritools_urisplit(url))
benchmark("yurl", lambda url: yurl_url(url))
benchmark("urlparse2", lambda url: urlparse2.urlsplit(url))
benchmark("urlparse", lambda url: urlparse.urlsplit(url))
benchmark("cyuri", lambda url: cyuri_parser.components(url))

title("urljoin_sibling")
benchmark("urlparse4",
          lambda url: urlparse4.urljoin(url, "sibling.html?q=1#e=b"))
benchmark("pygurl", lambda url: pygurl.URL(url).Resolve("sibling.html?q=1#e=b"))
benchmark("uritools", lambda url: uritools_urijoin(url, "sibling.html?q=1#e=b"))
benchmark("yurl", lambda url: yurl_url(url) + yurl_url("sibling.html?q=1#e=b"))
benchmark("urlparse2",
          lambda url: urlparse2.urljoin(url, "sibling.html?q=1#e=b"))
Пример #19
0
    row = [name, sum(times), mean(times), median(times), percentile(times, 90)]
    print row
    data.append(row)


def title(name):
    data.append(["", "", "", "", ""])
    data.append(["%s:" % name, "", "", "", ""])
    data.append(["----", "----", "----", "----", "----"])

# Segfault: https://github.com/mitghi/cyuri/issues/1
cyuri_parser = cyuri.uriparser()

title("urlsplit")
benchmark("urlparse4", lambda url: urlparse4.urlsplit(url))
benchmark("pygurl", lambda url: pygurl.ParseStandard(url))
benchmark("uritools", lambda url: uritools_urisplit(url))
benchmark("yurl", lambda url: yurl_url(url))
benchmark("urlparse2", lambda url: urlparse2.urlsplit(url))
benchmark("urlparse", lambda url: urlparse.urlsplit(url))
benchmark("cyuri", lambda url: cyuri_parser.components(url))

title("urljoin_sibling")
benchmark("urlparse4", lambda url: urlparse4.urljoin(url, "sibling.html?q=1#e=b"))
benchmark("pygurl", lambda url: pygurl.URL(url).Resolve("sibling.html?q=1#e=b"))
benchmark("uritools", lambda url: uritools_urijoin(url, "sibling.html?q=1#e=b"))
benchmark("yurl", lambda url: yurl_url(url) + yurl_url("sibling.html?q=1#e=b"))
benchmark("urlparse2", lambda url: urlparse2.urljoin(url, "sibling.html?q=1#e=b"))
benchmark("urlparse", lambda url: urlparse.urljoin(url, "sibling.html?q=1#e=b"))
benchmark("cyuri", lambda url: cyuri_parser.join(url, "sibling.html?q=1#e=b"))