def test_typical_usage(self): self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"), "http://www.example.com/do?a=1&b=2&c=3") self.assertEqual(canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"), "http://www.example.com/do?a=3&b=2&c=1") self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"), "http://www.example.com/do?a=1")
def test_spaces(self): self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"), "http://www.example.com/do?a=1&q=a+space") self.assertEqual(canonicalize_url("http://www.example.com/do?q=a+space&a=1"), "http://www.example.com/do?a=1&q=a+space") self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"), "http://www.example.com/do?a=1&q=a+space")
def test_non_ascii_percent_encoding_in_path(self): self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"), "http://www.example.com/a%20do?a=1"), self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"), "http://www.example.com/a%20%20do?a=1"), self.assertEqual(canonicalize_url("http://www.example.com/a do\xc2\xa3.html?a=1"), "http://www.example.com/a%20do%C2%A3.html?a=1")
def test_non_ascii_percent_encoding_in_query_argument(self): self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=\xa3500&a=5&z=3"), u"http://www.example.com/do?a=5&price=%C2%A3500&z=3") self.assertEqual(canonicalize_url("http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"), "http://www.example.com/do?a=5&price=%C2%A3500&z=3") self.assertEqual(canonicalize_url("http://www.example.com/do?price(\xc2\xa3)=500&a=1"), "http://www.example.com/do?a=1&price%28%C2%A3%29=500")
def test_remove_fragments(self): self.assertEqual( canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag"), u"http://*****:*****@www.example.com/do?a=1") self.assertEqual( canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag", keep_fragments=True), u"http://*****:*****@www.example.com/do?a=1#frag")
def test_non_ascii_percent_encoding_in_path(self): self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"), "http://www.example.com/a%20do?a=1"), self.assertEqual( canonicalize_url("http://www.example.com/a %20do?a=1"), "http://www.example.com/a%20%20do?a=1"), self.assertEqual( canonicalize_url("http://www.example.com/a do\xc2\xa3.html?a=1"), "http://www.example.com/a%20do%C2%A3.html?a=1")
def test_typical_usage(self): self.assertEqual( canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"), "http://www.example.com/do?a=1&b=2&c=3") self.assertEqual( canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"), "http://www.example.com/do?a=3&b=2&c=1") self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"), "http://www.example.com/do?a=1")
def test_spaces(self): self.assertEqual( canonicalize_url("http://www.example.com/do?q=a space&a=1"), "http://www.example.com/do?a=1&q=a+space") self.assertEqual( canonicalize_url("http://www.example.com/do?q=a+space&a=1"), "http://www.example.com/do?a=1&q=a+space") self.assertEqual( canonicalize_url("http://www.example.com/do?q=a%20space&a=1"), "http://www.example.com/do?a=1&q=a+space")
def test_keep_blank_values(self): self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2", keep_blank_values=False), "http://www.example.com/do?a=2") self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"), "http://www.example.com/do?a=2&b=") self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2", keep_blank_values=False), "http://www.example.com/do?a=2") self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2"), "http://www.example.com/do?a=2&b=&c=") self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'), 'http://www.example.com/do?1750%2C4=')
def test_non_ascii_percent_encoding_in_query_argument(self): self.assertEqual( canonicalize_url( u"http://www.example.com/do?price=\xa3500&a=5&z=3"), u"http://www.example.com/do?a=5&price=%C2%A3500&z=3") self.assertEqual( canonicalize_url( "http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"), "http://www.example.com/do?a=5&price=%C2%A3500&z=3") self.assertEqual( canonicalize_url( "http://www.example.com/do?price(\xc2\xa3)=500&a=1"), "http://www.example.com/do?a=1&price%28%C2%A3%29=500")
def test_dont_convert_safe_chars(self): self.assertEqual( canonicalize_url( "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html" ), "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html" )
def test_keep_blank_values(self): self.assertEqual( canonicalize_url("http://www.example.com/do?b=&a=2", keep_blank_values=False), "http://www.example.com/do?a=2") self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"), "http://www.example.com/do?a=2&b=") self.assertEqual( canonicalize_url("http://www.example.com/do?b=&c&a=2", keep_blank_values=False), "http://www.example.com/do?a=2") self.assertEqual( canonicalize_url("http://www.example.com/do?b=&c&a=2"), "http://www.example.com/do?a=2&b=&c=") self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'), 'http://www.example.com/do?1750%2C4=')
def test_safe_characters_unicode(self): # urllib.quote uses a mapping cache of encoded characters. when parsing # an already percent-encoded url, it will fail if that url was not # percent-encoded as utf-8, that's why canonicalize_url must always # convert the urls to string. the following test asserts that # functionality. self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'), 'http://www.example.com/caf%E9-con-leche.htm')
def test_safe_characters_unicode(self): # urllib.quote uses a mapping cache of encoded characters. when parsing # an already percent-encoded url, it will fail if that url was not # percent-encoded as utf-8, that's why canonicalize_url must always # convert the urls to string. the following test asserts that # functionality. self.assertEqual( canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'), 'http://www.example.com/caf%E9-con-leche.htm')
def test_auth_and_ports(self): self.assertEqual(canonicalize_url(u"http://*****:*****@www.example.com:81/do?now=1"), u"http://*****:*****@www.example.com:81/do?now=1")
def test_remove_fragments(self): self.assertEqual(canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag"), u"http://*****:*****@www.example.com/do?a=1") self.assertEqual(canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag", keep_fragments=True), u"http://*****:*****@www.example.com/do?a=1#frag")
def test_sorting(self): self.assertEqual( canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"), "http://www.example.com/do?a=50&b=2&b=5&c=3")
def test_append_missing_path(self): self.assertEqual(canonicalize_url("http://www.example.com"), "http://www.example.com/")
def test_normalize_percent_encoding_in_path(self): self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"), "http://www.example.com/a%A3do"),
def test_normalize_percent_encoding_in_query_arguments(self): self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"), "http://www.example.com/do?k=b%A3")
def test_auth_and_ports(self): self.assertEqual( canonicalize_url(u"http://*****:*****@www.example.com:81/do?now=1"), u"http://*****:*****@www.example.com:81/do?now=1")
def test_sorting(self): self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"), "http://www.example.com/do?a=50&b=2&b=5&c=3")
def test_returns_str(self): assert isinstance(canonicalize_url(u"http://www.example.com"), str)
def _get_fingerprint(self, url): return self.fingerprint_function(canonicalize_url(url))
def test_simple_case(self): self.assertEqual(canonicalize_url("http://www.example.com/"), "http://www.example.com/")
def test_domains_are_case_insensitive(self): self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"), "http://www.example.com/")
def test_dont_convert_safe_chars(self): self.assertEqual(canonicalize_url( "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"), "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
def test_quoted_slash_and_question_sign(self): self.assertEqual( canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"), "http://foo.com/AC%2FDC+rocks%3F/?yeah=1") self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"), "http://foo.com/AC%2FDC/")
def test_quoted_slash_and_question_sign(self): self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"), "http://foo.com/AC%2FDC+rocks%3F/?yeah=1") self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"), "http://foo.com/AC%2FDC/")
def _add_fingerprint(self, obj): obj.meta['fingerprint'] = self.fingerprint_function(canonicalize_url(obj.url)) return obj