def test_host_normalization(self): """ Asserts the scheme and hosts with a normalizable scheme are converted to lower-case. """ url_host_map = { # Hosts 'HTTP://GOOGLE.COM/mail/': ('http', 'google.com', None), 'GOogle.COM/mail': ('http', 'google.com', None), 'HTTP://GoOgLe.CoM:8000/mail/': ('http', 'google.com', 8000), 'HTTP://*****:*****@EXAMPLE.COM:1234': ('http', 'example.com', 1234), '173.194.35.7': ('http', '173.194.35.7', None), 'HTTP://173.194.35.7': ('http', '173.194.35.7', None), 'HTTP://[2a00:1450:4001:c01::67]:80/test': ('http', '[2a00:1450:4001:c01::67]', 80), 'HTTP://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:8000/index.html': ( 'http', '[fedc:ba98:7654:3210:fedc:ba98:7654:3210]', 8000), 'HTTPS://[1080:0:0:0:8:800:200c:417A]/index.html': ( 'https', '[1080:0:0:0:8:800:200c:417a]', None), 'abOut://eXamPlE.com?info=1': ('about', 'eXamPlE.com', None), 'http+UNIX://%2fvar%2frun%2fSOCKET/path': ( 'http+unix', '%2fvar%2frun%2fSOCKET', None), } for url, expected_host in url_host_map.items(): returned_host = get_host(url) self.assertEqual(returned_host, expected_host)
def test_host_normalization(self): """ Asserts the scheme and hosts with a normalizable scheme are converted to lower-case. """ url_host_map = { # Hosts 'HTTP://GOOGLE.COM/mail/': ('http', 'google.com', None), 'GOogle.COM/mail': ('http', 'google.com', None), 'HTTP://GoOgLe.CoM:8000/mail/': ('http', 'google.com', 8000), 'HTTP://*****:*****@EXAMPLE.COM:1234': ('http', 'example.com', 1234), '173.194.35.7': ('http', '173.194.35.7', None), 'HTTP://173.194.35.7': ('http', '173.194.35.7', None), 'HTTP://[2a00:1450:4001:c01::67]:80/test': ('http', '[2a00:1450:4001:c01::67]', 80), 'HTTP://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:8000/index.html': ('http', '[fedc:ba98:7654:3210:fedc:ba98:7654:3210]', 8000), 'HTTPS://[1080:0:0:0:8:800:200c:417A]/index.html': ('https', '[1080:0:0:0:8:800:200c:417a]', None), 'abOut://eXamPlE.com?info=1': ('about', 'eXamPlE.com', None), 'http+UNIX://%2fvar%2frun%2fSOCKET/path': ('http+unix', '%2fvar%2frun%2fSOCKET', None), } for url, expected_host in url_host_map.items(): returned_host = get_host(url) self.assertEqual(returned_host, expected_host)
def work(self, site): _, hostname, _ = get_host(site) conn = utils.http_req(site) item = { "site": site, "hostname": hostname, "ip":"", "title": utils.get_title(conn.content), "status": conn.status_code, "headers": utils.get_headers(conn), "http_server": conn.headers.get("Server", ""), "body_length": len(conn.content), "finger": [], "favicon": fetch_favicon(site) } domain_parsed = utils.domain_parsed(hostname) if domain_parsed: item["fld"] = domain_parsed["fld"] ips = utils.get_ip(hostname) if ips: item["ip"] = ips[0] else: item["ip"] = hostname self.site_info_list.append(item) if conn.status_code == 301 or conn.status_code == 302: url_302 = urljoin(site, conn.headers.get("Location", "")) if url_302 != site and url_302.startswith(site): self.work(url_302)
def test_get_host(self): url_host_map = { # Hosts 'http://google.com/mail': ('http', 'google.com', None), 'http://google.com/mail/': ('http', 'google.com', None), 'google.com/mail': ('http', 'google.com', None), 'http://google.com/': ('http', 'google.com', None), 'http://google.com': ('http', 'google.com', None), 'http://www.google.com': ('http', 'www.google.com', None), 'http://mail.google.com': ('http', 'mail.google.com', None), 'http://google.com:8000/mail/': ('http', 'google.com', 8000), 'http://google.com:8000': ('http', 'google.com', 8000), 'https://google.com': ('https', 'google.com', None), 'https://google.com:8000': ('https', 'google.com', 8000), 'http://*****:*****@127.0.0.1:1234': ('http', '127.0.0.1', 1234), 'http://google.com/foo=http://bar:42/baz': ('http', 'google.com', None), 'http://google.com?foo=http://bar:42/baz': ('http', 'google.com', None), 'http://google.com#foo=http://bar:42/baz': ('http', 'google.com', None), # IPv4 '173.194.35.7': ('http', '173.194.35.7', None), 'http://173.194.35.7': ('http', '173.194.35.7', None), 'http://173.194.35.7/test': ('http', '173.194.35.7', None), 'http://173.194.35.7:80': ('http', '173.194.35.7', 80), 'http://173.194.35.7:80/test': ('http', '173.194.35.7', 80), # IPv6 '[2a00:1450:4001:c01::67]': ('http', '[2a00:1450:4001:c01::67]', None), 'http://[2a00:1450:4001:c01::67]': ('http', '[2a00:1450:4001:c01::67]', None), 'http://[2a00:1450:4001:c01::67]/test': ('http', '[2a00:1450:4001:c01::67]', None), 'http://[2a00:1450:4001:c01::67]:80': ('http', '[2a00:1450:4001:c01::67]', 80), 'http://[2a00:1450:4001:c01::67]:80/test': ('http', '[2a00:1450:4001:c01::67]', 80), # More IPv6 from http://www.ietf.org/rfc/rfc2732.txt 'http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:8000/index.html': ('http', '[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]', 8000), 'http://[1080:0:0:0:8:800:200C:417A]/index.html': ('http', '[1080:0:0:0:8:800:200C:417A]', None), 'http://[3ffe:2a00:100:7031::1]': ('http', '[3ffe:2a00:100:7031::1]', None), 'http://[1080::8:800:200C:417A]/foo': ('http', '[1080::8:800:200C:417A]', None), 'http://[::192.9.5.5]/ipng': ('http', '[::192.9.5.5]', None), 'http://[::FFFF:129.144.52.38]:42/index.html': ('http', '[::FFFF:129.144.52.38]', 42), 'http://[2010:836B:4179::836B:4179]': ('http', '[2010:836B:4179::836B:4179]', None), } for url, expected_host in url_host_map.items(): returned_host = get_host(url) self.assertEqual(returned_host, expected_host)
def test_get_host(self): url_host_map = { # Hosts 'http://google.com/mail': ('http', 'google.com', None), 'http://google.com/mail/': ('http', 'google.com', None), 'google.com/mail': ('http', 'google.com', None), 'http://google.com/': ('http', 'google.com', None), 'http://google.com': ('http', 'google.com', None), 'http://www.google.com': ('http', 'www.google.com', None), 'http://mail.google.com': ('http', 'mail.google.com', None), 'http://google.com:8000/mail/': ('http', 'google.com', 8000), 'http://google.com:8000': ('http', 'google.com', 8000), 'https://google.com': ('https', 'google.com', None), 'https://google.com:8000': ('https', 'google.com', 8000), 'http://*****:*****@127.0.0.1:1234': ('http', '127.0.0.1', 1234), 'http://google.com/foo=http://bar:42/baz': ('http', 'google.com', None), 'http://google.com?foo=http://bar:42/baz': ('http', 'google.com', None), 'http://google.com#foo=http://bar:42/baz': ('http', 'google.com', None), # IPv4 '173.194.35.7': ('http', '173.194.35.7', None), 'http://173.194.35.7': ('http', '173.194.35.7', None), 'http://173.194.35.7/test': ('http', '173.194.35.7', None), 'http://173.194.35.7:80': ('http', '173.194.35.7', 80), 'http://173.194.35.7:80/test': ('http', '173.194.35.7', 80), # IPv6 '[2a00:1450:4001:c01::67]': ('http', '[2a00:1450:4001:c01::67]', None), 'http://[2a00:1450:4001:c01::67]': ('http', '[2a00:1450:4001:c01::67]', None), 'http://[2a00:1450:4001:c01::67]/test': ('http', '[2a00:1450:4001:c01::67]', None), 'http://[2a00:1450:4001:c01::67]:80': ('http', '[2a00:1450:4001:c01::67]', 80), 'http://[2a00:1450:4001:c01::67]:80/test': ('http', '[2a00:1450:4001:c01::67]', 80), # More IPv6 from http://www.ietf.org/rfc/rfc2732.txt 'http://[fedc:ba98:7654:3210:fedc:ba98:7654:3210]:8000/index.html': ( 'http', '[fedc:ba98:7654:3210:fedc:ba98:7654:3210]', 8000), 'http://[1080:0:0:0:8:800:200c:417a]/index.html': ( 'http', '[1080:0:0:0:8:800:200c:417a]', None), 'http://[3ffe:2a00:100:7031::1]': ('http', '[3ffe:2a00:100:7031::1]', None), 'http://[1080::8:800:200c:417a]/foo': ('http', '[1080::8:800:200c:417a]', None), 'http://[::192.9.5.5]/ipng': ('http', '[::192.9.5.5]', None), 'http://[::ffff:129.144.52.38]:42/index.html': ('http', '[::ffff:129.144.52.38]', 42), 'http://[2010:836b:4179::836b:4179]': ('http', '[2010:836b:4179::836b:4179]', None), } for url, expected_host in url_host_map.items(): returned_host = get_host(url) self.assertEqual(returned_host, expected_host)
def test_host_normalization(self): """Asserts the scheme and host is normalized to lower-case.""" url_host_map = { # Hosts 'HTTP://GOOGLE.COM/mail/': ('http', 'google.com', None), 'GOogle.COM/mail': ('http', 'google.com', None), 'HTTP://GoOgLe.CoM:8000/mail/': ('http', 'google.com', 8000), 'HTTP://*****:*****@EXAMPLE.COM:1234': ('http', 'example.com', 1234), '173.194.35.7': ('http', '173.194.35.7', None), 'HTTP://173.194.35.7': ('http', '173.194.35.7', None), 'HTTP://[2a00:1450:4001:c01::67]:80/test': ('http', '[2a00:1450:4001:c01::67]', 80), 'HTTP://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:8000/index.html': ('http', '[fedc:ba98:7654:3210:fedc:ba98:7654:3210]', 8000), 'HTTPS://[1080:0:0:0:8:800:200c:417A]/index.html': ('https', '[1080:0:0:0:8:800:200c:417a]', None), } for url, expected_host in url_host_map.items(): returned_host = get_host(url) self.assertEqual(returned_host, expected_host)
def test_get_host(self): url_host_map = { # Hosts "http://google.com/mail": ("http", "google.com", None), "http://google.com/mail/": ("http", "google.com", None), "google.com/mail": ("http", "google.com", None), "http://google.com/": ("http", "google.com", None), "http://google.com": ("http", "google.com", None), "http://www.google.com": ("http", "www.google.com", None), "http://mail.google.com": ("http", "mail.google.com", None), "http://google.com:8000/mail/": ("http", "google.com", 8000), "http://google.com:8000": ("http", "google.com", 8000), "https://google.com": ("https", "google.com", None), "https://google.com:8000": ("https", "google.com", 8000), "http://*****:*****@127.0.0.1:1234": ("http", "127.0.0.1", 1234), "http://google.com/foo=http://bar:42/baz": ("http", "google.com", None), "http://google.com?foo=http://bar:42/baz": ("http", "google.com", None), "http://google.com#foo=http://bar:42/baz": ("http", "google.com", None), # IPv4 "173.194.35.7": ("http", "173.194.35.7", None), "http://173.194.35.7": ("http", "173.194.35.7", None), "http://173.194.35.7/test": ("http", "173.194.35.7", None), "http://173.194.35.7:80": ("http", "173.194.35.7", 80), "http://173.194.35.7:80/test": ("http", "173.194.35.7", 80), # IPv6 "[2a00:1450:4001:c01::67]": ("http", "[2a00:1450:4001:c01::67]", None), "http://[2a00:1450:4001:c01::67]": ("http", "[2a00:1450:4001:c01::67]", None), "http://[2a00:1450:4001:c01::67]/test": ("http", "[2a00:1450:4001:c01::67]", None), "http://[2a00:1450:4001:c01::67]:80": ("http", "[2a00:1450:4001:c01::67]", 80), "http://[2a00:1450:4001:c01::67]:80/test": ("http", "[2a00:1450:4001:c01::67]", 80), # More IPv6 from http://www.ietf.org/rfc/rfc2732.txt "http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:8000/index.html": ( "http", "[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]", 8000, ), "http://[1080:0:0:0:8:800:200C:417A]/index.html": ("http", "[1080:0:0:0:8:800:200C:417A]", None), "http://[3ffe:2a00:100:7031::1]": ("http", "[3ffe:2a00:100:7031::1]", None), "http://[1080::8:800:200C:417A]/foo": ("http", "[1080::8:800:200C:417A]", None), "http://[::192.9.5.5]/ipng": ("http", "[::192.9.5.5]", None), "http://[::FFFF:129.144.52.38]:42/index.html": ("http", "[::FFFF:129.144.52.38]", 42), "http://[2010:836B:4179::836B:4179]": ("http", "[2010:836B:4179::836B:4179]", None), } for url, expected_host in url_host_map.items(): returned_host = get_host(url) self.assertEqual(returned_host, expected_host)
def test_invalid_host(self, location): with pytest.raises(LocationParseError): get_host(location)
def test_get_host(self, url, expected_host): returned_host = get_host(url) assert returned_host == expected_host