def test_backward_filename_filter(self): url_filter = BackwardFilenameFilter( accepted=['html', 'image.*.png'], rejected=['bmp', 'jp[eg]', 'image.123.png'] ) mock_record = MockURLTableRecord() mock_record.url = 'http://example.com/' self.assertTrue(url_filter.test( URLInfo.parse('http://example/index.html'), mock_record )) self.assertTrue(url_filter.test( URLInfo.parse('http://example/myimage.1003.png'), mock_record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example/myimage.123.png'), mock_record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example/blah.png'), mock_record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example/image.1003.png.bmp'), mock_record ))
def test_url_info_invalids(self): self.assertRaises(ValueError, URLInfo.parse, '') self.assertRaises(ValueError, URLInfo.parse, '#') self.assertRaises(ValueError, URLInfo.parse, 'http://') self.assertRaises(ValueError, URLInfo.parse, 'example....com') self.assertRaises(ValueError, URLInfo.parse, 'http://example....com') self.assertRaises(ValueError, URLInfo.parse, 'http://example…com') self.assertRaises(ValueError, URLInfo.parse, 'http://[34.4kf]::4') self.assertRaises(ValueError, URLInfo.parse, 'http://[34.4kf::4') self.assertRaises(ValueError, URLInfo.parse, 'http://dmn3]:3a:45') self.assertRaises(ValueError, URLInfo.parse, ':38/3') self.assertRaises(ValueError, URLInfo.parse, 'http://][a:@1]') self.assertRaises(ValueError, URLInfo.parse, 'http://[[aa]]:4:]6') self.assertNotIn('[', URLInfo.parse('http://[a]').hostname) self.assertNotIn(']', URLInfo.parse('http://[a]').hostname) self.assertRaises(ValueError, URLInfo.parse, 'http://[[a]') self.assertRaises(ValueError, URLInfo.parse, 'http://[[a]]a]') self.assertRaises(ValueError, URLInfo.parse, 'http://[[a:a]]') self.assertRaises(ValueError, URLInfo.parse, 'http:///') self.assertRaises(ValueError, URLInfo.parse, 'http:///horse') self.assertRaises(ValueError, URLInfo.parse, 'http://?what?') self.assertRaises(ValueError, URLInfo.parse, 'http://#egg=wpull') self.assertRaises(ValueError, URLInfo.parse, 'http://:@example.com:?@/') self.assertRaises(ValueError, URLInfo.parse, 'http://\x00/') self.assertRaises(ValueError, URLInfo.parse, 'http:/a') self.assertRaises(ValueError, URLInfo.parse, 'http://@@example.com/@') self.assertRaises( ValueError, URLInfo.parse, 'http://fat32defragmenter.internets::80') self.assertRaises( ValueError, URLInfo.parse, 'http://fat32defragmenter.internets:80/') self.assertRaises(ValueError, URLInfo.parse, 'http:// /spaaaace') self.assertRaises( ValueError, URLInfo.parse, 'http://a-long-long-time-ago-the-earth-was-ruled-by-dinosaurs-' 'they-were-big-so-not-a-lot-of-people-went-around-hassling-them-' 'actually-no-people-went-around-hassling-them-' 'because-there-weren-t-any-people-yet-' 'just-the-first-tiny-mammals-' 'basically-life-was-good-' 'lou-it-just-dont-get-no-better-than-this-' 'yeah-' 'then-something-happened-' 'a-giant-meteorite-struck-the-earth-' 'goodbye-dinosaurs-' 'but-what-if-the-dinosaurs-werent-all-destroyed-' 'what-if-the-impact-of-that-meteorite-created-a-parallel-dimension-' 'where-the-dinosaurs-continue-to-thrive-' 'and-evolved-into-intelligent-vicious-aggressive-beings-' 'just-like-us-' 'and-hey-what-if-they-found-their-way-back.movie' ) self.assertRaises( ValueError, URLInfo.parse, 'http://[...]/python.xml%22') self.assertRaises( ValueError, URLInfo.parse, 'http://[…]/python.xml%22') self.assertRaises( ValueError, URLInfo.parse, 'http://[.]/python.xml%22')
def test_regex_filter(self): mock_record = MockURLTableRecord() mock_record.url = 'http://example.com/blog/' url_filter = RegexFilter() self.assertTrue(url_filter.test( URLInfo.parse('http://example.net'), mock_record )) url_filter = RegexFilter(accepted=r'blo[a-z]/$') self.assertTrue(url_filter.test( URLInfo.parse('http://example.net/blob/'), mock_record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example.net/blob/123'), mock_record )) url_filter = RegexFilter(rejected=r'\.gif$') self.assertTrue(url_filter.test( URLInfo.parse('http://example.net/blob/'), mock_record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example.net/blob/123.gif'), mock_record ))
def test_directory_filter(self): mock_record = MockURLTableRecord() mock_record.url = 'http://example.com/blog/' url_filter = DirectoryFilter() self.assertTrue(url_filter.test( URLInfo.parse('http://example.com'), mock_record )) url_filter = DirectoryFilter(accepted=['/blog']) self.assertFalse(url_filter.test( URLInfo.parse('http://example.com'), mock_record )) self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/'), mock_record )) url_filter = DirectoryFilter(rejected=['/cgi-bin/']) self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/'), mock_record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example.com/cgi-bin'), mock_record ))
def test_parent_filter(self): mock_record = MockURLTableRecord() mock_record.inline = False url_filter = ParentFilter() mock_record.top_url = 'http://example.com/blog/topic2/' self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic2/'), mock_record )) mock_record.top_url = 'http://example.com/blog/topic1/' self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), mock_record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example.com/blog/'), mock_record )) mock_record.inline = True self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/styles.css'), mock_record ))
def test_url_info_ipv6(self): self.assertEqual( 'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6', URLInfo.parse( 'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6' ).url ) self.assertEqual( '[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080', URLInfo.parse( 'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6' ).hostname_with_port ) self.assertEqual( 'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]/ipv6', URLInfo.parse( 'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]/ipv6' ).url ) self.assertEqual( '[2001:db8:85a3:8d3:1319:8a2e:370:7348]', URLInfo.parse( 'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]/ipv6' ).hostname_with_port )
def test_css_detect(self): self.assertTrue( CSSReader.is_file( io.BytesIO('body { color: white }'.encode('utf-16le')))) self.assertFalse( CSSReader.is_file(io.BytesIO('hello world!'.encode('utf-16le')))) self.assertFalse(CSSReader.is_file(io.BytesIO(b'<html><body>hello'))) self.assertTrue( CSSReader.is_file(io.BytesIO(b'<html><body>hello')) is VeryFalse) self.assertTrue( CSSReader.is_file(io.BytesIO(b'h1 { background-color: red }'))) self.assertTrue(CSSReader.is_file(io.BytesIO(b'@import url.css;'))) self.assertTrue( CSSReader.is_url(URLInfo.parse('example.com/index.css'))) self.assertFalse( CSSReader.is_url(URLInfo.parse('example.com/image.jpg'))) self.assertTrue(CSSReader.is_request(Request('example.com/index.css'))) self.assertFalse(CSSReader.is_request( Request('example.com/image.jpg'))) response = Response(200, 'OK') response.fields['Content-Type'] = 'text/css' self.assertTrue(CSSReader.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(CSSReader.is_response(response))
def test_xml_detect(self): self.assertTrue( XMLDetector.is_file(io.BytesIO( '<?xml version='.encode('utf-16le')))) self.assertFalse( XMLDetector.is_file( io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le')))) self.assertFalse(XMLDetector.is_file(io.BytesIO(b'<html><body>hello'))) self.assertTrue(XMLDetector.is_file(io.BytesIO(b'<?xml version'))) self.assertTrue( XMLDetector.is_url(URLInfo.parse('example.com/index.xml'))) self.assertFalse( XMLDetector.is_url(URLInfo.parse('example.com/image.jpg'))) self.assertTrue( XMLDetector.is_request(Request('example.com/index.xml'))) self.assertFalse( XMLDetector.is_request(Request('example.com/image.jpg'))) response = Response(200, 'OK') response.fields['Content-Type'] = 'text/xml' self.assertTrue(XMLDetector.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'application/xml' self.assertTrue(XMLDetector.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(XMLDetector.is_response(response))
def test_url_info_query(self): self.assertEqual('http://example.com/?a=', URLInfo.parse('http://example.com?a=').url) self.assertEqual('http://example.com/?a=1', URLInfo.parse('http://example.com?a=1').url) self.assertEqual('http://example.com/?a=1&b', URLInfo.parse('http://example.com?a=1&b').url) self.assertEqual('http://example.com/?a=1&b=', URLInfo.parse('http://example.com?a=1&b=').url)
def test_to_dir_path_url(self): self.assertEqual('ftp://putfile.com/', to_dir_path_url(URLInfo.parse('ftp://putfile.com/'))) self.assertEqual( 'ftp://putfile.com/', to_dir_path_url(URLInfo.parse('ftp://putfile.com/asdf'))) self.assertEqual( 'ftp://putfile.com/asdf/', to_dir_path_url(URLInfo.parse('ftp://putfile.com/asdf/qwer')))
def test_url_info_trailing_dot(self): self.assertEqual( 'http://example.com./', URLInfo.parse('http://example.com./').url ) self.assertEqual( 'http://example.com.:81/', URLInfo.parse('http://example.com.:81/').url )
def test_url_info_naked(self): self.assertEqual('http://example.com/', URLInfo.parse('Example.Com').url) self.assertEqual('http://example.com/', URLInfo.parse('//example.com').url) self.assertEqual('http://example.com/Blah', URLInfo.parse('//example.com/Blah').url) url_info = URLInfo.parse('example.com:8080') self.assertEqual('http://example.com:8080/', url_info.url) self.assertEqual('example.com:8080', url_info.hostname_with_port) self.assertEqual(8080, url_info.port) url_info = URLInfo.parse('localhost:8080/A/b/C:') self.assertEqual('http://localhost:8080/A/b/C:', url_info.url) self.assertEqual('localhost:8080', url_info.hostname_with_port) self.assertEqual(8080, url_info.port) self.assertEqual('http://example.com/Asdf', URLInfo.parse('example.com/Asdf#Blah').url) self.assertEqual('http://example.com/asdf/Ghjk', URLInfo.parse('example.com/asdf/Ghjk#blah').url) self.assertEqual('http://example.com/', URLInfo.parse('example.com/').url) self.assertEqual('https://example.com/', URLInfo.parse('https://example.com').url)
def test_url_info_default_port(self): self.assertEqual(80, URLInfo.parse('http://example.com').port) self.assertEqual(443, URLInfo.parse('https://example.com').port) self.assertEqual( 'example.com', URLInfo.parse('http://example.com').hostname_with_port) self.assertEqual( 'example.com', URLInfo.parse('https://example.com').hostname_with_port) self.assertEqual('http://example.com/', URLInfo.parse('http://example.com:80').url)
def test_url_info_reserved_char_is_ok(self): self.assertEqual( 'http://example.com/@49IMG.DLL/$SESSION$/image.png;large', URLInfo.parse( 'http://example.com/@49IMG.DLL/$SESSION$/image.png;large').url) self.assertEqual( 'http://example.com/@49IMG.DLL/$SESSION$/imag%C3%A9.png;large', URLInfo.parse( 'http://example.com/@49IMG.DLL/$SESSION$/imagé.png;large').url) self.assertEqual( 'http://example.com/$c/%system.exe/', URLInfo.parse('http://example.com/$c/%system.exe/').url)
def test_http_filter(self): mock_record = MockURLTableRecord() url_filter = HTTPFilter() self.assertTrue( url_filter.test(URLInfo.parse('http://example.net'), mock_record)) self.assertFalse( url_filter.test(URLInfo.parse('mailto:[email protected]'), mock_record)) self.assertFalse( url_filter.test(URLInfo.parse("javascript:alert('hello!')"), mock_record))
def test_to_dir_path_url(self): self.assertEqual( 'ftp://putfile.com/', to_dir_path_url(URLInfo.parse('ftp://putfile.com/')) ) self.assertEqual( 'ftp://putfile.com/', to_dir_path_url(URLInfo.parse('ftp://putfile.com/asdf')) ) self.assertEqual( 'ftp://putfile.com/asdf/', to_dir_path_url(URLInfo.parse('ftp://putfile.com/asdf/qwer')) )
def test_url_info_round_trip(self): urls = [ 'http://example.com/blah%20blah/', 'example.com:81?blah=%c3%B0', 'http://example.com/a/../../b/style.css', 'http://example.com/' '?blah=http%3A%2F%2Fexample.com%2F%3Ffail%3Dtrue', 'http://example.com/??blah=blah[0:]=bl%61h?blah"&d%26_', 'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]/ipv6', ] for url in urls: URLInfo.parse(URLInfo.parse(url).url)
def test_url_info_path_folding(self): self.assertEqual('http://example.com/', URLInfo.parse('http://example.com/.').url) self.assertEqual('http://example.com/', URLInfo.parse('http://example.com/../').url) self.assertEqual('http://example.com/index.html', URLInfo.parse('http://example.com/../index.html').url) self.assertEqual( 'http://example.com/b/style.css', URLInfo.parse('http://example.com/a/../../b/style.css').url) self.assertEqual( 'http://example.com/a/style.css', URLInfo.parse('http://example.com/a/b/../style.css').url)
def test_ip_address_normalization(self): self.assertEqual( 'http://192.0.2.235/', URLInfo.parse('http://0xC0.0x00.0x02.0xEB').url ) self.assertEqual( 'http://192.0.2.235/', URLInfo.parse('http://0300.0000.0002.0353').url ) self.assertEqual( 'http://192.0.2.235/', URLInfo.parse('http://0xC00002EB/').url ) self.assertEqual( 'http://192.0.2.235/', URLInfo.parse('http://3221226219/').url ) self.assertEqual( 'http://192.0.2.235/', URLInfo.parse('http://030000001353/').url ) self.assertEqual( 'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6', URLInfo.parse( 'http://[2001:Db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6' ).url ) self.assertEqual( 'http://[::1]/', URLInfo.parse('http://[0:0:0:0:0:0:0:1]').url ) self.assertEqual( 'http://[::ffff:c000:280]/', URLInfo.parse('http://[::ffff:192.0.2.128]/').url )
def test_parent_filter(self): mock_record = MockURLTableRecord() mock_record.inline = False url_filter = ParentFilter() mock_record.top_url = 'http://example.com/blog/topic2/' self.assertTrue( url_filter.test(URLInfo.parse('http://example.com/blog/topic2/'), mock_record)) mock_record.top_url = 'http://example.com/blog/topic1/' self.assertTrue( url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), mock_record)) self.assertTrue( url_filter.test( URLInfo.parse('https://example.com/blog/topic1/blah2.html'), mock_record)) self.assertFalse( url_filter.test(URLInfo.parse('http://example.com/blog/'), mock_record)) self.assertFalse( url_filter.test(URLInfo.parse('https://example.com/blog/'), mock_record)) self.assertTrue( url_filter.test(URLInfo.parse('http://somewhere.com/'), mock_record)) self.assertTrue( url_filter.test(URLInfo.parse('https://somewhere.com/'), mock_record)) mock_record.inline = True self.assertTrue( url_filter.test(URLInfo.parse('http://example.com/styles.css'), mock_record))
def test_url_info_misleading_parts(self): self.assertEqual( 'http://example.com/?a', URLInfo.parse('http://example.com?a').url ) self.assertEqual( 'http://example.com/?a?', URLInfo.parse('http://example.com?a?').url ) self.assertEqual( 'http://example.com/', URLInfo.parse('http://example.com#a').url ) self.assertEqual( 'http://example.com/', URLInfo.parse('http://example.com#a?').url ) self.assertEqual( 'http://example.com/?a', URLInfo.parse('http://example.com?a#').url ) self.assertEqual( 'http://example.com/:10', URLInfo.parse('http://example.com/:10').url ) self.assertEqual( 'http://example.com/?@/', URLInfo.parse('http://:@example.com?@/').url ) self.assertEqual( 'http://example.com/http:/example.com', URLInfo.parse('http://:@example.com/http://example.com').url )
def test_ip_address_normalization(self): self.assertEqual( 'http://192.0.2.235/', URLInfo.parse('https://0xC0.0x00.0x02.0xEB').url ) self.assertEqual( 'http://192.0.2.235/', URLInfo.parse('https://0301.1680.0002.0353').url ) self.assertEqual( 'http://192.0.2.235/', URLInfo.parse('https://0xC00002EB/').url ) self.assertEqual( 'http://192.0.2.235/', URLInfo.parse('https://3221226219/').url ) self.assertEqual( 'http://192.0.2.235/', URLInfo.parse('https://030000001353/').url ) self.assertEqual( 'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6', URLInfo.parse( 'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6' ).url ) self.assertEqual( 'https://[::1]/', URLInfo.parse('https://[0:0:0:0:0:0:0:1]').url ) self.assertEqual( 'https://[::ffff:192.0.2.128]/', URLInfo.parse('https://[::ffff:c000:0280]').url )
def test_url_info_reserved_char_is_ok(self): self.assertEqual( 'http://example.com/@49IMG.DLL/$SESSION$/image.png;large', URLInfo.parse( 'http://example.com/@49IMG.DLL/$SESSION$/image.png;large').url ) self.assertEqual( 'http://example.com/@49IMG.DLL/$SESSION$/imag%C3%A9.png;large', URLInfo.parse( 'http://example.com/@49IMG.DLL/$SESSION$/imagé.png;large').url ) self.assertEqual( 'http://example.com/$c/%system.exe/', URLInfo.parse('http://example.com/$c/%system.exe/').url )
def parse_url(cls, url, encoding): '''Parse and return a URLInfo. This function logs a warning if the URL cannot be parsed and returns None. ''' try: url_info = URLInfo.parse(url, encoding=encoding) # FIXME: workaround detection of bad URL unsplit. See issue #132. URLInfo.parse(url_info.url, encoding=encoding) except ValueError as error: _logger.warning(__(_('Discarding malformed URL ‘{url}’: {error}.'), url=url, error=error)) else: return url_info
def test_url_info_parts(self): url_info = URLInfo.parse( 'HTTP://*****:*****@[A::1]:81/ásdF\u200C/ghjK?a=b=c&D#/?') self.assertEqual( 'http://*****:*****@[a::1]:81/' '%C3%A1sdF%E2%80%8C/ghjK?a=b=c&D', url_info.url ) self.assertEqual('http', url_info.scheme) self.assertEqual('userName:pass%3Aword@[A::1]:81', url_info.authority) self.assertEqual('/ásdF\u200C/ghjK?a=b=c&D#/?', url_info.resource) self.assertEqual('userName', url_info.username) self.assertEqual('pass:word', url_info.password) self.assertEqual('[A::1]:81', url_info.host) self.assertEqual('[a::1]:81', url_info.hostname_with_port) self.assertEqual('a::1', url_info.hostname) self.assertEqual(81, url_info.port) self.assertEqual('/%C3%A1sdF%E2%80%8C/ghjK', url_info.path) self.assertEqual('a=b=c&D', url_info.query) self.assertEqual('/?', url_info.fragment) self.assertEqual('utf-8', url_info.encoding) self.assertEqual( 'HTTP://*****:*****@[A::1]:81/ásdF\u200C/ghjK?a=b=c&D#/?', url_info.raw) self.assertEqual(('/%C3%A1sdF%E2%80%8C', 'ghjK'), url_info.split_path()) url_info = URLInfo.parse( 'Ftp://*****:*****@LocalHost.Example/mydocs/' ) self.assertEqual('ftp', url_info.scheme) self.assertEqual('N00B:[email protected]', url_info.authority) self.assertEqual('/mydocs/', url_info.resource) self.assertEqual('N00B', url_info.username) self.assertEqual('hunter2', url_info.password) self.assertEqual('LocalHost.Example', url_info.host) self.assertEqual('localhost.example', url_info.hostname_with_port) self.assertEqual('localhost.example', url_info.hostname) self.assertEqual(21, url_info.port) self.assertEqual('/mydocs/', url_info.path) self.assertFalse(url_info.query) self.assertFalse(url_info.fragment) self.assertEqual('utf-8', url_info.encoding) self.assertEqual( 'Ftp://*****:*****@LocalHost.Example/mydocs/', url_info.raw) self.assertEqual(('/mydocs', ''), url_info.split_path())
def test_append_slash_to_path_url(self): self.assertEqual( 'ftp://putfile.com/example/', append_slash_to_path_url( URLInfo.parse('ftp://putfile.com/example') ) )
def test_http_filter(self): mock_record = MockURLTableRecord() url_filter = HTTPFilter() self.assertTrue(url_filter.test( URLInfo.parse('http://example.net'), mock_record )) self.assertFalse(url_filter.test( URLInfo.parse('mailto:[email protected]'), mock_record )) self.assertFalse(url_filter.test( URLInfo.parse("javascript:alert('hello!')"), mock_record ))
def fetch_robots_txt(self, request, file=None): '''Fetch the robots.txt file for the request. Coroutine. ''' url_info = request.url_info url = URLInfo.parse('{0}://{1}/robots.txt'.format( url_info.scheme, url_info.hostname_with_port)).url if not file: file = wpull.body.new_temp_file(os.getcwd(), hint='robots') with contextlib.closing(file): request = self._web_client.request_factory(url) session = self._web_client.session(request) while not session.done(): wpull.util.truncate_file(file.name) try: response = yield From(session.fetch(file=file)) except ProtocolError: self._accept_as_blank(url_info) return status_code = response.status_code if 500 <= status_code <= 599: raise ServerError('Server returned error for robots.txt.') if status_code == 200: self._read_content(response, url_info) else: self._accept_as_blank(url_info)
def test_url_info_query(self): self.assertEqual( 'http://example.com/?a=', URLInfo.parse('http://example.com?a=').url ) self.assertEqual( 'http://example.com/?a=1', URLInfo.parse('http://example.com?a=1').url ) self.assertEqual( 'http://example.com/?a=1&b', URLInfo.parse('http://example.com?a=1&b').url ) self.assertEqual( 'http://example.com/?a=1&b=', URLInfo.parse('http://example.com?a=1&b=').url )
def test_url_info_to_dict(self): url_info = URLInfo.parse('https://example.com/file.jpg') url_info_dict = url_info.to_dict() self.assertEqual('/file.jpg', url_info_dict['path']) self.assertEqual('example.com', url_info_dict['hostname']) self.assertEqual('https', url_info_dict['scheme']) self.assertEqual(443, url_info_dict['port']) self.assertEqual('utf-8', url_info_dict['encoding'])
def _get_cookie_referrer_host(self): '''Return the referrer hostname.''' referer = self._original_request.fields.get('Referer') if referer: return URLInfo.parse(referer).hostname else: return None
def _get_cookie_referrer_host(self): """Return the referrer hostname.""" referer = self._original_request.fields.get("Referer") if referer: return URLInfo.parse(referer).hostname else: return None
def _build_input_urls(self, default_scheme='http'): '''Read the URLs provided by the user.''' url_string_iter = self._args.urls or () if self._args.input_file: if self._args.force_html: urls = self._read_input_file_as_html() else: urls = self._read_input_file_as_lines() url_string_iter = itertools.chain(url_string_iter, urls) sitemap_url_infos = set() base_url = self._args.base for url_string in url_string_iter: _logger.debug(__('Parsing URL {0}', url_string)) if base_url: url_string = wpull.url.urljoin(base_url, url_string) url_info = self._factory.class_map['URLInfo'].parse( url_string, default_scheme=default_scheme) _logger.debug(__('Parsed URL {0}', url_info)) yield url_info if self._args.sitemaps: sitemap_url_infos.update(( URLInfo.parse( '{0}://{1}/robots.txt'.format( url_info.scheme, url_info.hostname_with_port) ), URLInfo.parse( '{0}://{1}/sitemap.xml'.format( url_info.scheme, url_info.hostname_with_port) ) )) for url_info in sitemap_url_infos: yield url_info
def test_consult_filters(self): fetch_rule = self.get_fetch_rule() url_info = URLInfo.parse('http://example.com') url_record = new_mock_url_record() verdict, reason, test_info = fetch_rule.consult_filters(url_info, url_record) self.assertTrue(verdict) self.assertEqual('filters', reason)
def _convert_plain(self, link_info, root, encoding): base_url = wpull.util.to_str(root.base_url) if link_info.base_link: base_url = wpull.url.urljoin(base_url, link_info.base_link) url = wpull.url.urljoin(base_url, link_info.link) url_info = URLInfo.parse(url, encoding=encoding) new_url = self._get_new_url(url_info) link_info.element.set(link_info.attrib, new_url)
def test_https_filter(self): record= URLRecord() url_filter = HTTPSOnlyFilter() self.assertFalse(url_filter.test( URLInfo.parse('http://example.net'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://example.net'), record )) self.assertFalse(url_filter.test( URLInfo.parse('mailto:[email protected]'), record )) self.assertFalse(url_filter.test( URLInfo.parse("javascript:alert('hello!')"), record ))
def test_url_info_default_port(self): self.assertEqual( 80, URLInfo.parse('http://example.com').port ) self.assertEqual( 443, URLInfo.parse('https://example.com').port ) self.assertEqual( 'example.com', URLInfo.parse('http://example.com').hostname_with_port ) self.assertEqual( 'example.com', URLInfo.parse('https://example.com').hostname_with_port ) self.assertEqual( 'http://example.com/', URLInfo.parse('http://example.com:80').url )
def test_url_info_path_folding(self): self.assertEqual( 'http://example.com/', URLInfo.parse('http://example.com/.').url ) self.assertEqual( 'http://example.com/', URLInfo.parse('http://example.com/../').url ) self.assertEqual( 'http://example.com/index.html', URLInfo.parse('http://example.com/../index.html').url ) self.assertEqual( 'http://example.com/b/style.css', URLInfo.parse('http://example.com/a/../../b/style.css').url ) self.assertEqual( 'http://example.com/a/style.css', URLInfo.parse('http://example.com/a/b/../style.css').url )
def test_span_hosts_filter(self): mock_record = MockURLTableRecord() mock_record.url = 'http://example.com' url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/'), ], enabled=False ) self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), mock_record )) self.assertFalse(url_filter.test( URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'), mock_record )) url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/'), ], enabled=True ) self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), mock_record )) self.assertTrue(url_filter.test( URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'), mock_record ))
def test_url_info_usrename_password(self): self.assertEqual( 'http://[email protected]/', URLInfo.parse('http://[email protected]/').url ) self.assertEqual( 'http://*****:*****@example.com/', URLInfo.parse('http://*****:*****@example.com/').url ) self.assertEqual( 'http://:[email protected]/', URLInfo.parse('http://:[email protected]/').url ) self.assertEqual( 'http://*****:*****@example.com/', URLInfo.parse('http://*****:*****@example.com/').url ) self.assertEqual( 'http://User%40Name:Pass:[email protected]/', URLInfo.parse('http://User%40Name:Pass%[email protected]/').url ) self.assertEqual( 'http://User%20Name%[email protected]/', URLInfo.parse('http://User Name%3A:@example.com/').url )
def test_span_hosts_filter(self): mock_record = MockURLTableRecord() mock_record.url = 'http://example.com' url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/'), ], enabled=False) self.assertTrue( url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), mock_record)) self.assertFalse( url_filter.test( URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'), mock_record)) url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/'), ], enabled=True) self.assertTrue( url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), mock_record)) self.assertTrue( url_filter.test( URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'), mock_record))
def parse_url(cls, url, encoding): '''Parse and return a URLInfo. This function logs a warning if the URL cannot be parsed and returns None. ''' try: url_info = URLInfo.parse(url, encoding=encoding) except ValueError as error: _logger.warning(_('Discarding malformed URL ‘{url}’: {error}.')\ .format(url=url, error=error)) else: return url_info
def test_sitemap_detect(self): # It should detect without BOM self.assertTrue(SitemapReader.is_file( io.BytesIO('<?xml > <urlset >'.encode('utf-16le')) )) self.assertFalse(SitemapReader.is_file( io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le')) )) self.assertFalse(SitemapReader.is_file( io.BytesIO(b'<html><body>hello<urlset>') )) self.assertTrue(SitemapReader.is_file( io.BytesIO(b'<?xml version> <urlset>') )) data_file = io.BytesIO() g_file = gzip.GzipFile(fileobj=data_file, mode='wb') g_file.write('<?xml version> <urlset>'.encode('utf-16le')) g_file.close() data_file.seek(0) self.assertTrue(SitemapReader.is_file( data_file )) self.assertTrue( SitemapReader.is_url(URLInfo.parse('example.com/sitemaps1.xml')) ) self.assertTrue( SitemapReader.is_url(URLInfo.parse('example.com/robots.txt')) ) self.assertFalse( SitemapReader.is_url(URLInfo.parse('example.com/image.jpg')) ) self.assertTrue( SitemapReader.is_request(Request('example.com/sitemaps34.xml')) ) self.assertFalse( SitemapReader.is_request(Request('example.com/image.jpg')) )
def repl(match): url = match.group(1) or match.group(2) if base_url: url = wpull.url.urljoin(base_url, url) if url in self._url_table \ and self._url_table[url].status == Status.done: new_url = self._path_namer.get_filename(URLInfo.parse(url)) else: new_url = url return match.group().replace(url, new_url)