def response_callback(request): request.prepare_for_send() self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response(500, 'Oops') response.request = request checker.web_client.session_obj.done_value = True return response
def test_javascript_heavy_inline_monstrosity(self): scraper = JavaScriptScraper() request = Request('http://example.com/test.js') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'twitchplayspokemonfirered.html') with open(html_file_path, 'rb') as in_file: in_file.seek(0x147) shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertIn( 'http://cdn.bulbagarden.net/upload/archive/a/a4/' '20090718115357%21195Quagsire.png', inline_urls) self.assertIn( 'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F' 'user%2FGoldenSandslash15&sa=D&sntz=1&' 'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls) print('\n'.join(inline_urls)) print('\n'.join(linked_urls))
def test_sitemap_scraper_xml_index(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> </sitemapindex> ''') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/sitemap1.xml.gz', }, linked_urls) self.assertFalse(inline_urls)
def test_response(self): response = Response(200, 'OK') response.fields['Cake'] = 'dolphin' self.assertEqual((b'HTTP/1.1 200 OK\r\n' b'Cake: dolphin\r\n' b'\r\n'), response.to_bytes())
def test_adjust_extension(self): writer = AntiClobberFileWriter(self.get_path_namer(), adjust_extension=True) test_data = [ ('text/html', '/mordor', 'mordor.html'), ('text/html', '/mordor?ring.asp', 'mordor?ring.asp.html'), ('text/html', '/mordor?ring.htm', 'mordor?ring.htm'), ('text/plain', '/static/my_file.txt', 'static/my_file.txt'), ('text/css', '/static/style.css', 'static/style.css'), ('text/css', '/static/style.css?hamster.exe', 'static/style.css?hamster.exe.css'), ('text/html', '/static/mojibake.html', 'static/mojibake.html'), ('text/html', '/static/mojibake.html?dolphin.png', 'static/mojibake.html?dolphin.png.html'), ] for mime_type, path, filename in test_data: session = writer.session() request = HTTPRequest('http://example.com' + path) response = HTTPResponse(status_code=200, reason='OK', request=request) response.fields['Content-Type'] = mime_type session.process_request(request) session.process_response(response) session.save_document(response) print(filename, list(os.walk('.'))) self.assertTrue(os.path.exists(filename))
def test_javascript_heavy_inline_monstrosity(self): scraper = JavaScriptScraper() request = Request('http://example.com/test.js') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'twitchplayspokemonfirered.html') with open(html_file_path, 'rb') as in_file: in_file.seek(0x147) shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertIn( 'http://cdn.bulbagarden.net/upload/archive/a/a4/' '20090718115357%21195Quagsire.png', inline_urls ) self.assertIn( 'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F' 'user%2FGoldenSandslash15&sa=D&sntz=1&' 'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls ) print('\n'.join(inline_urls)) print('\n'.join(linked_urls))
def test_html_scraper_links_base_href(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request("http://example.com/") response = Response(200, "OK") response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "basehref.html") with open(html_file_path, "rb") as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual("utf-8", scrape_result.encoding) self.assertEqual( { "http://cdn.example.com/stylesheet1.css", "http://www.example.com/stylesheet2.css", "http://example.com/a/stylesheet3.css", "http://example.com/a/dir/image1.png", "http://example.com/dir/image2.png", "http://example.net/image3.png", "http://example.com/dir/image4.png", }, inline_urls, ) self.assertEqual({"http://example.com/a/"}, linked_urls)
def test_sitemap_scraper_xml_index(self): scraper = SitemapScraper(self.get_html_parser()) request = Request("http://example.com/sitemap.xml") response = Response(200, "OK") response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write( b"""<?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> </sitemapindex> """ ) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({"http://www.example.com/sitemap1.xml.gz"}, linked_urls) self.assertFalse(inline_urls)
def test_content_disposition(self): writer = AntiClobberFileWriter(self.get_path_namer(), content_disposition=True) test_data = [ ('hello1.txt', 'hello1.txt'), ('hello2.txt;', 'hello2.txt'), ('"hello3.txt"', 'hello3.txt'), ('\'hello4.txt\'', 'hello4.txt'), ] for raw_filename, filename in test_data: session = writer.session() request = HTTPRequest('http://example.com') response = HTTPResponse(status_code=200, reason='OK', request=request) response.fields[ 'Content-Disposition'] = 'attachment; filename={}'.format( raw_filename) session.process_request(request) session.process_response(response) session.save_document(response) print(list(os.walk('.'))) self.assertTrue(os.path.exists(filename))
def test_html_wrong_charset(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request("http://example.com/") response = Response(200, "") response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "kcna.html") with open(html_file_path, "rb") as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual("utf-16-le", scrape_result.encoding) self.assertEqual( { "http://example.com/utm/__utm.js", "http://example.com/Knewskage.gif", "http://example.com/Lline.gif", "http://example.com/Sline.gif", "http://example.com/korean01.gif", "http://example.com/korean02.gif", "http://example.com/english01.gif", "http://example.com/english02.gif", "http://example.com/Tongsinkage.gif", "http://example.com/Knewskage.gif", }, inline_urls, ) self.assertEqual({"http://example.com/index-k.htm", "http://example.com/index-e.htm"}, linked_urls)
def test_html_scraper_links_base_href(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'basehref.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual('utf-8', scrape_result.encoding) self.assertEqual( { 'http://cdn.example.com/stylesheet1.css', 'http://www.example.com/stylesheet2.css', 'http://example.com/a/stylesheet3.css', 'http://example.com/a/dir/image1.png', 'http://example.com/dir/image2.png', 'http://example.net/image3.png', 'http://example.com/dir/image4.png', }, inline_urls) self.assertEqual({'http://example.com/a/'}, linked_urls)
def test_html_soup(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['Refresh'] = 'yes' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'soup.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({'http://example.com/ABOUTM~1.JPG'}, inline_urls) self.assertEqual( { 'http://example.com/BLOG', 'http://example.com/web ring/Join.htm', }, linked_urls)
def test_sitemap_scraper_xml(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://www.example.com/</loc> <lastmod>2005-01-01</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> </urlset> ''') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/', }, linked_urls) self.assertFalse(inline_urls)
def test_rss_as_html(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'application/rss+xml' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'rss.xml') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) self.assertTrue(scrape_result) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertFalse(inline_urls) self.assertEqual( { 'http://www.someexamplerssdomain.com/main.html', 'http://www.wikipedia.org/' }, linked_urls)
def get_http_header(self) -> Response: '''Return the HTTP header. It only attempts to read the first 4 KiB of the payload. Returns: Response, None: Returns an instance of :class:`.http.request.Response` or None. ''' with wpull.util.reset_file_offset(self.block_file): data = self.block_file.read(4096) match = re.match(br'(.*?\r?\n\r?\n)', data) if not match: return status_line, dummy, field_str = match.group(1).partition(b'\n') try: version, code, reason = Response.parse_status_line(status_line) except ValueError: return response = Response(status_code=code, reason=reason, version=version) try: response.fields.parse(field_str, strict=False) except ValueError: return return response
def test_sitemap_scraper_xml(self): scraper = SitemapScraper(self.get_html_parser()) request = Request("http://example.com/sitemap.xml") response = Response(200, "OK") response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write( b"""<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://www.example.com/</loc> <lastmod>2005-01-01</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> </urlset> """ ) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({"http://www.example.com/"}, linked_urls) self.assertFalse(inline_urls)
def test_http_response(self): response = Response(200, 'OK', version='HTTP/1.0') response.fields['hello'] = 'world' new_response = HTTPResponseInfoWrapper(response) info = new_response.info() self.assertEqual('world', info.get('hello'))
def response_callback(request): request.prepare_for_send() self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response(200, 'OK') response.request = request response.body = io.StringIO('User-agent:*\nDisallow: /\n') checker.web_client.session_obj.done_value = True return response
def test_response(self): response = Response(200, 'OK') response.fields['Cake'] = 'dolphin' self.assertEqual( (b'HTTP/1.1 200 OK\r\n' b'Cake: dolphin\r\n' b'\r\n'), response.to_bytes() )
def test_xml_detect(self): self.assertTrue( XMLDetector.is_file(io.BytesIO( '<?xml version='.encode('utf-16le')))) self.assertFalse( XMLDetector.is_file( io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le')))) self.assertFalse(XMLDetector.is_file(io.BytesIO(b'<html><body>hello'))) self.assertTrue(XMLDetector.is_file(io.BytesIO(b'<?xml version'))) self.assertTrue( XMLDetector.is_url(URLInfo.parse('example.com/index.xml'))) self.assertFalse( XMLDetector.is_url(URLInfo.parse('example.com/image.jpg'))) self.assertTrue( XMLDetector.is_request(Request('example.com/index.xml'))) self.assertFalse( XMLDetector.is_request(Request('example.com/image.jpg'))) response = Response(200, 'OK') response.fields['Content-Type'] = 'text/xml' self.assertTrue(XMLDetector.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'application/xml' self.assertTrue(XMLDetector.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(XMLDetector.is_response(response))
def response_callback_3(request): request.prepare_for_send() self.assertEqual('http://www.example.net/robots.txt', request.url_info.url) response = Response(200, 'OK') response.request = request response.body = io.StringIO('User-agent:*\nAllow: /\n') checker.web_client.session_obj.done_value = True return response
def response_callback(request): request.prepare_for_send() self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response(302, 'See else') response.request = request response.fields['Location'] = '/robots.txt' nonlocal_dict['counter'] += 1 if nonlocal_dict['counter'] > 20: raise ProtocolError('Mock redirect loop error.') return response
def test_html_scraper_reject_type(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request("http://example.com/") response = Response(200, "OK") response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "many_urls.html") with open(html_file_path, "rb") as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response, link_type=LinkType.css) self.assertFalse(scrape_result)
def test_html_detect(self): self.assertTrue(HTMLReader.is_file( io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le')) )) self.assertFalse(HTMLReader.is_file( io.BytesIO('hello world!'.encode('utf-16le')) )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<title>hello</title>hi') )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<html><body>hello') )) self.assertTrue(HTMLReader.is_file( io.BytesIO( b'The document has moved <a href="somewhere.html">here</a>' ) )) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.htm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.html')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xht')) ) self.assertFalse( HTMLReader.is_url(URLInfo.parse('example.com/image.jpg')) ) self.assertTrue( HTMLReader.is_request(Request('example.com/index.html')) ) self.assertFalse( HTMLReader.is_request(Request('example.com/image.jpg')) ) response = Response(200, 'OK') response.fields['Content-Type'] = 'text/html' self.assertTrue(HTMLReader.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(HTMLReader.is_response(response))
def test_js_detect(self): self.assertTrue(JavaScriptReader.is_file(io.BytesIO("var a = 1;".encode("utf-16le")))) self.assertTrue(JavaScriptReader.is_file(io.BytesIO("setTimeout(".encode("utf-16le")))) self.assertFalse(JavaScriptReader.is_file(io.BytesIO("hello world!".encode("utf-16le")))) self.assertFalse(JavaScriptReader.is_file(io.BytesIO(b"<html><body>hello"))) self.assertTrue(JavaScriptReader.is_file(io.BytesIO(b"<html><body>hello")) is VeryFalse) response = Response(200, "OK") response.fields["Content-Type"] = "application/javascript" self.assertTrue(JavaScriptReader.is_response(response)) response = Response(200, "OK") response.fields["Content-Type"] = "image/png" self.assertFalse(JavaScriptReader.is_response(response))
def response_callback_2(request): request.prepare_for_send() self.assertEqual('http://www.example.com/robots.txt', request.url_info.url) response = Response(301, 'Moved') response.fields['location'] = 'http://www.example.net/robots.txt' response.request = request checker.web_client.mock_response_callback = response_callback_3 checker.web_client.request = Request( 'http://www.example.net/robots.txt') return response
def test_to_dict_body(self): request = Request() request.body = Body() request_dict = request.to_dict() self.assertTrue(request_dict['body']) request.body.close() request = Request() request.body = NotImplemented request_dict = request.to_dict() self.assertFalse(request_dict['body']) response = Response() response.body = Body() response_dict = response.to_dict() self.assertTrue(response_dict['body']) response.body.close() response = Response() response.body = NotImplemented response_dict = response.to_dict() self.assertFalse(response_dict['body'])
def test_warc_max_size_and_append(self): file_prefix = 'asdf' with open('asdf-00000.warc', 'w'): pass with open('asdf-00001.warc', 'w'): pass warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, max_size=1, appending=True ), ) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'BLAH') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() warc_recorder.close() self.assertTrue(os.path.exists('asdf-00000.warc')) self.assertTrue(os.path.exists('asdf-00001.warc')) self.assertTrue(os.path.exists('asdf-00002.warc')) self.assertTrue(os.path.exists('asdf-00003.warc')) self.assertTrue(os.path.exists('asdf-meta.warc')) self.assertEqual(0, os.path.getsize('asdf-00000.warc')) self.assertEqual(0, os.path.getsize('asdf-00001.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00002.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00003.warc')) self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))
def test_progress_http(self): progress = ProgressPrinter(stream=sys.stdout) request = HTTPRequest('http://example.com') response = HTTPResponse(206, 'OK') response.fields['Content-Size'] = '1024' response.fields['Content-Range'] = 'bytes 10-/2048' progress.update_from_begin_request(request) progress.update_from_begin_response(response) for dummy in range(100): progress.update_with_data(b'abc') progress.update_from_end_response(response)
def test_sitemap_scraper_invalid_robots(self): scraper = SitemapScraper(self.get_html_parser()) request = Request("http://example.com/robots.txt") response = Response(200, "OK") response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b"dsfju3wrji kjasSItemapsdmjfkl wekie;er :Ads fkj3m /Dk") scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertFalse(linked_urls) self.assertFalse(inline_urls)
def test_css_scraper_reject_type(self): scraper = CSSScraper() request = Request('http://example.com/styles.css') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'styles.css') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response, link_type=LinkType.html) self.assertFalse(scrape_result)
def test_sitemap_scraper_robots(self): scraper = SitemapScraper(self.get_html_parser()) request = Request("http://example.com/robots.txt") response = Response(200, "OK") response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b"Sitemap: http://example.com/sitemap00.xml") scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({"http://example.com/sitemap00.xml"}, linked_urls) self.assertFalse(inline_urls)
def test_warc_recorder_rollback(self): warc_filename = 'asdf.warc' warc_prefix = 'asdf' with open(warc_filename, 'wb') as warc_file: warc_file.write(b'a' * 10) warc_recorder = WARCRecorder( warc_prefix, params=WARCRecorderParams( compress=False, ) ) request = HTTPRequest('http://example.com/') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) class BadRecord(WARCRecord): def __init__(self, original_record): super().__init__() self.block_file = original_record.block_file self.fields = original_record.fields def __iter__(self): for dummy in range(1000): yield b"where's my elephant?" raise OSError('Oops') session._request_record = BadRecord(session._request_record) original_offset = os.path.getsize(warc_filename) with self.assertRaises((OSError, IOError)): session.end_request(request) new_offset = os.path.getsize(warc_filename) self.assertEqual(new_offset, original_offset) self.assertFalse(os.path.exists(warc_filename + '-wpullinc')) _logger.debug('original offset {0}'.format(original_offset))
def test_javascript_reject_type(self): scraper = JavaScriptScraper() request = Request('http://example.com/script.js') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'script.js') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response, link_type=LinkType.css) self.assertFalse(scrape_result)
def test_html_serious_bad_encoding(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker, encoding_override="utf8") request = Request("http://example.com/") response = Response(200, "") response.body = Body() response.fields["content-type"] = "text/html; charset=utf8" with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "xkcd_1_evil.html") with open(html_file_path, "rb") as in_file: shutil.copyfileobj(in_file, response.body) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info)
def test_html_encoding_lxml_name_mismatch(self): """It should accept encoding names with underscore.""" element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request("http://example.com/") response = Response(200, "") response.body = Body() response.fields["content-type"] = "text/html; charset=EUC_KR" with wpull.util.reset_file_offset(response.body): response.body.write("힖".encode("euc_kr")) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info) self.assertEqual("euc_kr", scrape_info["encoding"])
def test_bad_xml(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'foxstripcomics_bad_xml.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) # No crash scraper.scrape(request, response, link_type=LinkType.html)
def test_sitemap_scraper_invalid_robots(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/robots.txt') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write( b'dsfju3wrji kjasSItemapsdmjfkl wekie;er :Ads fkj3m /Dk') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertFalse(linked_urls) self.assertFalse(inline_urls)
def test_response_empty_reason_line(self): response = Response() response.parse(b'HTTP/1.0 200\r\n') response.parse(b'Cake: dolphin\r\n') response.parse(b'\r\n') self.assertEqual(200, response.status_code) self.assertEqual('', response.reason) self.assertEqual('dolphin', response.fields['Cake'])
def test_warc_max_size_and_append(self): file_prefix = 'asdf' with open('asdf-00000.warc', 'w'): pass with open('asdf-00001.warc', 'w'): pass warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams(compress=False, max_size=1, appending=True), ) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'BLAH') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() warc_recorder.close() self.assertTrue(os.path.exists('asdf-00000.warc')) self.assertTrue(os.path.exists('asdf-00001.warc')) self.assertTrue(os.path.exists('asdf-00002.warc')) self.assertTrue(os.path.exists('asdf-00003.warc')) self.assertTrue(os.path.exists('asdf-meta.warc')) self.assertEqual(0, os.path.getsize('asdf-00000.warc')) self.assertEqual(0, os.path.getsize('asdf-00001.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00002.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00003.warc')) self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))
def test_html_encoding_lxml_name_mismatch(self): '''It should accept encoding names with underscore.''' element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'text/html; charset=EUC_KR' with wpull.util.reset_file_offset(response.body): response.body.write('힖'.encode('euc_kr')) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info) self.assertEqual('euc_kr', scrape_info['encoding'])
def test_sitemap_scraper_robots(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/robots.txt') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'Sitemap: http://example.com/sitemap00.xml') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://example.com/sitemap00.xml', }, linked_urls) self.assertFalse(inline_urls)
def test_warc_recorder_rollback(self): warc_filename = 'asdf.warc' warc_prefix = 'asdf' with open(warc_filename, 'wb') as warc_file: warc_file.write(b'a' * 10) warc_recorder = WARCRecorder(warc_prefix, params=WARCRecorderParams( compress=False, )) request = HTTPRequest('http://example.com/') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) class BadRecord(WARCRecord): def __init__(self, original_record): super().__init__() self.block_file = original_record.block_file self.fields = original_record.fields def __iter__(self): for dummy in range(1000): yield b"where's my elephant?" raise OSError('Oops') session._request_record = BadRecord(session._request_record) original_offset = os.path.getsize(warc_filename) with self.assertRaises((OSError, IOError)): session.end_request(request) new_offset = os.path.getsize(warc_filename) self.assertEqual(new_offset, original_offset) self.assertFalse(os.path.exists(warc_filename + '-wpullinc')) _logger.debug('original offset {0}'.format(original_offset))
def test_html_not_quite_charset(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request("http://example.com/") response = Response(200, "") response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "videogame_top.htm") with open(html_file_path, "rb") as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertIn("http://example.com/copyright_2001_2006_rtype.gif", inline_urls) self.assertIn("http://www.geocities.jp/gamehouse_grindcrusher/", linked_urls)
def test_html_garbage(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'text/html' with wpull.util.reset_file_offset(response.body): response.body.write( b'\x01\x00\x01\x00l~Z\xff\x0f`y\x80\x00p<\x7f' b'\xffndo\xff\xff-\x83{d\xec</\xfe\x80\x00\xb4Bo' b'\x7f\xff\xff\xffV\xc1\xff\x7f\xff7') scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info)
def test_html_garbage(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request("http://example.com/") response = Response(200, "") response.body = Body() response.fields["content-type"] = "text/html" with wpull.util.reset_file_offset(response.body): response.body.write( b"\x01\x00\x01\x00l~Z\xff\x0f`y\x80\x00p<\x7f" b"\xffndo\xff\xff-\x83{d\xec</\xfe\x80\x00\xb4Bo" b"\x7f\xff\xff\xffV\xc1\xff\x7f\xff7" ) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info)
def test_html_scraper_reject_type(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'many_urls.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response, link_type=LinkType.css) self.assertFalse(scrape_result)
def test_xhtml_invalid(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request("http://example.com/") response = Response(200, "") response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "xhtml_invalid.html") with open(html_file_path, "rb") as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({"http://example.com/image.png", "http://example.com/script.js"}, inline_urls) self.assertEqual({"http://example.com/link"}, linked_urls)
def test_to_dict(self): request = Request('https://foofle.com') request_dict = request.to_dict() self.assertEqual('https://foofle.com', request_dict['url']) self.assertEqual('https', request_dict['url_info']['scheme']) self.assertEqual('GET', request_dict['method']) self.assertEqual('http', request_dict['protocol']) response = Response(status_code=200, reason='OK', request=request) response_dict = response.to_dict() self.assertEqual('https://foofle.com', response_dict['request']['url']) self.assertEqual('http', response_dict['protocol']) self.assertEqual(200, response_dict['status_code']) self.assertEqual(200, response_dict['response_code']) self.assertEqual('OK', response_dict['reason']) self.assertEqual('OK', response_dict['response_message'])
def test_warc_recorder_journal(self): warc_filename = 'asdf.warc' warc_prefix = 'asdf' warc_recorder = WARCRecorder( warc_prefix, params=WARCRecorderParams( compress=False, ) ) request = HTTPRequest('http://example.com/') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) test_instance = self class MockRecord(WARCRecord): def __init__(self, original_record): super().__init__() self.block_file = original_record.block_file self.fields = original_record.fields def __iter__(self): print(list(os.walk('.'))) test_instance.assertTrue( os.path.exists(warc_filename + '-wpullinc') ) for dummy in range(1000): yield b"where's my elephant?" session._request_record = MockRecord(session._request_record) session.end_request(request) self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))
def test_redirect_tracker(self): tracker = RedirectTracker(5) self.assertFalse(tracker.is_redirect()) self.assertFalse(tracker.is_repeat()) self.assertFalse(tracker.exceeded()) self.assertFalse(tracker.next_location(raw=True)) self.assertEqual(0, tracker.count()) response = Response(200, 'OK') tracker.load(response) self.assertFalse(tracker.is_redirect()) self.assertFalse(tracker.is_repeat()) self.assertFalse(tracker.exceeded()) self.assertFalse(tracker.next_location()) self.assertEqual(0, tracker.count()) response = Response(303, 'See other') response.fields['location'] = '/test' tracker.load(response) self.assertTrue(tracker.is_redirect()) self.assertFalse(tracker.is_repeat()) self.assertFalse(tracker.exceeded()) self.assertEqual('/test', tracker.next_location(raw=True)) self.assertEqual(1, tracker.count()) response = Response(307, 'Temporary redirect') response.fields['location'] = '/test' tracker.load(response) tracker.load(response) tracker.load(response) tracker.load(response) tracker.load(response) self.assertTrue(tracker.is_redirect()) self.assertTrue(tracker.is_repeat()) self.assertTrue(tracker.exceeded()) self.assertEqual('/test', tracker.next_location(raw=True)) self.assertEqual(6, tracker.count())
def test_html_soup(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request("http://example.com/") response = Response(200, "") response.body = Body() response.fields["Refresh"] = "yes" with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "soup.html") with open(html_file_path, "rb") as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({"http://example.com/ABOUTM~1.JPG"}, inline_urls) self.assertEqual({"http://example.com/BLOG", "http://example.com/web ring/Join.htm"}, linked_urls)
def test_rss_as_html(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request("http://example.com/") response = Response(200, "") response.body = Body() response.fields["content-type"] = "application/rss+xml" with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "rss.xml") with open(html_file_path, "rb") as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) self.assertTrue(scrape_result) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertFalse(inline_urls) self.assertEqual({"http://www.someexamplerssdomain.com/main.html", "http://www.wikipedia.org/"}, linked_urls)
def test_javascript_scraper(self): scraper = JavaScriptScraper() request = Request('http://example.com/script.js') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'script.js') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://example.com/script_variable.png', 'http://example.com/dragonquery.js', }, inline_urls ) self.assertEqual({ 'http://example.com/document_write.html', 'http://example.com/http_document_write.html', 'http://example.com/http_document_write2.html', 'http://example.com/http document write.html', 'http://example.com/script_variable.html', 'http://example.com/http_script_variable.html', 'https://example.com/https_script_variable.html', 'ftp://example.com/ftp_script_variable.html', 'http://example.com/end_dir_script_variable/', 'http://example.com/start_dir_script_variable', 'http://example.com/../relative_dir_script_variable' if sys.version_info < (3, 5) else 'http://example.com/relative_dir_script_variable', 'http://example.com/script_json.html', 'http://example.com/http_script_json.html?a=b', }, linked_urls )
def read_response(self, response=None): '''Read the response's HTTP status line and header fields. Coroutine. ''' _logger.debug('Reading header.') if response is None: response = Response() header_lines = [] bytes_read = 0 while True: try: data = yield from self._connection.readline() except ValueError as error: raise ProtocolError( 'Invalid header: {0}'.format(error)) from error self._data_event_dispatcher.notify_read(data) if not data.endswith(b'\n'): raise NetworkError('Connection closed.') elif data in (b'\r\n', b'\n'): break header_lines.append(data) assert data.endswith(b'\n') bytes_read += len(data) if bytes_read > 32768: raise ProtocolError('Header too big.') if not header_lines: raise ProtocolError('No header received.') response.parse(b''.join(header_lines)) return response