def test_basic_requests(self): proxy_http_client = Client(recorder=DebugPrintRecorder()) proxy_server = HTTPProxyServer(proxy_http_client) proxy_socket, proxy_port = tornado.testing.bind_unused_port() yield From(trollius.start_server(proxy_server, sock=proxy_socket)) connection_pool = HTTPProxyConnectionPool(('127.0.0.1', proxy_port)) http_client = Client(connection_pool=connection_pool, recorder=DebugPrintRecorder()) for dummy in range(3): with http_client.session() as session: response = yield From(session.fetch(Request(self.get_url('/')))) self.assertEqual(200, response.status_code) file = io.BytesIO() yield From(session.read_content(file=file)) data = file.getvalue().decode('ascii', 'replace') self.assertTrue(data.endswith('</html>')) with http_client.session() as session: response = yield From(session.fetch(Request( self.get_url('/always_error')))) self.assertEqual(500, response.status_code) self.assertEqual('Dragon In Data Center', response.reason) file = io.BytesIO() yield From(session.read_content(file=file)) data = file.getvalue().decode('ascii', 'replace') self.assertEqual('Error', data)
def test_client_did_not_complete(self): client = Client() with warnings.catch_warnings(record=True) as warn_list: warnings.simplefilter("always") with client.session() as session: request = Request(self.get_url('/')) yield From(session.fetch(request)) self.assertFalse(session.done()) for warn_obj in warn_list: print(warn_obj) # Unrelated warnings may occur in PyPy # https://travis-ci.org/chfoo/wpull/jobs/51420202 self.assertGreaterEqual(len(warn_list), 1) for warn_obj in warn_list: if str(warn_obj.message) == 'HTTP session did not complete.': break else: self.fail('Warning did not occur.') client = Client() with self.assertRaises(MyException): with client.session() as session: request = Request(self.get_url('/')) yield From(session.fetch(request)) raise MyException('Oops')
def test_xml_detect(self): self.assertTrue( XMLDetector.is_file(io.BytesIO( '<?xml version='.encode('utf-16le')))) self.assertFalse( XMLDetector.is_file( io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le')))) self.assertFalse(XMLDetector.is_file(io.BytesIO(b'<html><body>hello'))) self.assertTrue(XMLDetector.is_file(io.BytesIO(b'<?xml version'))) self.assertTrue( XMLDetector.is_url(URLInfo.parse('example.com/index.xml'))) self.assertFalse( XMLDetector.is_url(URLInfo.parse('example.com/image.jpg'))) self.assertTrue( XMLDetector.is_request(Request('example.com/index.xml'))) self.assertFalse( XMLDetector.is_request(Request('example.com/image.jpg'))) response = Response(200, 'OK') response.fields['Content-Type'] = 'text/xml' self.assertTrue(XMLDetector.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'application/xml' self.assertTrue(XMLDetector.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(XMLDetector.is_response(response))
def test_to_dict_body(self): request = Request() request.body = Body() request_dict = request.to_dict() self.assertTrue(request_dict['body']) request.body.close() request = Request() request.body = NotImplemented request_dict = request.to_dict() self.assertFalse(request_dict['body']) response = Response() response.body = Body() response_dict = response.to_dict() self.assertTrue(response_dict['body']) response.body.close() response = Response() response.body = NotImplemented response_dict = response.to_dict() self.assertFalse(response_dict['body'])
def test_overrun(self): stream = self.new_stream() request = Request(self.get_url('/overrun')) for dummy in range(3): response, content = yield From(self.fetch(stream, request)) self.assertEqual(b'a' * 100, content) request = Request(self.get_url('/')) yield From(self.fetch(stream, request))
def test_header_early_close(self): stream = self.new_stream() request = Request(self.get_url('/header_early_close')) try: yield From(self.fetch(stream, request)) except NetworkError: pass else: self.fail() # pragma: no cover request = Request(self.get_url('/')) yield From(self.fetch(stream, request))
def test_html_krokozyabry(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'text/html; charset=KOI8-R' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'krokozyabry.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual('koi8-r', scrape_result.encoding) self.assertEqual( set(), inline_urls ) self.assertEqual( {'http://example.com/Кракозябры'}, linked_urls )
def test_html_soup(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['Refresh'] = 'yes' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'soup.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual( {'http://example.com/ABOUTM~1.JPG'}, inline_urls ) self.assertEqual( { 'http://example.com/BLOG', 'http://example.com/web ring/Join.htm', }, linked_urls )
def test_client_duration_timeout(self): client = Client() with self.assertRaises(DurationTimeout), client.session() as session: request = Request(self.get_url('/sleep_long')) yield From(session.fetch(request)) yield From(session.read_content(duration_timeout=0.1))
def test_xhtml_invalid(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'xhtml_invalid.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual( { 'http://example.com/image.png', 'http://example.com/script.js', }, inline_urls ) self.assertEqual( { 'http://example.com/link' }, linked_urls )
def test_http_request(self): request = Request('http://example.com') request.fields['hello'] = 'world' new_request = convert_http_request(request) self.assertEqual('example.com', new_request.host) self.assertEqual('world', new_request.get_header('Hello'))
def test_html_not_quite_charset(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'videogame_top.htm') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertIn( 'http://example.com/copyright_2001_2006_rtype.gif', inline_urls ) self.assertIn( 'http://www.geocities.jp/gamehouse_grindcrusher/', linked_urls )
def test_sitemap_scraper_xml(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://www.example.com/</loc> <lastmod>2005-01-01</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> </urlset> ''') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/', }, linked_urls) self.assertFalse(inline_urls)
def test_content_length_and_chunked(self): stream = self.new_stream() request = Request(self.get_url('/content_length_and_chunked')) response, content = yield From(self.fetch(stream, request)) self.assertEqual(200, response.status_code) self.assertEqual('chunked', response.fields['Transfer-Encoding']) self.assertEqual(b'hello world!', content)
def test_false_gzip(self): stream = self.new_stream('127.0.0.1', self._port) request = Request(self.get_url('/false_gzip')) response, content = yield From(self.fetch(stream, request)) self.assertEqual('gzip', response.fields['Content-Encoding']) self.assertEqual(b'a' * 100, content)
def test_utf8_header(self): stream = self.new_stream() request = Request(self.get_url('/utf8_header')) response, dummy = yield From(self.fetch(stream, request)) self.assertEqual(200, response.status_code) self.assertEqual('🐱'.encode('utf-8').decode('latin-1'), response.fields['whoa'])
def test_javascript_heavy_inline_monstrosity(self): scraper = JavaScriptScraper() request = Request('http://example.com/test.js') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'twitchplayspokemonfirered.html') with open(html_file_path, 'rb') as in_file: in_file.seek(0x147) shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertIn( 'http://cdn.bulbagarden.net/upload/archive/a/a4/' '20090718115357%21195Quagsire.png', inline_urls) self.assertIn( 'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F' 'user%2FGoldenSandslash15&sa=D&sntz=1&' 'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls) print('\n'.join(inline_urls)) print('\n'.join(linked_urls))
def test_status_line_only(self): stream = self.new_stream('127.0.0.1', self._port) request = Request(self.get_url('/status_line_only')) response, content = yield From(self.fetch(stream, request)) self.assertEqual(200, response.status_code) self.assertEqual(b'Hey', content)
def test_sitemap_scraper_xml_index(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> </sitemapindex> ''') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/sitemap1.xml.gz', }, linked_urls) self.assertFalse(inline_urls)
def test_rss_as_html(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'application/rss+xml' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'rss.xml') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) self.assertTrue(scrape_result) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertFalse( inline_urls ) self.assertEqual( { 'http://www.someexamplerssdomain.com/main.html', 'http://www.wikipedia.org/' }, linked_urls )
def test_client_exception_throw(self): client = Client() with client.session() as session: request = Request('http://wpull-no-exist.invalid') with self.assertRaises(NetworkError): yield From(session.fetch(request))
def test_basic_chunked_trailer(self): stream = self.new_stream() request = Request(self.get_url('/chunked_trailer')) response, content = yield From(self.fetch(stream, request)) self.assertEqual(200, response.status_code) self.assertEqual('chunked', response.fields['Transfer-Encoding']) self.assertEqual('dolphin', response.fields['Animal']) self.assertEqual(b'hello world!', content)
def test_basic_content_length(self): stream = self.new_stream() request = Request(self.get_url('/content_length')) response, content = yield From(self.fetch(stream, request)) self.assertEqual(200, response.status_code) self.assertEqual('100', response.fields['Content-Length']) self.assertEqual(b'a' * 100, content) self.assertEqual(100, len(content))
def test_ignore_length(self): stream = self.new_stream('127.0.0.1', self._port, keep_alive=False, ignore_length=True) request = Request(self.get_url('/underrun')) response, content = yield From(self.fetch(stream, request)) self.assertEqual(50, len(content))
def test_connection_reuse(self): stream = self.new_stream() request = Request(self.get_url('/')) request.version = 'HTTP/1.0' response, dummy = yield From(self.fetch(stream, request)) self.assertEqual(200, response.status_code) response, dummy = yield From(self.fetch(stream, request)) self.assertEqual(200, response.status_code)
def test_connection_refused(self): stream = self.new_stream('127.0.0.1', 1) try: yield From(self.fetch(stream, Request('http://localhost:1/'))) except ConnectionRefused: pass else: self.fail() # pragma: no cover
def test_basic(self): client = WebClient() session = client.session(Request(self.get_url('/'))) self.assertFalse(session.done()) response = yield From(session.fetch()) self.assertEqual(200, response.status_code) self.assertTrue(session.done())
def test_bad_chunk_size(self): stream = self.new_stream() request = Request(self.get_url('/bad_chunk_size')) try: yield From(self.fetch(stream, request)) except ProtocolError: pass else: self.fail() # pragma: no cover
def test_no_such_host(self): stream = self.new_stream('wpull-no-exist.invalid', 80) try: yield From( self.fetch(stream, Request('http://wpull-no-exist.invalid'))) except NetworkError: pass else: self.fail() # pragma: no cover
def test_gzip_corrupt_footer(self): stream = self.new_stream() request = Request(self.get_url('/gzip_corrupt_footer')) try: yield From(self.fetch(stream, request)) except ProtocolError: pass else: self.fail() # pragma: no cover