def test_html_detect(self): self.assertTrue(HTMLReader.is_file( io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le')) )) self.assertFalse(HTMLReader.is_file( io.BytesIO('hello world!'.encode('utf-16le')) )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<title>hello</title>hi') )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<html><body>hello') )) self.assertTrue(HTMLReader.is_file( io.BytesIO( b'The document has moved <a href="somewhere.html">here</a>' ) )) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.htm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.html')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xht')) ) self.assertFalse( HTMLReader.is_url(URLInfo.parse('example.com/image.jpg')) ) self.assertTrue( HTMLReader.is_request(Request.new('example.com/index.html')) ) self.assertFalse( HTMLReader.is_request(Request.new('example.com/image.jpg')) ) response = Response('HTTP/1.0', '200', 'OK') response.fields['Content-Type'] = 'text/html' self.assertTrue(HTMLReader.is_response(response)) response = Response('HTTP/1.0', '200', 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(HTMLReader.is_response(response))
def test_fetch_disallow(self): http_client = MockHTTPClient() pool = RobotsTxtPool() client = RichClient(http_client, pool) session = MockRobotsTxtRichClientSession( client, Request.new('http://example.com')) self.assertEqual(RobotsState.unknown, session._robots_state) request = session.next_request self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response('HTTP/1.0', 200, 'OK') response.body.content_file = io.StringIO('User-agent:*\nDisallow: /\n') http_client.response = response yield session.fetch() self.assertEqual(RobotsState.denied, session._robots_state) request = session.next_request self.assertIsNone(request) try: yield session.fetch() except RobotsDenied: pass else: self.fail() self.assertTrue(session.done)
def test_html_soup(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, '') response.fields['Refresh'] = 'yes' with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'soup.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual( {'http://example.com/ABOUTM~1.JPG'}, inline_urls ) self.assertEqual( { 'http://example.com/BLOG', 'http://example.com/web ring/Join.htm', }, linked_urls )
def test_html_krokozyabry(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, '') response.fields['content-type'] = 'text/html; charset=KOI8-R' with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'krokozyabry.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual('koi8-r', scrape_info['encoding']) self.assertEqual( set(), inline_urls ) self.assertEqual( {'http://example.com/Кракозябры'}, linked_urls )
def test_fetch_disallow(self): http_client = MockHTTPClient() pool = RobotsTxtPool() client = RichClient(http_client, pool) session = MockRobotsTxtRichClientSession( client, Request.new('http://example.com') ) self.assertEqual(RobotsState.unknown, session._robots_state) request = session.next_request self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response('HTTP/1.0', 200, 'OK') response.body.content_file = io.StringIO('User-agent:*\nDisallow: /\n') http_client.response = response yield session.fetch() self.assertEqual(RobotsState.denied, session._robots_state) request = session.next_request self.assertIsNone(request) try: yield session.fetch() except RobotsDenied: pass else: self.fail() self.assertTrue(session.done)
def test_html_scraper_links_base_href(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, 'OK') with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'basehref.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual('utf-8', scrape_info['encoding']) self.assertEqual({ 'http://cdn.example.com/stylesheet1.css', 'http://www.example.com/stylesheet2.css', 'http://example.com/a/stylesheet3.css', 'http://example.com/a/dir/image1.png', 'http://example.com/dir/image2.png', 'http://example.net/image3.png', 'http://example.com/dir/image4.png', }, inline_urls ) self.assertEqual({ 'http://example.com/a/' }, linked_urls )
def test_server_error(self): http_client = MockHTTPClient() pool = RobotsTxtPool() client = RichClient(http_client, pool) session = MockRobotsTxtRichClientSession( client, Request.new('http://example.com') ) self.assertEqual(RobotsState.unknown, session._robots_state) for dummy in range(21): request = session.next_request self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response('HTTP/1.0', 500, 'Opps') http_client.response = response yield session.fetch() request = session.next_request self.assertIsNone(request) try: yield session.fetch() except RobotsDenied: pass else: self.fail() self.assertTrue(session.done)
def test_redirect_loop(self): http_client = MockHTTPClient() pool = RobotsTxtPool() client = RichClient(http_client, pool) session = MockRobotsTxtRichClientSession( client, Request.new('http://example.com') ) self.assertEqual(RobotsState.unknown, session._robots_state) for dummy in range(21): request = session.next_request self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response('HTTP/1.0', 302, 'See else') response.url_info = request.url_info response.fields['location'] = '/robots.txt' http_client.response = response yield session.fetch() request = session.next_request self.assertTrue(request) response = Response('HTTP/1.0', 200, 'OK') http_client.response = response yield session.fetch() self.assertEqual(RobotsState.ok, session._robots_state) print(session.next_request) self.assertTrue(session.done)
def test_rss_as_html(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, '') response.fields['content-type'] = 'application/rss+xml' with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'rss.xml') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertFalse( inline_urls ) self.assertEqual( { 'http://www.someexamplerssdomain.com/main.html', 'http://www.wikipedia.org/' }, linked_urls )
def test_xhtml_invalid(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, '') with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'xhtml_invalid.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual( { 'http://example.com/image.png', 'http://example.com/script.js', }, inline_urls ) self.assertEqual( { 'http://example.com/link' }, linked_urls )
def test_sitemap_scraper_xml(self): scraper = SitemapScraper() request = Request.new('http://example.com/sitemap.xml') response = Response('HTTP/1.0', 200, 'OK') with wpull.util.reset_file_offset(response.body.content_file): response.body.content_file.write( b'''<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://www.example.com/</loc> <lastmod>2005-01-01</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> </urlset> ''' ) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual({ 'http://www.example.com/', }, linked_urls ) self.assertFalse(inline_urls)
def test_javascript_heavy_inline_monstrosity(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, 'OK') with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'twitchplayspokemonfirered.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertIn( 'http://cdn.bulbagarden.net/upload/archive/a/a4/' '20090718115357%21195Quagsire.png', inline_urls ) self.assertIn( 'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F' 'user%2FGoldenSandslash15&sa=D&sntz=1&' 'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls )
def test_sitemap_scraper_xml_index(self): scraper = SitemapScraper() request = Request.new('http://example.com/sitemap.xml') response = Response('HTTP/1.0', 200, 'OK') with wpull.util.reset_file_offset(response.body.content_file): response.body.content_file.write( b'''<?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> </sitemapindex> ''' ) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual({ 'http://www.example.com/sitemap1.xml.gz', }, linked_urls ) self.assertFalse(inline_urls)
def test_request(self): request = Request.new('http://example.com/robots.txt') self.assertEqual( (b'GET /robots.txt HTTP/1.1\r\n' b'Host: example.com\r\n' b'\r\n'), request.header() )
def test_connection_reuse(self): connection = Connection('localhost', self._port) request = Request.new(self.get_url('/')) request.version = 'HTTP/1.0' response = yield connection.fetch(request) self.assertEqual(200, response.status_code) response = yield connection.fetch(request) self.assertEqual(200, response.status_code)
def test_ssl_fail(self): connection = Connection('localhost', self.get_http_port()) try: yield connection.fetch(Request.new(self.get_url('/'))) except SSLVerficationError: pass else: self.fail()
def test_connection_refused(self): connection = Connection(('localhost', 1)) try: yield connection.fetch(Request.new('http://localhost:1/')) except ConnectionRefused: pass else: self.fail()
def test_read_timeout(self): connection = Connection('localhost', self._port, read_timeout=0.1) request = Request.new(self.get_url('/sleep_long')) try: yield connection.fetch(request) except NetworkError: pass else: self.fail()
def test_no_such_host(self): connection = Connection('wpull-no-exist.invalid', 80) try: yield connection.fetch( Request.new('http://wpull-no-exist.invalid')) except NetworkError: pass else: self.fail()
def test_connection_timeout(self): connection = Connection('1.0.0.0', 1, connect_timeout=0.1) try: yield connection.fetch( Request.new('http://1.0.0.0:1/')) except NetworkError: pass else: self.fail()
def test_connection_refused(self): connection = Connection('localhost', 1) try: yield connection.fetch( Request.new('http://localhost:1/')) except ConnectionRefused: pass else: self.fail()
def test_client_exception_throw(self): client = Client() try: yield client.fetch(Request.new('http://wpull-no-exist.invalid')) except NetworkError: pass else: self.fail()
def test_basic(self): http_client = Client() client = RichClient(http_client) session = client.session(Request.new(self.get_url('/'))) self.assertFalse(session.done) response = yield session.fetch() self.assertEqual(200, response.status_code) self.assertTrue(session.done)
def test_buffer_overflow(self): connection = Connection('localhost', self._port, connect_timeout=2.0, read_timeout=5.0, buffer_size=1000) request = Request.new(self.get_url('/buffer_overflow')) try: yield connection.fetch(request) except (ProtocolError, NetworkError): pass else: self.fail()
def test_ignore_length(self): self.connection = Connection( ('localhost', self._port), params=ConnectionParams(keep_alive=False, ignore_length=True)) response = yield self.connection.fetch(Request.new( self.get_url('/underrun')), recorder=DebugPrintRecorder()) self.assertEqual(50, response.body.content_size)
def test_http_request(self): request = Request.new('http://example.com') request.fields['hello'] = 'world' new_request = convert_http_request(request) if sys.version_info[0] == 2: self.assertEqual('example.com', new_request.get_host()) else: self.assertEqual('example.com', new_request.host) self.assertEqual('world', new_request.get_header('Hello'))
def test_ignore_length(self): self.connection = Connection( ('localhost', self._port), params=ConnectionParams(keep_alive=False, ignore_length=True) ) response = yield self.connection.fetch( Request.new(self.get_url('/underrun')), recorder=DebugPrintRecorder() ) self.assertEqual(50, response.body.content_size)
def test_bad_redirect(self): http_client = Client() client = RichClient(http_client) session = client.session(Request.new(self.get_url('/bad_redirect'))) while not session.done: try: yield session.fetch() except ProtocolError: return else: self.fail()
def test_client_exception_recovery(self): connection_factory = functools.partial(Connection, read_timeout=0.2) host_connection_pool_factory = functools.partial( HostConnectionPool, connection_factory=connection_factory) connection_pool = ConnectionPool(host_connection_pool_factory) client = Client(connection_pool) for dummy in range(7): try: yield client.fetch( Request.new(self.get_url('/header_early_close')), recorder=DebugPrintRecorder() ) except NetworkError: pass else: self.fail() for dummy in range(7): response = yield client.fetch(Request.new(self.get_url('/'))) self.assertEqual(200, response.status_code)
def test_ssl_fail(self): connection = Connection( ('localhost', self.get_http_port()), ssl_enable=True, params=ConnectionParams( ssl_options=dict(cert_reqs=ssl.CERT_REQUIRED, ca_certs=self.get_ssl_options()['certfile']))) try: yield connection.fetch(Request.new(self.get_url('/'))) except SSLVerficationError: pass else: self.fail()
def test_connection_pool_clean(self): connection_pool = ConnectionPool() client = Client(connection_pool) requests = [client.fetch( Request.new(self.get_url('/'))) for dummy in range(12)] responses = yield requests for response in responses: self.assertEqual(200, response.status_code) connection_pool.clean() self.assertEqual(0, len(connection_pool))
def test_sitemap_detect(self): self.assertTrue(SitemapReader.is_file( io.BytesIO('<?xml > <urlset >'.encode('utf-16le')) )) self.assertFalse(SitemapReader.is_file( io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le')) )) self.assertFalse(SitemapReader.is_file( io.BytesIO(b'<html><body>hello<urlset>') )) self.assertTrue(SitemapReader.is_file( io.BytesIO(b'<?xml version> <urlset>') )) data_file = io.BytesIO() g_file = gzip.GzipFile(fileobj=data_file, mode='wb') g_file.write('<?xml version> <urlset>'.encode('utf-16le')) g_file.close() data_file.seek(0) self.assertTrue(SitemapReader.is_file( data_file )) self.assertTrue( SitemapReader.is_url(URLInfo.parse('example.com/sitemaps1.xml')) ) self.assertTrue( SitemapReader.is_url(URLInfo.parse('example.com/robots.txt')) ) self.assertFalse( SitemapReader.is_url(URLInfo.parse('example.com/image.jpg')) ) self.assertTrue( SitemapReader.is_request(Request.new('example.com/sitemaps34.xml')) ) self.assertFalse( SitemapReader.is_request(Request.new('example.com/image.jpg')) )
def test_connection_pool_min(self): connection_pool = ConnectionPool() client = Client(connection_pool) for dummy in range(2): response = yield client.fetch( Request.new(self.get_url('/sleep_short'))) self.assertEqual(200, response.status_code) self.assertEqual(b'12', response.body.content) self.assertEqual(1, len(connection_pool)) connection_pool_entry = list(connection_pool.values())[0] self.assertIsInstance(connection_pool_entry, HostConnectionPool) self.assertEqual(1, len(connection_pool_entry))
def test_underrun(self): self.connection = Connection( ('localhost', self._port), params=ConnectionParams(connect_timeout=2.0, read_timeout=1.0)) for counter in range(3): print(counter) try: yield self.connection.fetch( Request.new(self.get_url('/underrun'))) except NetworkTimedOut: pass else: self.fail()
def test_xml_detect(self): self.assertTrue(XMLDetector.is_file( io.BytesIO('<?xml version='.encode('utf-16le')) )) self.assertFalse(XMLDetector.is_file( io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le')) )) self.assertFalse(XMLDetector.is_file( io.BytesIO(b'<html><body>hello') )) self.assertTrue(XMLDetector.is_file( io.BytesIO(b'<?xml version') )) self.assertTrue( XMLDetector.is_url(URLInfo.parse('example.com/index.xml')) ) self.assertFalse( XMLDetector.is_url(URLInfo.parse('example.com/image.jpg')) ) self.assertTrue( XMLDetector.is_request(Request.new('example.com/index.xml')) ) self.assertFalse( XMLDetector.is_request(Request.new('example.com/image.jpg')) ) response = Response('HTTP/1.0', '200', 'OK') response.fields['Content-Type'] = 'text/xml' self.assertTrue(XMLDetector.is_response(response)) response = Response('HTTP/1.0', '200', 'OK') response.fields['Content-Type'] = 'application/xml' self.assertTrue(XMLDetector.is_response(response)) response = Response('HTTP/1.0', '200', 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(XMLDetector.is_response(response))