def test_client_did_not_complete(self): client = Client() with warnings.catch_warnings(record=True) as warn_list: warnings.simplefilter("always") with client.session() as session: request = Request(self.get_url('/')) yield from session.start(request) self.assertFalse(session.done()) for warn_obj in warn_list: print(warn_obj) # Unrelated warnings may occur in PyPy # https://travis-ci.org/chfoo/wpull/jobs/51420202 self.assertGreaterEqual(len(warn_list), 1) for warn_obj in warn_list: if str(warn_obj.message) == 'HTTP session did not complete.': break else: self.fail('Warning did not occur.') client = Client() with self.assertRaises(MyException): with client.session() as session: request = Request(self.get_url('/')) yield from session.start(request) raise MyException('Oops')
def test_css_detect(self): self.assertTrue( CSSReader.is_file( io.BytesIO('body { color: white }'.encode('utf-16le')))) self.assertFalse( CSSReader.is_file(io.BytesIO('hello world!'.encode('utf-16le')))) self.assertFalse(CSSReader.is_file(io.BytesIO(b'<html><body>hello'))) self.assertTrue( CSSReader.is_file(io.BytesIO(b'<html><body>hello')) is VeryFalse) self.assertTrue( CSSReader.is_file(io.BytesIO(b'h1 { background-color: red }'))) self.assertTrue(CSSReader.is_file(io.BytesIO(b'@import url.css;'))) self.assertTrue( CSSReader.is_url(URLInfo.parse('example.com/index.css'))) self.assertFalse( CSSReader.is_url(URLInfo.parse('example.com/image.jpg'))) self.assertTrue(CSSReader.is_request(Request('example.com/index.css'))) self.assertFalse(CSSReader.is_request( Request('example.com/image.jpg'))) response = Response(200, 'OK') response.fields['Content-Type'] = 'text/css' self.assertTrue(CSSReader.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(CSSReader.is_response(response))
def test_basic_requests(self): proxy_http_client = Client() proxy_server = HTTPProxyServer(proxy_http_client) proxy_socket, proxy_port = tornado.testing.bind_unused_port() yield from asyncio.start_server(proxy_server, sock=proxy_socket) connection_pool = HTTPProxyConnectionPool(('127.0.0.1', proxy_port)) http_client = Client(connection_pool=connection_pool) for dummy in range(3): with http_client.session() as session: response = yield from session.start(Request(self.get_url('/'))) self.assertEqual(200, response.status_code) file = io.BytesIO() yield from session.download(file=file) data = file.getvalue().decode('ascii', 'replace') self.assertTrue(data.endswith('</html>')) with http_client.session() as session: response = yield from session.start( Request(self.get_url('/always_error'))) self.assertEqual(500, response.status_code) self.assertEqual('Dragon In Data Center', response.reason) file = io.BytesIO() yield from session.download(file=file) data = file.getvalue().decode('ascii', 'replace') self.assertEqual('Error', data)
def test_http_request(self): request = Request('http://example.com') request.fields['hello'] = 'world' new_request = convert_http_request(request) self.assertEqual('example.com', new_request.host) self.assertEqual('world', new_request.get_header('Hello'))
def test_xml_detect(self): self.assertTrue( XMLDetector.is_file(io.BytesIO( '<?xml version='.encode('utf-16le')))) self.assertFalse( XMLDetector.is_file( io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le')))) self.assertFalse(XMLDetector.is_file(io.BytesIO(b'<html><body>hello'))) self.assertTrue(XMLDetector.is_file(io.BytesIO(b'<?xml version'))) self.assertTrue( XMLDetector.is_url(URLInfo.parse('example.com/index.xml'))) self.assertFalse( XMLDetector.is_url(URLInfo.parse('example.com/image.jpg'))) self.assertTrue( XMLDetector.is_request(Request('example.com/index.xml'))) self.assertFalse( XMLDetector.is_request(Request('example.com/image.jpg'))) response = Response(200, 'OK') response.fields['Content-Type'] = 'text/xml' self.assertTrue(XMLDetector.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'application/xml' self.assertTrue(XMLDetector.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(XMLDetector.is_response(response))
def test_redirect_loop(self): checker = RobotsTxtChecker(web_client=MockWebClient()) request = Request('http://example.com') request.prepare_for_send() nonlocal_dict = {'counter': 0} def response_callback(request): request.prepare_for_send() self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response(302, 'See else') response.request = request response.fields['Location'] = '/robots.txt' nonlocal_dict['counter'] += 1 if nonlocal_dict['counter'] > 20: raise ProtocolError('Mock redirect loop error.') return response checker.web_client.mock_response_callback = response_callback self.assertTrue((yield from checker.can_fetch(request))) self.assertTrue(checker.can_fetch_pool(request))
def test_connection_reuse(self): stream = self.new_stream() request = Request(self.get_url('/')) request.version = 'HTTP/1.0' response, dummy = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code) response, dummy = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code)
def test_request(self): request = Request('http://example.com/robots.txt') request.prepare_for_send() self.assertEqual( (b'GET /robots.txt HTTP/1.1\r\n' b'Host: example.com\r\n' b'\r\n'), request.to_bytes() )
def test_fetch_allow_redirects(self): checker = RobotsTxtChecker(web_client=MockWebClient()) request = Request('http://example.com') request.prepare_for_send() # Try fetch example.com/ (need robots.txt) def response_callback_1(request): request.prepare_for_send() self.assertEqual('http://example.com/robots.txt', request.url_info.url) response = Response(301, 'Moved') response.fields['location'] = 'http://www.example.com/robots.txt' response.request = request checker.web_client.mock_response_callback = response_callback_2 checker.web_client.request = Request( 'http://www.example.com/robots.txt') return response # Try fetch www.example.com/robots.txt def response_callback_2(request): request.prepare_for_send() self.assertEqual('http://www.example.com/robots.txt', request.url_info.url) response = Response(301, 'Moved') response.fields['location'] = 'http://www.example.net/robots.txt' response.request = request checker.web_client.mock_response_callback = response_callback_3 checker.web_client.request = Request( 'http://www.example.net/robots.txt') return response # Try fetch www.example.net/robots.txt def response_callback_3(request): request.prepare_for_send() self.assertEqual('http://www.example.net/robots.txt', request.url_info.url) response = Response(200, 'OK') response.request = request response.body = io.StringIO('User-agent:*\nAllow: /\n') checker.web_client.session_obj.done_value = True return response checker.web_client.mock_response_callback = response_callback_1 self.assertTrue((yield from checker.can_fetch(request))) self.assertTrue(checker.can_fetch_pool(request))
def test_overrun(self): stream = self.new_stream() request = Request(self.get_url('/overrun')) for dummy in range(3): response, content = yield from self.fetch(stream, request) self.assertEqual(b'a' * 100, content) request = Request(self.get_url('/')) yield from self.fetch(stream, request)
def test_header_early_close(self): stream = self.new_stream() request = Request(self.get_url('/header_early_close')) try: yield from self.fetch(stream, request) except NetworkError: pass else: self.fail() # pragma: no cover request = Request(self.get_url('/')) yield from self.fetch(stream, request)
def test_html_detect(self): self.assertTrue(HTMLReader.is_file( io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le')) )) self.assertFalse(HTMLReader.is_file( io.BytesIO('hello world!'.encode('utf-16le')) )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<title>hello</title>hi') )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<html><body>hello') )) self.assertTrue(HTMLReader.is_file( io.BytesIO( b'The document has moved <a href="somewhere.html">here</a>' ) )) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.htm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.html')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xht')) ) self.assertFalse( HTMLReader.is_url(URLInfo.parse('example.com/image.jpg')) ) self.assertTrue( HTMLReader.is_request(Request('example.com/index.html')) ) self.assertFalse( HTMLReader.is_request(Request('example.com/image.jpg')) ) response = Response(200, 'OK') response.fields['Content-Type'] = 'text/html' self.assertTrue(HTMLReader.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(HTMLReader.is_response(response))
def test_to_dict_body(self): request = Request() request.body = Body() request_dict = request.to_dict() self.assertTrue(request_dict['body']) request.body.close() request = Request() request.body = NotImplemented request_dict = request.to_dict() self.assertFalse(request_dict['body']) response = Response() response.body = Body() response_dict = response.to_dict() self.assertTrue(response_dict['body']) response.body.close() response = Response() response.body = NotImplemented response_dict = response.to_dict() self.assertFalse(response_dict['body'])
def test_warc_max_size_and_append(self): file_prefix = 'asdf' with open('asdf-00000.warc', 'w'): pass with open('asdf-00001.warc', 'w'): pass warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, max_size=1, appending=True ), ) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'BLAH') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() warc_recorder.close() self.assertTrue(os.path.exists('asdf-00000.warc')) self.assertTrue(os.path.exists('asdf-00001.warc')) self.assertTrue(os.path.exists('asdf-00002.warc')) self.assertTrue(os.path.exists('asdf-00003.warc')) self.assertTrue(os.path.exists('asdf-meta.warc')) self.assertEqual(0, os.path.getsize('asdf-00000.warc')) self.assertEqual(0, os.path.getsize('asdf-00001.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00002.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00003.warc')) self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))
def test_warc_recorder_rollback(self): warc_filename = 'asdf.warc' warc_prefix = 'asdf' with open(warc_filename, 'wb') as warc_file: warc_file.write(b'a' * 10) warc_recorder = WARCRecorder( warc_prefix, params=WARCRecorderParams( compress=False, ) ) request = HTTPRequest('http://example.com/') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) class BadRecord(WARCRecord): def __init__(self, original_record): super().__init__() self.block_file = original_record.block_file self.fields = original_record.fields def __iter__(self): for dummy in range(1000): yield b"where's my elephant?" raise OSError('Oops') session._request_record = BadRecord(session._request_record) original_offset = os.path.getsize(warc_filename) with self.assertRaises((OSError, IOError)): session.end_request(request) new_offset = os.path.getsize(warc_filename) self.assertEqual(new_offset, original_offset) self.assertFalse(os.path.exists(warc_filename + '-wpullinc')) _logger.debug('original offset {0}'.format(original_offset))
def test_javascript_heavy_inline_monstrosity(self): scraper = JavaScriptScraper() request = Request('http://example.com/test.js') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'twitchplayspokemonfirered.html') with open(html_file_path, 'rb') as in_file: in_file.seek(0x147) shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertIn( 'http://cdn.bulbagarden.net/upload/archive/a/a4/' '20090718115357%21195Quagsire.png', inline_urls) self.assertIn( 'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F' 'user%2FGoldenSandslash15&sa=D&sntz=1&' 'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls) print('\n'.join(inline_urls)) print('\n'.join(linked_urls))
def test_client_duration_timeout(self): client = Client() with self.assertRaises(DurationTimeout), client.session() as session: request = Request(self.get_url('/sleep_long')) yield from session.start(request) yield from session.download(duration_timeout=0.1)
def test_html_scraper_links_base_href(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'basehref.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual('utf-8', scrape_result.encoding) self.assertEqual( { 'http://cdn.example.com/stylesheet1.css', 'http://www.example.com/stylesheet2.css', 'http://example.com/a/stylesheet3.css', 'http://example.com/a/dir/image1.png', 'http://example.com/dir/image2.png', 'http://example.net/image3.png', 'http://example.com/dir/image4.png', }, inline_urls) self.assertEqual({'http://example.com/a/'}, linked_urls)
def test_rss_as_html(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'application/rss+xml' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'rss.xml') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) self.assertTrue(scrape_result) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertFalse(inline_urls) self.assertEqual( { 'http://www.someexamplerssdomain.com/main.html', 'http://www.wikipedia.org/' }, linked_urls)
def test_sitemap_scraper_xml(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://www.example.com/</loc> <lastmod>2005-01-01</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> </urlset> ''') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/', }, linked_urls) self.assertFalse(inline_urls)
def test_html_soup(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['Refresh'] = 'yes' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'soup.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({'http://example.com/ABOUTM~1.JPG'}, inline_urls) self.assertEqual( { 'http://example.com/BLOG', 'http://example.com/web ring/Join.htm', }, linked_urls)
def test_duration_timeout(self): client = WebClient() session = client.session(Request(self.get_url('/sleep_long'))) with self.assertRaises(DurationTimeout): yield from session.start() yield from session.download(duration_timeout=0.1)
def test_content_length_and_chunked(self): stream = self.new_stream() request = Request(self.get_url('/content_length_and_chunked')) response, content = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code) self.assertEqual('chunked', response.fields['Transfer-Encoding']) self.assertEqual(b'hello world!', content)
def test_utf8_header(self): stream = self.new_stream() request = Request(self.get_url('/utf8_header')) response, dummy = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code) self.assertEqual('🐱'.encode('utf-8').decode('latin-1'), response.fields['whoa'])
def test_false_gzip(self): stream = self.new_stream('127.0.0.1', self._port) request = Request(self.get_url('/false_gzip')) response, content = yield from self.fetch(stream, request) self.assertEqual('gzip', response.fields['Content-Encoding']) self.assertEqual(b'a' * 100, content)
def test_status_line_only(self): stream = self.new_stream('127.0.0.1', self._port) request = Request(self.get_url('/status_line_only')) response, content = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code) self.assertEqual(b'Hey', content)
def test_sitemap_scraper_xml_index(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> </sitemapindex> ''') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/sitemap1.xml.gz', }, linked_urls) self.assertFalse(inline_urls)
def end_request(self, request: HTTPRequest): payload_offset = len(request.to_bytes()) self._request_record.block_file.seek(0) self._recorder.set_length_and_maybe_checksums( self._request_record, payload_offset=payload_offset) self._recorder.write_record(self._request_record)
def test_warc_max_size_and_append(self): file_prefix = 'asdf' with open('asdf-00000.warc', 'w'): pass with open('asdf-00001.warc', 'w'): pass warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams(compress=False, max_size=1, appending=True), ) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'BLAH') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() warc_recorder.close() self.assertTrue(os.path.exists('asdf-00000.warc')) self.assertTrue(os.path.exists('asdf-00001.warc')) self.assertTrue(os.path.exists('asdf-00002.warc')) self.assertTrue(os.path.exists('asdf-00003.warc')) self.assertTrue(os.path.exists('asdf-meta.warc')) self.assertEqual(0, os.path.getsize('asdf-00000.warc')) self.assertEqual(0, os.path.getsize('asdf-00001.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00002.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00003.warc')) self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))
def test_client_exception_recovery(self): connection_factory = functools.partial(Connection, timeout=2.0) connection_pool = ConnectionPool(connection_factory=connection_factory) client = Client(connection_pool=connection_pool) for dummy in range(7): with self.assertRaises(NetworkError), client.session() as session: request = Request(self.get_url('/header_early_close')) yield from session.start(request) for dummy in range(7): with client.session() as session: request = Request(self.get_url('/')) response = yield from session.start(request) self.assertEqual(200, response.status_code) yield from session.download() self.assertTrue(session.done())
def end_request(self, request: HTTPRequest): payload_offset = len(request.to_bytes()) self._request_record.block_file.seek(0) self._recorder.set_length_and_maybe_checksums( self._request_record, payload_offset=payload_offset ) self._recorder.write_record(self._request_record)
def test_basic_chunked_trailer(self): stream = self.new_stream() request = Request(self.get_url('/chunked_trailer')) response, content = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code) self.assertEqual('chunked', response.fields['Transfer-Encoding']) self.assertEqual('dolphin', response.fields['Animal']) self.assertEqual(b'hello world!', content)
def test_connection_refused(self): stream = self.new_stream('127.0.0.1', 1) try: yield from self.fetch(stream, Request('http://localhost:1/')) except ConnectionRefused: pass else: self.fail() # pragma: no cover
def test_basic_content_length(self): stream = self.new_stream() request = Request(self.get_url('/content_length')) response, content = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code) self.assertEqual('100', response.fields['Content-Length']) self.assertEqual(b'a' * 100, content) self.assertEqual(100, len(content))
def _add_referrer(cls, request: Request, url_record: URLRecord): '''Add referrer URL to request.''' # Prohibit leak of referrer from HTTPS to HTTP # rfc7231 section 5.5.2. if url_record.parent_url.startswith('https://') and \ url_record.url_info.scheme == 'http': return request.fields['Referer'] = url_record.parent_url
def test_warc_recorder_journal(self): warc_filename = 'asdf.warc' warc_prefix = 'asdf' warc_recorder = WARCRecorder( warc_prefix, params=WARCRecorderParams( compress=False, ) ) request = HTTPRequest('http://example.com/') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) test_instance = self class MockRecord(WARCRecord): def __init__(self, original_record): super().__init__() self.block_file = original_record.block_file self.fields = original_record.fields def __iter__(self): print(list(os.walk('.'))) test_instance.assertTrue( os.path.exists(warc_filename + '-wpullinc') ) for dummy in range(1000): yield b"where's my elephant?" session._request_record = MockRecord(session._request_record) session.end_request(request) self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))
def _add_basic_auth_header(self, request: Request): username = request.url_info.username or request.username password = request.url_info.password or request.password if username and password: _logger.debug("Add basic auth header") auth_string = "{}:{}".format(username, password) auth_string = base64.b64encode(auth_string.encode("utf-8", "replace")).decode("utf-8") request.fields["Authorization"] = "Basic {}".format(auth_string)
def _read_request_header(self) -> Request: request = Request() for dummy in range(100): line = yield from self._reader.readline() _logger.debug(__('Got line {0}', line)) if line[-1:] != b'\n': return if not line.strip(): break request.parse(line) else: raise ProtocolError('Request has too many headers.') return request
def _add_post_data(self, request: Request): '''Add data to the payload.''' if self._item_session.url_record.post_data: data = wpull.string.to_bytes(self._item_session.url_record.post_data) else: data = wpull.string.to_bytes( self._processor.fetch_params.post_data ) request.method = 'POST' request.fields['Content-Type'] = 'application/x-www-form-urlencoded' request.fields['Content-Length'] = str(len(data)) _logger.debug('Posting with data {0}.', data) if not request.body: request.body = Body(io.BytesIO()) with wpull.util.reset_file_offset(request.body): request.body.write(data)
def start(self, request: Request) -> Response: '''Begin a HTTP request Args: request: Request information. Returns: A response populated with the HTTP headers. Once the headers are received, call :meth:`download`. Coroutine. ''' if self._session_state != SessionState.ready: raise RuntimeError('Session already started') assert not self._request self._request = request _logger.debug(__('Client fetch request {0}.', request)) connection = yield from self._acquire_request_connection(request) full_url = connection.proxied and not connection.tunneled self._stream = stream = self._stream_factory(connection) yield from self._stream.reconnect() request.address = connection.address self.event_dispatcher.notify(self.Event.begin_request, request) write_callback = functools.partial(self.event_dispatcher.notify, self.Event.request_data) stream.data_event_dispatcher.add_write_listener(write_callback) yield from stream.write_request(request, full_url=full_url) if request.body: assert 'Content-Length' in request.fields length = int(request.fields['Content-Length']) yield from stream.write_body(request.body, length=length) stream.data_event_dispatcher.remove_write_listener(write_callback) self.event_dispatcher.notify(self.Event.end_request, request) read_callback = functools.partial(self.event_dispatcher.notify, self.Event.response_data) stream.data_event_dispatcher.add_read_listener(read_callback) self._response = response = yield from stream.read_response() response.request = request self.event_dispatcher.notify(self.Event.begin_response, response) self._session_state = SessionState.request_sent return response
def _add_basic_auth_header(self, request: Request): username = request.url_info.username or request.username password = request.url_info.password or request.password if username and password: _logger.debug('Add basic auth header') auth_string = '{}:{}'.format(username, password) auth_string = base64.b64encode( auth_string.encode('utf-8', 'replace')).decode('utf-8') request.fields['Authorization'] = 'Basic {}'.format(auth_string)
def test_to_dict(self): request = Request('https://foofle.com') request_dict = request.to_dict() self.assertEqual('https://foofle.com', request_dict['url']) self.assertEqual('https', request_dict['url_info']['scheme']) self.assertEqual('GET', request_dict['method']) self.assertEqual('http', request_dict['protocol']) response = Response(status_code=200, reason='OK', request=request) response_dict = response.to_dict() self.assertEqual( 'https://foofle.com', response_dict['request']['url'] ) self.assertEqual('http', response_dict['protocol']) self.assertEqual(200, response_dict['status_code']) self.assertEqual(200, response_dict['response_code']) self.assertEqual('OK', response_dict['reason']) self.assertEqual('OK', response_dict['response_message'])
def test_warc_move_max_size(self): file_prefix = 'asdf' cdx_filename = 'asdf.cdx' os.mkdir('./blah/') warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, cdx=True, move_to='./blah/', max_size=1, ), ) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'BLAH') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() warc_recorder.close() self.assertTrue(os.path.exists('./blah/asdf-00000.warc')) self.assertTrue(os.path.exists('./blah/asdf-00001.warc')) self.assertTrue(os.path.exists('./blah/asdf-meta.warc')) self.assertTrue(os.path.exists('./blah/' + cdx_filename))
def test_cdx_dedup(self): url_table = URLTable() warc_recorder = WARCRecorder( 'asdf', params=WARCRecorderParams( compress=False, cdx=True, url_table=url_table ) ) url_table.add_visits([ ( 'http://example.com/fennec', '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>', 'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ' ) ]) request = HTTPRequest('http://example.com/fennec') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() revisit_response_header_size = len(response.to_bytes()) with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() request = HTTPRequest('http://example.com/horse') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() _logger.info('FINISHED') warc_recorder.close() with open('asdf.warc', 'rb') as in_file: warc_file_content = in_file.read() with open('asdf.cdx', 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content) self.assertIn( b'WARC-Refers-To: ' b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n', warc_file_content ) self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content) self.assertIn( b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/' b'identical-payload-digest\r\n', warc_file_content ) self.assertIn( b'Content-Length: ' + str(revisit_response_header_size).encode('ascii') + b'\r\n', warc_file_content ) self.assertIn( b'WARC-Target-URI: http://example.com/fennec\r\n', warc_file_content ) self.assertIn( b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content ) self.assertEqual( 1, warc_file_content.count(b'kitbit') ) self.assertIn(b'http://example.com/horse ', cdx_file_content)
def _process_request(self, request: Request): _logger.debug(__('Got request {0}', request)) if request.method == 'CONNECT': self._reject_request('CONNECT is intentionally not supported') return if self._is_ssl_tunnel and request.url.startswith('http://'): # Since we are spying under a SSL tunnel, assume processed requests # are SSL request.url = request.url.replace('http://', 'https://', 1) if 'Upgrade' in request.fields.get('Connection', ''): _logger.warning(__( _('Connection Upgrade not supported for {}'), request.url )) self._reject_request('Upgrade not supported') return _logger.debug('Begin response.') try: action = self.hook_dispatcher.call(self.Event.client_request, request) except HookDisconnected: pass else: if not action: _logger.debug('Proxy force reject request') self._reject_request() return with self._http_client.session() as session: if 'Content-Length' in request.fields: request.body = self._reader try: response = yield from session.start(request) except NetworkError as error: _logger.debug('Upstream error', exc_info=True) self._write_error_response() self.event_dispatcher.notify(self.Event.server_response_error, error) return response.body = Body() try: action = self.hook_dispatcher.call(self.Event.server_begin_response, response) except HookDisconnected: pass else: if not action: _logger.debug('Proxy force reject request via response') self._reject_request() return try: self._writer.write(response.to_bytes()) yield from self._writer.drain() session.event_dispatcher.add_listener( Session.Event.response_data, self._writer.write ) yield from session.download(file=response.body, raw=True) yield from self._writer.drain() except NetworkError as error: _logger.debug('Upstream error', exc_info=True) self.event_dispatcher.notify(self.Event.server_response_error, error) raise self.event_dispatcher.notify(self.Event.server_end_response, response) _logger.debug('Response done.')
def test_copy(self): request = Request('http://twitcharchivestheinternet.invalid/') # Cheeck for no crash request.copy()
def test_request_parse(self): request = Request() request.parse(b'GET /robots.txt HTTP/1.1\r\n') request.parse(b'Host: example.com\r\n') request.parse('Accept: éxample\r\n'.encode('utf_8')) request.parse(b'\r\n') self.assertEqual('http://example.com/robots.txt', request.url) self.assertEqual('example.com', request.fields['host']) self.assertEqual('éxample'.encode('utf-8').decode('latin-1'), request.fields['accept']) request = Request() request.parse(b'GET https://example.com/robots.txt HTTP/1.1\r\n') request.parse(b'Host: example.com\r\n') request.parse(b'Accept: \xffexample\r\n') request.parse(b'\r\n') self.assertEqual('https://example.com/robots.txt', request.url) self.assertEqual('example.com', request.fields['host']) self.assertEqual('\xffexample', request.fields['accept'])
def test_warc_recorder(self): file_prefix = 'asdf' warc_filename = 'asdf.warc' cdx_filename = 'asdf.cdx' warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, extra_fields=[('Extra-field', 'my_extra_field')], cdx=True, ), ) request = HTTPRequest('http://example.com/') request.prepare_for_send() request.address = ('0.0.0.0', 80) request.prepare_for_send() response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() _logger.info('FINISHED') warc_recorder.close() with open(warc_filename, 'rb') as in_file: warc_file_content = in_file.read() with open(cdx_filename, 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content) self.assertIn(b'Content-Type: application/warc-fields', warc_file_content) self.assertIn(b'WARC-Date: ', warc_file_content) self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content) self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Payload-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Type: request\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=request', warc_file_content) self.assertIn(b'WARC-Type: response', warc_file_content) self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=response', warc_file_content) self.assertIn( 'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'), warc_file_content ) self.assertIn( 'Python/{0}'.format( wpull.util.python_version()).encode('utf-8'), warc_file_content ) self.assertIn(b'Extra-Field: my_extra_field', warc_file_content) self.assertIn(b'GET / HTTP', warc_file_content) self.assertIn(b'KITTEH DOGE', warc_file_content) self.assertIn(b'FINISHED', warc_file_content) self.assertIn(b'WARC-Target-URI: urn:X-wpull:log', warc_file_content) self.assertIn(b'Content-Length:', warc_file_content) self.assertNotIn(b'Content-Length: 0', warc_file_content) cdx_lines = cdx_file_content.split(b'\n') cdx_labels = cdx_lines[0].strip().split(b' ') cdx_fields = cdx_lines[1].split(b' ') print(cdx_lines) self.assertEqual(3, len(cdx_lines)) self.assertEqual(10, len(cdx_labels)) self.assertEqual(9, len(cdx_fields)) self.assertTrue(cdx_lines[0].startswith(b' CDX')) self.assertEqual(b'http://example.com/', cdx_fields[0]) self.assertEqual(b'-', cdx_fields[2]) self.assertEqual(b'200', cdx_fields[3]) self.assertNotEqual(b'-', cdx_fields[4]) self.assertNotEqual(b'0', cdx_fields[5]) self.assertNotEqual(b'0', cdx_fields[6]) self.assertEqual( os.path.basename(warc_filename), cdx_fields[7].decode('ascii')) length = int(cdx_fields[5]) offset = int(cdx_fields[6]) with open(warc_filename, 'rb') as in_file: in_file.seek(offset) data = in_file.read(length) assert len(data) == length self.assertEqual(b'WARC/1.0', data[:8]) self.assertIn(b'KITTEH DOGE', data) self.validate_warc(warc_filename)
def test_warc_recorder_max_size(self): file_prefix = 'asdf' cdx_filename = 'asdf.cdx' warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, extra_fields=[('Extra-field', 'my_extra_field')], cdx=True, max_size=1, ) ) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() request = HTTPRequest('http://example.com/2') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'DOGE KITTEH') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() _logger.info('FINISHED') warc_recorder.close() with open('asdf-00000.warc', 'rb') as in_file: warc_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo', warc_file_content) self.assertIn(b'KITTEH DOGE', warc_file_content) with open('asdf-00001.warc', 'rb') as in_file: warc_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo', warc_file_content) self.assertIn(b'DOGE KITTEH', warc_file_content) with open(cdx_filename, 'rb') as in_file: cdx_file_content = in_file.read() cdx_lines = cdx_file_content.split(b'\n') cdx_labels = cdx_lines[0].strip().split(b' ') print(cdx_lines) self.assertEqual(4, len(cdx_lines)) self.assertEqual(10, len(cdx_labels)) self.assertIn(b'http://example.com/1', cdx_file_content) self.assertIn(b'http://example.com/2', cdx_file_content) with open('asdf-meta.warc', 'rb') as in_file: meta_file_content = in_file.read() self.assertIn(b'FINISHED', meta_file_content) self.validate_warc('asdf-00000.warc') self.validate_warc('asdf-00001.warc') self.validate_warc('asdf-meta.warc')