def test_redirect_loop(self): checker = RobotsTxtChecker(web_client=MockWebClient()) request = Request('http://example.com') request.prepare_for_send() nonlocal_dict = {'counter': 0} def response_callback(request): request.prepare_for_send() self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response(302, 'See else') response.request = request response.fields['Location'] = '/robots.txt' nonlocal_dict['counter'] += 1 if nonlocal_dict['counter'] > 20: raise ProtocolError('Mock redirect loop error.') return response checker.web_client.mock_response_callback = response_callback self.assertTrue((yield from checker.can_fetch(request))) self.assertTrue(checker.can_fetch_pool(request))
def test_request(self): request = Request('http://example.com/robots.txt') request.prepare_for_send() self.assertEqual( (b'GET /robots.txt HTTP/1.1\r\n' b'Host: example.com\r\n' b'\r\n'), request.to_bytes() )
def test_fetch_allow_redirects(self): checker = RobotsTxtChecker(web_client=MockWebClient()) request = Request('http://example.com') request.prepare_for_send() # Try fetch example.com/ (need robots.txt) def response_callback_1(request): request.prepare_for_send() self.assertEqual('http://example.com/robots.txt', request.url_info.url) response = Response(301, 'Moved') response.fields['location'] = 'http://www.example.com/robots.txt' response.request = request checker.web_client.mock_response_callback = response_callback_2 checker.web_client.request = Request( 'http://www.example.com/robots.txt') return response # Try fetch www.example.com/robots.txt def response_callback_2(request): request.prepare_for_send() self.assertEqual('http://www.example.com/robots.txt', request.url_info.url) response = Response(301, 'Moved') response.fields['location'] = 'http://www.example.net/robots.txt' response.request = request checker.web_client.mock_response_callback = response_callback_3 checker.web_client.request = Request( 'http://www.example.net/robots.txt') return response # Try fetch www.example.net/robots.txt def response_callback_3(request): request.prepare_for_send() self.assertEqual('http://www.example.net/robots.txt', request.url_info.url) response = Response(200, 'OK') response.request = request response.body = io.StringIO('User-agent:*\nAllow: /\n') checker.web_client.session_obj.done_value = True return response checker.web_client.mock_response_callback = response_callback_1 self.assertTrue((yield from checker.can_fetch(request))) self.assertTrue(checker.can_fetch_pool(request))
def test_server_error(self): checker = RobotsTxtChecker(web_client=MockWebClient()) request = Request('http://example.com') request.prepare_for_send() def response_callback(request): request.prepare_for_send() self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response(500, 'Oops') response.request = request checker.web_client.session_obj.done_value = True return response checker.web_client.mock_response_callback = response_callback try: yield from checker.can_fetch(request) except ServerError: pass else: self.fail() # pragma: no cover
def test_fetch_disallow(self): checker = RobotsTxtChecker(web_client=MockWebClient()) request = Request('http://example.com') request.prepare_for_send() self.assertRaises(NotInPoolError, checker.can_fetch_pool, request) def response_callback(request): request.prepare_for_send() self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response(200, 'OK') response.request = request response.body = io.StringIO('User-agent:*\nDisallow: /\n') checker.web_client.session_obj.done_value = True return response checker.web_client.mock_response_callback = response_callback yield from checker.fetch_robots_txt(request) self.assertFalse(checker.can_fetch_pool(request)) self.assertFalse((yield from checker.can_fetch(request)))
def test_request_port(self): request = Request('https://example.com:4567/robots.txt') request.prepare_for_send() self.assertEqual((b'GET /robots.txt HTTP/1.1\r\n' b'Host: example.com:4567\r\n' b'\r\n'), request.to_bytes())
def test_warc_recorder(self): file_prefix = 'asdf' warc_filename = 'asdf.warc' cdx_filename = 'asdf.cdx' warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, extra_fields=[('Extra-field', 'my_extra_field')], cdx=True, ), ) request = HTTPRequest('http://example.com/') request.prepare_for_send() request.address = ('0.0.0.0', 80) request.prepare_for_send() response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() _logger.info('FINISHED') warc_recorder.close() with open(warc_filename, 'rb') as in_file: warc_file_content = in_file.read() with open(cdx_filename, 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content) self.assertIn(b'Content-Type: application/warc-fields', warc_file_content) self.assertIn(b'WARC-Date: ', warc_file_content) self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content) self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Payload-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Type: request\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=request', warc_file_content) self.assertIn(b'WARC-Type: response', warc_file_content) self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=response', warc_file_content) self.assertIn( 'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'), warc_file_content) self.assertIn( 'Python/{0}'.format(wpull.util.python_version()).encode('utf-8'), warc_file_content) self.assertIn(b'Extra-Field: my_extra_field', warc_file_content) self.assertIn(b'GET / HTTP', warc_file_content) self.assertIn(b'KITTEH DOGE', warc_file_content) self.assertIn(b'FINISHED', warc_file_content) self.assertIn(b'WARC-Target-URI: urn:X-wpull:log', warc_file_content) self.assertIn(b'Content-Length:', warc_file_content) self.assertNotIn(b'Content-Length: 0', warc_file_content) cdx_lines = cdx_file_content.split(b'\n') cdx_labels = cdx_lines[0].strip().split(b' ') cdx_fields = cdx_lines[1].split(b' ') print(cdx_lines) self.assertEqual(3, len(cdx_lines)) self.assertEqual(10, len(cdx_labels)) self.assertEqual(9, len(cdx_fields)) self.assertTrue(cdx_lines[0].startswith(b' CDX')) self.assertEqual(b'http://example.com/', cdx_fields[0]) self.assertEqual(b'-', cdx_fields[2]) self.assertEqual(b'200', cdx_fields[3]) self.assertNotEqual(b'-', cdx_fields[4]) self.assertNotEqual(b'0', cdx_fields[5]) self.assertNotEqual(b'0', cdx_fields[6]) self.assertEqual(os.path.basename(warc_filename), cdx_fields[7].decode('ascii')) length = int(cdx_fields[5]) offset = int(cdx_fields[6]) with open(warc_filename, 'rb') as in_file: in_file.seek(offset) data = in_file.read(length) assert len(data) == length self.assertEqual(b'WARC/1.0', data[:8]) self.assertIn(b'KITTEH DOGE', data) self.validate_warc(warc_filename)
def test_warc_recorder(self): file_prefix = 'asdf' warc_filename = 'asdf.warc' cdx_filename = 'asdf.cdx' warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, extra_fields=[('Extra-field', 'my_extra_field')], cdx=True, ), ) request = HTTPRequest('http://example.com/') request.prepare_for_send() request.address = ('0.0.0.0', 80) request.prepare_for_send() response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() _logger.info('FINISHED') warc_recorder.close() with open(warc_filename, 'rb') as in_file: warc_file_content = in_file.read() with open(cdx_filename, 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content) self.assertIn(b'Content-Type: application/warc-fields', warc_file_content) self.assertIn(b'WARC-Date: ', warc_file_content) self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content) self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Payload-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Type: request\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=request', warc_file_content) self.assertIn(b'WARC-Type: response', warc_file_content) self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=response', warc_file_content) self.assertIn( 'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'), warc_file_content ) self.assertIn( 'Python/{0}'.format( wpull.util.python_version()).encode('utf-8'), warc_file_content ) self.assertIn(b'Extra-Field: my_extra_field', warc_file_content) self.assertIn(b'GET / HTTP', warc_file_content) self.assertIn(b'KITTEH DOGE', warc_file_content) self.assertIn(b'FINISHED', warc_file_content) self.assertIn(b'WARC-Target-URI: urn:X-wpull:log', warc_file_content) self.assertIn(b'Content-Length:', warc_file_content) self.assertNotIn(b'Content-Length: 0', warc_file_content) cdx_lines = cdx_file_content.split(b'\n') cdx_labels = cdx_lines[0].strip().split(b' ') cdx_fields = cdx_lines[1].split(b' ') print(cdx_lines) self.assertEqual(3, len(cdx_lines)) self.assertEqual(10, len(cdx_labels)) self.assertEqual(9, len(cdx_fields)) self.assertTrue(cdx_lines[0].startswith(b' CDX')) self.assertEqual(b'http://example.com/', cdx_fields[0]) self.assertEqual(b'-', cdx_fields[2]) self.assertEqual(b'200', cdx_fields[3]) self.assertNotEqual(b'-', cdx_fields[4]) self.assertNotEqual(b'0', cdx_fields[5]) self.assertNotEqual(b'0', cdx_fields[6]) self.assertEqual( os.path.basename(warc_filename), cdx_fields[7].decode('ascii')) length = int(cdx_fields[5]) offset = int(cdx_fields[6]) with open(warc_filename, 'rb') as in_file: in_file.seek(offset) data = in_file.read(length) assert len(data) == length self.assertEqual(b'WARC/1.0', data[:8]) self.assertIn(b'KITTEH DOGE', data) self.validate_warc(warc_filename)