def test_new_file_and_clobber(self): writer = AntiClobberFileWriter(self.get_path_namer()) session = writer.session() request1 = HTTPRequest('http://example.com/my_file.txt') response1 = HTTPResponse(status_code=200, reason='OK', request=request1) session.process_request(request1) session.process_response(response1) session.save_document(response1) self.assertTrue(os.path.exists('my_file.txt')) session = writer.session() request2 = HTTPRequest('http://example.com/my_file.txt') response2 = HTTPResponse(status_code=200, reason='OK', request=request2) session.process_request(request2) session.process_response(response2) session.save_document(response2) self.assertTrue(os.path.exists('my_file.txt'))
def test_trust_server_names(self): writer = AntiClobberFileWriter(self.get_path_namer(), trust_server_names=True) session = writer.session() request1 = HTTPRequest('http://example.com') response1 = HTTPResponse(status_code=302, reason='Moved', request=request1) session.process_request(request1) session.process_response(response1) request2 = HTTPRequest('http://example.com/my_file.html') response2 = HTTPResponse(status_code=200, reason='OK', request=request2) session.process_request(request2) session.process_response(response2) session.save_document(response2) print(list(os.walk('.'))) self.assertTrue(os.path.exists('my_file.html'))
def test_null_writer(self): writer = NullWriter() session = writer.session() session.process_request(HTTPRequest()) session.process_response(HTTPResponse()) session.discard_document(HTTPResponse()) session.save_document(HTTPResponse()) self.assertIsNone(session.extra_resource_path('blah'))
def test_file_continue(self): writer = OverwriteFileWriter(self.get_path_namer(), file_continuing=True) session = writer.session() with open('my_file.txt', 'wb') as file: file.write(b'TEST') request = HTTPRequest('http://example.com/my_file.txt') session.process_request(request) self.assertIn('Range', request.fields) response = HTTPResponse(status_code=206, reason='Partial content', request=request) session.process_response(response) response.body.write(b'END') response.body.flush() session.save_document(response) with open('my_file.txt', 'rb') as file: data = file.read() self.assertEqual(b'TESTEND', data)
def test_content_disposition(self): writer = AntiClobberFileWriter(self.get_path_namer(), content_disposition=True) test_data = [ ('hello1.txt', 'hello1.txt'), ('hello2.txt;', 'hello2.txt'), ('"hello3.txt"', 'hello3.txt'), ('\'hello4.txt\'', 'hello4.txt'), ] for raw_filename, filename in test_data: session = writer.session() request = HTTPRequest('http://example.com') response = HTTPResponse(status_code=200, reason='OK', request=request) response.fields[ 'Content-Disposition'] = 'attachment; filename={}'.format( raw_filename) session.process_request(request) session.process_response(response) session.save_document(response) print(list(os.walk('.'))) self.assertTrue(os.path.exists(filename))
def test_adjust_extension(self): writer = AntiClobberFileWriter(self.get_path_namer(), adjust_extension=True) test_data = [ ('text/html', '/mordor', 'mordor.html'), ('text/html', '/mordor?ring.asp', 'mordor?ring.asp.html'), ('text/html', '/mordor?ring.htm', 'mordor?ring.htm'), ('text/plain', '/static/my_file.txt', 'static/my_file.txt'), ('text/css', '/static/style.css', 'static/style.css'), ('text/css', '/static/style.css?hamster.exe', 'static/style.css?hamster.exe.css'), ('text/html', '/static/mojibake.html', 'static/mojibake.html'), ('text/html', '/static/mojibake.html?dolphin.png', 'static/mojibake.html?dolphin.png.html'), ] for mime_type, path, filename in test_data: session = writer.session() request = HTTPRequest('http://example.com' + path) response = HTTPResponse(status_code=200, reason='OK', request=request) response.fields['Content-Type'] = mime_type session.process_request(request) session.process_response(response) session.save_document(response) print(filename, list(os.walk('.'))) self.assertTrue(os.path.exists(filename))
def test_single_document_writer(self): stream = io.BytesIO() writer = SingleDocumentWriter(stream, headers_included=True) session = writer.session() request1 = HTTPRequest('http://example.com/my_file1.txt') response1 = HTTPResponse(status_code=200, reason='OK', request=request1) session.process_request(request1) session.process_response(response1) response1.body.write(b'The content') session.save_document(response1) session = writer.session() request2 = HTTPRequest('http://example.com/my_file2.txt') response2 = HTTPResponse(status_code=200, reason='OK', request=request2) session.process_request(request2) session.process_response(response2) response1.body.write(b'Another thing') session.save_document(response2) data = stream.getvalue() self.assertIn(b'HTTP', data) self.assertIn(b'The content', data) self.assertIn(b'Another thing', data)
def test_progress_http(self): progress = ProgressPrinter(stream=sys.stdout) request = HTTPRequest('http://example.com') response = HTTPResponse(206, 'OK') response.fields['Content-Size'] = '1024' response.fields['Content-Range'] = 'bytes 10-/2048' progress.update_from_begin_request(request) progress.update_from_begin_response(response) for dummy in range(100): progress.update_with_data(b'abc') progress.update_from_end_response(response)
def test_dir_or_file_dir_got_first(self): writer = OverwriteFileWriter(self.get_path_namer()) session = writer.session() os.mkdir('dir_or_file') request = HTTPRequest('http://example.com/dir_or_file') response = HTTPResponse(status_code=200, reason='OK', request=request) session.process_request(request) session.process_response(response) session.save_document(response) print(list(os.walk('.'))) self.assertTrue(os.path.isdir('dir_or_file')) self.assertTrue(os.path.isfile('dir_or_file.f'))
def test_warc_max_size_and_append(self): file_prefix = 'asdf' with open('asdf-00000.warc', 'w'): pass with open('asdf-00001.warc', 'w'): pass warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams(compress=False, max_size=1, appending=True), ) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'BLAH') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() warc_recorder.close() self.assertTrue(os.path.exists('asdf-00000.warc')) self.assertTrue(os.path.exists('asdf-00001.warc')) self.assertTrue(os.path.exists('asdf-00002.warc')) self.assertTrue(os.path.exists('asdf-00003.warc')) self.assertTrue(os.path.exists('asdf-meta.warc')) self.assertEqual(0, os.path.getsize('asdf-00000.warc')) self.assertEqual(0, os.path.getsize('asdf-00001.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00002.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00003.warc')) self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))
def test_warc_recorder_rollback(self): warc_filename = 'asdf.warc' warc_prefix = 'asdf' with open(warc_filename, 'wb') as warc_file: warc_file.write(b'a' * 10) warc_recorder = WARCRecorder(warc_prefix, params=WARCRecorderParams( compress=False, )) request = HTTPRequest('http://example.com/') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) class BadRecord(WARCRecord): def __init__(self, original_record): super().__init__() self.block_file = original_record.block_file self.fields = original_record.fields def __iter__(self): for dummy in range(1000): yield b"where's my elephant?" raise OSError('Oops') session._request_record = BadRecord(session._request_record) original_offset = os.path.getsize(warc_filename) with self.assertRaises((OSError, IOError)): session.end_request(request) new_offset = os.path.getsize(warc_filename) self.assertEqual(new_offset, original_offset) self.assertFalse(os.path.exists(warc_filename + '-wpullinc')) _logger.debug('original offset {0}'.format(original_offset))
def test_timestamping(self): writer = TimestampingFileWriter(self.get_path_namer()) session = writer.session() local_timestamp = 634521600 with open('my_file.txt', 'wb') as file: file.write(b'') os.utime('my_file.txt', (local_timestamp, local_timestamp)) request = HTTPRequest('http://example.com/my_file.txt') session.process_request(request) self.assertIn('If-Modified-Since', request.fields) response = HTTPResponse(status_code=304, reason='Not modified', request=request) session.process_response(response)
def test_warc_recorder_journal(self): warc_filename = 'asdf.warc' warc_prefix = 'asdf' warc_recorder = WARCRecorder(warc_prefix, params=WARCRecorderParams( compress=False, )) request = HTTPRequest('http://example.com/') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) test_instance = self class MockRecord(WARCRecord): def __init__(self, original_record): super().__init__() self.block_file = original_record.block_file self.fields = original_record.fields def __iter__(self): print(list(os.walk('.'))) test_instance.assertTrue( os.path.exists(warc_filename + '-wpullinc')) for dummy in range(1000): yield b"where's my elephant?" session._request_record = MockRecord(session._request_record) session.end_request(request) self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))
def test_warc_move_max_size(self): file_prefix = 'asdf' cdx_filename = 'asdf.cdx' os.mkdir('./blah/') warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, cdx=True, move_to='./blah/', max_size=1, ), ) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'BLAH') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() warc_recorder.close() self.assertTrue(os.path.exists('./blah/asdf-00000.warc')) self.assertTrue(os.path.exists('./blah/asdf-00001.warc')) self.assertTrue(os.path.exists('./blah/asdf-meta.warc')) self.assertTrue(os.path.exists('./blah/' + cdx_filename))
def test_warc_recorder_max_size(self): file_prefix = 'asdf' cdx_filename = 'asdf.cdx' warc_recorder = WARCRecorder(file_prefix, params=WARCRecorderParams( compress=False, extra_fields=[('Extra-field', 'my_extra_field')], cdx=True, max_size=1, )) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() request = HTTPRequest('http://example.com/2') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'DOGE KITTEH') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() _logger.info('FINISHED') warc_recorder.close() with open('asdf-00000.warc', 'rb') as in_file: warc_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo', warc_file_content) self.assertIn(b'KITTEH DOGE', warc_file_content) with open('asdf-00001.warc', 'rb') as in_file: warc_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo', warc_file_content) self.assertIn(b'DOGE KITTEH', warc_file_content) with open(cdx_filename, 'rb') as in_file: cdx_file_content = in_file.read() cdx_lines = cdx_file_content.split(b'\n') cdx_labels = cdx_lines[0].strip().split(b' ') print(cdx_lines) self.assertEqual(4, len(cdx_lines)) self.assertEqual(10, len(cdx_labels)) self.assertIn(b'http://example.com/1', cdx_file_content) self.assertIn(b'http://example.com/2', cdx_file_content) with open('asdf-meta.warc', 'rb') as in_file: meta_file_content = in_file.read() self.assertIn(b'FINISHED', meta_file_content) self.validate_warc('asdf-00000.warc') self.validate_warc('asdf-00001.warc') self.validate_warc('asdf-meta.warc')
def test_cdx_dedup(self): url_table = URLTable() warc_recorder = WARCRecorder('asdf', params=WARCRecorderParams( compress=False, cdx=True, url_table=url_table)) url_table.add_visits([ ('http://example.com/fennec', '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>', 'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ') ]) request = HTTPRequest('http://example.com/fennec') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() revisit_response_header_size = len(response.to_bytes()) with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() request = HTTPRequest('http://example.com/horse') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() _logger.info('FINISHED') warc_recorder.close() with open('asdf.warc', 'rb') as in_file: warc_file_content = in_file.read() with open('asdf.cdx', 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content) self.assertIn( b'WARC-Refers-To: ' b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n', warc_file_content) self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content) self.assertIn( b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/' b'identical-payload-digest\r\n', warc_file_content) self.assertIn( b'Content-Length: ' + str(revisit_response_header_size).encode('ascii') + b'\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://example.com/fennec\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content) self.assertEqual(1, warc_file_content.count(b'kitbit')) self.assertIn(b'http://example.com/horse ', cdx_file_content)
def test_warc_recorder(self): file_prefix = 'asdf' warc_filename = 'asdf.warc' cdx_filename = 'asdf.cdx' warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, extra_fields=[('Extra-field', 'my_extra_field')], cdx=True, ), ) request = HTTPRequest('http://example.com/') request.prepare_for_send() request.address = ('0.0.0.0', 80) request.prepare_for_send() response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() _logger.info('FINISHED') warc_recorder.close() with open(warc_filename, 'rb') as in_file: warc_file_content = in_file.read() with open(cdx_filename, 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content) self.assertIn(b'Content-Type: application/warc-fields', warc_file_content) self.assertIn(b'WARC-Date: ', warc_file_content) self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content) self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Payload-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Type: request\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=request', warc_file_content) self.assertIn(b'WARC-Type: response', warc_file_content) self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=response', warc_file_content) self.assertIn( 'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'), warc_file_content) self.assertIn( 'Python/{0}'.format(wpull.util.python_version()).encode('utf-8'), warc_file_content) self.assertIn(b'Extra-Field: my_extra_field', warc_file_content) self.assertIn(b'GET / HTTP', warc_file_content) self.assertIn(b'KITTEH DOGE', warc_file_content) self.assertIn(b'FINISHED', warc_file_content) self.assertIn(b'WARC-Target-URI: urn:X-wpull:log', warc_file_content) self.assertIn(b'Content-Length:', warc_file_content) self.assertNotIn(b'Content-Length: 0', warc_file_content) cdx_lines = cdx_file_content.split(b'\n') cdx_labels = cdx_lines[0].strip().split(b' ') cdx_fields = cdx_lines[1].split(b' ') print(cdx_lines) self.assertEqual(3, len(cdx_lines)) self.assertEqual(10, len(cdx_labels)) self.assertEqual(9, len(cdx_fields)) self.assertTrue(cdx_lines[0].startswith(b' CDX')) self.assertEqual(b'http://example.com/', cdx_fields[0]) self.assertEqual(b'-', cdx_fields[2]) self.assertEqual(b'200', cdx_fields[3]) self.assertNotEqual(b'-', cdx_fields[4]) self.assertNotEqual(b'0', cdx_fields[5]) self.assertNotEqual(b'0', cdx_fields[6]) self.assertEqual(os.path.basename(warc_filename), cdx_fields[7].decode('ascii')) length = int(cdx_fields[5]) offset = int(cdx_fields[6]) with open(warc_filename, 'rb') as in_file: in_file.seek(offset) data = in_file.read(length) assert len(data) == length self.assertEqual(b'WARC/1.0', data[:8]) self.assertIn(b'KITTEH DOGE', data) self.validate_warc(warc_filename)