示例#1
0
    def test_warc_recorder_rollback(self):
        warc_filename = 'asdf.warc'

        with open(warc_filename, 'wb') as warc_file:
            warc_file.write(b'a' * 10)

        warc_recorder = WARCRecorder(
            warc_filename,
            params=WARCRecorderParams(
                compress=False,
            )
        )

        request = Request.new('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'KITTEH DOGE')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())

            class BadRecord(WARCRecord):
                def __init__(self, original_record):
                    super().__init__()
                    self.block_file = original_record.block_file
                    self.fields = original_record.fields

                def __iter__(self):
                    for dummy in range(1000):
                        yield b"where's my elephant?"
                    raise OSError('Oops')

            session._request_record = BadRecord(session._request_record)
            original_offset = os.path.getsize(warc_filename)

            try:
                session.request(request)
            except (OSError, IOError):
                new_offset = os.path.getsize(warc_filename)
                self.assertEqual(new_offset, original_offset)
            else:
                self.fail()

            _logger.debug('original offset {0}'.format(original_offset))
示例#2
0
    def test_warc_recorder_rollback(self):
        warc_filename = 'asdf.warc'

        with open(warc_filename, 'wb') as warc_file:
            warc_file.write(b'a' * 10)

        warc_recorder = WARCRecorder(warc_filename,
                                     params=WARCRecorderParams(
                                         compress=False, ))

        request = Request.new('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'KITTEH DOGE')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())

            class BadRecord(WARCRecord):
                def __init__(self, original_record):
                    super().__init__()
                    self.block_file = original_record.block_file
                    self.fields = original_record.fields

                def __iter__(self):
                    for dummy in range(1000):
                        yield b"where's my elephant?"
                    raise OSError('Oops')

            session._request_record = BadRecord(session._request_record)
            original_offset = os.path.getsize(warc_filename)

            try:
                session.request(request)
            except (OSError, IOError):
                new_offset = os.path.getsize(warc_filename)
                self.assertEqual(new_offset, original_offset)
            else:
                self.fail()

            _logger.debug('original offset {0}'.format(original_offset))
示例#3
0
    def test_warc_move(self):
        file_prefix = 'asdf'
        warc_filename = 'asdf.warc'
        cdx_filename = 'asdf.cdx'

        os.mkdir('./blah/')

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                cdx=True,
                move_to='./blah/'
            ),
        )

        warc_recorder.close()

        self.assertTrue(os.path.exists('./blah/' + warc_filename))
        self.assertTrue(os.path.exists('./blah/' + cdx_filename))
示例#4
0
    def test_warc_move_max_size(self):
        file_prefix = 'asdf'
        cdx_filename = 'asdf.cdx'

        os.mkdir('./blah/')

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                cdx=True,
                move_to='./blah/',
                max_size=1,
            ),
        )

        request = Request.new('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'BLAH')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        warc_recorder.close()

        self.assertTrue(os.path.exists('./blah/asdf-00000.warc'))
        self.assertTrue(os.path.exists('./blah/asdf-00001.warc'))
        self.assertTrue(os.path.exists('./blah/asdf-meta.warc'))
        self.assertTrue(os.path.exists('./blah/' + cdx_filename))
示例#5
0
    def test_warc_recorder(self):
        temp_file = tempfile.NamedTemporaryFile()
        warc_recorder = WARCRecorder(
            temp_file.name, compress=False,
            extra_fields=[('Extra-field', 'my_extra_field')],
            cdx_filename=temp_file.name + '.cdx',
        )

        request = Request.new('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'KITTEH DOGE')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        _logger.info('FINISHED')

        warc_recorder.close()

        with open(temp_file.name, 'rb') as in_file:
            warc_file_content = in_file.read()

        with open(temp_file.name + '.cdx', 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo', warc_file_content)
        self.assertIn(b'Content-Type: application/warc-fields',
            warc_file_content)
        self.assertIn(b'WARC-Date: ', warc_file_content)
        self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content)
        self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content)
        self.assertIn(b'WARC-Payload-Digest: sha1:', warc_file_content)
        self.assertIn(b'WARC-Type: request', warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://', warc_file_content)
        self.assertIn(b'Content-Type: application/http;msgtype=request',
            warc_file_content)
        self.assertIn(b'WARC-Type: response', warc_file_content)
        self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content)
        self.assertIn(b'Content-Type: application/http;msgtype=response',
            warc_file_content)
        self.assertIn(
            'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'),
            warc_file_content
        )
        self.assertIn(
            'Python/{0}'.format(wpull.util.python_version()).encode('utf-8'),
            warc_file_content
        )
        self.assertIn(b'Extra-Field: my_extra_field', warc_file_content)
        self.assertIn(b'GET / HTTP', warc_file_content)
        self.assertIn(b'KITTEH DOGE', warc_file_content)
        self.assertIn(b'FINISHED', warc_file_content)
        self.assertIn(b'WARC-Target-URI: urn:X-wpull:log', warc_file_content)
        self.assertIn(b'Content-Length:', warc_file_content)
        self.assertNotIn(b'Content-Length: 0', warc_file_content)

        cdx_lines = cdx_file_content.split(b'\n')
        cdx_labels = cdx_lines[0].strip().split(b' ')
        cdx_fields = cdx_lines[1].split(b' ')

        print(cdx_lines)

        self.assertEqual(3, len(cdx_lines))
        self.assertEqual(10, len(cdx_labels))
        self.assertEqual(9, len(cdx_fields))
        self.assertTrue(cdx_lines[0].startswith(b' CDX'))

        self.assertEqual(b'http://example.com/', cdx_fields[0])
        self.assertEqual(b'-', cdx_fields[2])
        self.assertEqual(b'200', cdx_fields[3])
        self.assertNotEqual(b'-', cdx_fields[4])
        self.assertNotEqual(b'0', cdx_fields[5])
        self.assertNotEqual(b'0', cdx_fields[6])
        self.assertEqual(
            os.path.basename(temp_file.name), cdx_fields[7].decode('ascii'))

        length = int(cdx_fields[5])
        offset = int(cdx_fields[6])

        with open(temp_file.name, 'rb') as in_file:
            in_file.seek(offset)
            data = in_file.read(length)

            assert len(data) == length

        self.assertEqual(b'WARC/1.0', data[:8])

        self.assertIn(b'KITTEH DOGE', data)
示例#6
0
    def test_cdx_dedup(self):
        url_table = URLTable()
        warc_recorder = WARCRecorder(
            'asdf',
            params=WARCRecorderParams(
                compress=False, cdx=True, url_table=url_table
            )
        )

        url_table.add_visits([
            (
                'http://example.com/fennec',
                '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>',
                'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ'
            )
        ])

        request = Request.new('http://example.com/fennec')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OK')
        revisit_response_header_size = len(response.header())

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'kitbit')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        request = Request.new('http://example.com/horse')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'kitbit')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        with open('asdf.cdx', 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Refers-To: '
                b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n',
            warc_file_content
        )
        self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/'
                b'identical-payload-digest\r\n',
            warc_file_content
        )
        self.assertIn(
            b'Content-Length: ' +
                str(revisit_response_header_size).encode('ascii') + b'\r\n',
            warc_file_content
        )
        self.assertIn(
            b'WARC-Target-URI: http://example.com/fennec\r\n',
            warc_file_content
        )
        self.assertIn(
            b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content
        )
        self.assertEqual(
            1,
            warc_file_content.count(b'kitbit')
        )

        self.assertIn(b'http://example.com/horse ', cdx_file_content)
示例#7
0
    def test_warc_recorder_max_size(self):
        file_prefix = 'asdf'
        cdx_filename = 'asdf.cdx'

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                extra_fields=[('Extra-field', 'my_extra_field')],
                cdx=True, max_size=1,
            )
        )

        request = Request.new('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'KITTEH DOGE')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        request = Request.new('http://example.com/2')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'DOGE KITTEH')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf-00000.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo', warc_file_content)
        self.assertIn(b'KITTEH DOGE', warc_file_content)

        with open('asdf-00001.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo', warc_file_content)
        self.assertIn(b'DOGE KITTEH', warc_file_content)

        with open(cdx_filename, 'rb') as in_file:
            cdx_file_content = in_file.read()

        cdx_lines = cdx_file_content.split(b'\n')
        cdx_labels = cdx_lines[0].strip().split(b' ')

        print(cdx_lines)

        self.assertEqual(4, len(cdx_lines))
        self.assertEqual(10, len(cdx_labels))

        self.assertIn(b'http://example.com/1', cdx_file_content)
        self.assertIn(b'http://example.com/2', cdx_file_content)

        with open('asdf-meta.warc', 'rb') as in_file:
            meta_file_content = in_file.read()

        self.assertIn(b'FINISHED', meta_file_content)
示例#8
0
    def test_warc_recorder(self):
        file_prefix = 'asdf'
        warc_filename = 'asdf.warc'
        cdx_filename = 'asdf.cdx'

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                extra_fields=[('Extra-field', 'my_extra_field')],
                cdx=True,
            ),
        )

        request = Request.new('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'KITTEH DOGE')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        _logger.info('FINISHED')

        warc_recorder.close()

        with open(warc_filename, 'rb') as in_file:
            warc_file_content = in_file.read()

        with open(cdx_filename, 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content)
        self.assertIn(b'Content-Type: application/warc-fields',
                      warc_file_content)
        self.assertIn(b'WARC-Date: ', warc_file_content)
        self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content)
        self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content)
        self.assertIn(b'WARC-Payload-Digest: sha1:', warc_file_content)
        self.assertIn(b'WARC-Type: request\r\n', warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://', warc_file_content)
        self.assertIn(b'Content-Type: application/http;msgtype=request',
                      warc_file_content)
        self.assertIn(b'WARC-Type: response', warc_file_content)
        self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content)
        self.assertIn(b'Content-Type: application/http;msgtype=response',
                      warc_file_content)
        self.assertIn(
            'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'),
            warc_file_content)
        self.assertIn(
            'Python/{0}'.format(wpull.util.python_version()).encode('utf-8'),
            warc_file_content)
        self.assertIn(b'Extra-Field: my_extra_field', warc_file_content)
        self.assertIn(b'GET / HTTP', warc_file_content)
        self.assertIn(b'KITTEH DOGE', warc_file_content)
        self.assertIn(b'FINISHED', warc_file_content)
        self.assertIn(b'WARC-Target-URI: urn:X-wpull:log', warc_file_content)
        self.assertIn(b'Content-Length:', warc_file_content)
        self.assertNotIn(b'Content-Length: 0', warc_file_content)

        cdx_lines = cdx_file_content.split(b'\n')
        cdx_labels = cdx_lines[0].strip().split(b' ')
        cdx_fields = cdx_lines[1].split(b' ')

        print(cdx_lines)

        self.assertEqual(3, len(cdx_lines))
        self.assertEqual(10, len(cdx_labels))
        self.assertEqual(9, len(cdx_fields))
        self.assertTrue(cdx_lines[0].startswith(b' CDX'))

        self.assertEqual(b'http://example.com/', cdx_fields[0])
        self.assertEqual(b'-', cdx_fields[2])
        self.assertEqual(b'200', cdx_fields[3])
        self.assertNotEqual(b'-', cdx_fields[4])
        self.assertNotEqual(b'0', cdx_fields[5])
        self.assertNotEqual(b'0', cdx_fields[6])
        self.assertEqual(os.path.basename(warc_filename),
                         cdx_fields[7].decode('ascii'))

        length = int(cdx_fields[5])
        offset = int(cdx_fields[6])

        with open(warc_filename, 'rb') as in_file:
            in_file.seek(offset)
            data = in_file.read(length)

            assert len(data) == length

        self.assertEqual(b'WARC/1.0', data[:8])

        self.assertIn(b'KITTEH DOGE', data)
示例#9
0
    def test_cdx_dedup(self):
        url_table = URLTable()
        warc_recorder = WARCRecorder('asdf',
                                     params=WARCRecorderParams(
                                         compress=False,
                                         cdx=True,
                                         url_table=url_table))

        url_table.add_visits([
            ('http://example.com/fennec',
             '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>',
             'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ')
        ])

        request = Request.new('http://example.com/fennec')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OK')
        revisit_response_header_size = len(response.header())

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'kitbit')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        request = Request.new('http://example.com/horse')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'kitbit')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        with open('asdf.cdx', 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Refers-To: '
            b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n',
            warc_file_content)
        self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/'
            b'identical-payload-digest\r\n', warc_file_content)
        self.assertIn(
            b'Content-Length: ' +
            str(revisit_response_header_size).encode('ascii') + b'\r\n',
            warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://example.com/fennec\r\n',
                      warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://example.com/horse\r\n',
                      warc_file_content)
        self.assertEqual(1, warc_file_content.count(b'kitbit'))

        self.assertIn(b'http://example.com/horse ', cdx_file_content)
示例#10
0
    def test_warc_recorder_max_size(self):
        file_prefix = 'asdf'
        cdx_filename = 'asdf.cdx'

        warc_recorder = WARCRecorder(file_prefix,
                                     params=WARCRecorderParams(
                                         compress=False,
                                         extra_fields=[('Extra-field',
                                                        'my_extra_field')],
                                         cdx=True,
                                         max_size=1,
                                     ))

        request = Request.new('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'KITTEH DOGE')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        request = Request.new('http://example.com/2')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'DOGE KITTEH')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf-00000.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo', warc_file_content)
        self.assertIn(b'KITTEH DOGE', warc_file_content)

        with open('asdf-00001.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo', warc_file_content)
        self.assertIn(b'DOGE KITTEH', warc_file_content)

        with open(cdx_filename, 'rb') as in_file:
            cdx_file_content = in_file.read()

        cdx_lines = cdx_file_content.split(b'\n')
        cdx_labels = cdx_lines[0].strip().split(b' ')

        print(cdx_lines)

        self.assertEqual(4, len(cdx_lines))
        self.assertEqual(10, len(cdx_labels))

        self.assertIn(b'http://example.com/1', cdx_file_content)
        self.assertIn(b'http://example.com/2', cdx_file_content)

        with open('asdf-meta.warc', 'rb') as in_file:
            meta_file_content = in_file.read()

        self.assertIn(b'FINISHED', meta_file_content)