Exemplo n.º 1
0
    def test_record_custom_record(self):
        dedup_index = self._get_dedup_index(user=False)

        warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        req_url = '/live/resource/postreq?url=custom://httpbin.org&param.recorder.coll=META&put_record=resource'

        buff = b'Some Data'

        testapp = webtest.TestApp(recorder_app)
        headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo'}

        resp = testapp.put(req_url, headers=headers, params=buff)

        assert resp.json['success'] == 'true'
        assert resp.json['WARC-Date'] != ''

        self._test_all_warcs('/warcs/meta', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('META:warc')
        assert len(warcs) == 1

        warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8')

        with open(warcs[warc_key], 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(
                decomp, ensure_http_headers=True)

        status_headers = record.rec_headers
        assert len(record.rec_headers.headers) == 9
        assert status_headers.get_header('WARC-Type') == 'resource'
        assert status_headers.get_header(
            'WARC-Target-URI') == 'custom://httpbin.org'
        assert status_headers.get_header('WARC-Record-ID') != ''
        assert status_headers.get_header('WARC-Date') != ''
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header(
            'WARC-Block-Digest') == status_headers.get_header(
                'WARC-Payload-Digest')
        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))
        assert status_headers.get_header('WARC-Custom') == 'foo'

        assert record.raw_stream.read() == buff

        status_headers = record.http_headers
        assert len(record.http_headers.headers) == 2

        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))

        writer.close()
        assert len(writer.fh_cache) == 0
Exemplo n.º 2
0
    def test_record_custom_record(self):
        dedup_index = self._get_dedup_index(user=False)

        warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        req_url = '/live/resource/postreq?url=custom://httpbin.org&param.recorder.coll=META&put_record=resource'

        buff = b'Some Data'

        testapp = webtest.TestApp(recorder_app)
        headers = {'content-type': 'text/plain',
                   'WARC-Custom': 'foo'
                  }

        resp = testapp.put(req_url, headers=headers, params=buff)

        assert resp.json['success'] == 'true'
        assert resp.json['WARC-Date'] != ''

        self._test_all_warcs('/warcs/meta', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('META:warc')
        assert len(warcs) == 1

        warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8')

        with open(warcs[warc_key], 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(decomp, ensure_http_headers=True)

        status_headers = record.rec_headers
        assert len(record.rec_headers.headers) == 9
        assert status_headers.get_header('WARC-Type') == 'resource'
        assert status_headers.get_header('WARC-Target-URI') == 'custom://httpbin.org'
        assert status_headers.get_header('WARC-Record-ID') != ''
        assert status_headers.get_header('WARC-Date') != ''
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))
        assert status_headers.get_header('WARC-Custom') == 'foo'

        assert record.raw_stream.read() == buff

        status_headers = record.http_headers
        assert len(record.http_headers.headers) == 2

        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))

        writer.close()
        assert len(writer.fh_cache) == 0
Exemplo n.º 3
0
    def test_record_file_warc_keep_open(self):
        path = to_path(self.root_dir + '/warcs/A.warc.gz')
        writer = MultiFileWARCWriter(path)
        recorder_app = RecorderApp(self.upstream_url, writer)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        assert os.path.isfile(path)
        assert len(writer.fh_cache) == 1

        writer.close()
        assert len(writer.fh_cache) == 0
Exemplo n.º 4
0
    def test_record_file_warc_keep_open(self):
        path = to_path(self.root_dir + '/warcs/A.warc.gz')
        writer = MultiFileWARCWriter(path)
        recorder_app = RecorderApp(self.upstream_url, writer)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        assert os.path.isfile(path)
        assert len(writer.fh_cache) == 1

        writer.close()
        assert len(writer.fh_cache) == 0
Exemplo n.º 5
0
    def test_record_multiple_writes_rollover_idle(self):
        warc_path = to_path(self.root_dir +
                            '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path,
                                     dedup_index=dedup_index,
                                     max_idle_secs=0.9)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?foo=bar',
                                     '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 1)

        time.sleep(1.0)
        writer.close_idle_files()

        # Third Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?goo=bar',
                                     '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"goo": "bar"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 2)

        writer.close()
        assert len(writer.fh_cache) == 0
Exemplo n.º 6
0
    def test_record_multiple_writes_rollover_idle(self):
        warc_path = to_path(self.root_dir + '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index, max_idle_secs=0.9)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?boo=far', '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 1)

        time.sleep(1.0)
        writer.close_idle_files()

        # Third Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?goo=bar', '&param.recorder.coll=GOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"goo": "bar"' in resp.body

        self._test_all_warcs('/warcs/GOO/', 2)

        writer.close()
        assert len(writer.fh_cache) == 0
Exemplo n.º 7
0
    def test_record_multiple_writes_keep_open(self):
        warc_path = to_path(self.root_dir +
                            '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?foo=bar',
                                     '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/FOO/', 1)

        # Check two records in WARC
        r = FakeStrictRedis.from_url('redis://localhost/2')
        res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
        fullname = coll_dir + files[0]

        cdxout = BytesIO()
        with open(fullname, 'rb') as fh:
            filename = os.path.relpath(fullname, rel_path)
            write_cdx_index(cdxout,
                            fh,
                            filename,
                            cdxj=True,
                            append_post=True,
                            sort=True)

        res = [CDXObject(x) for x in res]

        cdxres = cdxout.getvalue().strip()
        cdxres = cdxres.split(b'\n')
        cdxres = [CDXObject(x) for x in cdxres]

        assert cdxres == res

        assert len(writer.fh_cache) == 1

        writer.close_key(to_path(self.root_dir + '/warcs/FOO/'))

        assert len(writer.fh_cache) == 0

        writer.close()

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=FOO')

        self._test_all_warcs('/warcs/FOO/', 2)

        warcs = r.hgetall('FOO:warc')
        assert len(warcs) == 2

        writer.close()
        assert len(writer.fh_cache) == 0
Exemplo n.º 8
0
    def test_record_multiple_writes_keep_open(self):
        warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body


        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?boo=far', '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/FOO/', 1)

        # Check two records in WARC
        r = FakeStrictRedis.from_url('redis://localhost/2')
        res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
        fullname = coll_dir + files[0]

        cdxout = BytesIO()
        with open(fullname, 'rb') as fh:
            filename = os.path.relpath(fullname, rel_path)
            write_cdx_index(cdxout, fh, filename,
                            cdxj=True, append_post=True, sort=True)

        res = [CDXObject(x) for x in res]

        cdxres = cdxout.getvalue().strip()
        cdxres = cdxres.split(b'\n')
        cdxres = [CDXObject(x) for x in cdxres]

        assert cdxres == res

        assert len(writer.fh_cache) == 1

        writer.close_key(to_path(self.root_dir + '/warcs/FOO/'))

        assert len(writer.fh_cache) == 0

        writer.close()

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                '/get?boo=far', '&param.recorder.coll=FOO')

        self._test_all_warcs('/warcs/FOO/', 2)

        warcs = r.hgetall('FOO:warc')
        assert len(warcs) == 2

        writer.close()
        assert len(writer.fh_cache) == 0