def test_record_custom_record(self): dedup_index = self._get_dedup_index(user=False) warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz') writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) req_url = '/live/resource/postreq?url=custom://httpbin.org¶m.recorder.coll=META&put_record=resource' buff = b'Some Data' testapp = webtest.TestApp(recorder_app) headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo'} resp = testapp.put(req_url, headers=headers, params=buff) assert resp.json['success'] == 'true' assert resp.json['WARC-Date'] != '' self._test_all_warcs('/warcs/meta', 1) r = FakeStrictRedis.from_url('redis://localhost/2') warcs = r.hgetall('META:warc') assert len(warcs) == 1 warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8') with open(warcs[warc_key], 'rb') as fh: decomp = DecompressingBufferedReader(fh) record = ArcWarcRecordLoader().parse_record_stream( decomp, ensure_http_headers=True) status_headers = record.rec_headers assert len(record.rec_headers.headers) == 9 assert status_headers.get_header('WARC-Type') == 'resource' assert status_headers.get_header( 'WARC-Target-URI') == 'custom://httpbin.org' assert status_headers.get_header('WARC-Record-ID') != '' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header('WARC-Block-Digest') != '' assert status_headers.get_header( 'WARC-Block-Digest') == status_headers.get_header( 'WARC-Payload-Digest') assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) assert status_headers.get_header('WARC-Custom') == 'foo' assert record.raw_stream.read() == buff status_headers = record.http_headers assert len(record.http_headers.headers) == 2 assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) writer.close() assert len(writer.fh_cache) == 0
def test_record_custom_record(self): dedup_index = self._get_dedup_index(user=False) warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz') writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) req_url = '/live/resource/postreq?url=custom://httpbin.org¶m.recorder.coll=META&put_record=resource' buff = b'Some Data' testapp = webtest.TestApp(recorder_app) headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo' } resp = testapp.put(req_url, headers=headers, params=buff) assert resp.json['success'] == 'true' assert resp.json['WARC-Date'] != '' self._test_all_warcs('/warcs/meta', 1) r = FakeStrictRedis.from_url('redis://localhost/2') warcs = r.hgetall('META:warc') assert len(warcs) == 1 warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8') with open(warcs[warc_key], 'rb') as fh: decomp = DecompressingBufferedReader(fh) record = ArcWarcRecordLoader().parse_record_stream(decomp, ensure_http_headers=True) status_headers = record.rec_headers assert len(record.rec_headers.headers) == 9 assert status_headers.get_header('WARC-Type') == 'resource' assert status_headers.get_header('WARC-Target-URI') == 'custom://httpbin.org' assert status_headers.get_header('WARC-Record-ID') != '' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header('WARC-Block-Digest') != '' assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest') assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) assert status_headers.get_header('WARC-Custom') == 'foo' assert record.raw_stream.read() == buff status_headers = record.http_headers assert len(record.http_headers.headers) == 2 assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) writer.close() assert len(writer.fh_cache) == 0
def test_record_file_warc_keep_open(self): path = to_path(self.root_dir + '/warcs/A.warc.gz') writer = MultiFileWARCWriter(path) recorder_app = RecorderApp(self.upstream_url, writer) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert os.path.isfile(path) assert len(writer.fh_cache) == 1 writer.close() assert len(writer.fh_cache) == 0
def test_record_multiple_writes_rollover_idle(self): warc_path = to_path(self.root_dir + '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz') rel_path = to_path(self.root_dir + '/warcs/') dedup_index = self._get_dedup_index(user=False) writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index, max_idle_secs=0.9) recorder_app = RecorderApp(self.upstream_url, writer) # First Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.coll=GOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body # Second Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=GOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"boo": "far"' in resp.body self._test_all_warcs('/warcs/GOO/', 1) time.sleep(1.0) writer.close_idle_files() # Third Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?goo=bar', '¶m.recorder.coll=GOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"goo": "bar"' in resp.body self._test_all_warcs('/warcs/GOO/', 2) writer.close() assert len(writer.fh_cache) == 0
def test_record_multiple_writes_keep_open(self): warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz') rel_path = to_path(self.root_dir + '/warcs/') dedup_index = self._get_dedup_index(user=False) writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) # First Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.coll=FOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body # Second Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=FOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"boo": "far"' in resp.body self._test_all_warcs('/warcs/FOO/', 1) # Check two records in WARC r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1) fullname = coll_dir + files[0] cdxout = BytesIO() with open(fullname, 'rb') as fh: filename = os.path.relpath(fullname, rel_path) write_cdx_index(cdxout, fh, filename, cdxj=True, append_post=True, sort=True) res = [CDXObject(x) for x in res] cdxres = cdxout.getvalue().strip() cdxres = cdxres.split(b'\n') cdxres = [CDXObject(x) for x in cdxres] assert cdxres == res assert len(writer.fh_cache) == 1 writer.close_key(to_path(self.root_dir + '/warcs/FOO/')) assert len(writer.fh_cache) == 0 writer.close() resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=FOO') self._test_all_warcs('/warcs/FOO/', 2) warcs = r.hgetall('FOO:warc') assert len(warcs) == 2 writer.close() assert len(writer.fh_cache) == 0