def test_record_custom_record(self): dedup_index = self._get_dedup_index(user=False) warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz') writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) req_url = '/live/resource/postreq?url=custom://httpbin.org¶m.recorder.coll=META&put_record=resource' buff = b'Some Data' testapp = webtest.TestApp(recorder_app) headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo'} resp = testapp.put(req_url, headers=headers, params=buff) assert resp.json['success'] == 'true' assert resp.json['WARC-Date'] != '' self._test_all_warcs('/warcs/meta', 1) r = FakeStrictRedis.from_url('redis://localhost/2') warcs = r.hgetall('META:warc') assert len(warcs) == 1 warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8') with open(warcs[warc_key], 'rb') as fh: decomp = DecompressingBufferedReader(fh) record = ArcWarcRecordLoader().parse_record_stream( decomp, ensure_http_headers=True) status_headers = record.rec_headers assert len(record.rec_headers.headers) == 9 assert status_headers.get_header('WARC-Type') == 'resource' assert status_headers.get_header( 'WARC-Target-URI') == 'custom://httpbin.org' assert status_headers.get_header('WARC-Record-ID') != '' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header('WARC-Block-Digest') != '' assert status_headers.get_header( 'WARC-Block-Digest') == status_headers.get_header( 'WARC-Payload-Digest') assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) assert status_headers.get_header('WARC-Custom') == 'foo' assert record.raw_stream.read() == buff status_headers = record.http_headers assert len(record.http_headers.headers) == 2 assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) writer.close() assert len(writer.fh_cache) == 0
def test_record_custom_record(self): dedup_index = self._get_dedup_index(user=False) warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz') writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) req_url = '/live/resource/postreq?url=custom://httpbin.org¶m.recorder.coll=META&put_record=resource' buff = b'Some Data' testapp = webtest.TestApp(recorder_app) headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo' } resp = testapp.put(req_url, headers=headers, params=buff) assert resp.json['success'] == 'true' assert resp.json['WARC-Date'] != '' self._test_all_warcs('/warcs/meta', 1) r = FakeStrictRedis.from_url('redis://localhost/2') warcs = r.hgetall('META:warc') assert len(warcs) == 1 warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8') with open(warcs[warc_key], 'rb') as fh: decomp = DecompressingBufferedReader(fh) record = ArcWarcRecordLoader().parse_record_stream(decomp, ensure_http_headers=True) status_headers = record.rec_headers assert len(record.rec_headers.headers) == 9 assert status_headers.get_header('WARC-Type') == 'resource' assert status_headers.get_header('WARC-Target-URI') == 'custom://httpbin.org' assert status_headers.get_header('WARC-Record-ID') != '' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header('WARC-Block-Digest') != '' assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest') assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) assert status_headers.get_header('WARC-Custom') == 'foo' assert record.raw_stream.read() == buff status_headers = record.http_headers assert len(record.http_headers.headers) == 2 assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) writer.close() assert len(writer.fh_cache) == 0
def test_record_file_warc_keep_open(self): path = to_path(self.root_dir + '/warcs/A.warc.gz') writer = MultiFileWARCWriter(path) recorder_app = RecorderApp(self.upstream_url, writer) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert os.path.isfile(path) assert len(writer.fh_cache) == 1 writer.close() assert len(writer.fh_cache) == 0
def main(): upstream_url = 'http://localhost:8080' target = tempfile.mkdtemp(prefix='tmprec') + '/' print('Recording to ' + target) def rm_target(): print('Removing ' + target) shutil.rmtree(target) atexit.register(rm_target) local_r = redis.StrictRedis.from_url('redis://localhost/2') local_r.delete('rec:cdxj') local_r.delete('rec:warc') #target = './_recordings/' dedup_index = WritableRedisIndexer( redis_url='redis://localhost/2/rec:cdxj', file_key_template='rec:warc', rel_path_template=target, dupe_policy=SkipDupePolicy()) recorder_app = RecorderApp(upstream_url, MultiFileWARCWriter(target, dedup_index=dedup_index), accept_colls='live') return recorder_app
def init_recorder(self, recorder_config): """Initialize the recording functionality of pywb. If recording_config is None this function is a no op""" if not recorder_config: self.recorder = None self.recorder_path = None return if isinstance(recorder_config, str): recorder_coll = recorder_config recorder_config = {} else: recorder_coll = recorder_config['source_coll'] # TODO: support dedup dedup_index = None warc_writer = MultiFileWARCWriter( self.warcserver.archive_paths, max_size=int(recorder_config.get('rollover_size', 1000000000)), max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)), filename_template=recorder_config.get('filename_template'), dedup_index=dedup_index) self.recorder = RecorderApp( self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer, accept_colls=recorder_config.get('source_filter')) recorder_server = GeventServer(self.recorder, port=0) self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
def init_recorder(self, recorder_config): if not recorder_config: self.recorder = None self.recorder_path = None return if isinstance(recorder_config, str): recorder_coll = recorder_config recorder_config = {} else: recorder_coll = recorder_config['source_coll'] # TODO: support dedup dedup_index = None warc_writer = MultiFileWARCWriter( self.warcserver.archive_paths, max_size=int(recorder_config.get('rollover_size', 1000000000)), max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)), filename_template=recorder_config.get('filename_template'), dedup_index=dedup_index) self.recorder = RecorderApp( self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer) recorder_server = GeventServer(self.recorder, port=0) self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
def test_record_multiple_writes_rollover_idle(self): warc_path = to_path(self.root_dir + '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz') rel_path = to_path(self.root_dir + '/warcs/') dedup_index = self._get_dedup_index(user=False) writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index, max_idle_secs=0.9) recorder_app = RecorderApp(self.upstream_url, writer) # First Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.coll=GOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body # Second Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=GOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"boo": "far"' in resp.body self._test_all_warcs('/warcs/GOO/', 1) time.sleep(1.0) writer.close_idle_files() # Third Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?goo=bar', '¶m.recorder.coll=GOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"goo": "bar"' in resp.body self._test_all_warcs('/warcs/GOO/', 2) writer.close() assert len(writer.fh_cache) == 0
def test_record_multiple_writes_keep_open(self): warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz') rel_path = to_path(self.root_dir + '/warcs/') dedup_index = self._get_dedup_index(user=False) writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) # First Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.coll=FOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body # Second Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=FOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"boo": "far"' in resp.body self._test_all_warcs('/warcs/FOO/', 1) # Check two records in WARC r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1) fullname = coll_dir + files[0] cdxout = BytesIO() with open(fullname, 'rb') as fh: filename = os.path.relpath(fullname, rel_path) write_cdx_index(cdxout, fh, filename, cdxj=True, append_post=True, sort=True) res = [CDXObject(x) for x in res] cdxres = cdxout.getvalue().strip() cdxres = cdxres.split(b'\n') cdxres = [CDXObject(x) for x in cdxres] assert cdxres == res assert len(writer.fh_cache) == 1 writer.close_key(to_path(self.root_dir + '/warcs/FOO/')) assert len(writer.fh_cache) == 0 writer.close() resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=FOO') self._test_all_warcs('/warcs/FOO/', 2) warcs = r.hgetall('FOO:warc') assert len(warcs) == 2 writer.close() assert len(writer.fh_cache) == 0
def init_recorder(self, recorder_config): """Initialize the recording functionality of pywb. If recording_config is None this function is a no op :param str|dict|None recorder_config: The configuration for the recorder app :rtype: None """ if not recorder_config: self.recorder = None self.recorder_path = None return if isinstance(recorder_config, str): recorder_coll = recorder_config recorder_config = {} else: recorder_coll = recorder_config['source_coll'] # cache mode self.rec_cache_mode = recorder_config.get('cache', 'default') dedup_policy = recorder_config.get('dedup_policy') dedup_by_url = False if dedup_policy == 'none': dedup_policy = '' if dedup_policy == 'keep': dedup_policy = WriteDupePolicy() elif dedup_policy == 'revisit': dedup_policy = WriteRevisitDupePolicy() elif dedup_policy == 'skip': dedup_policy = SkipDupePolicy() dedup_by_url = True elif dedup_policy: msg = 'Invalid option for dedup_policy: {0}' raise Exception(msg.format(dedup_policy)) if dedup_policy: dedup_index = WritableRedisIndexer( redis_url=self.warcserver.dedup_index_url, dupe_policy=dedup_policy, rel_path_template=self.warcserver.root_dir + '/{coll}/archive') else: dedup_index = None warc_writer = MultiFileWARCWriter( self.warcserver.archive_paths, max_size=int(recorder_config.get('rollover_size', 1000000000)), max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)), filename_template=recorder_config.get('filename_template'), dedup_index=dedup_index, dedup_by_url=dedup_by_url) if dedup_policy: pending_counter = self.warcserver.dedup_index_url.replace( ':cdxj', ':pending') pending_timeout = recorder_config.get('pending_timeout', 30) create_buff_func = lambda params, name: RedisPendingCounterTempBuffer( 512 * 1024, pending_counter, params, name, pending_timeout) else: create_buff_func = None self.recorder = RecorderApp( self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer, accept_colls=recorder_config.get('source_filter'), create_buff_func=create_buff_func) recorder_server = GeventServer(self.recorder, port=0) self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll) # enable PUT of custom data as 'resource' records if recorder_config.get('enable_put_custom_record'): self.put_custom_record_path = self.recorder_path + '&put_record={rec_type}&url={url}'
def init_recorder(self, recorder_config): """Initialize the recording functionality of pywb. If recording_config is None this function is a no op :param str|dict|None recorder_config: The configuration for the recorder app :rtype: None """ if not recorder_config: self.recorder = None self.recorder_path = None return if isinstance(recorder_config, str): recorder_coll = recorder_config recorder_config = {} else: recorder_coll = recorder_config['source_coll'] # cache mode self.rec_cache_mode = recorder_config.get('cache', 'default') dedup_policy = recorder_config.get('dedup_policy') dedup_by_url = False if dedup_policy == 'none': dedup_policy = '' if dedup_policy == 'keep': dedup_policy = WriteDupePolicy() elif dedup_policy == 'revisit': dedup_policy = WriteRevisitDupePolicy() elif dedup_policy == 'skip': dedup_policy = SkipDupePolicy() dedup_by_url = True elif dedup_policy: msg = 'Invalid option for dedup_policy: {0}' raise Exception(msg.format(dedup_policy)) if dedup_policy: dedup_index = WritableRedisIndexer( redis_url=self.warcserver.dedup_index_url, dupe_policy=dedup_policy, rel_path_template=self.warcserver.root_dir + '/{coll}/archive') else: dedup_index = None warc_writer = MultiFileWARCWriter( self.warcserver.archive_paths, max_size=int(recorder_config.get('rollover_size', 1000000000)), max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)), filename_template=recorder_config.get('filename_template'), dedup_index=dedup_index, dedup_by_url=dedup_by_url) self.recorder = RecorderApp( self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer, accept_colls=recorder_config.get('source_filter')) recorder_server = GeventServer(self.recorder, port=0) self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
def close_file(actual_self, filename): MultiFileWARCWriter.close_file(actual_self, filename) assert list(actual_self.iter_open_files()) == [] global all_closed all_closed = True