def test_record_warc_2(self): recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter( to_path(self.root_dir + '/warcs/')), accept_colls='live') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/', 2)
def test_error_url(self): recorder_app = RecorderApp(self.upstream_url + '01', PerRecordWARCWriter( to_path(self.root_dir + '/warcs/')), accept_colls='live') testapp = webtest.TestApp(recorder_app) resp = testapp.get('/live/resource?url=http://example.com/', status=400) assert resp.json['error'] != '' self._test_all_warcs('/warcs/', 2)
def test_record_param_user_coll_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) self._test_all_warcs('/warcs/USER/COLL/', 1) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 2) # Test Redis CDX r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 if b'warc/revisit' in res[0]: cdx = CDXObject(res[0]) else: cdx = CDXObject(res[1]) assert cdx['urlkey'] == 'org,httpbin)/user-agent' assert cdx['mime'] == 'warc/revisit' assert cdx['offset'] == '0' assert cdx['filename'].startswith(to_path('USER/COLL/')) assert cdx['filename'].endswith('.warc.gz') fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename']) warcs = r.hgetall('USER:COLL:warc') assert len(warcs) == 2 assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8') with open(fullwarc, 'rb') as fh: decomp = DecompressingBufferedReader(fh) # Test refers-to headers status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp) assert status_headers.get_header('WARC-Type') == 'revisit' assert status_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Refers-To-Date') != ''
def test_record_file_warc_keep_open(self): path = to_path(self.root_dir + '/warcs/A.warc.gz') writer = MultiFileWARCWriter(path) recorder_app = RecorderApp(self.upstream_url, writer) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert os.path.isfile(path) assert len(writer.fh_cache) == 1 writer.close() assert len(writer.fh_cache) == 0
def test_record_multiple_writes_rollover_idle(self): warc_path = to_path(self.root_dir + '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz') rel_path = to_path(self.root_dir + '/warcs/') dedup_index = self._get_dedup_index(user=False) writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index, max_idle_secs=0.9) recorder_app = RecorderApp(self.upstream_url, writer) # First Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.coll=GOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body # Second Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=GOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"boo": "far"' in resp.body self._test_all_warcs('/warcs/GOO/', 1) time.sleep(1.0) writer.close_idle_files() # Third Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?goo=bar', '¶m.recorder.coll=GOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"goo": "bar"' in resp.body self._test_all_warcs('/warcs/GOO/', 2) writer.close() assert len(writer.fh_cache) == 0
def test_record_param_user_coll_same_dir(self): warc_path = to_path(self.root_dir + '/warcs2/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index, key_template='{user}:{coll}')) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.user=USER2¶m.recorder.coll=COLL2') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.user=USER2¶m.recorder.coll=COLL3') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs2', 2)
def init_recorder(self): self.dedup_index = self.init_indexer() writer = SkipCheckingMultiFileWARCWriter( dir_template=self.warc_path_templ, filename_template=self.warc_name_templ, dedup_index=self.dedup_index, redis=self.redis, skip_key_templ=self.skip_key_templ, key_template=self.info_keys['rec'], header_filter=ExcludeHttpOnlyCookieHeaders()) self.writer = writer recorder_app = RecorderApp(self.upstream_url, writer, accept_colls='(live|mount:)', create_buff_func=self.create_buffer) self.recorder = recorder_app
def test_record_video_metadata(self): pytest.importorskip('youtube_dl') warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) params = { 'param.recorder.user': '******', 'param.recorder.coll': 'VIDEO', 'content_type': 'application/vnd.youtube-dl_formats+json' } resp = self._test_warc_write( recorder_app, 'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params), link_url='metadata://www.youtube.com/v/BfBgWtAIbRc') r = FakeStrictRedis.from_url('redis://localhost/2') warcs = r.hgetall('USER:VIDEO:warc') assert len(warcs) == 1 filename = list(warcs.values())[0] with open(filename, 'rb') as fh: decomp = DecompressingBufferedReader(fh) record = ArcWarcRecordLoader().parse_record_stream(decomp) status_headers = record.rec_headers assert status_headers.get_header('WARC-Type') == 'metadata' assert status_headers.get_header( 'Content-Type') == 'application/vnd.youtube-dl_formats+json' assert status_headers.get_header('WARC-Block-Digest') != '' assert status_headers.get_header( 'WARC-Block-Digest') == status_headers.get_header( 'WARC-Payload-Digest')
def test_record_param_user_coll(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp( self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) self._test_all_warcs('/warcs/USER/COLL/', None) resp = self._test_warc_write( recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 1) r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 1 cdx = CDXObject(res[0]) assert cdx['urlkey'] == 'org,httpbin)/user-agent' assert cdx['mime'] == 'application/json' assert cdx['offset'] == '0' assert cdx['filename'].startswith(to_path('USER/COLL/')) assert cdx['filename'].endswith('.warc.gz') warcs = r.hgetall('USER:COLL:warc') full_path = to_path(self.root_dir + '/warcs/' + cdx['filename']) assert warcs == { cdx['filename'].encode('utf-8'): full_path.encode('utf-8') }
def test_record_cookies_header(self): base_path = to_path(self.root_dir + '/warcs/cookiecheck/') recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(base_path), accept_colls='live') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar') assert b'HTTP/1.1 302' in resp.body buff = BytesIO(resp.body) record = ArcWarcRecordLoader().parse_record_stream(buff) assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers stored_req, stored_resp = self._load_resp_req(base_path) assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.http_headers.headers assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.http_headers.headers assert ('X-Other', 'foo') in stored_req.http_headers.headers assert ('Cookie', 'boo=far') in stored_req.http_headers.headers self._test_all_warcs('/warcs/cookiecheck/', 1)
def init_recorder(self): self.dedup_index = self.init_indexer() writer = SkipCheckingMultiFileWARCWriter(dir_template=self.warc_path_templ, dedup_index=self.dedup_index, redis=self.redis, key_template=self.info_keys['rec'], header_filter=ExcludeHttpOnlyCookieHeaders(), config=self.config) self.writer = writer skip_filters = [SkipRangeRequestFilter(), ExtractPatchingFilter()] recorder_app = RecorderApp(self.upstream_url, writer, skip_filters=skip_filters, #accept_colls=self.accept_colls, create_buff_func=writer.create_write_buffer) self.recorder = recorder_app
def test_record_param_user_coll_skip(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy()) recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) # No new entries written self._test_all_warcs('/warcs/USER/COLL/', 2) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 2) # Test Redis CDX r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2
def test_record_multiple_writes_keep_open(self): warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz') rel_path = to_path(self.root_dir + '/warcs/') dedup_index = self._get_dedup_index(user=False) writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) # First Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.coll=FOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body # Second Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=FOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"boo": "far"' in resp.body self._test_all_warcs('/warcs/FOO/', 1) # Check two records in WARC r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1) fullname = coll_dir + files[0] cdxout = BytesIO() with open(fullname, 'rb') as fh: filename = os.path.relpath(fullname, rel_path) write_cdx_index(cdxout, fh, filename, cdxj=True, append_post=True, sort=True) res = [CDXObject(x) for x in res] cdxres = cdxout.getvalue().strip() cdxres = cdxres.split(b'\n') cdxres = [CDXObject(x) for x in cdxres] assert cdxres == res assert len(writer.fh_cache) == 1 writer.close_key(to_path(self.root_dir + '/warcs/FOO/')) assert len(writer.fh_cache) == 0 writer.close() resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=FOO') self._test_all_warcs('/warcs/FOO/', 2) warcs = r.hgetall('FOO:warc') assert len(warcs) == 2 writer.close() assert len(writer.fh_cache) == 0
def init_recorder(self, recorder_config): """Initialize the recording functionality of pywb. If recording_config is None this function is a no op :param str|dict|None recorder_config: The configuration for the recorder app :rtype: None """ if not recorder_config: self.recorder = None self.recorder_path = None return if isinstance(recorder_config, str): recorder_coll = recorder_config recorder_config = {} else: recorder_coll = recorder_config['source_coll'] # cache mode self.rec_cache_mode = recorder_config.get('cache', 'default') dedup_policy = recorder_config.get('dedup_policy') dedup_by_url = False if dedup_policy == 'none': dedup_policy = '' if dedup_policy == 'keep': dedup_policy = WriteDupePolicy() elif dedup_policy == 'revisit': dedup_policy = WriteRevisitDupePolicy() elif dedup_policy == 'skip': dedup_policy = SkipDupePolicy() dedup_by_url = True elif dedup_policy: msg = 'Invalid option for dedup_policy: {0}' raise Exception(msg.format(dedup_policy)) if dedup_policy: dedup_index = WritableRedisIndexer( redis_url=self.warcserver.dedup_index_url, dupe_policy=dedup_policy, rel_path_template=self.warcserver.root_dir + '/{coll}/archive') else: dedup_index = None warc_writer = MultiFileWARCWriter( self.warcserver.archive_paths, max_size=int(recorder_config.get('rollover_size', 1000000000)), max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)), filename_template=recorder_config.get('filename_template'), dedup_index=dedup_index, dedup_by_url=dedup_by_url) if dedup_policy: pending_counter = self.warcserver.dedup_index_url.replace( ':cdxj', ':pending') pending_timeout = recorder_config.get('pending_timeout', 30) create_buff_func = lambda params, name: RedisPendingCounterTempBuffer( 512 * 1024, pending_counter, params, name, pending_timeout) else: create_buff_func = None self.recorder = RecorderApp( self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer, accept_colls=recorder_config.get('source_filter'), create_buff_func=create_buff_func) recorder_server = GeventServer(self.recorder, port=0) self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll) # enable PUT of custom data as 'resource' records if recorder_config.get('enable_put_custom_record'): self.put_custom_record_path = self.recorder_path + '&put_record={rec_type}&url={url}'
def init_recorder(self, recorder_config): """Initialize the recording functionality of pywb. If recording_config is None this function is a no op :param str|dict|None recorder_config: The configuration for the recorder app :rtype: None """ if not recorder_config: self.recorder = None self.recorder_path = None return if isinstance(recorder_config, str): recorder_coll = recorder_config recorder_config = {} else: recorder_coll = recorder_config['source_coll'] # cache mode self.rec_cache_mode = recorder_config.get('cache', 'default') dedup_policy = recorder_config.get('dedup_policy') dedup_by_url = False if dedup_policy == 'none': dedup_policy = '' if dedup_policy == 'keep': dedup_policy = WriteDupePolicy() elif dedup_policy == 'revisit': dedup_policy = WriteRevisitDupePolicy() elif dedup_policy == 'skip': dedup_policy = SkipDupePolicy() dedup_by_url = True elif dedup_policy: msg = 'Invalid option for dedup_policy: {0}' raise Exception(msg.format(dedup_policy)) if dedup_policy: dedup_index = WritableRedisIndexer( redis_url=self.warcserver.dedup_index_url, dupe_policy=dedup_policy, rel_path_template=self.warcserver.root_dir + '/{coll}/archive') else: dedup_index = None warc_writer = MultiFileWARCWriter( self.warcserver.archive_paths, max_size=int(recorder_config.get('rollover_size', 1000000000)), max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)), filename_template=recorder_config.get('filename_template'), dedup_index=dedup_index, dedup_by_url=dedup_by_url) self.recorder = RecorderApp( self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer, accept_colls=recorder_config.get('source_filter')) recorder_server = GeventServer(self.recorder, port=0) self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)