def test_single_file_combined(self): agg = SimpleAggregator( {'source': FileAccessIndexSource(TEST_EXCL_PATH + 'list1.aclj')}) access = AccessChecker(agg, default_access='block') edx = access.find_access_rule('http://example.com/abc/page.html') assert edx['urlkey'] == 'com,example)/abc/page.html' assert edx['access'] == 'allow' edx = access.find_access_rule('http://example.com/abc/page.htm') assert edx['urlkey'] == 'com,example)/abc' assert edx['access'] == 'block' edx = access.find_access_rule('http://example.com/abc/') assert edx['urlkey'] == 'com,example)/abc' assert edx['access'] == 'block' edx = access.find_access_rule('http://foo.example.com/') assert edx['urlkey'] == 'com,example,' assert edx['access'] == 'exclude' edx = access.find_access_rule('http://example.com/') assert edx['urlkey'] == 'com,' assert edx['access'] == 'allow' edx = access.find_access_rule('foo.net') assert edx['urlkey'] == '' assert edx['access'] == 'block' edx = access.find_access_rule('https://example.net/abc/path/other') assert edx['urlkey'] == '' assert edx['access'] == 'block'
def load_auto_colls(self): if not self.root_dir: print('No Root Dir, Skip Auto Colls!') return dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir, base_dir=self.index_paths, config=self.config) access_checker = AccessChecker( CacheDirectoryAccessSource(self.acl_paths), self.default_access) if self.dedup_index_url: source = SimpleAggregator({ 'dedup': RedisMultiKeyIndexSource(self.dedup_index_url), 'dir': dir_source }) else: source = dir_source return DefaultResourceHandler(source, self.archive_paths, rules_file=self.rules_file, access_checker=access_checker)
def test_blocks_only(self): agg = SimpleAggregator( {'source': FileAccessIndexSource(TEST_EXCL_PATH + 'blocks.aclj')}) access = AccessChecker(agg) edx = access.find_access_rule('https://example.com/foo') assert edx['urlkey'] == 'com,example)/foo' assert edx['access'] == 'exclude' edx = access.find_access_rule('https://example.com/food') assert edx['urlkey'] == 'com,example)/foo' assert edx['access'] == 'exclude' edx = access.find_access_rule('https://example.com/foo/path') assert edx['urlkey'] == 'com,example)/foo' assert edx['access'] == 'exclude' edx = access.find_access_rule('https://example.net/abc/path') assert edx['urlkey'] == 'net,example)/abc/path' assert edx['access'] == 'block' edx = access.find_access_rule('https://example.net/abc/path/other') assert edx['urlkey'] == 'net,example)/abc/path' assert edx['access'] == 'block' edx = access.find_access_rule('https://example.net/fo') assert edx['urlkey'] == '' assert edx['access'] == 'allow'
def make_live_app(): app = BaseWarcServer() app.add_route( '/live', DefaultResourceHandler( SimpleAggregator({'live': LiveIndexSource()}))) return app
def test_extra_agg_collB(self): agg_source = SimpleAggregator({'dir': self.dir_loader}) res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'}) exp = [{'source': to_path('dir:colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {})
def init_index_agg(source_configs, use_gevent=False, timeout=0, source_list=None): sources = {} for n, v in iteritems(source_configs): sources[n] = init_index_source(v, source_list=source_list) if use_gevent: return GeventTimeoutAggregator(sources, timeout=timeout) else: return SimpleAggregator(sources)
def setup(self): app = BaseWarcServer() base_url = 'http://localhost:{0}'.format(self.server.port) app.add_route( '/upstream', DefaultResourceHandler( SimpleAggregator( {'upstream': UpstreamAggIndexSource(base_url + '/live')}))) app.add_route( '/upstream_opt', DefaultResourceHandler( SimpleAggregator({ 'upstream_opt': UpstreamMementoIndexSource.upstream_resource(base_url + '/live') }))) self.base_url = base_url self.testapp = webtest.TestApp(app)
def __init__(self, *args, **kwargs): redis_url = kwargs.get('redis_url') redis = kwargs.get('redis') cdx_key_template = kwargs.get('cdx_key_template') super(WritableRedisIndexer, self).__init__(redis_url, redis, cdx_key_template) name = kwargs.get('name', 'recorder') self.cdx_lookup = SimpleAggregator({name: self}) self.rel_path_template = kwargs.get('rel_path_template', '') self.file_key_template = kwargs.get('file_key_template', '') self.full_warc_prefix = kwargs.get('full_warc_prefix', '') self.dupe_policy = kwargs.get('dupe_policy', WriteRevisitDupePolicy())
def test_agg_dir_and_memento(self): sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'local': self.dir_loader} agg_source = SimpleAggregator(sources) res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6}) exp = [ {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'}, {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, {'source': to_path('local:colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} ] assert(to_json_list(res) == exp) assert(errs == {})
def setup_class(cls): super(TestBaseWarcServer, cls).setup_class() live_source = SimpleAggregator({'live': LiveIndexSource()}) live_handler = DefaultResourceHandler(live_source) app = BaseWarcServer() app.add_route('/live', live_handler) source1 = GeventTimeoutAggregator(sources) handler1 = DefaultResourceHandler(source1, TEST_WARC_PATH) app.add_route('/many', handler1) app.add_route( '/cdx_api', DefaultResourceHandler(SimpleAggregator(ia_cdx), TEST_WARC_PATH)) source2 = SimpleAggregator( {'post': FileIndexSource(TEST_CDX_PATH + 'post-test.cdxj')}) handler2 = DefaultResourceHandler(source2, TEST_WARC_PATH) app.add_route('/posttest', handler2) source3 = SimpleAggregator( {'example': FileIndexSource(TEST_CDX_PATH + 'example2.cdxj')}) handler3 = DefaultResourceHandler(source3, TEST_WARC_PATH) app.add_route('/fallback', HandlerSeq([handler3, handler2, live_handler])) app.add_route('/seq', HandlerSeq([handler3, handler2])) app.add_route( '/allredis', DefaultResourceHandler(source3, 'redis://localhost/2/test:warc')) app.add_route('/empty', HandlerSeq([])) app.add_route( '/invalid', DefaultResourceHandler( [SimpleAggregator({'invalid': 'should not be a callable'})])) url_agnost = SimpleAggregator({ 'url-agnost': FileIndexSource(TEST_CDX_PATH + 'url-agnost-example.cdxj') }) app.add_route( '/urlagnost', DefaultResourceHandler(url_agnost, 'redis://localhost/2/test:{arg}:warc')) cls.testapp = webtest.TestApp(app)
def test_allows_only_default_block(self): agg = SimpleAggregator( {'source': FileAccessIndexSource(TEST_EXCL_PATH + 'allows.aclj')}) access = AccessChecker(agg, default_access='block') edx = access.find_access_rule('http://example.net') assert edx['urlkey'] == 'net,' edx = access.find_access_rule('http://foo.example.net/abc') assert edx['urlkey'] == 'net,' edx = access.find_access_rule('https://example.net/test/') assert edx['urlkey'] == 'net,example)/test' edx = access.find_access_rule('https://example.org/') assert edx['urlkey'] == '' assert edx['access'] == 'block' edx = access.find_access_rule('https://abc.domain.net/path') assert edx['urlkey'] == 'net,domain,' edx = access.find_access_rule('https://domain.neta/path') assert edx['urlkey'] == '' assert edx['access'] == 'block'
def __init__(self): init_logging() config = load_wr_config() app = BaseWarcServer(debug=True) redis_base = os.environ['REDIS_BASE_URL'] + '/' rec_url = redis_base + config['cdxj_key_templ'] coll_url = redis_base + config['coll_cdxj_key_templ'] warc_url = redis_base + config['warc_key_templ'] rec_list_key = config['rec_list_key_templ'] redis_resolver = RedisResolver(redis_url=warc_url, member_key_templ=rec_list_key) redis = redis_resolver.redis warc_resolvers = [redis_resolver] cache_proxy_url = os.environ.get('CACHE_PROXY_URL', '') global PROXY_PREFIX PROXY_PREFIX = cache_proxy_url timeout = 20.0 rec_redis_source = RedisIndexSource(timeout=timeout, redis_url=rec_url, redis=redis) coll_redis_source = RedisIndexSource(timeout=timeout, redis_url=coll_url, redis=redis) live_rec = DefaultResourceHandler( SimpleAggregator({'live': LiveIndexSource()}, ), warc_resolvers, cache_proxy_url) # Extractable archives (all available) wam_loader = WAMSourceLoader(memento_cls=ProxyMementoIndexSource, remote_cls=ProxyRemoteIndexSource, wb_memento_cls=ProxyWBMementoIndexSource) extractable_archives = wam_loader.sources # Extract Source extractor = GeventTimeoutAggregator(extractable_archives, timeout=timeout) extract_primary = DefaultResourceHandler(extractor, warc_resolvers, cache_proxy_url) # Patch fallback archives fallback_archives = self.filter_archives( extractable_archives, config['patch_archives_index']) # patch + live #patch_archives = fallback_archives.copy() patch_archives = fallback_archives patch_archives['live'] = LiveIndexSource() extractor2 = GeventTimeoutAggregator(patch_archives, timeout=timeout, sources_key='inv_sources', invert_sources=True) extract_other = DefaultResourceHandler(extractor2, warc_resolvers, cache_proxy_url) patcher = GeventTimeoutAggregator(patch_archives, timeout=timeout) patch_rec = DefaultResourceHandler(patcher, warc_resolvers, cache_proxy_url) # Single Rec Replay replay_rec = DefaultResourceHandler( SimpleAggregator({'local': rec_redis_source}), warc_resolvers, cache_proxy_url) # Coll Replay replay_coll = DefaultResourceHandler( SimpleAggregator({'local': coll_redis_source}), warc_resolvers, cache_proxy_url) app.add_route('/live', live_rec) app.add_route('/extract', HandlerSeq([extract_primary, extract_other, replay_rec])) app.add_route('/replay', replay_rec) app.add_route('/replay-coll', replay_coll) app.add_route('/patch', HandlerSeq([replay_coll, patch_rec])) self.app = app
def query_single_source(source, params): string = str(source) return SimpleAggregator({'source': source})(params)
'local': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'), 'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'), 'bl': MementoIndexSource.from_timegate_url( 'http://www.webarchive.org.uk/wayback/archive/'), 'rhiz': MementoIndexSource.from_timegate_url('https://webenact.rhizome.org/vvork/', path='*') } aggs = { 'simple': SimpleAggregator(sources), 'gevent': GeventTimeoutAggregator(sources, timeout=5.0), } aggs_inv = { 'simple': SimpleAggregator(sources, invert_sources=True), 'gevent': GeventTimeoutAggregator(sources, invert_sources=True, timeout=5.0), } agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.05)} nf = {'notfound': FileIndexSource('testdata/not-found-x')} agg_nf = { 'simple': SimpleAggregator(nf),
def setup_class(cls): cls.source = SimpleAggregator({'source': EchoParamsSource()}) cls.fuzzy = FuzzyMatcher()
from mock import patch from pywb.warcserver.handlers import IndexHandler # Aggregator Mappings sources = { 'local': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'), 'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'), 'bl': MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/'), 'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*') } aggs = {'simple': SimpleAggregator(sources), 'gevent': GeventTimeoutAggregator(sources, timeout=5.0), } aggs_inv = {'simple': SimpleAggregator(sources, invert_sources=True), 'gevent': GeventTimeoutAggregator(sources, invert_sources=True, timeout=5.0), } agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.05)} nf = {'notfound': FileIndexSource('testdata/not-found-x')} agg_nf = {'simple': SimpleAggregator(nf), 'gevent': GeventTimeoutAggregator(nf, timeout=5.0), }
def do_query(self, params): return SimpleAggregator( {'source': XmlQueryIndexSource('http://localhost:8080/path')})(params)