def test_another_remote_not_found(self): source = MementoIndexSource.from_timegate_url('https://webenact.rhizome.org/all/') url = 'http://x-not-found-x.notfound/' res, errs = self.query_single_source(source, dict(url=url, limit=3)) expected = '' assert(key_ts_res(res) == expected) assert(errs['source'] == "NotFoundException('https://webenact.rhizome.org/all/timemap/link/http://x-not-found-x.notfound/',)")
def test_another_remote_not_found(self): source = MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/all/') url = 'http://x-not-found-x.notfound/' res, errs = self.query_single_source(source, dict(url=url, limit=3)) expected = '' assert(key_ts_res(res) == expected) assert(errs['source'] == "NotFoundException('http://webenact.rhizome.org/all/timemap/link/http://x-not-found-x.notfound/',)")
def setup_class(cls): super(TestIndexSources, cls).setup_class() cls.add_cdx_to_redis(TEST_CDX_PATH + 'iana.cdxj', 'test:rediscdx') cls.all_sources = { 'file': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'), 'redis': RedisIndexSource('redis://localhost:6379/2/test:rediscdx'), 'remote_cdx': RemoteIndexSource('https://webenact.rhizome.org/all/cdx?url={url}', 'https://webenact.rhizome.org/all/{timestamp}id_/{url}'), 'memento': MementoIndexSource('https://webenact.rhizome.org/all/{url}', 'https://webenact.rhizome.org/all/timemap/link/{url}', 'https://webenact.rhizome.org/all/{timestamp}id_/{url}') }
def add_index(self, replay, apis, pk, collection=''): replay = replay.replace('{collection}', collection) index = None if 'memento' in apis: timegate = apis['memento']['timegate'].replace('{collection}', collection) + '{url}' timemap = apis['memento']['timemap'].replace('{collection}', collection) + '{url}' index = MementoIndexSource(timegate, timemap, replay) elif 'cdx' in apis: query = apis['cdx']['query'].replace('{collection}', collection) index = RemoteIndexSource(query, replay) else: index = WBMementoIndexSource('', '', replay) if index: self.all_archives[pk] = index
def test_agg_dir_and_memento(self): sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'local': self.dir_loader} agg_source = SimpleAggregator(sources) res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6}) exp = [ {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'}, {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, {'source': to_path('local:colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} ] assert(to_json_list(res) == exp) assert(errs == {})
from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq from pywb.warcserver.index.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource from pywb.warcserver.index.indexsource import RemoteIndexSource from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggregator from pywb.warcserver.index.aggregator import DirectoryIndexSource from pywb.warcserver.basewarcserver import BaseWarcServer from pywb.utils.memento import MementoUtils sources = { 'local': DirectoryIndexSource(TEST_CDX_PATH), 'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'), 'live': LiveIndexSource(), } ia_cdx = { 'ia-cdx': RemoteIndexSource('http://web.archive.org/cdx?url={url}&closest={closest}&sort=closest', 'http://web.archive.org/web/{timestamp}id_/{url}') } class TestBaseWarcServer(HttpBinLiveTests, MementoOverrideTests, FakeRedisTests, BaseTestClass): @classmethod def setup_class(cls):
from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq from pywb.warcserver.index.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource from pywb.warcserver.index.indexsource import RemoteIndexSource from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggregator from pywb.warcserver.index.aggregator import DirectoryIndexSource from pywb.warcserver.basewarcserver import BaseWarcServer from pywb.utils.memento import MementoUtils sources = { 'local': DirectoryIndexSource(TEST_CDX_PATH), 'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'rhiz': MementoIndexSource.from_timegate_url('https://webenact.rhizome.org/vvork/'), 'live': LiveIndexSource(), } ia_cdx = { 'ia-cdx': RemoteIndexSource('http://web.archive.org/cdx?url={url}&closest={closest}&sort=closest', 'http://web.archive.org/web/{timestamp}id_/{url}') } class TestBaseWarcServer(HttpBinLiveTests, MementoOverrideTests, FakeRedisTests, BaseTestClass): @classmethod def setup_class(cls):
from pywb.warcserver.test.testutils import to_json_list, to_path, TEST_CDX_PATH, MementoOverrideTests, BaseTestClass import json import pytest import time import six from mock import patch from pywb.warcserver.handlers import IndexHandler # Aggregator Mappings sources = { 'local': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'), 'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'), 'bl': MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/'), 'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*') } aggs = {'simple': SimpleAggregator(sources), 'gevent': GeventTimeoutAggregator(sources, timeout=5.0), } aggs_inv = {'simple': SimpleAggregator(sources, invert_sources=True), 'gevent': GeventTimeoutAggregator(sources, invert_sources=True, timeout=5.0), } agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.05)}
from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq from pywb.warcserver.index.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource from pywb.warcserver.index.indexsource import RemoteIndexSource from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggregator from pywb.warcserver.index.aggregator import DirectoryIndexSource from pywb.warcserver.basewarcserver import BaseWarcServer from pywb.utils.memento import MementoUtils sources = { 'local': DirectoryIndexSource(TEST_CDX_PATH), 'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'), 'live': LiveIndexSource(), } ia_cdx = { 'ia-cdx': RemoteIndexSource( 'http://web.archive.org/cdx?url={url}&closest={closest}&sort=closest', 'http://web.archive.org/web/{timestamp}id_/{url}') }