Пример #1
0
    def test_agg_collB_found(self):
        res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'})

        exp = [{'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Пример #2
0
    def test_redis_not_found(self, indexloader):
        res, errs = indexloader({'url': 'example.com/'})

        exp = []

        assert(errs == {})
        assert(to_json_list(res) == exp)
Пример #3
0
    def test_agg_collB(self):
        res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'B'})

        exp = []

        assert(to_json_list(res) == exp)
        assert(errs == {})
Пример #4
0
    def test_mem_agg_index_3(self, agg):
        url = 'http://vvork.com/'
        res, errs = agg(dict(url=url, closest='20141001', limit=5))

        exp = [{
            "timestamp": "20141006184357",
            "load_url":
            "https://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/",
            "source": "rhiz"
        }, {
            "timestamp": "20141018133107",
            "load_url":
            "http://web.archive.org/web/20141018133107id_/http://vvork.com/",
            "source": "ia"
        }, {
            "timestamp": "20141020161243",
            "load_url":
            "http://web.archive.org/web/20141020161243id_/http://vvork.com/",
            "source": "ia"
        }, {
            "timestamp": "20140806161228",
            "load_url":
            "http://web.archive.org/web/20140806161228id_/http://vvork.com/",
            "source": "ia"
        }, {
            "timestamp": "20131004231540",
            "load_url":
            "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/",
            "source": "ait"
        }]

        assert (to_json_list(res) == exp)
        assert (errs == {})
Пример #5
0
    def test_agg_collA_found(self):
        res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})

        exp = [{'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Пример #6
0
    def test_redis_not_found(self, indexloader):
        res, errs = indexloader({'url': 'example.com/'})

        exp = []

        assert (errs == {})
        assert (to_json_list(res) == exp)
Пример #7
0
    def test_timeout_skipping(self):
        assert (self.sources['slow'].calls == 3)
        assert (self.sources['slower'].calls == 3)

        agg = GeventTimeoutAggregator(self.sources,
                                      timeout=0.40,
                                      t_count=2,
                                      t_duration=1.0)

        exp = [{'source': 'slow', 'timestamp': '20160225042329'}]

        res, errs = agg(dict(url='http://example.com/'))
        assert (to_json_list(res, fields=['source', 'timestamp']) == exp)
        assert (self.sources['slow'].calls == 4)
        assert (self.sources['slower'].calls == 4)

        assert (errs == {'slower': 'timeout'})

        res, errs = agg(dict(url='http://example.com/'))
        assert (to_json_list(res, fields=['source', 'timestamp']) == exp)
        assert (self.sources['slow'].calls == 5)
        assert (self.sources['slower'].calls == 5)

        assert (errs == {'slower': 'timeout'})

        res, errs = agg(dict(url='http://example.com/'))
        assert (to_json_list(res, fields=['source', 'timestamp']) == exp)
        assert (self.sources['slow'].calls == 6)
        assert (self.sources['slower'].calls == 5)

        assert (errs == {})

        res, errs = agg(dict(url='http://example.com/'))
        assert (to_json_list(res, fields=['source', 'timestamp']) == exp)
        assert (self.sources['slow'].calls == 7)
        assert (self.sources['slower'].calls == 5)

        assert (errs == {})

        time.sleep(1.5)

        res, errs = agg(dict(url='http://example.com/'))
        assert (to_json_list(res, fields=['source', 'timestamp']) == exp)
        assert (self.sources['slow'].calls == 8)
        assert (self.sources['slower'].calls == 6)

        assert (errs == {'slower': 'timeout'})
Пример #8
0
    def test_agg_no_dir_2(self):
        loader = DirectoryIndexSource(self.root_dir, '')
        res, errs = loader({'url': 'example.com/', 'param.coll': 'X'})

        exp = []

        assert(to_json_list(res) == exp)
        assert(errs == {})
Пример #9
0
    def test_mem_agg_not_found(self, agg):
        url = 'http://vvork.com/'
        res, errs = agg(dict(url=url, closest='20141001', limit=2))

        assert (to_json_list(res) == [])
        assert (errs == {
            'notfound': "NotFoundException('testdata/not-found-x',)"
        })
Пример #10
0
    def test_extra_agg_collB(self):
        agg_source = SimpleAggregator({'dir': self.dir_loader})
        res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})

        exp = [{'source': to_path('dir:colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Пример #11
0
    def test_timeout_skipping(self):
        assert(self.sources['slow'].calls == 3)
        assert(self.sources['slower'].calls == 3)

        agg = GeventTimeoutAggregator(self.sources, timeout=0.40,
                                      t_count=2, t_duration=1.0)

        exp = [{'source': 'slow', 'timestamp': '20160225042329'}]

        res, errs = agg(dict(url='http://example.com/'))
        assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
        assert(self.sources['slow'].calls == 4)
        assert(self.sources['slower'].calls == 4)

        assert(errs == {'slower': 'timeout'})

        res, errs = agg(dict(url='http://example.com/'))
        assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
        assert(self.sources['slow'].calls == 5)
        assert(self.sources['slower'].calls == 5)

        assert(errs == {'slower': 'timeout'})

        res, errs = agg(dict(url='http://example.com/'))
        assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
        assert(self.sources['slow'].calls == 6)
        assert(self.sources['slower'].calls == 5)

        assert(errs == {})

        res, errs = agg(dict(url='http://example.com/'))
        assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
        assert(self.sources['slow'].calls == 7)
        assert(self.sources['slower'].calls == 5)

        assert(errs == {})

        time.sleep(1.5)

        res, errs = agg(dict(url='http://example.com/'))
        assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
        assert(self.sources['slow'].calls == 8)
        assert(self.sources['slower'].calls == 6)

        assert(errs == {'slower': 'timeout'})
Пример #12
0
    def test_mem_agg_index_4(self, agg):
        url = 'http://vvork.com/'
        res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))

        exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
               {"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Пример #13
0
    def test_mem_agg_index_5_inverse_preset(self, agg):
        url = 'http://vvork.com/'
        res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))


        exp = [{'timestamp': '20141018133107', 'load_url': 'http://web.archive.org/web/20141018133107id_/http://vvork.com/', 'source': 'ia'}]

        assert(to_json_list(res) == exp)
        assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://vvork.com/',)"})
Пример #14
0
    def test_timeout_slower_skipped_1(self):
        agg = GeventTimeoutAggregator(self.sources, timeout=0.40)

        res, errs = agg(dict(url='http://example.com/'))

        exp = [{'source': 'slow', 'timestamp': '20160225042329'}]

        assert (to_json_list(res, fields=['source', 'timestamp']) == exp)

        assert (errs == {'slower': 'timeout'})
Пример #15
0
    def test_timeout_slower_all_skipped(self):
        agg = GeventTimeoutAggregator(self.sources, timeout=0.10)

        res, errs = agg(dict(url='http://example.com/'))

        exp = []

        assert(to_json_list(res, fields=['source', 'timestamp']) == exp)

        assert(errs == {'slower': 'timeout', 'slow': 'timeout'})
Пример #16
0
    def test_timeout_slower_skipped_1(self):
        agg = GeventTimeoutAggregator(self.sources, timeout=0.40)

        res, errs = agg(dict(url='http://example.com/'))

        exp = [{'source': 'slow', 'timestamp': '20160225042329'}]

        assert(to_json_list(res, fields=['source', 'timestamp']) == exp)

        assert(errs == {'slower': 'timeout'})
Пример #17
0
    def test_timeout_slower_all_skipped(self):
        agg = GeventTimeoutAggregator(self.sources, timeout=0.10)

        res, errs = agg(dict(url='http://example.com/'))

        exp = []

        assert (to_json_list(res, fields=['source', 'timestamp']) == exp)

        assert (errs == {'slower': 'timeout', 'slow': 'timeout'})
Пример #18
0
    def test_redis_agg_one(self, indexloader):
        res, errs = indexloader({'url': 'example.com/', 'param.user': '******', 'param.coll': 'dupes'})

        exp = [
            {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
            {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
        ]

        assert(errs == {})
        assert(to_json_list(res) == exp)
Пример #19
0
    def test_agg_all_found_2(self):
        res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'})

        exp = [
            {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
            {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
            {'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
        ]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Пример #20
0
    def test_agg_all_found_1(self):
        res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'})

        exp = [
            {'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
            {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
            {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
        ]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Пример #21
0
    def test_timeout_long_all_pass(self):
        agg = GeventTimeoutAggregator(self.sources, timeout=1.0)

        res, errs = agg(dict(url='http://example.com/'))

        exp = [{'source': 'slower', 'timestamp': '20140127171200'},
               {'source': 'slower', 'timestamp': '20140127171251'},
               {'source': 'slow', 'timestamp': '20160225042329'}]

        assert(to_json_list(res, fields=['source', 'timestamp']) == exp)

        assert(errs == {})
Пример #22
0
    def test_mem_agg_index_2(self, agg):
        url = 'http://example.com/'
        res, errs = agg(dict(url=url, closest='20100512', limit=6))

        exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
                {"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
                {"timestamp": "20100513224108", "load_url": "http://web.archive.org/web/20100513224108id_/http://example.com/", "source": "ia"},
                {"timestamp": "20100511201151", 'load_url': "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
                {"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
                {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
              ]

        assert(to_json_list(res) == exp)
        assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
Пример #23
0
    def test_mem_agg_index_1(self, agg):
        url = 'http://iana.org/'
        res, errs = agg(dict(url=url, closest='20140126000000', limit=5))

        exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"},
               {"timestamp": "20140126200624", "filename": "iana.warc.gz", "source": "local"},
               {"timestamp": "20140123034755", "load_url": "http://web.archive.org/web/20140123034755id_/http://iana.org/", "source": "ia"},
               {"timestamp": "20140129175203", "load_url": "http://web.archive.org/web/20140129175203id_/http://iana.org/", "source": "ia"},
               {"timestamp": "20140107040552", "load_url": "http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/", "source": "ait"}
              ]

        assert(to_json_list(res) == exp)
        assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
                        'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"})
Пример #24
0
    def test_mem_agg_timeout(self, agg):
        url = 'http://vvork.com/'

        orig_source = BaseAggregator.load_child_source
        def load_child_source(self, name, source, params):
            time.sleep(0.1)
            return orig_source(self, name, source, params)

        BaseAggregator.load_child_source = load_child_source
        res, errs = agg(dict(url=url, closest='20141001', limit=2))
        BaseAggregator.load_child_source = orig_source

        assert(to_json_list(res) == [])
        assert(errs == {'local': 'timeout',
                        'ait': 'timeout', 'bl': 'timeout', 'ia': 'timeout', 'rhiz': 'timeout'})
Пример #25
0
    def test_agg_dir_and_memento(self):
        sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
                   'local': self.dir_loader}
        agg_source = SimpleAggregator(sources)

        res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})

        exp = [
            {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
            {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
            {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
            {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
            {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
            {'source': to_path('local:colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
        ]

        assert(to_json_list(res) == exp)
        assert(errs == {})
Пример #26
0
    def test_timeout_long_all_pass(self):
        agg = GeventTimeoutAggregator(self.sources, timeout=1.0)

        res, errs = agg(dict(url='http://example.com/'))

        exp = [{
            'source': 'slower',
            'timestamp': '20140127171200'
        }, {
            'source': 'slower',
            'timestamp': '20140127171251'
        }, {
            'source': 'slow',
            'timestamp': '20160225042329'
        }]

        assert (to_json_list(res, fields=['source', 'timestamp']) == exp)

        assert (errs == {})
Пример #27
0
    def test_redis_agg_one(self, indexloader):
        res, errs = indexloader({
            'url': 'example.com/',
            'param.user': '******',
            'param.coll': 'dupes'
        })

        exp = [
            {
                'source': 'FOO:dupes:cdxj',
                'timestamp': '20140127171200',
                'filename': 'dupes.warc.gz'
            },
            {
                'source': 'FOO:dupes:cdxj',
                'timestamp': '20140127171251',
                'filename': 'dupes.warc.gz'
            },
        ]

        assert (errs == {})
        assert (to_json_list(res) == exp)
Пример #28
0
    def test_mem_agg_not_found(self, agg):
        url = 'http://vvork.com/'
        res, errs = agg(dict(url=url, closest='20141001', limit=2))

        assert(to_json_list(res) == [])
        assert(errs == {'notfound': "NotFoundException('testdata/not-found-x',)"})
Пример #29
0
 def test_agg_no_coll_set(self):
     res, errs = self.dir_loader(dict(url='example.com/'))
     assert(to_json_list(res) == [])
     assert(errs == {})