예제 #1
0
    def test_resolver_dir_and_file(self):
        a_file = os.path.realpath(__file__)
        a_dir = os.path.dirname(a_file)

        # a file -- assume path index
        res = DefaultResolverMixin.make_best_resolver(a_file)
        assert isinstance(res, PathIndexResolver)

        # a dir -- asume prefix
        res = DefaultResolverMixin.make_best_resolver(a_dir)
        assert isinstance(res, PrefixResolver)

        # not a valid file -- default to prefix
        res = DefaultResolverMixin.make_best_resolver('file://test/x_invalid')
        assert isinstance(res, PrefixResolver)
예제 #2
0
    def test_resolver_dir_and_file(self):
        a_file = os.path.realpath(__file__)
        a_dir = os.path.dirname(a_file)

        # a file -- assume path index
        res = DefaultResolverMixin.make_best_resolver(a_file)
        assert isinstance(res, PathIndexResolver)

        # a dir -- assume prefix
        res = DefaultResolverMixin.make_best_resolver(a_dir)
        assert isinstance(res, PrefixResolver)

        # not a valid file -- default to prefix
        res = DefaultResolverMixin.make_best_resolver('file://test/x_invalid')
        assert isinstance(res, PrefixResolver)
예제 #3
0
    def test_resolver_dir_wildcard(self):
        resolver = DefaultResolverMixin.make_best_resolver(os.path.join(get_test_dir(), '*', ''))

        cdx = CDXObject()
        res = resolver('example.warc.gz', cdx)
        assert len(res) == 1
        assert res[0] == os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
예제 #4
0
    def test_resolver_dir_wildcard(self):
        resolver = DefaultResolverMixin.make_best_resolver(os.path.join(get_test_dir(), '*', ''))

        cdx = CDXObject()
        res = resolver('example.warc.gz', cdx)
        assert len(res) == 1
        assert res[0] == os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
예제 #5
0
    def test_resolver_http_prefix_not_wildcard(self):
        resolver = DefaultResolverMixin.make_best_resolver(
            'http://example.com/*/')

        cdx = CDXObject()
        res = resolver('example.warc.gz', cdx)
        assert res == 'http://example.com/*/example.warc.gz'
예제 #6
0
    def test_resolver_dir_wildcard_as_file_url(self):
        url = to_file_url(get_test_dir()) +  '/*/'
        resolver = DefaultResolverMixin.make_best_resolver(url)

        cdx = CDXObject()
        res = resolver('example.warc.gz', cdx)
        assert len(res) == 1
        assert res[0] == os.path.abspath(os.path.join(get_test_dir(), 'warcs', 'example.warc.gz'))
예제 #7
0
    def test_resolver_dir_wildcard_as_file_url(self):
        url = to_file_url(get_test_dir()) +  '/*/'
        resolver = DefaultResolverMixin.make_best_resolver(url)

        cdx = CDXObject()
        res = resolver('example.warc.gz', cdx)
        assert len(res) == 1
        assert res[0] == os.path.abspath(os.path.join(get_test_dir(), 'warcs', 'example.warc.gz'))
예제 #8
0
    def test_resolver_dir_wildcard_with_coll(self):
        resolver = DefaultResolverMixin.make_best_resolver('s3://bucket/colls/*/archives/')

        cdx = CDXObject()
        cdx['source'] = 'my-coll/indexes/index.cdxj'
        cdx['source-coll'] = 'my-coll'

        res = resolver('example.warc.gz', cdx)
        assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'
예제 #9
0
    def test_resolver_list(self):
        paths = [to_file_url(os.path.realpath(__file__)),
                 'http://myhost.example.com/warcs/',
                 'redis://localhost:1234/0']

        res = DefaultResolverMixin.make_resolvers(paths)
        assert isinstance(res[0], PathIndexResolver)
        assert isinstance(res[1], PrefixResolver)
        assert isinstance(res[2], RedisResolver)
예제 #10
0
    def test_resolver_list(self):
        paths = [to_file_url(os.path.realpath(__file__)),
                 'http://myhost.example.com/warcs/',
                 'redis://localhost:1234/0']

        res = DefaultResolverMixin.make_resolvers(paths)
        assert isinstance(res[0], PathIndexResolver)
        assert isinstance(res[1], PrefixResolver)
        assert isinstance(res[2], RedisResolver)
예제 #11
0
    def test_resolver_dir_wildcard_with_coll(self):
        resolver = DefaultResolverMixin.make_best_resolver('s3://bucket/colls/*/archives/')

        cdx = CDXObject()
        cdx['source'] = 'my-coll/indexes/index.cdxj'
        cdx['source-coll'] = 'my-coll'

        res = resolver('example.warc.gz', cdx)
        assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'
예제 #12
0
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False,
                       failed_files=None):
    resolve_loader = ResolvingLoader(DefaultResolverMixin.make_resolvers(test_warc_dir))
    cdx = CDXObject(cdx.encode('utf-8'))

    try:
        (headers, stream) = resolve_loader(cdx, failed_files, revisit_func)
        print(repr_format(headers))
        sys.stdout.write(stream.readline().decode('utf-8'))
        sys.stdout.write(stream.readline().decode('utf-8'))
    except ArchiveLoadFailed as e:
        if reraise:
            raise
        else:
            print('Exception: ' + e.__class__.__name__)
예제 #13
0
def load_from_cdx_test(cdx,
                       revisit_func=load_orig_cdx,
                       reraise=False,
                       failed_files=None):
    resolve_loader = ResolvingLoader(
        DefaultResolverMixin.make_resolvers(test_warc_dir))
    cdx = CDXObject(cdx.encode('utf-8'))

    try:
        (headers, stream) = resolve_loader(cdx, failed_files, revisit_func)
        print(repr_format(headers))
        sys.stdout.write(stream.readline().decode('utf-8'))
        sys.stdout.write(stream.readline().decode('utf-8'))
    except ArchiveLoadFailed as e:
        if reraise:
            raise
        else:
            print('Exception: ' + e.__class__.__name__)
예제 #14
0
 def test_make_best_resolver_redis(self):
     res = DefaultResolverMixin.make_best_resolver('redis://myhost.example.com:1234/1')
     assert isinstance(res, RedisResolver)
     assert repr(res) == "RedisResolver('redis://myhost.example.com:1234/1')"
예제 #15
0
 def test_make_best_resolver_pathindex(self):
     path = os.path.join(get_test_dir(), 'text_content', 'pathindex.txt')
     res = DefaultResolverMixin.make_best_resolver(path)
     assert isinstance(res, PathIndexResolver)
     assert repr(res) == "PathIndexResolver('{0}')".format(path)
예제 #16
0
 def test_make_best_resolver_redis(self):
     res = DefaultResolverMixin.make_best_resolver(
         'redis://myhost.example.com:1234/1')
     assert isinstance(res, RedisResolver)
     assert repr(
         res) == "RedisResolver('redis://myhost.example.com:1234/1')"
예제 #17
0
 def test_make_best_resolver_http(self):
     res = DefaultResolverMixin.make_best_resolver(
         'http://myhost.example.com/warcs/')
     assert isinstance(res, PrefixResolver)
     assert repr(
         res) == "PrefixResolver('http://myhost.example.com/warcs/')"
예제 #18
0
 def test_make_best_resolver_pathindex(self):
     path = os.path.join(get_test_dir(), 'text_content', 'pathindex.txt')
     res = DefaultResolverMixin.make_best_resolver(path)
     assert isinstance(res, PathIndexResolver)
     assert repr(res) == "PathIndexResolver('{0}')".format(path)
예제 #19
0
    def test_resolver_http_prefix_not_wildcard(self):
        resolver = DefaultResolverMixin.make_best_resolver('http://example.com/*/')

        cdx = CDXObject()
        res = resolver('example.warc.gz', cdx)
        assert res == 'http://example.com/*/example.warc.gz'
예제 #20
0
 def test_make_best_resolver_http(self):
     res = DefaultResolverMixin.make_best_resolver('http://myhost.example.com/warcs/')
     assert isinstance(res, PrefixResolver)
     assert repr(res) == "PrefixResolver('http://myhost.example.com/warcs/')"