Пример #1
0
def test_parse_query():
    E = SimpleNamespace
    base = "s3://bucket/path/a/"

    assert parse_query(base) == E(base=base, depth=None, glob=None, file=None)
    assert parse_query(base + "some") == E(base=base + "some/",
                                           depth=None,
                                           glob=None,
                                           file=None)
    assert parse_query(base + "*") == E(base=base,
                                        depth=0,
                                        glob="*",
                                        file=None)
    assert parse_query(base + "*/*txt") == E(base=base,
                                             depth=1,
                                             glob="*txt",
                                             file=None)
    assert parse_query(base + "*/*/*txt") == E(base=base,
                                               depth=2,
                                               glob="*txt",
                                               file=None)
    assert parse_query(base + "*/*/file.txt") == E(base=base,
                                                   depth=2,
                                                   glob=None,
                                                   file="file.txt")
    assert parse_query(base + "**/*txt") == E(base=base,
                                              depth=-1,
                                              glob="*txt",
                                              file=None)
    assert parse_query(base + "*/*/something/*yaml") == E(base=base,
                                                          depth=3,
                                                          glob="*yaml",
                                                          file=None)

    with pytest.raises(ValueError):
        parse_query(base + "**/*/something/*yaml")
Пример #2
0
def cli(uri, skip_check):
    """ List files on S3 bucket.

    Example:

       \b
       List files in directory that match `*yaml`
        > s3-find 's3://mybucket/some/path/*yaml'

       \b
       List files in directory and all sub-directories that match `*yaml`
        > s3-find 's3://mybucket/some/path/**/*yaml'

       \b
       List files that match `*yaml` 2 levels deep from known path
        > s3-find 's3://mybucket/some/path/*/*/*yaml'

       \b
       List directories 2 levels deep from known path
        > s3-find 's3://mybucket/some/path/*/*/'

       \b
       List all files named `metadata.yaml` 2 directories deep
        > s3-find 's3://mybucket/some/path/*/*/metadata.yaml'
    """
    def do_file_query(qq, pred):
        for d in s3.dir_dir(qq.base, qq.depth):
            _, _files = s3.list_dir(d).result()
            for f in _files:
                if pred(f):
                    yield f

    def do_file_query2(qq):
        fname = qq.file

        stream = s3.dir_dir(qq.base, qq.depth)

        if skip_check:
            yield from (SimpleNamespace(url=d + fname) for d in stream)
            return

        stream = (s3.head_object(d + fname) for d in stream)

        for (f, _), _ in future_results(stream, 32):
            if f is not None:
                yield f

    def do_dir_query(qq):
        return (SimpleNamespace(url=url)
                for url in s3.dir_dir(qq.base, qq.depth))

    flush_freq = 100

    try:
        qq = parse_query(uri)
    except ValueError as e:
        click.echo(str(e), err=True)
        sys.exit(1)

    s3 = S3Fetcher()

    glob_or_file = qq.glob or qq.file

    if qq.depth is None and glob_or_file is None:
        stream = s3.find(qq.base)
    elif qq.depth is None or qq.depth < 0:
        if qq.glob:
            stream = s3.find(qq.base, glob=qq.glob)
        elif qq.file:
            postfix = '/' + qq.file
            stream = s3.find(qq.base, pred=lambda o: o.url.endswith(postfix))
    else:
        # fixed depth query
        if qq.glob is not None:
            pred = norm_predicate(glob=qq.glob)
            stream = do_file_query(qq, pred)
        elif qq.file is not None:
            stream = do_file_query2(qq)
        else:
            stream = do_dir_query(qq)

    try:
        for i, o in enumerate(stream):
            print(o.url, flush=(i % flush_freq == 0))
    except Exception as e:
        print(f"ERROR: {str(e)}")
        sys.exit(1)
Пример #3
0
def s3_find_glob(glob_pattern: str, skip_check: bool):
    """Build generator from supplied S3 URI glob pattern  
    Arguments:
        glob_pattern {str} -- Glob pattern to filter S3 Keys by
        skip_check {bool} -- Skip validity check for S3 Key
    Raises:
        ve: ValueError if the glob pattern cannot be parsed
    """
    def do_file_query(qq, pred):
        for d in s3.dir_dir(qq.base, qq.depth):
            _, _files = s3.list_dir(d).result()
            for f in _files:
                if pred(f):
                    yield f

    def do_file_query2(qq):
        fname = qq.file

        stream = s3.dir_dir(qq.base, qq.depth)

        if skip_check:
            yield from (SimpleNamespace(url=d + fname) for d in stream)
            return

        stream = (s3.head_object(d + fname) for d in stream)

        for (f, _), _ in future_results(stream, 32):
            if f is not None:
                yield f

    def do_dir_query(qq):
        return (SimpleNamespace(url=url)
                for url in s3.dir_dir(qq.base, qq.depth))

    try:
        qq = parse_query(glob_pattern)
    except ValueError as ve:
        logging.error(f"URI glob-pattern not understood : {ve}")
        raise ve

    s3 = S3Fetcher()

    glob_or_file = qq.glob or qq.file

    if qq.depth is None and glob_or_file is None:
        stream = s3.find(qq.base)
    elif qq.depth is None or qq.depth < 0:
        if qq.glob:
            stream = s3.find(qq.base, glob=qq.glob)
        elif qq.file:
            postfix = '/' + qq.file
            stream = s3.find(qq.base, pred=lambda o: o.url.endswith(postfix))
    else:
        # fixed depth query
        if qq.glob is not None:
            pred = norm_predicate(glob=qq.glob)
            stream = do_file_query(qq, pred)
        elif qq.file is not None:
            stream = do_file_query2(qq)
        else:
            stream = do_dir_query(qq)

    return stream
Пример #4
0
def s3_find_glob(glob_pattern: str,
                 skip_check: bool = False,
                 s3: Optional[S3Fetcher] = None,
                 **kw) -> Iterator[Any]:
    """
    Build generator from supplied S3 URI glob pattern

    Arguments:
        glob_pattern {str} -- Glob pattern to filter S3 Keys by
        skip_check {bool} -- Skip validity check for S3 Key
    Raises:
        ve: ValueError if the glob pattern cannot be parsed
    """
    if s3 is None:
        s3 = S3Fetcher()

    def do_file_query(qq, pred, dirs_pred=None):
        for d in s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw):
            _, _files = s3.list_dir(d, **kw).result()
            for f in _files:
                if pred(f):
                    yield f

    def do_file_query2(qq, dirs_pred=None):
        fname = qq.file

        stream = s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw)

        if skip_check:
            yield from (SimpleNamespace(url=d + fname) for d in stream)
            return

        stream = (s3.head_object(d + fname, **kw) for d in stream)

        for (f, _), _ in future_results(stream, 32):
            if f is not None:
                yield f

    def do_dir_query(qq, dirs_pred=None):
        return (SimpleNamespace(url=url)
                for url in s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw))

    try:
        qq = parse_query(glob_pattern)
    except ValueError as ve:
        logging.error(f"URI glob-pattern not understood : {ve}")
        raise ve

    glob_or_file = qq.glob or qq.file

    if qq.depth is None and glob_or_file is None:
        stream = s3.find(qq.base, **kw)
    elif qq.depth is None or qq.depth < 0:
        if qq.glob:
            stream = s3.find(qq.base, glob=qq.glob, **kw)
        elif qq.file:
            postfix = "/" + qq.file
            stream = s3.find(qq.base,
                             pred=lambda o: o.url.endswith(postfix),
                             **kw)
    else:
        # fixed depth query
        _, prefix = s3_url_parse(glob_pattern)
        dirs_glob = prefix.split("/")[:-1]

        def dirs_pred(f):
            n = f.count("/")
            _glob = "/".join(dirs_glob[:n]) + "/"
            return fnmatch(f, _glob)

        if qq.glob is not None:
            pred = norm_predicate(glob=qq.glob)
            stream = do_file_query(qq, pred, dirs_pred=dirs_pred)
        elif qq.file is not None:
            stream = do_file_query2(qq, dirs_pred=dirs_pred)
        else:
            stream = do_dir_query(qq, dirs_pred=dirs_pred)

    return stream