예제 #1
0
async def _s3_find_via_cbk(url, cbk, s3, pred=None, glob=None):
    """ List all objects under certain path

        each s3 object is represented by a SimpleNamespace with attributes:
        - url
        - size
        - last_modified
        - etag
    """
    pred = norm_predicate(pred=pred, glob=glob)

    bucket, prefix = s3_url_parse(url)

    if len(prefix) > 0 and not prefix.endswith('/'):
        prefix = prefix + '/'

    pp = s3.get_paginator('list_objects_v2')

    n_total, n = 0, 0

    async for o in pp.paginate(Bucket=bucket, Prefix=prefix):
        for f in o.get('Contents', []):
            n_total += 1
            f = s3_file_info(f, bucket)
            if pred is None or pred(f):
                n += 1
                await cbk(f)

    return n_total, n
예제 #2
0
async def s3_dir(url, s3, pred=None, glob=None):
    """ List s3 "directory" without descending into sub directories.

        pred: predicate for file objects file_info -> True|False
        glob: glob pattern for files only

        Returns: (dirs, files)

        where
          dirs -- list of subdirectories in `s3://bucket/path/` format

          files -- list of objects with attributes: url, size, last_modified, etag
    """
    bucket, prefix = s3_url_parse(url)
    pred = norm_predicate(pred=pred, glob=glob)

    if len(prefix) > 0 and not prefix.endswith('/'):
        prefix = prefix + '/'

    pp = s3.get_paginator('list_objects_v2')

    _dirs = []
    _files = []

    async for o in pp.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/'):
        for d in o.get('CommonPrefixes', []):
            d = d.get('Prefix')
            _dirs.append('s3://{}/{}'.format(bucket, d))
        for f in o.get('Contents', []):
            f = s3_file_info(f, bucket)
            if pred is None or pred(f):
                _files.append(f)

    return _dirs, _files
예제 #3
0
def cli(uri, skip_check):
    """ List files on S3 bucket.

    Example:

       \b
       List files in directory that match `*yaml`
        > s3-find 's3://mybucket/some/path/*yaml'

       \b
       List files in directory and all sub-directories that match `*yaml`
        > s3-find 's3://mybucket/some/path/**/*yaml'

       \b
       List files that match `*yaml` 2 levels deep from known path
        > s3-find 's3://mybucket/some/path/*/*/*yaml'

       \b
       List directories 2 levels deep from known path
        > s3-find 's3://mybucket/some/path/*/*/'

       \b
       List all files named `metadata.yaml` 2 directories deep
        > s3-find 's3://mybucket/some/path/*/*/metadata.yaml'
    """
    def do_file_query(qq, pred):
        for d in s3.dir_dir(qq.base, qq.depth):
            _, _files = s3.list_dir(d).result()
            for f in _files:
                if pred(f):
                    yield f

    def do_file_query2(qq):
        fname = qq.file

        stream = s3.dir_dir(qq.base, qq.depth)

        if skip_check:
            yield from (SimpleNamespace(url=d + fname) for d in stream)
            return

        stream = (s3.head_object(d + fname) for d in stream)

        for (f, _), _ in future_results(stream, 32):
            if f is not None:
                yield f

    def do_dir_query(qq):
        return (SimpleNamespace(url=url)
                for url in s3.dir_dir(qq.base, qq.depth))

    flush_freq = 100

    try:
        qq = parse_query(uri)
    except ValueError as e:
        click.echo(str(e), err=True)
        sys.exit(1)

    s3 = S3Fetcher()

    glob_or_file = qq.glob or qq.file

    if qq.depth is None and glob_or_file is None:
        stream = s3.find(qq.base)
    elif qq.depth is None or qq.depth < 0:
        if qq.glob:
            stream = s3.find(qq.base, glob=qq.glob)
        elif qq.file:
            postfix = '/' + qq.file
            stream = s3.find(qq.base, pred=lambda o: o.url.endswith(postfix))
    else:
        # fixed depth query
        if qq.glob is not None:
            pred = norm_predicate(glob=qq.glob)
            stream = do_file_query(qq, pred)
        elif qq.file is not None:
            stream = do_file_query2(qq)
        else:
            stream = do_dir_query(qq)

    try:
        for i, o in enumerate(stream):
            print(o.url, flush=(i % flush_freq == 0))
    except Exception as e:
        print(f"ERROR: {str(e)}")
        sys.exit(1)
예제 #4
0
def s3_find_glob(glob_pattern: str, skip_check: bool):
    """Build generator from supplied S3 URI glob pattern  
    Arguments:
        glob_pattern {str} -- Glob pattern to filter S3 Keys by
        skip_check {bool} -- Skip validity check for S3 Key
    Raises:
        ve: ValueError if the glob pattern cannot be parsed
    """
    def do_file_query(qq, pred):
        for d in s3.dir_dir(qq.base, qq.depth):
            _, _files = s3.list_dir(d).result()
            for f in _files:
                if pred(f):
                    yield f

    def do_file_query2(qq):
        fname = qq.file

        stream = s3.dir_dir(qq.base, qq.depth)

        if skip_check:
            yield from (SimpleNamespace(url=d + fname) for d in stream)
            return

        stream = (s3.head_object(d + fname) for d in stream)

        for (f, _), _ in future_results(stream, 32):
            if f is not None:
                yield f

    def do_dir_query(qq):
        return (SimpleNamespace(url=url)
                for url in s3.dir_dir(qq.base, qq.depth))

    try:
        qq = parse_query(glob_pattern)
    except ValueError as ve:
        logging.error(f"URI glob-pattern not understood : {ve}")
        raise ve

    s3 = S3Fetcher()

    glob_or_file = qq.glob or qq.file

    if qq.depth is None and glob_or_file is None:
        stream = s3.find(qq.base)
    elif qq.depth is None or qq.depth < 0:
        if qq.glob:
            stream = s3.find(qq.base, glob=qq.glob)
        elif qq.file:
            postfix = '/' + qq.file
            stream = s3.find(qq.base, pred=lambda o: o.url.endswith(postfix))
    else:
        # fixed depth query
        if qq.glob is not None:
            pred = norm_predicate(glob=qq.glob)
            stream = do_file_query(qq, pred)
        elif qq.file is not None:
            stream = do_file_query2(qq)
        else:
            stream = do_dir_query(qq)

    return stream
예제 #5
0
def s3_find_glob(glob_pattern: str,
                 skip_check: bool = False,
                 s3: Optional[S3Fetcher] = None,
                 **kw) -> Iterator[Any]:
    """
    Build generator from supplied S3 URI glob pattern

    Arguments:
        glob_pattern {str} -- Glob pattern to filter S3 Keys by
        skip_check {bool} -- Skip validity check for S3 Key
    Raises:
        ve: ValueError if the glob pattern cannot be parsed
    """
    if s3 is None:
        s3 = S3Fetcher()

    def do_file_query(qq, pred, dirs_pred=None):
        for d in s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw):
            _, _files = s3.list_dir(d, **kw).result()
            for f in _files:
                if pred(f):
                    yield f

    def do_file_query2(qq, dirs_pred=None):
        fname = qq.file

        stream = s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw)

        if skip_check:
            yield from (SimpleNamespace(url=d + fname) for d in stream)
            return

        stream = (s3.head_object(d + fname, **kw) for d in stream)

        for (f, _), _ in future_results(stream, 32):
            if f is not None:
                yield f

    def do_dir_query(qq, dirs_pred=None):
        return (SimpleNamespace(url=url)
                for url in s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw))

    try:
        qq = parse_query(glob_pattern)
    except ValueError as ve:
        logging.error(f"URI glob-pattern not understood : {ve}")
        raise ve

    glob_or_file = qq.glob or qq.file

    if qq.depth is None and glob_or_file is None:
        stream = s3.find(qq.base, **kw)
    elif qq.depth is None or qq.depth < 0:
        if qq.glob:
            stream = s3.find(qq.base, glob=qq.glob, **kw)
        elif qq.file:
            postfix = "/" + qq.file
            stream = s3.find(qq.base,
                             pred=lambda o: o.url.endswith(postfix),
                             **kw)
    else:
        # fixed depth query
        _, prefix = s3_url_parse(glob_pattern)
        dirs_glob = prefix.split("/")[:-1]

        def dirs_pred(f):
            n = f.count("/")
            _glob = "/".join(dirs_glob[:n]) + "/"
            return fnmatch(f, _glob)

        if qq.glob is not None:
            pred = norm_predicate(glob=qq.glob)
            stream = do_file_query(qq, pred, dirs_pred=dirs_pred)
        elif qq.file is not None:
            stream = do_file_query2(qq, dirs_pred=dirs_pred)
        else:
            stream = do_dir_query(qq, dirs_pred=dirs_pred)

    return stream