示例#1
0
async def s3_head_object(url, s3, **kw):
    """Run head_object return Result or Error

    (Result, None) -- on success
    (None, error) -- on failure

    """
    from botocore.exceptions import ClientError, BotoCoreError

    def unpack(url, rr):
        return SimpleNamespace(
            url=url,
            size=rr.get("ContentLength", 0),
            etag=rr.get("ETag", ""),
            last_modified=rr.get("LastModified"),
            expiration=rr.get("Expiration"),
        )

    bucket, key = s3_url_parse(url)
    try:
        rr = await s3.head_object(Bucket=bucket, Key=key, **kw)
    except (ClientError, BotoCoreError) as e:
        return (None, e)

    return (unpack(url, rr), None)
示例#2
0
async def _s3_find_via_cbk(url, cbk, s3, pred=None, glob=None):
    """ List all objects under certain path

        each s3 object is represented by a SimpleNamespace with attributes:
        - url
        - size
        - last_modified
        - etag
    """
    pred = norm_predicate(pred=pred, glob=glob)

    bucket, prefix = s3_url_parse(url)

    if len(prefix) > 0 and not prefix.endswith('/'):
        prefix = prefix + '/'

    pp = s3.get_paginator('list_objects_v2')

    n_total, n = 0, 0

    async for o in pp.paginate(Bucket=bucket, Prefix=prefix):
        for f in o.get('Contents', []):
            n_total += 1
            f = s3_file_info(f, bucket)
            if pred is None or pred(f):
                n += 1
                await cbk(f)

    return n_total, n
示例#3
0
async def s3_dir(url, s3, pred=None, glob=None):
    """ List s3 "directory" without descending into sub directories.

        pred: predicate for file objects file_info -> True|False
        glob: glob pattern for files only

        Returns: (dirs, files)

        where
          dirs -- list of subdirectories in `s3://bucket/path/` format

          files -- list of objects with attributes: url, size, last_modified, etag
    """
    bucket, prefix = s3_url_parse(url)
    pred = norm_predicate(pred=pred, glob=glob)

    if len(prefix) > 0 and not prefix.endswith('/'):
        prefix = prefix + '/'

    pp = s3.get_paginator('list_objects_v2')

    _dirs = []
    _files = []

    async for o in pp.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/'):
        for d in o.get('CommonPrefixes', []):
            d = d.get('Prefix')
            _dirs.append('s3://{}/{}'.format(bucket, d))
        for f in o.get('Contents', []):
            f = s3_file_info(f, bucket)
            if pred is None or pred(f):
                _files.append(f)

    return _dirs, _files
示例#4
0
async def s3_dir_dir(url, depth, dst_q, s3):
    """
    Find directories certain depth from the base, push them to the `dst_q`

    ```
    s3://bucket/a
                 |- b1
                      |- c1/...
                      |- c2/...
                      |- some_file.txt
                 |- b2
                      |- c3/...
    ```

    Given a bucket structure above, calling this function with

    - url s3://bucket/a/
    - depth=1 will produce
         - s3://bucket/a/b1/
         - s3://bucket/a/b2/
    - depth=2 will produce
         - s3://bucket/a/b1/c1/
         - s3://bucket/a/b1/c2/
         - s3://bucket/a/b2/c3/

    Any files are ignored.
    """
    if not url.endswith('/'):
        url = url + '/'

    pp = s3.get_paginator('list_objects_v2')

    async def step(bucket, prefix, depth, work_q, dst_q):

        async for o in pp.paginate(Bucket=bucket, Prefix=prefix,
                                   Delimiter='/'):
            for d in o.get('CommonPrefixes', []):
                d = d.get('Prefix')
                if depth > 1:
                    await work_q.put((d, depth - 1))
                else:
                    d = 's3://{}/{}'.format(bucket, d)
                    await dst_q.put(d)

    bucket, prefix = s3_url_parse(url)
    work_q = asyncio.LifoQueue()
    work_q.put_nowait((prefix, depth))

    while work_q.qsize() > 0:
        _dir, depth = work_q.get_nowait()
        await step(bucket, _dir, depth, work_q, dst_q)
示例#5
0
async def s3_fetch_object(url, s3, range=None):
    """ returns object with

     On success:
        .url = url
        .data = bytes
        .last_modified -- last modified timestamp
        .range = None | (in,out)
        .error = None

    On failure:
        .url = url
        .data = None
        .last_modified = None
        .range = None | (in, out)
        .error = str| botocore.Exception class
    """
    from botocore.exceptions import ClientError, BotoCoreError

    def result(data=None, last_modified=None, error=None):
        return SimpleNamespace(url=url,
                               data=data,
                               error=error,
                               last_modified=last_modified,
                               range=range)

    bucket, key = s3_url_parse(url)
    extra_args = {}

    if range is not None:
        try:
            extra_args['Range'] = s3_fmt_range(range)
        except Exception:
            return result(error='Bad range passed in: ' + str(range))

    try:
        obj = await s3.get_object(Bucket=bucket, Key=key, **extra_args)
        stream = obj.get('Body', None)
        if stream is None:
            return result(error='Missing Body in response')
        async with stream:
            data = await stream.read()
    except (ClientError, BotoCoreError) as e:
        return result(error=e)
    except Exception as e:
        return result(error="Some Error: " + str(e))

    last_modified = obj.get('LastModified', None)
    return result(data=data, last_modified=last_modified)
示例#6
0
    def dump_to_s3(self, url, creds=None, **kw):
        import boto3
        from boto3.s3.transfer import TransferConfig
        from odc.aws import s3_url_parse

        assert self._mem is not None

        GB = 1 << 30
        transfer_config = TransferConfig(multipart_threshold=5 * GB)
        bucket, key = s3_url_parse(url)
        creds_opts = ({} if creds is None else dict(
            aws_access_key_id=creds.access_key,
            aws_secret_access_key=creds.secret_key,
            aws_session_token=creds.token,
        ))
        s3 = boto3.client("s3", **creds_opts)

        return s3.upload_fileobj(self._mem,
                                 bucket,
                                 key,
                                 ExtraArgs=kw,
                                 Config=transfer_config)
示例#7
0
async def s3_dir_dir(url, depth, dst_q, s3, pred=None):
    """Find directories certain depth from the base, push them to the `dst_q`

    ```
    s3://bucket/a
                 |- b1
                      |- c1/...
                      |- c2/...
                      |- some_file.txt
                 |- b2
                      |- c3/...
    ```

    Given a bucket structure above, calling this function with

    - url s3://bucket/a/
    - depth=1 will produce
         - s3://bucket/a/b1/
         - s3://bucket/a/b2/
    - depth=2 will produce
         - s3://bucket/a/b1/c1/
         - s3://bucket/a/b1/c2/
         - s3://bucket/a/b2/c3/

    Any files are ignored.

    If `pred` is supplied it is expected to be a `str -> bool` mapping, on
    input full path of the sub-directory is given (e.g `a/b1/`) starting from
    root, but not including bucket name. Sub-directory is only traversed
    further if predicate returns True.
    """
    if not url.endswith('/'):
        url = url + '/'

    if depth == 0:
        await dst_q.put(url)
        return

    pp = s3.get_paginator('list_objects_v2')

    async def step(bucket, prefix, depth, work_q, dst_q):

        async for o in pp.paginate(Bucket=bucket, Prefix=prefix,
                                   Delimiter='/'):
            for d in o.get('CommonPrefixes', []):
                d = d.get('Prefix')
                if pred is not None and not pred(d):
                    continue

                if depth > 1:
                    await work_q.put((d, depth - 1))
                else:
                    d = 's3://{}/{}'.format(bucket, d)
                    await dst_q.put(d)

    bucket, prefix = s3_url_parse(url)
    work_q = asyncio.LifoQueue()
    work_q.put_nowait((prefix, depth))

    while work_q.qsize() > 0:
        _dir, depth = work_q.get_nowait()
        await step(bucket, _dir, depth, work_q, dst_q)
示例#8
0
def s3_find_glob(glob_pattern: str,
                 skip_check: bool = False,
                 s3: Optional[S3Fetcher] = None,
                 **kw) -> Iterator[Any]:
    """
    Build generator from supplied S3 URI glob pattern

    Arguments:
        glob_pattern {str} -- Glob pattern to filter S3 Keys by
        skip_check {bool} -- Skip validity check for S3 Key
    Raises:
        ve: ValueError if the glob pattern cannot be parsed
    """
    if s3 is None:
        s3 = S3Fetcher()

    def do_file_query(qq, pred, dirs_pred=None):
        for d in s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw):
            _, _files = s3.list_dir(d, **kw).result()
            for f in _files:
                if pred(f):
                    yield f

    def do_file_query2(qq, dirs_pred=None):
        fname = qq.file

        stream = s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw)

        if skip_check:
            yield from (SimpleNamespace(url=d + fname) for d in stream)
            return

        stream = (s3.head_object(d + fname, **kw) for d in stream)

        for (f, _), _ in future_results(stream, 32):
            if f is not None:
                yield f

    def do_dir_query(qq, dirs_pred=None):
        return (SimpleNamespace(url=url)
                for url in s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw))

    try:
        qq = parse_query(glob_pattern)
    except ValueError as ve:
        logging.error(f"URI glob-pattern not understood : {ve}")
        raise ve

    glob_or_file = qq.glob or qq.file

    if qq.depth is None and glob_or_file is None:
        stream = s3.find(qq.base, **kw)
    elif qq.depth is None or qq.depth < 0:
        if qq.glob:
            stream = s3.find(qq.base, glob=qq.glob, **kw)
        elif qq.file:
            postfix = "/" + qq.file
            stream = s3.find(qq.base,
                             pred=lambda o: o.url.endswith(postfix),
                             **kw)
    else:
        # fixed depth query
        _, prefix = s3_url_parse(glob_pattern)
        dirs_glob = prefix.split("/")[:-1]

        def dirs_pred(f):
            n = f.count("/")
            _glob = "/".join(dirs_glob[:n]) + "/"
            return fnmatch(f, _glob)

        if qq.glob is not None:
            pred = norm_predicate(glob=qq.glob)
            stream = do_file_query(qq, pred, dirs_pred=dirs_pred)
        elif qq.file is not None:
            stream = do_file_query2(qq, dirs_pred=dirs_pred)
        else:
            stream = do_dir_query(qq, dirs_pred=dirs_pred)

    return stream
示例#9
0
    def execute_task(self,
                     task: AlchemistTask,
                     dryrun: bool = False,
                     sns_arn: str = None):
        log = _LOG.bind(task=task.dataset.id)
        log.info("Task commencing", task=task)

        # Make sure our task makes sense and store it
        if task.settings.specification.transform != self.transform_name:
            raise ValueError(
                "Task transform is different to the Alchemist transform")
        transform = self._transform_with_args(task)

        # Ensure output path exists, this should be fine for file or s3 paths
        s3_destination = None
        try:
            s3_bucket, s3_path = s3_url_parse(task.settings.output.location)
            s3_destination = True
        except ValueError:
            fs_destination = Path(task.settings.output.location)

        # Load and process data in a decimated array
        if dryrun:
            res_by_ten = self._native_resolution(task) * 10
            data = self.dc.load(
                product=task.dataset.type.name,
                id=task.dataset.id,
                measurements=task.settings.specification.measurements,
                output_crs=task.dataset.crs,
                resolution=(-1 * res_by_ten, res_by_ten),
            )
        else:
            data = native_load(
                task.dataset,
                measurements=task.settings.specification.measurements,
                dask_chunks=task.settings.processing.dask_chunks,
                basis=task.settings.specification.basis,
            )
        data = data.rename(task.settings.specification.measurement_renames)

        log.info("Data loaded")

        output_data = transform.compute(data)
        if "time" in output_data.dims:
            output_data = output_data.squeeze("time")

        log.info("Prepared lazy transformation", output_data=output_data)

        output_data = output_data.compute()
        crs = data.attrs["crs"]

        del data
        log.info("Loaded and transformed")

        # Because"/env/lib/python3.6/site-packages/eodatasets3/images.py", line 489, in write_from_ndarray
        # raise TypeError("Datatype not supported: {dt}".format(dt=dtype))
        # TODO: investigate if this is ok
        dtypes = set(str(v.dtype) for v in output_data.data_vars.values())
        if "int8" in dtypes:
            log.info(
                "Found dtype=int8 in output data, converting to uint8 for geotiffs"
            )
            output_data = output_data.astype("uint8", copy=False)

        if "crs" not in output_data.attrs:
            output_data.attrs["crs"] = crs

        uuid, _ = self._deterministic_uuid(task)

        temp_metadata_path = Path(
            tempfile.gettempdir()) / f"{task.dataset.id}.yaml"
        with DatasetAssembler(
                metadata_path=temp_metadata_path,
                naming_conventions=self.naming_convention,
                dataset_id=uuid,
        ) as dataset_assembler:
            if task.settings.output.reference_source_dataset:
                source_doc = _munge_dataset_to_eo3(task.dataset)
                dataset_assembler.add_source_dataset(
                    source_doc,
                    auto_inherit_properties=True,
                    inherit_geometry=task.settings.output.inherit_geometry,
                    classifier=task.settings.specification.
                    override_product_family,
                )

            # Copy in metadata and properties
            for k, v in task.settings.output.metadata.items():
                setattr(dataset_assembler, k, v)
            for k, v in task.settings.output.properties.items():
                dataset_assembler.properties[k] = v

            # Update the GSD
            dataset_assembler.properties["eo:gsd"] = self._native_resolution(
                task)

            dataset_assembler.processed = datetime.utcnow()

            dataset_assembler.note_software_version(
                "datacube-alchemist",
                "https://github.com/opendatacube/datacube-alchemist",
                __version__,
            )

            # Software Version of Transformer
            version_url = self._get_transform_info()
            dataset_assembler.note_software_version(
                name=task.settings.specification.transform,
                url=version_url["url"],
                version=version_url["version"],
            )

            # Write it all to a tempdir root, and then either shift or s3 sync it into place
            with tempfile.TemporaryDirectory() as temp_dir:
                # Set up a temporary directory
                dataset_assembler.collection_location = Path(temp_dir)
                # Dodgy hack!
                dataset_assembler._metadata_path = None

                # Write out the data
                dataset_assembler.write_measurements_odc_xarray(
                    output_data,
                    nodata=task.settings.output.nodata,
                    **task.settings.output.write_data_settings,
                )
                log.info("Finished writing measurements")

                # Write out the thumbnail
                _write_thumbnail(task, dataset_assembler)
                log.info("Wrote thumbnail")

                # Do all the deferred work from above
                dataset_id, metadata_path = dataset_assembler.done()
                log.info("Assembled dataset", metadata_path=metadata_path)

                # Write STAC, because it depends on this being .done()
                # Conveniently, this also checks that files are there!
                stac = None
                if task.settings.output.write_stac:
                    stac = _write_stac(metadata_path, task, dataset_assembler)
                    log.info("STAC file written")

                relative_path = dataset_assembler._dataset_location.relative_to(
                    temp_dir)
                if s3_destination:
                    s3_location = (
                        f"s3://{s3_bucket}/{s3_path.rstrip('/')}/{relative_path}"
                    )
                    s3_command = [
                        "aws",
                        "s3",
                        "sync",
                        "--only-show-errors",
                        "--acl bucket-owner-full-control",
                        str(dataset_assembler._dataset_location),
                        s3_location,
                    ]

                    if not dryrun:
                        log.info(f"Syncing files to {s3_location}")
                    else:
                        s3_command.append("--dryrun")
                        log.warning("PRETENDING to sync files to S3",
                                    s3_location=s3_destination)

                    log.info("Writing files to s3", location=s3_location)
                    # log.debug("S3 command: ", command=s3_command)
                    subprocess.run(" ".join(s3_command),
                                   shell=True,
                                   check=True)
                else:
                    dest_directory = fs_destination / relative_path
                    if not dryrun:
                        log.info("Writing files to disk",
                                 location=dest_directory)
                        if dest_directory.exists():
                            shutil.rmtree(dest_directory)
                        shutil.copytree(dataset_assembler._dataset_location,
                                        dest_directory)
                    else:
                        log.warning(
                            f"NOT moving data from {temp_dir} to {dest_directory}"
                        )

                log.info("Task complete")
                if stac is not None and sns_arn:
                    if not dryrun:
                        _stac_to_sns(sns_arn, stac)
                elif sns_arn:
                    _LOG.error(
                        "Not posting to SNS because there's no STAC to post")

        return dataset_id, metadata_path