Exemplo n.º 1
0
def test_native_load(tmpdir):
    from datacube.testutils.io import native_load, native_geobox

    tmpdir = Path(str(tmpdir))
    spatial = dict(resolution=(15, -15),
                   offset=(11230, 1381110),)
    nodata = -999
    aa = mk_test_image(96, 64, 'int16', nodata=nodata)
    cc = mk_test_image(32, 16, 'int16', nodata=nodata)

    bands = [SimpleNamespace(name=name, values=aa, nodata=nodata)
             for name in ['aa', 'bb']]
    bands.append(SimpleNamespace(name='cc', values=cc, nodata=nodata))

    ds, gbox = gen_tiff_dataset(bands[:2],
                                tmpdir,
                                prefix='ds1-',
                                timestamp='2018-07-19',
                                **spatial)

    assert set(get_raster_info(ds)) == set(ds.measurements)

    xx = native_load(ds)
    assert xx.geobox == gbox
    np.testing.assert_array_equal(aa, xx.isel(time=0).aa.values)
    np.testing.assert_array_equal(aa, xx.isel(time=0).bb.values)

    ds, gbox_cc = gen_tiff_dataset(bands,
                                   tmpdir,
                                   prefix='ds2-',
                                   timestamp='2018-07-19',
                                   **spatial)

    # cc is different size from aa,bb
    with pytest.raises(ValueError):
        xx = native_load(ds)

    # cc is different size from aa,bb
    with pytest.raises(ValueError):
        xx = native_geobox(ds)

    # aa and bb are the same
    assert native_geobox(ds, ['aa', 'bb']) == gbox
    xx = native_load(ds, ['aa', 'bb'])
    assert xx.geobox == gbox
    np.testing.assert_array_equal(aa, xx.isel(time=0).aa.values)
    np.testing.assert_array_equal(aa, xx.isel(time=0).bb.values)

    # cc will be reprojected
    assert native_geobox(ds, basis='aa') == gbox
    xx = native_load(ds, basis='aa')
    assert xx.geobox == gbox
    np.testing.assert_array_equal(aa, xx.isel(time=0).aa.values)
    np.testing.assert_array_equal(aa, xx.isel(time=0).bb.values)

    # cc is compatible with self
    xx = native_load(ds, ['cc'])
    assert xx.geobox == gbox_cc
    np.testing.assert_array_equal(cc, xx.isel(time=0).cc.values)
Exemplo n.º 2
0
def _do_fc_task(config, task):
    """
    Load data, run FC algorithm, attach metadata, and write output.
    :param dict config: Config object
    :param dict task: Dictionary of tasks
    :return: Dataset objects representing the generated data that can be added to the index
    :rtype: list(datacube.model.Dataset)
    """
    global_attributes = config['global_attributes']
    variable_params = config['variable_params']
    output_product = config['fc_product']

    file_path = Path(task['filename_dataset'])

    uri, band_uris = calc_uris(file_path, variable_params)
    output_measurements = config['fc_product'].measurements.values()

    nbart = io.native_load(task['dataset'], measurements=config['load_bands'])
    if config['band_mapping'] is not None:
        nbart = nbart.rename(config['band_mapping'])

    fc_dataset = run_fc(nbart, output_measurements,
                        config.get('sensor_regression_coefficients'))

    def _make_dataset(labels, sources):
        assert sources
        dataset = make_dataset(product=output_product,
                               sources=sources,
                               extent=nbart.geobox.extent,
                               center_time=labels['time'],
                               uri=uri,
                               band_uris=band_uris,
                               app_info=_get_app_metadata(config),
                               valid_data=polygon_from_sources_extents(
                                   sources, nbart.geobox))
        return dataset

    source = Datacube.group_datasets([task['dataset']], 'time')

    datasets = xr_apply(source, _make_dataset, dtype='O')
    fc_dataset['dataset'] = datasets_to_doc(datasets)

    base, ext = os.path.splitext(file_path)
    if ext == '.tif':
        dataset_to_geotif_yaml(
            dataset=fc_dataset,
            odc_dataset=datasets.item(),
            filename=file_path,
            variable_params=variable_params,
        )
    else:
        write_dataset_to_netcdf(
            dataset=fc_dataset,
            filename=file_path,
            global_attributes=global_attributes,
            variable_params=variable_params,
        )

    return datasets
Exemplo n.º 3
0
def gen_test_data(prefix, dask=False):
    w, h, dtype, nodata, ndw = 96, 64, 'int16', -999, 7

    aa = mk_test_image(w, h, dtype, nodata, nodata_width=ndw)

    ds, gbox = gen_tiff_dataset(
        SimpleNamespace(name='aa', values=aa, nodata=nodata), prefix)
    extras = {}

    if dask:
        extras.update(dask_chunks={'time': 1})

    xx = native_load(ds, **extras)

    return xx.aa.isel(time=0), ds
Exemplo n.º 4
0
def gen_test_data(prefix, dask=False, shape=None):
    w, h, dtype, nodata, ndw = 96, 64, "int16", -999, 7
    if shape is not None:
        h, w = shape

    aa = mk_test_image(w, h, dtype, nodata, nodata_width=ndw)

    ds, gbox = gen_tiff_dataset(
        SimpleNamespace(name="aa", values=aa, nodata=nodata), prefix)
    extras = {}

    if dask:
        extras.update(dask_chunks={"time": 1})

    xx = native_load(ds, **extras)

    return xx.aa.isel(time=0), ds
Exemplo n.º 5
0
def execute_task(task: AlchemistTask):
    log = _LOG.bind(task=task)
    transform = _import_transform(task.settings.specification.transform)
    transform = transform(**task.settings.specification.transform_args)

    # Load and process data
    data = native_load(task.dataset, measurements=task.settings.specification.measurements,
                       dask_chunks=task.settings.processing.dask_chunks,
                       basis=task.settings.specification.basis)
    data = data.rename(task.settings.specification.measurement_renames)

    log.info('data loaded')

    output_data = transform.compute(data)
    if 'time' in output_data.dims:
        output_data = output_data.squeeze('time')

    log.info('prepared lazy transformation', output_data=output_data)

    output_data = output_data.compute()
    crs = data.attrs['crs']

    del data
    log.info('loaded and transformed')

    dtypes = set(str(v.dtype) for v in output_data.data_vars.values())
    if 'int8' in dtypes:
        log.info('Found dtype=int8 in output data, converting to uint8 for geotiffs')
        output_data = output_data.astype('uint8', copy=False)

    if 'crs' not in output_data.attrs:
        output_data.attrs['crs'] = crs

    # Ensure output path exists
    output_location = Path(task.settings.output.location)
    output_location.mkdir(parents=True, exist_ok=True)
    uuid, _ = deterministic_uuid(task)
    if task.dataset.metadata.platform.lower().startswith("sentinel"):
        name = "dea_s2"
    else:
        name = "dea"
    with DatasetAssembler(output_location, naming_conventions=name,
                          dataset_id=uuid) as p:
        if task.settings.output.reference_source_dataset:
            source_doc = _munge_dataset_to_eo3(task.dataset)
            p.add_source_dataset(source_doc, auto_inherit_properties=True,
                                 classifier=task.settings.specification.override_product_family)

        # Copy in metadata and properties
        for k, v in task.settings.output.metadata.items():
            setattr(p, k, v)
        for k, v in task.settings.output.properties.items():
            p.properties[k] = v

        p.processed = datetime.utcnow()

        p.note_software_version(
            'datacube-alchemist',
            "https://github.com/opendatacube/datacube-alchemist",
            __version__
        )

        # Software Version of Transformer
        version_url = get_transform_info(task.settings.specification.transform)
        p.note_software_version(name=task.settings.specification.transform, url=version_url['url'],
                                version=version_url['version'])

        # TODO Note configuration settings of this Task
        # p.extend_user_metadata()

        # TODO Check whether output already exists

        p.write_measurements_odc_xarray(
            output_data,
            nodata=task.settings.output.nodata,
            **task.settings.output.write_data_settings
        )

        if task.settings.output.preview_image is not None:
            p.write_thumbnail(*task.settings.output.preview_image)
        dataset_id, metadata_path = p.done()

    return dataset_id, metadata_path
Exemplo n.º 6
0
    def execute_task(self,
                     task: AlchemistTask,
                     dryrun: bool = False,
                     sns_arn: str = None):
        log = _LOG.bind(task=task.dataset.id)
        log.info("Task commencing", task=task)

        # Make sure our task makes sense and store it
        if task.settings.specification.transform != self.transform_name:
            raise ValueError(
                "Task transform is different to the Alchemist transform")
        transform = self._transform_with_args(task)

        # Ensure output path exists, this should be fine for file or s3 paths
        s3_destination = None
        try:
            s3_bucket, s3_path = s3_url_parse(task.settings.output.location)
            s3_destination = True
        except ValueError:
            fs_destination = Path(task.settings.output.location)

        # Load and process data in a decimated array
        if dryrun:
            res_by_ten = self._native_resolution(task) * 10
            data = self.dc.load(
                product=task.dataset.type.name,
                id=task.dataset.id,
                measurements=task.settings.specification.measurements,
                output_crs=task.dataset.crs,
                resolution=(-1 * res_by_ten, res_by_ten),
            )
        else:
            data = native_load(
                task.dataset,
                measurements=task.settings.specification.measurements,
                dask_chunks=task.settings.processing.dask_chunks,
                basis=task.settings.specification.basis,
            )
        data = data.rename(task.settings.specification.measurement_renames)

        log.info("Data loaded")

        output_data = transform.compute(data)
        if "time" in output_data.dims:
            output_data = output_data.squeeze("time")

        log.info("Prepared lazy transformation", output_data=output_data)

        output_data = output_data.compute()
        crs = data.attrs["crs"]

        del data
        log.info("Loaded and transformed")

        # Because"/env/lib/python3.6/site-packages/eodatasets3/images.py", line 489, in write_from_ndarray
        # raise TypeError("Datatype not supported: {dt}".format(dt=dtype))
        # TODO: investigate if this is ok
        dtypes = set(str(v.dtype) for v in output_data.data_vars.values())
        if "int8" in dtypes:
            log.info(
                "Found dtype=int8 in output data, converting to uint8 for geotiffs"
            )
            output_data = output_data.astype("uint8", copy=False)

        if "crs" not in output_data.attrs:
            output_data.attrs["crs"] = crs

        uuid, _ = self._deterministic_uuid(task)

        temp_metadata_path = Path(
            tempfile.gettempdir()) / f"{task.dataset.id}.yaml"
        with DatasetAssembler(
                metadata_path=temp_metadata_path,
                naming_conventions=self.naming_convention,
                dataset_id=uuid,
        ) as dataset_assembler:
            if task.settings.output.reference_source_dataset:
                source_doc = _munge_dataset_to_eo3(task.dataset)
                dataset_assembler.add_source_dataset(
                    source_doc,
                    auto_inherit_properties=True,
                    inherit_geometry=task.settings.output.inherit_geometry,
                    classifier=task.settings.specification.
                    override_product_family,
                )

            # Copy in metadata and properties
            for k, v in task.settings.output.metadata.items():
                setattr(dataset_assembler, k, v)
            for k, v in task.settings.output.properties.items():
                dataset_assembler.properties[k] = v

            # Update the GSD
            dataset_assembler.properties["eo:gsd"] = self._native_resolution(
                task)

            dataset_assembler.processed = datetime.utcnow()

            dataset_assembler.note_software_version(
                "datacube-alchemist",
                "https://github.com/opendatacube/datacube-alchemist",
                __version__,
            )

            # Software Version of Transformer
            version_url = self._get_transform_info()
            dataset_assembler.note_software_version(
                name=task.settings.specification.transform,
                url=version_url["url"],
                version=version_url["version"],
            )

            # Write it all to a tempdir root, and then either shift or s3 sync it into place
            with tempfile.TemporaryDirectory() as temp_dir:
                # Set up a temporary directory
                dataset_assembler.collection_location = Path(temp_dir)
                # Dodgy hack!
                dataset_assembler._metadata_path = None

                # Write out the data
                dataset_assembler.write_measurements_odc_xarray(
                    output_data,
                    nodata=task.settings.output.nodata,
                    **task.settings.output.write_data_settings,
                )
                log.info("Finished writing measurements")

                # Write out the thumbnail
                _write_thumbnail(task, dataset_assembler)
                log.info("Wrote thumbnail")

                # Do all the deferred work from above
                dataset_id, metadata_path = dataset_assembler.done()
                log.info("Assembled dataset", metadata_path=metadata_path)

                # Write STAC, because it depends on this being .done()
                # Conveniently, this also checks that files are there!
                stac = None
                if task.settings.output.write_stac:
                    stac = _write_stac(metadata_path, task, dataset_assembler)
                    log.info("STAC file written")

                relative_path = dataset_assembler._dataset_location.relative_to(
                    temp_dir)
                if s3_destination:
                    s3_location = (
                        f"s3://{s3_bucket}/{s3_path.rstrip('/')}/{relative_path}"
                    )
                    s3_command = [
                        "aws",
                        "s3",
                        "sync",
                        "--only-show-errors",
                        "--acl bucket-owner-full-control",
                        str(dataset_assembler._dataset_location),
                        s3_location,
                    ]

                    if not dryrun:
                        log.info(f"Syncing files to {s3_location}")
                    else:
                        s3_command.append("--dryrun")
                        log.warning("PRETENDING to sync files to S3",
                                    s3_location=s3_destination)

                    log.info("Writing files to s3", location=s3_location)
                    # log.debug("S3 command: ", command=s3_command)
                    subprocess.run(" ".join(s3_command),
                                   shell=True,
                                   check=True)
                else:
                    dest_directory = fs_destination / relative_path
                    if not dryrun:
                        log.info("Writing files to disk",
                                 location=dest_directory)
                        if dest_directory.exists():
                            shutil.rmtree(dest_directory)
                        shutil.copytree(dataset_assembler._dataset_location,
                                        dest_directory)
                    else:
                        log.warning(
                            f"NOT moving data from {temp_dir} to {dest_directory}"
                        )

                log.info("Task complete")
                if stac is not None and sns_arn:
                    if not dryrun:
                        _stac_to_sns(sns_arn, stac)
                elif sns_arn:
                    _LOG.error(
                        "Not posting to SNS because there's no STAC to post")

        return dataset_id, metadata_path