def test_native_load(tmpdir): from datacube.testutils.io import native_load, native_geobox tmpdir = Path(str(tmpdir)) spatial = dict(resolution=(15, -15), offset=(11230, 1381110),) nodata = -999 aa = mk_test_image(96, 64, 'int16', nodata=nodata) cc = mk_test_image(32, 16, 'int16', nodata=nodata) bands = [SimpleNamespace(name=name, values=aa, nodata=nodata) for name in ['aa', 'bb']] bands.append(SimpleNamespace(name='cc', values=cc, nodata=nodata)) ds, gbox = gen_tiff_dataset(bands[:2], tmpdir, prefix='ds1-', timestamp='2018-07-19', **spatial) assert set(get_raster_info(ds)) == set(ds.measurements) xx = native_load(ds) assert xx.geobox == gbox np.testing.assert_array_equal(aa, xx.isel(time=0).aa.values) np.testing.assert_array_equal(aa, xx.isel(time=0).bb.values) ds, gbox_cc = gen_tiff_dataset(bands, tmpdir, prefix='ds2-', timestamp='2018-07-19', **spatial) # cc is different size from aa,bb with pytest.raises(ValueError): xx = native_load(ds) # cc is different size from aa,bb with pytest.raises(ValueError): xx = native_geobox(ds) # aa and bb are the same assert native_geobox(ds, ['aa', 'bb']) == gbox xx = native_load(ds, ['aa', 'bb']) assert xx.geobox == gbox np.testing.assert_array_equal(aa, xx.isel(time=0).aa.values) np.testing.assert_array_equal(aa, xx.isel(time=0).bb.values) # cc will be reprojected assert native_geobox(ds, basis='aa') == gbox xx = native_load(ds, basis='aa') assert xx.geobox == gbox np.testing.assert_array_equal(aa, xx.isel(time=0).aa.values) np.testing.assert_array_equal(aa, xx.isel(time=0).bb.values) # cc is compatible with self xx = native_load(ds, ['cc']) assert xx.geobox == gbox_cc np.testing.assert_array_equal(cc, xx.isel(time=0).cc.values)
def _do_fc_task(config, task): """ Load data, run FC algorithm, attach metadata, and write output. :param dict config: Config object :param dict task: Dictionary of tasks :return: Dataset objects representing the generated data that can be added to the index :rtype: list(datacube.model.Dataset) """ global_attributes = config['global_attributes'] variable_params = config['variable_params'] output_product = config['fc_product'] file_path = Path(task['filename_dataset']) uri, band_uris = calc_uris(file_path, variable_params) output_measurements = config['fc_product'].measurements.values() nbart = io.native_load(task['dataset'], measurements=config['load_bands']) if config['band_mapping'] is not None: nbart = nbart.rename(config['band_mapping']) fc_dataset = run_fc(nbart, output_measurements, config.get('sensor_regression_coefficients')) def _make_dataset(labels, sources): assert sources dataset = make_dataset(product=output_product, sources=sources, extent=nbart.geobox.extent, center_time=labels['time'], uri=uri, band_uris=band_uris, app_info=_get_app_metadata(config), valid_data=polygon_from_sources_extents( sources, nbart.geobox)) return dataset source = Datacube.group_datasets([task['dataset']], 'time') datasets = xr_apply(source, _make_dataset, dtype='O') fc_dataset['dataset'] = datasets_to_doc(datasets) base, ext = os.path.splitext(file_path) if ext == '.tif': dataset_to_geotif_yaml( dataset=fc_dataset, odc_dataset=datasets.item(), filename=file_path, variable_params=variable_params, ) else: write_dataset_to_netcdf( dataset=fc_dataset, filename=file_path, global_attributes=global_attributes, variable_params=variable_params, ) return datasets
def gen_test_data(prefix, dask=False): w, h, dtype, nodata, ndw = 96, 64, 'int16', -999, 7 aa = mk_test_image(w, h, dtype, nodata, nodata_width=ndw) ds, gbox = gen_tiff_dataset( SimpleNamespace(name='aa', values=aa, nodata=nodata), prefix) extras = {} if dask: extras.update(dask_chunks={'time': 1}) xx = native_load(ds, **extras) return xx.aa.isel(time=0), ds
def gen_test_data(prefix, dask=False, shape=None): w, h, dtype, nodata, ndw = 96, 64, "int16", -999, 7 if shape is not None: h, w = shape aa = mk_test_image(w, h, dtype, nodata, nodata_width=ndw) ds, gbox = gen_tiff_dataset( SimpleNamespace(name="aa", values=aa, nodata=nodata), prefix) extras = {} if dask: extras.update(dask_chunks={"time": 1}) xx = native_load(ds, **extras) return xx.aa.isel(time=0), ds
def execute_task(task: AlchemistTask): log = _LOG.bind(task=task) transform = _import_transform(task.settings.specification.transform) transform = transform(**task.settings.specification.transform_args) # Load and process data data = native_load(task.dataset, measurements=task.settings.specification.measurements, dask_chunks=task.settings.processing.dask_chunks, basis=task.settings.specification.basis) data = data.rename(task.settings.specification.measurement_renames) log.info('data loaded') output_data = transform.compute(data) if 'time' in output_data.dims: output_data = output_data.squeeze('time') log.info('prepared lazy transformation', output_data=output_data) output_data = output_data.compute() crs = data.attrs['crs'] del data log.info('loaded and transformed') dtypes = set(str(v.dtype) for v in output_data.data_vars.values()) if 'int8' in dtypes: log.info('Found dtype=int8 in output data, converting to uint8 for geotiffs') output_data = output_data.astype('uint8', copy=False) if 'crs' not in output_data.attrs: output_data.attrs['crs'] = crs # Ensure output path exists output_location = Path(task.settings.output.location) output_location.mkdir(parents=True, exist_ok=True) uuid, _ = deterministic_uuid(task) if task.dataset.metadata.platform.lower().startswith("sentinel"): name = "dea_s2" else: name = "dea" with DatasetAssembler(output_location, naming_conventions=name, dataset_id=uuid) as p: if task.settings.output.reference_source_dataset: source_doc = _munge_dataset_to_eo3(task.dataset) p.add_source_dataset(source_doc, auto_inherit_properties=True, classifier=task.settings.specification.override_product_family) # Copy in metadata and properties for k, v in task.settings.output.metadata.items(): setattr(p, k, v) for k, v in task.settings.output.properties.items(): p.properties[k] = v p.processed = datetime.utcnow() p.note_software_version( 'datacube-alchemist', "https://github.com/opendatacube/datacube-alchemist", __version__ ) # Software Version of Transformer version_url = get_transform_info(task.settings.specification.transform) p.note_software_version(name=task.settings.specification.transform, url=version_url['url'], version=version_url['version']) # TODO Note configuration settings of this Task # p.extend_user_metadata() # TODO Check whether output already exists p.write_measurements_odc_xarray( output_data, nodata=task.settings.output.nodata, **task.settings.output.write_data_settings ) if task.settings.output.preview_image is not None: p.write_thumbnail(*task.settings.output.preview_image) dataset_id, metadata_path = p.done() return dataset_id, metadata_path
def execute_task(self, task: AlchemistTask, dryrun: bool = False, sns_arn: str = None): log = _LOG.bind(task=task.dataset.id) log.info("Task commencing", task=task) # Make sure our task makes sense and store it if task.settings.specification.transform != self.transform_name: raise ValueError( "Task transform is different to the Alchemist transform") transform = self._transform_with_args(task) # Ensure output path exists, this should be fine for file or s3 paths s3_destination = None try: s3_bucket, s3_path = s3_url_parse(task.settings.output.location) s3_destination = True except ValueError: fs_destination = Path(task.settings.output.location) # Load and process data in a decimated array if dryrun: res_by_ten = self._native_resolution(task) * 10 data = self.dc.load( product=task.dataset.type.name, id=task.dataset.id, measurements=task.settings.specification.measurements, output_crs=task.dataset.crs, resolution=(-1 * res_by_ten, res_by_ten), ) else: data = native_load( task.dataset, measurements=task.settings.specification.measurements, dask_chunks=task.settings.processing.dask_chunks, basis=task.settings.specification.basis, ) data = data.rename(task.settings.specification.measurement_renames) log.info("Data loaded") output_data = transform.compute(data) if "time" in output_data.dims: output_data = output_data.squeeze("time") log.info("Prepared lazy transformation", output_data=output_data) output_data = output_data.compute() crs = data.attrs["crs"] del data log.info("Loaded and transformed") # Because"/env/lib/python3.6/site-packages/eodatasets3/images.py", line 489, in write_from_ndarray # raise TypeError("Datatype not supported: {dt}".format(dt=dtype)) # TODO: investigate if this is ok dtypes = set(str(v.dtype) for v in output_data.data_vars.values()) if "int8" in dtypes: log.info( "Found dtype=int8 in output data, converting to uint8 for geotiffs" ) output_data = output_data.astype("uint8", copy=False) if "crs" not in output_data.attrs: output_data.attrs["crs"] = crs uuid, _ = self._deterministic_uuid(task) temp_metadata_path = Path( tempfile.gettempdir()) / f"{task.dataset.id}.yaml" with DatasetAssembler( metadata_path=temp_metadata_path, naming_conventions=self.naming_convention, dataset_id=uuid, ) as dataset_assembler: if task.settings.output.reference_source_dataset: source_doc = _munge_dataset_to_eo3(task.dataset) dataset_assembler.add_source_dataset( source_doc, auto_inherit_properties=True, inherit_geometry=task.settings.output.inherit_geometry, classifier=task.settings.specification. override_product_family, ) # Copy in metadata and properties for k, v in task.settings.output.metadata.items(): setattr(dataset_assembler, k, v) for k, v in task.settings.output.properties.items(): dataset_assembler.properties[k] = v # Update the GSD dataset_assembler.properties["eo:gsd"] = self._native_resolution( task) dataset_assembler.processed = datetime.utcnow() dataset_assembler.note_software_version( "datacube-alchemist", "https://github.com/opendatacube/datacube-alchemist", __version__, ) # Software Version of Transformer version_url = self._get_transform_info() dataset_assembler.note_software_version( name=task.settings.specification.transform, url=version_url["url"], version=version_url["version"], ) # Write it all to a tempdir root, and then either shift or s3 sync it into place with tempfile.TemporaryDirectory() as temp_dir: # Set up a temporary directory dataset_assembler.collection_location = Path(temp_dir) # Dodgy hack! dataset_assembler._metadata_path = None # Write out the data dataset_assembler.write_measurements_odc_xarray( output_data, nodata=task.settings.output.nodata, **task.settings.output.write_data_settings, ) log.info("Finished writing measurements") # Write out the thumbnail _write_thumbnail(task, dataset_assembler) log.info("Wrote thumbnail") # Do all the deferred work from above dataset_id, metadata_path = dataset_assembler.done() log.info("Assembled dataset", metadata_path=metadata_path) # Write STAC, because it depends on this being .done() # Conveniently, this also checks that files are there! stac = None if task.settings.output.write_stac: stac = _write_stac(metadata_path, task, dataset_assembler) log.info("STAC file written") relative_path = dataset_assembler._dataset_location.relative_to( temp_dir) if s3_destination: s3_location = ( f"s3://{s3_bucket}/{s3_path.rstrip('/')}/{relative_path}" ) s3_command = [ "aws", "s3", "sync", "--only-show-errors", "--acl bucket-owner-full-control", str(dataset_assembler._dataset_location), s3_location, ] if not dryrun: log.info(f"Syncing files to {s3_location}") else: s3_command.append("--dryrun") log.warning("PRETENDING to sync files to S3", s3_location=s3_destination) log.info("Writing files to s3", location=s3_location) # log.debug("S3 command: ", command=s3_command) subprocess.run(" ".join(s3_command), shell=True, check=True) else: dest_directory = fs_destination / relative_path if not dryrun: log.info("Writing files to disk", location=dest_directory) if dest_directory.exists(): shutil.rmtree(dest_directory) shutil.copytree(dataset_assembler._dataset_location, dest_directory) else: log.warning( f"NOT moving data from {temp_dir} to {dest_directory}" ) log.info("Task complete") if stac is not None and sns_arn: if not dryrun: _stac_to_sns(sns_arn, stac) elif sns_arn: _LOG.error( "Not posting to SNS because there's no STAC to post") return dataset_id, metadata_path