def ingest_work(config, source_type, output_type, tile, tile_index): _LOG.info('Starting task %s', tile_index) namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.load_data(tile.sources, tile.geobox, measurements, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, tile_index, tile.sources, version=config['taskfile_version']) def _make_dataset(labels, sources): return make_dataset(product=output_type, sources=sources, extent=tile.geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config, config['filename']), valid_data=GeoPolygon.from_sources_extents(sources, tile.geobox)) datasets = xr_apply(tile.sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf(nudata, file_path, global_attributes, variable_params) _LOG.info('Finished task %s', tile_index) return datasets
def ingest_work(config, source_type, output_type, index, sources, geobox): namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.product_data(sources, geobox, measurements, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, index, sources) def _make_dataset(labels, sources): sources_union = union_points(*[source.extent.to_crs(geobox.crs).points for source in sources]) valid_data = intersect_points(geobox.extent.points, sources_union) dataset = make_dataset(dataset_type=output_type, sources=sources, extent=geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config, config['filename']), valid_data=GeoPolygon(valid_data, geobox.crs)) return dataset datasets = xr_apply(sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf(nudata, global_attributes, variable_params, file_path) return datasets
def ingest_work(config, source_type, output_type, tile, tile_index): # pylint: disable=too-many-locals _LOG.info('Starting task %s', tile_index) driver = storage_writer_by_name(config['storage']['driver']) if driver is None: _LOG.error('Failed to load storage driver %s', config['storage']['driver']) raise ValueError('Something went wrong: no longer can find driver pointed by storage.driver option') namemap = get_namemap(config) # TODO: get_measurements possibly changes dtype, not sure load_data would like that measurements = get_measurements(source_type, config) resampling = get_resampling(config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] datasets = tile.sources.sum().item() for dataset in datasets: if not dataset.uris: _LOG.error('Locationless dataset found in the database: %r', dataset) data = Datacube.load_data(tile.sources, tile.geobox, measurements, resampling=resampling, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, tile_index, tile.sources) file_uri = driver.mk_uri(file_path, config['storage']) def _make_dataset(labels, sources): return make_dataset(product=output_type, sources=sources, extent=tile.geobox.extent, center_time=labels['time'], uri=file_uri, app_info=get_app_metadata(config['filename']), valid_data=polygon_from_sources_extents(sources, tile.geobox)) datasets = xr_apply(tile.sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) variable_params['dataset'] = { 'chunksizes': (1,), 'zlib': True, 'complevel': 9, } driver_data = driver.write_dataset_to_storage(nudata, file_uri, global_attributes=global_attributes, variable_params=variable_params, storage_config=config['storage']) if (driver_data is not None) and len(driver_data) > 0: datasets.attrs['driver_data'] = driver_data _LOG.info('Finished task %s', tile_index) return datasets
def do_fixer_task(config, task): global_attributes = config['global_attributes'] # Don't keep the original history if we are trying to fix it global_attributes['history'] = build_history_string(config, task, keep_original=False) variable_params = config['variable_params'] output_filename = Path(task['output_filename']) output_uri = output_filename.absolute().as_uri() temp_filename = get_temp_file(output_filename) tile = task['tile'] # Only use the time chunk size (eg 5), but not spatial chunks # This means the file only gets opened once per band, and all data is available when compressing on write # 5 * 4000 * 4000 * 2bytes == 152MB, so mem usage is not an issue chunk_profile = {'time': config['storage']['chunking']['time']} data = datacube.api.GridWorkflow.load(tile, dask_chunks=chunk_profile) unwrapped_datasets = xr_apply(tile.sources, _unwrap_dataset_list, dtype='O') data['dataset'] = datasets_to_doc(unwrapped_datasets) try: if data.geobox is None: raise DatacubeException( 'Dataset geobox property is None, cannot write to NetCDF file.' ) if data.geobox.crs is None: raise DatacubeException( 'Dataset geobox.crs property is None, cannot write to NetCDF file.' ) nco = create_netcdf_storage_unit(temp_filename, data.geobox.crs, data.coords, data.data_vars, variable_params, global_attributes) write_data_variables(data.data_vars, nco) nco.close() temp_filename.rename(output_filename) if config.get('check_data_identical', False): new_tile = make_updated_tile(unwrapped_datasets, output_uri, tile.geobox) new_data = datacube.api.GridWorkflow.load( new_tile, dask_chunks=chunk_profile) check_identical(data, new_data, output_filename) except Exception as e: if temp_filename.exists(): temp_filename.unlink() raise e return unwrapped_datasets, output_uri
def _do_fc_task(config, task): """ Load data, run FC algorithm, attach metadata, and write output. :param dict config: Config object :param dict task: Dictionary of tasks :return: Dataset objects representing the generated data that can be added to the index :rtype: list(datacube.model.Dataset) """ global_attributes = config['global_attributes'] variable_params = config['variable_params'] output_product = config['fc_product'] file_path = Path(task['filename_dataset']) uri, band_uris = calc_uris(file_path, variable_params) output_measurements = config['fc_product'].measurements.values() nbart = io.native_load(task['dataset'], measurements=config['load_bands']) if config['band_mapping'] is not None: nbart = nbart.rename(config['band_mapping']) fc_dataset = run_fc(nbart, output_measurements, config.get('sensor_regression_coefficients')) def _make_dataset(labels, sources): assert sources dataset = make_dataset(product=output_product, sources=sources, extent=nbart.geobox.extent, center_time=labels['time'], uri=uri, band_uris=band_uris, app_info=_get_app_metadata(config), valid_data=polygon_from_sources_extents( sources, nbart.geobox)) return dataset source = Datacube.group_datasets([task['dataset']], 'time') datasets = xr_apply(source, _make_dataset, dtype='O') fc_dataset['dataset'] = datasets_to_doc(datasets) base, ext = os.path.splitext(file_path) if ext == '.tif': dataset_to_geotif_yaml( dataset=fc_dataset, odc_dataset=datasets.item(), filename=file_path, variable_params=variable_params, ) else: write_dataset_to_netcdf( dataset=fc_dataset, filename=file_path, global_attributes=global_attributes, variable_params=variable_params, ) return datasets
def _find_source_datasets(self, stat: OutputProduct, uri: str = None, band_uris: dict = None) -> xarray.DataArray: """ Find all the source datasets for a task Put them in order so that they can be assigned to a stacked output aligned against it's time dimension :return: (datasets, sources) datasets is a bunch of strings to dump, indexed on time sources is more structured. An x-array of lists of dataset sources, indexed on time """ task = self._task geobox = self._task.geobox app_info = self._app_info def add_all(iterable): return reduce_(operator.add, iterable) def merge_sources(prod): # Merge data sources and mask sources # Align the data `Tile` with potentially many mask `Tile`s along their time axis all_sources = xarray.align(prod.data.sources, *[mask_tile.sources for mask_tile in prod.masks if mask_tile]) # TODO: The following can fail if prod.data and prod.masks have different times # Which can happen in the case of a missing PQ Scene, where there is a scene overlap # ie. Two overlapped NBAR scenes, One PQ scene (the later) return add_all(sources_.sum() for sources_ in all_sources) sources = add_all(merge_sources(prod) for prod in task.sources) def unique(index, dataset_tuple): return tuple(set(dataset_tuple)) sources = xr_apply(sources, unique, dtype='O') # Sources has no time at this point, so insert back in the start of our stats epoch start_time, _ = task.time_period sources = unsqueeze_data_array(sources, dim='time', pos=0, coord=start_time, attrs=task.time_attributes) if not sources: raise StatsOutputError('No valid sources found, or supplied sources do not align to the same time.\n' 'Unable to write dataset metadata.') def _make_dataset(labels, sources_): return make_dataset(product=stat.product, sources=sources_, extent=geobox.extent, center_time=labels['time'], uri=uri, band_uris=band_uris, app_info=app_info, valid_data=polygon_from_sources_extents(sources_, geobox)) datasets = xr_apply(sources, _make_dataset, dtype='O') # Store in DataArray to associate Time -> Dataset datasets = datasets_to_doc(datasets) return datasets
def ingest_work(config, source_type, output_type, tile, tile_index): # pylint: disable=too-many-locals _LOG.info('Starting task %s', tile_index) driver = storage_writer_by_name(config['storage']['driver']) if driver is None: _LOG.error('Failed to load storage driver %s', config['storage']['driver']) raise ValueError('Something went wrong: no longer can find driver pointed by storage.driver option') namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.load_data(tile.sources, tile.geobox, measurements, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, tile_index, tile.sources) def mk_uri(file_path): if driver.uri_scheme == "file": return file_path.absolute().as_uri() return '{}://{}'.format(driver.uri_scheme, file_path) def _make_dataset(labels, sources): return make_dataset(product=output_type, sources=sources, extent=tile.geobox.extent, center_time=labels['time'], uri=mk_uri(file_path), app_info=get_app_metadata(config, config['filename']), valid_data=GeoPolygon.from_sources_extents(sources, tile.geobox)) datasets = xr_apply(tile.sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) variable_params['dataset'] = { 'chunksizes': (1,), 'zlib': True, 'complevel': 9, } storage_metadata = driver.write_dataset_to_storage(nudata, file_path, global_attributes=global_attributes, variable_params=variable_params, storage_config=config['storage']) if (storage_metadata is not None) and len(storage_metadata) > 0: datasets.attrs['storage_metadata'] = storage_metadata _LOG.info('Finished task %s', tile_index) return datasets
def do_stack_task(config, task): global_attributes = config['global_attributes'] global_attributes['history'] = get_history_attribute(config, task) variable_params = config['variable_params'] variable_params['dataset'] = { 'chunksizes': (1, ), 'zlib': True, 'complevel': 9, } output_filename = Path(task['output_filename']) output_uri = output_filename.absolute().as_uri() temp_filename = get_temp_file(output_filename) tile = task['tile'] # Only use the time chunk size (eg 5), but not spatial chunks # This means the file only gets opened once per band, and all data is available when compressing on write # 5 * 4000 * 4000 * 2bytes == 152MB, so mem usage is not an issue chunk_profile = {'time': config['storage']['chunking']['time']} data = datacube.api.GridWorkflow.load(tile, dask_chunks=chunk_profile) unwrapped_datasets = xr_apply(tile.sources, _unwrap_dataset_list, dtype='O') data['dataset'] = datasets_to_doc(unwrapped_datasets) try: nco = create_netcdf_storage_unit(temp_filename, data.crs, data.coords, data.data_vars, variable_params, global_attributes) write_data_variables(data.data_vars, nco) nco.close() temp_filename.rename(output_filename) if config.get('check_data_identical', False): new_tile = make_updated_tile(unwrapped_datasets, output_uri, tile.geobox) new_data = datacube.api.GridWorkflow.load( new_tile, dask_chunks=chunk_profile) check_identical(data, new_data, output_filename) except Exception as e: if temp_filename.exists(): temp_filename.unlink() raise e return unwrapped_datasets, output_uri
def do_stack_task(config, task): global_attributes = config['global_attributes'] global_attributes['history'] = get_history_attribute(config, task) variable_params = config['variable_params'] output_filename = Path(task['output_filename']) tile = task['tile'] data = datacube.api.GridWorkflow.load( tile, dask_chunks=config['storage']['chunking']) unwrapped_datasets = xr_apply(tile.sources, _unwrap_dataset_list, dtype='O') data['dataset'] = datasets_to_doc(unwrapped_datasets) nco = create_netcdf_storage_unit(output_filename, data.crs, data.coords, data.data_vars, variable_params, global_attributes) for name, variable in data.data_vars.items(): try: with dask.set_options(get=dask. async .get_sync): da.store(variable.data, nco[name], lock=True) except ValueError: nco[name][:] = netcdf_writer.netcdfy_data(variable.values) nco.sync() nco.close() def update_dataset_location(labels, dataset): new_dataset = copy.copy(dataset) new_dataset.local_uri = output_filename.absolute().as_uri() return [dataset] updated_datasets = xr_apply(unwrapped_datasets, update_dataset_location, dtype='O') new_tile = datacube.api.Tile(sources=updated_datasets, geobox=tile.geobox) new_data = datacube.api.GridWorkflow.load( new_tile, dask_chunks=config['storage']['chunking']) if not data.identical(new_data): _LOG.error("Mismatch found for %s, not indexing", output_filename) raise ValueError("Mismatch found for %s, not indexing" % output_filename) return unwrapped_datasets, output_filename.absolute().as_uri()
def do_ndvi_task(config, task): global_attributes = config['global_attributes'] variable_params = config['variable_params'] file_path = Path(task['filename']) output_type = config['ndvi_dataset_type'] measurement = output_type.measurements['ndvi'] output_dtype = np.dtype(measurement['dtype']) nodata_value = np.dtype(output_dtype).type(measurement['nodata']) if file_path.exists(): raise OSError(errno.EEXIST, 'Output file already exists', str(file_path)) measurements = ['red', 'nir'] nbar_tile = task['nbar'] nbar = GridWorkflow.load(nbar_tile, measurements) ndvi = calculate_ndvi(nbar, nodata=nodata_value, dtype=output_dtype, units=measurement['units']) def _make_dataset(labels, sources): assert len(sources) geobox = nbar.geobox source_data = union_points( *[dataset.extent.to_crs(geobox.crs).points for dataset in sources]) valid_data = intersect_points(geobox.extent.points, source_data) dataset = make_dataset(product=output_type, sources=sources, extent=geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config), valid_data=GeoPolygon(valid_data, geobox.crs)) return dataset datasets = xr_apply(nbar_tile.sources, _make_dataset, dtype='O') ndvi['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf( dataset=ndvi, filename=Path(file_path), global_attributes=global_attributes, variable_params=variable_params, ) return datasets
def _do_fc_task(config, task): """ Load data, run FC algorithm, attach metadata, and write output. :param dict config: Config object :param dict task: Dictionary of tasks :return: Dataset objects representing the generated data that can be added to the index :rtype: list(datacube.model.Dataset) """ global_attributes = config['global_attributes'] variable_params = config['variable_params'] file_path = Path(task['filename']) output_product = config['fc_product'] if file_path.exists(): raise OSError(errno.EEXIST, 'Output file already exists', str(file_path)) nbart_tile: Tile = task['nbart'] nbart = GridWorkflow.load(nbart_tile, ['green', 'red', 'nir', 'swir1', 'swir2']) output_measurements = config['fc_product'].measurements.values() fc_dataset = _make_fc_tile(nbart, output_measurements, config.get('sensor_regression_coefficients')) def _make_dataset(labels, sources): assert sources dataset = make_dataset(product=output_product, sources=sources, extent=nbart.geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=_get_app_metadata(config), valid_data=polygon_from_sources_extents( sources, nbart.geobox)) return dataset datasets = xr_apply(nbart_tile.sources, _make_dataset, dtype='O') fc_dataset['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf( dataset=fc_dataset, filename=file_path, global_attributes=global_attributes, variable_params=variable_params, ) return datasets
def ingest_work(driver_manager, config, source_type, output_type, tile, tile_index): _LOG.info('Starting task %s', tile_index) namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.load_data(tile.sources, tile.geobox, measurements, fuse_func=fuse_func, driver_manager=driver_manager) nudata = data.rename(namemap) file_path = get_filename(config, tile_index, tile.sources) def _make_dataset(labels, sources): return make_dataset(product=output_type, sources=sources, extent=tile.geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config, config['filename']), valid_data=GeoPolygon.from_sources_extents( sources, tile.geobox)) datasets = xr_apply( tile.sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) # Until ingest becomes a class and DriverManager an instance # variable, we call the constructor each time. DriverManager being # a singleton, there is little overhead, though. datasets.attrs['storage_output'] = driver_manager.write_dataset_to_storage( nudata, file_path, global_attributes, variable_params) _LOG.info('Finished task %s', tile_index) # When using multiproc executor, Driver Manager is a clone. if driver_manager.is_clone: driver_manager.close() return datasets
def do_ndvi_task(config, task): global_attributes = config['global_attributes'] variable_params = config['variable_params'] file_path = Path(task['filename']) output_type = config['ndvi_dataset_type'] measurement = output_type.measurements['ndvi'] output_dtype = np.dtype(measurement['dtype']) nodata_value = np.dtype(output_dtype).type(measurement['nodata']) if file_path.exists(): raise OSError(errno.EEXIST, 'Output file already exists', str(file_path)) measurements = ['red', 'nir'] nbar_tile = task['nbar'] nbar = GridWorkflow.load(nbar_tile, measurements) ndvi = calculate_ndvi(nbar, nodata=nodata_value, dtype=output_dtype, units=measurement['units']) def _make_dataset(labels, sources): assert len(sources) geobox = nbar.geobox source_data = union_points(*[dataset.extent.to_crs(geobox.crs).points for dataset in sources]) valid_data = intersect_points(geobox.extent.points, source_data) dataset = make_dataset(product=output_type, sources=sources, extent=geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config), valid_data=GeoPolygon(valid_data, geobox.crs)) return dataset datasets = xr_apply(nbar_tile.sources, _make_dataset, dtype='O') ndvi['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf( dataset=ndvi, filename=Path(file_path), global_attributes=global_attributes, variable_params=variable_params, ) return datasets
def do_stack_task(task): datasets_to_add = None datasets_to_update = None datasets_to_archive = None global_attributes = task['global_attributes'] variable_params = task['variable_params'] output_filename = Path(task['output_filename']) tile = task['tile'] if task.get('make_new_datasets', False): datasets_to_add = make_datasets(tile, output_filename, task) datasets_to_archive = xr_apply(tile.sources, _single_dataset, dtype='O') output_datasets = datasets_to_add else: datasets_to_update = xr_apply(tile.sources, _single_dataset, dtype='O') output_datasets = datasets_to_update data = datacube.api.GridWorkflow.load(tile, dask_chunks=dict(time=1)) # TODO: chunk along output NetCDF chunk? data['dataset'] = datasets_to_doc(output_datasets) nco = create_netcdf_storage_unit(output_filename, data.crs, data.coords, data.data_vars, variable_params, global_attributes) for name, variable in data.data_vars.items(): try: with dask.set_options(get=dask.async.get_sync): da.store(variable.data, nco[name], lock=True) except ValueError: nco[name][:] = netcdf_writer.netcdfy_data(variable.values) nco.sync() nco.close() return datasets_to_add, datasets_to_update, datasets_to_archive
def ingest_work(config, source_type, output_type, index, sources, geobox): namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.product_data(sources, geobox, measurements, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, index, sources) def _make_dataset(labels, sources): sources_union = union_points( *[source.extent.to_crs(geobox.crs).points for source in sources]) valid_data = intersect_points(geobox.extent.points, sources_union) dataset = make_dataset(dataset_type=output_type, sources=sources, extent=geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata( config, config['filename']), valid_data=GeoPolygon(valid_data, geobox.crs)) return dataset datasets = xr_apply( sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf(nudata, global_attributes, variable_params, file_path) return datasets
def do_fc_task(config, task): global_attributes = config['global_attributes'] variable_params = config['variable_params'] file_path = Path(task['filename']) output_product = config['fc_product'] if file_path.exists(): raise OSError(errno.EEXIST, 'Output file already exists', str(file_path)) nbar_tile: Tile = task['nbar'] nbar = GridWorkflow.load(nbar_tile, ['green', 'red', 'nir', 'swir1', 'swir2']) output_measurements = config['fc_product'].measurements.values() fc_dataset = make_fc_tile(nbar, output_measurements, config.get('sensor_regression_coefficients')) def _make_dataset(labels, sources): assert sources dataset = make_dataset(product=output_product, sources=sources, extent=nbar.geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config), valid_data=GeoPolygon.from_sources_extents(sources, nbar.geobox)) return dataset datasets = xr_apply(nbar_tile.sources, _make_dataset, dtype='O') fc_dataset['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf( dataset=fc_dataset, filename=file_path, global_attributes=global_attributes, variable_params=variable_params, ) return datasets