def make_sample_netcdf(tmpdir): """Make a test Geospatial NetCDF file, 4000x4000 int16 random data, in a variable named `sample`. Return the GDAL access string.""" sample_nc = str(tmpdir.mkdir('netcdfs').join('sample.nc')) geobox = GeoBox(4000, 4000, affine=Affine(25.0, 0.0, 1200000, 0.0, -25.0, -4200000), crs=epsg3577) sample_data = np.random.randint(10000, size=(4000, 4000), dtype=np.int16) variables = { 'sample': Variable(sample_data.dtype, nodata=-999, dims=geobox.dimensions, units=1) } nco = create_netcdf_storage_unit(sample_nc, geobox.crs, geobox.coordinates, variables=variables, variable_params={}) nco['sample'][:] = sample_data nco.close() return 'NetCDF:"%s":sample' % sample_nc, geobox, sample_data
def do_fixer_task(config, task): global_attributes = config['global_attributes'] # Don't keep the original history if we are trying to fix it global_attributes['history'] = build_history_string(config, task, keep_original=False) variable_params = config['variable_params'] output_filename = Path(task['output_filename']) output_uri = output_filename.absolute().as_uri() temp_filename = get_temp_file(output_filename) tile = task['tile'] # Only use the time chunk size (eg 5), but not spatial chunks # This means the file only gets opened once per band, and all data is available when compressing on write # 5 * 4000 * 4000 * 2bytes == 152MB, so mem usage is not an issue chunk_profile = {'time': config['storage']['chunking']['time']} data = datacube.api.GridWorkflow.load(tile, dask_chunks=chunk_profile) unwrapped_datasets = xr_apply(tile.sources, _unwrap_dataset_list, dtype='O') data['dataset'] = datasets_to_doc(unwrapped_datasets) try: if data.geobox is None: raise DatacubeException( 'Dataset geobox property is None, cannot write to NetCDF file.' ) if data.geobox.crs is None: raise DatacubeException( 'Dataset geobox.crs property is None, cannot write to NetCDF file.' ) nco = create_netcdf_storage_unit(temp_filename, data.geobox.crs, data.coords, data.data_vars, variable_params, global_attributes) write_data_variables(data.data_vars, nco) nco.close() temp_filename.rename(output_filename) if config.get('check_data_identical', False): new_tile = make_updated_tile(unwrapped_datasets, output_uri, tile.geobox) new_data = datacube.api.GridWorkflow.load( new_tile, dask_chunks=chunk_profile) check_identical(data, new_data, output_filename) except Exception as e: if temp_filename.exists(): temp_filename.unlink() raise e return unwrapped_datasets, output_uri
def do_stack_task(config, task): global_attributes = config['global_attributes'] global_attributes['history'] = get_history_attribute(config, task) variable_params = config['variable_params'] variable_params['dataset'] = { 'chunksizes': (1, ), 'zlib': True, 'complevel': 9, } output_filename = Path(task['output_filename']) output_uri = output_filename.absolute().as_uri() temp_filename = get_temp_file(output_filename) tile = task['tile'] # Only use the time chunk size (eg 5), but not spatial chunks # This means the file only gets opened once per band, and all data is available when compressing on write # 5 * 4000 * 4000 * 2bytes == 152MB, so mem usage is not an issue chunk_profile = {'time': config['storage']['chunking']['time']} data = datacube.api.GridWorkflow.load(tile, dask_chunks=chunk_profile) unwrapped_datasets = xr_apply(tile.sources, _unwrap_dataset_list, dtype='O') data['dataset'] = datasets_to_doc(unwrapped_datasets) try: nco = create_netcdf_storage_unit(temp_filename, data.crs, data.coords, data.data_vars, variable_params, global_attributes) write_data_variables(data.data_vars, nco) nco.close() temp_filename.rename(output_filename) if config.get('check_data_identical', False): new_tile = make_updated_tile(unwrapped_datasets, output_uri, tile.geobox) new_data = datacube.api.GridWorkflow.load( new_tile, dask_chunks=chunk_profile) check_identical(data, new_data, output_filename) except Exception as e: if temp_filename.exists(): temp_filename.unlink() raise e return unwrapped_datasets, output_uri
def _nco_from_sources(self, sources, geobox, measurements, variable_params, filename): coordinates = OrderedDict((name, geometry.Coordinate(coord.values, coord.units)) for name, coord in sources.coords.items()) coordinates.update(geobox.coordinates) variables = OrderedDict((variable['name'], Variable(dtype=numpy.dtype(variable['dtype']), nodata=variable['nodata'], dims=sources.dims + geobox.dimensions, units=variable['units'])) for variable in measurements) return create_netcdf_storage_unit(filename, crs=geobox.crs, coordinates=coordinates, variables=variables, variable_params=variable_params, global_attributes=self.global_attributes)