示例#1
0
def execute_task(task: StatsTask, output_driver, chunking) -> StatsTask:
    """
    Load data, run the statistical operations and write results out to the filesystem.

    :param datacube_stats.models.StatsTask task:
    :type output_driver: OutputDriver
    :param chunking: dict of dimension sizes to chunk the computation by
    """
    timer = MultiTimer().start('total')
    datacube.set_options(reproject_threads=1)

    process_chunk = load_process_save_chunk_iteratively if task.is_iterative else load_process_save_chunk

    try:
        with output_driver(task=task) as output_files:
            # currently for polygons process will load entirely
            if len(chunking) == 0:
                chunking = {'x': task.sample_tile.shape[2], 'y': task.sample_tile.shape[1]}
            for sub_tile_slice in tile_iter(task.sample_tile, chunking):
                process_chunk(output_files, sub_tile_slice, task, timer)
    except OutputFileAlreadyExists as e:
        _LOG.warning(str(e))
    except OutputDriverResult as e:
        # was run interactively
        # re-raise result to be caught again by StatsApp.execute_task
        raise e
    except Exception as e:
        _LOG.error("Error processing task: %s", task)
        raise StatsProcessingException("Error processing task: %s" % task)

    timer.pause('total')
    _LOG.debug('Completed %s %s task with %s data sources; %s', task.tile_index,
               [d.strftime('%Y-%m-%d') for d in task.time_period], task.data_sources_length(), timer)
    return task
示例#2
0
def assert_same_read_results(source, dst_shape, dst_dtype, dst_transform,
                             dst_nodata, dst_projection, resampling):
    expected = np.empty(dst_shape, dtype=dst_dtype)
    with source.open() as src:
        rasterio.warp.reproject(src.data,
                                expected,
                                src_transform=src.transform,
                                src_crs=str(src.crs),
                                src_nodata=src.nodata,
                                dst_transform=dst_transform,
                                dst_crs=str(dst_projection),
                                dst_nodata=dst_nodata,
                                resampling=resampling)

    result = np.full(dst_shape, dst_nodata, dtype=dst_dtype)
    H, W = dst_shape
    dst_gbox = GeoBox(W, H, dst_transform, dst_projection)
    with datacube.set_options(reproject_threads=1):
        with source.open() as rdr:
            read_time_slice(rdr,
                            result,
                            dst_gbox,
                            dst_nodata=dst_nodata,
                            resampling=resampling)

    assert np.isclose(result, expected, atol=0, rtol=0.05,
                      equal_nan=True).all()
    return result
示例#3
0
def ingest_work(config, source_type, output_type, tile, tile_index):
    _LOG.info('Starting task %s', tile_index)
    namemap = get_namemap(config)
    measurements = get_measurements(source_type, config)
    variable_params = get_variable_params(config)
    global_attributes = config['global_attributes']

    with datacube.set_options(reproject_threads=1):
        fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')]
        data = Datacube.load_data(tile.sources, tile.geobox, measurements, fuse_func=fuse_func)
    nudata = data.rename(namemap)
    file_path = get_filename(config, tile_index, tile.sources, version=config['taskfile_version'])

    def _make_dataset(labels, sources):
        return make_dataset(product=output_type,
                            sources=sources,
                            extent=tile.geobox.extent,
                            center_time=labels['time'],
                            uri=file_path.absolute().as_uri(),
                            app_info=get_app_metadata(config, config['filename']),
                            valid_data=GeoPolygon.from_sources_extents(sources, tile.geobox))

    datasets = xr_apply(tile.sources, _make_dataset, dtype='O')  # Store in Dataarray to associate Time -> Dataset
    nudata['dataset'] = datasets_to_doc(datasets)

    write_dataset_to_netcdf(nudata, file_path, global_attributes, variable_params)
    _LOG.info('Finished task %s', tile_index)

    return datasets
示例#4
0
    def data(self, datasets, mask=False, manual_merge=False, skip_corrections=False, **kwargs):
        # pylint: disable=too-many-locals, consider-using-enumerate
        if mask:
            prod = self._product.pq_product
            measurements = [prod.measurements[self._product.pq_band].copy()]
        else:
            prod = self._product.product
            measurements = [prod.measurements[name].copy() for name in self.needed_bands()]

        with datacube.set_options(reproject_threads=1, fast_load=True):
            if manual_merge:
                return self.manual_data_stack(datasets, measurements, mask, skip_corrections, **kwargs)
            elif self._product.solar_correction and not mask and not skip_corrections:
                # Merge performed already by dataset extent, but we need to
                # process the data for the datasets individually to do solar correction.
                merged = None
                for ds in datasets:
                    d = read_data(ds, measurements, self._geobox, **kwargs)
                    for band in self.needed_bands():
                        if band != self._product.pq_band:
                            d[band] = solar_correct_data(d[band], ds)
                    if merged is None:
                        merged = d
                    else:
                        merged = merged.combine_first(d)
                return merged
            else:
                data = read_data(datasets, measurements, self._geobox, self._resampling, **kwargs)
                return data
示例#5
0
def ingest_work(config, source_type, output_type, index, sources, geobox):
    namemap = get_namemap(config)
    measurements = get_measurements(source_type, config)
    variable_params = get_variable_params(config)
    global_attributes = config['global_attributes']

    with datacube.set_options(reproject_threads=1):
        fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')]
        data = Datacube.product_data(sources, geobox, measurements, fuse_func=fuse_func)
    nudata = data.rename(namemap)
    file_path = get_filename(config, index, sources)

    def _make_dataset(labels, sources):
        sources_union = union_points(*[source.extent.to_crs(geobox.crs).points for source in sources])
        valid_data = intersect_points(geobox.extent.points, sources_union)
        dataset = make_dataset(dataset_type=output_type,
                               sources=sources,
                               extent=geobox.extent,
                               center_time=labels['time'],
                               uri=file_path.absolute().as_uri(),
                               app_info=get_app_metadata(config, config['filename']),
                               valid_data=GeoPolygon(valid_data, geobox.crs))
        return dataset
    datasets = xr_apply(sources, _make_dataset, dtype='O')  # Store in Dataarray to associate Time -> Dataset
    nudata['dataset'] = datasets_to_doc(datasets)

    write_dataset_to_netcdf(nudata, global_attributes, variable_params, file_path)

    return datasets
示例#6
0
def _test_helper(source, dst_shape, dst_dtype, dst_transform, dst_nodata,
                 dst_projection, resampling):
    expected = numpy.empty(dst_shape, dtype=dst_dtype)
    with source.open() as src:
        rasterio.warp.reproject(src.data,
                                expected,
                                src_transform=src.transform,
                                src_crs=str(src.crs),
                                src_nodata=src.nodata,
                                dst_transform=dst_transform,
                                dst_crs=str(dst_projection),
                                dst_nodata=dst_nodata,
                                resampling=resampling)

    result = numpy.empty(dst_shape, dtype=dst_dtype)
    with datacube.set_options(reproject_threads=1):
        read_from_source(source,
                         result,
                         dst_transform=dst_transform,
                         dst_nodata=dst_nodata,
                         dst_projection=dst_projection,
                         resampling=resampling)

    assert numpy.isclose(result, expected, atol=0, rtol=0.05,
                         equal_nan=True).all()
    return result
示例#7
0
def ingest_work(config, source_type, output_type, tile, tile_index):
    # pylint: disable=too-many-locals
    _LOG.info('Starting task %s', tile_index)
    driver = storage_writer_by_name(config['storage']['driver'])

    if driver is None:
        _LOG.error('Failed to load storage driver %s', config['storage']['driver'])
        raise ValueError('Something went wrong: no longer can find driver pointed by storage.driver option')

    namemap = get_namemap(config)
    # TODO: get_measurements possibly changes dtype, not sure load_data would like that
    measurements = get_measurements(source_type, config)
    resampling = get_resampling(config)
    variable_params = get_variable_params(config)
    global_attributes = config['global_attributes']

    with datacube.set_options(reproject_threads=1):
        fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')]
        data = Datacube.load_data(tile.sources, tile.geobox, measurements,
                                  resampling=resampling,
                                  fuse_func=fuse_func)

    nudata = data.rename(namemap)
    file_path = get_filename(config, tile_index, tile.sources)

    def mk_uri(file_path):
        if driver.uri_scheme == "file":
            return file_path.absolute().as_uri()
        return '{}://{}'.format(driver.uri_scheme, file_path)

    def _make_dataset(labels, sources):
        return make_dataset(product=output_type,
                            sources=sources,
                            extent=tile.geobox.extent,
                            center_time=labels['time'],
                            uri=mk_uri(file_path),
                            app_info=get_app_metadata(config, config['filename']),
                            valid_data=GeoPolygon.from_sources_extents(sources, tile.geobox))

    datasets = xr_apply(tile.sources, _make_dataset, dtype='O')  # Store in Dataarray to associate Time -> Dataset
    nudata['dataset'] = datasets_to_doc(datasets)

    variable_params['dataset'] = {
        'chunksizes': (1,),
        'zlib': True,
        'complevel': 9,
    }

    storage_metadata = driver.write_dataset_to_storage(nudata, file_path,
                                                       global_attributes=global_attributes,
                                                       variable_params=variable_params,
                                                       storage_config=config['storage'])

    if (storage_metadata is not None) and len(storage_metadata) > 0:
        datasets.attrs['storage_metadata'] = storage_metadata

    _LOG.info('Finished task %s', tile_index)

    return datasets
示例#8
0
    def data(self, datasets):
        holder = numpy.empty(shape=tuple(), dtype=object)
        holder[()] = datasets
        sources = xarray.DataArray(holder)

        prod = datasets[0].type
        measurements = [
            self._set_resampling(prod.measurements[name])
            for name in self._bands
        ]
        with datacube.set_options(reproject_threads=1, fast_load=True):
            return datacube.Datacube.load_data(sources, self._geobox,
                                               measurements)
示例#9
0
def _load_data(dc, geobox, product, bands, time_):
    to_load = _get_datasets(dc, geobox, product, time_)

    holder = numpy.empty(shape=tuple(), dtype=object)
    holder[()] = to_load
    sources = xarray.DataArray(holder)

    prod = dc.index.products.get_by_name(product)
    measurements = [
        _set_resampling(m, 'cubic') for name, m in prod.measurements.items()
        if name in bands
    ]
    with datacube.set_options(reproject_threads=1):
        return dc.load_data(sources, geobox, measurements)
示例#10
0
    def data(self,
             datasets,
             mask=False,
             manual_merge=False,
             skip_corrections=False,
             use_overviews=False,
             **kwargs):
        #pylint: disable=too-many-locals, consider-using-enumerate
        if mask:
            prod = self._product.pq_product
            measurements = [prod.measurements[self._product.pq_band].copy()]
        else:
            prod = self._product.product
            measurements = [
                prod.measurements[name].copy() for name in self.needed_bands()
            ]

        with datacube.set_options(reproject_threads=1, fast_load=True):
            if manual_merge:
                return self.manual_data_stack(datasets, measurements, mask,
                                              skip_corrections, use_overviews,
                                              **kwargs)
            elif self._product.solar_correction and not mask and not skip_corrections:
                # Merge performed already by dataset extent, but we need to
                # process the data for the datasets individually to do solar correction.
                merged = None
                for i in range(0, len(datasets)):
                    holder = numpy.empty(shape=tuple(), dtype=object)
                    ds = datasets[i]
                    d = read_data(ds, measurements, self._geobox,
                                  use_overviews, **kwargs)
                    for band in self.needed_bands():
                        if band != self._product.pq_band:
                            d[band] = solar_correct_data(d[band], ds)
                    if merged is None:
                        merged = d
                    else:
                        merged = merged.combine_first(d)
                return merged
            else:
                # Merge performed already by dataset extent
                if isinstance(datasets, xarray.DataArray):
                    sources = datasets
                else:
                    holder = numpy.empty(shape=tuple(), dtype=object)
                    holder[()] = datasets
                    sources = xarray.DataArray(holder)
                data = read_data(datasets, measurements, self._geobox,
                                 use_overviews, **kwargs)
                return data
示例#11
0
 def data(self, datasets, mask=False, manual_merge=False):
     if mask:
         prod = self._product.pq_product
         measurements = [
             self._set_resampling(prod.measurements[self._product.pq_band])
         ]
     else:
         prod = self._product.product
         measurements = [
             self._set_resampling(prod.measurements[name])
             for name in self.needed_bands()
         ]
     with datacube.set_options(reproject_threads=1, fast_load=True):
         if manual_merge:
             datas = []
             for i in range(0, len(datasets)):
                 j = i + 1
                 holder = numpy.empty(shape=tuple(), dtype=object)
                 holder[()] = datasets[i:j]
                 sources = xarray.DataArray(holder)
                 datas.append(
                     datacube.Datacube.load_data(sources, self._geobox,
                                                 measurements))
             merged = None
             if mask:
                 band = self._product.pq_band
             else:
                 for band in self.needed_bands():
                     break
             for d in datas:
                 extent_mask = self._product.extent_mask_func(d, band)
                 dm = d.where(extent_mask)
                 if merged is None:
                     merged = dm
                 else:
                     merged = merged.combine_first(dm)
             if mask:
                 merged = merged.astype('uint8', copy=True)
                 merged[band].attrs = d[band].attrs
             return merged
         else:
             if isinstance(datasets, xarray.DataArray):
                 sources = datasets
             else:
                 holder = numpy.empty(shape=tuple(), dtype=object)
                 holder[()] = datasets
                 sources = xarray.DataArray(holder)
             return datacube.Datacube.load_data(sources, self._geobox,
                                                measurements)
示例#12
0
def ingest_work(driver_manager, config, source_type, output_type, tile,
                tile_index):
    _LOG.info('Starting task %s', tile_index)

    namemap = get_namemap(config)
    measurements = get_measurements(source_type, config)
    variable_params = get_variable_params(config)
    global_attributes = config['global_attributes']

    with datacube.set_options(reproject_threads=1):
        fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')]
        data = Datacube.load_data(tile.sources,
                                  tile.geobox,
                                  measurements,
                                  fuse_func=fuse_func,
                                  driver_manager=driver_manager)
    nudata = data.rename(namemap)
    file_path = get_filename(config, tile_index, tile.sources)

    def _make_dataset(labels, sources):
        return make_dataset(product=output_type,
                            sources=sources,
                            extent=tile.geobox.extent,
                            center_time=labels['time'],
                            uri=file_path.absolute().as_uri(),
                            app_info=get_app_metadata(config,
                                                      config['filename']),
                            valid_data=GeoPolygon.from_sources_extents(
                                sources, tile.geobox))

    datasets = xr_apply(
        tile.sources, _make_dataset,
        dtype='O')  # Store in Dataarray to associate Time -> Dataset
    nudata['dataset'] = datasets_to_doc(datasets)

    # Until ingest becomes a class and DriverManager an instance
    # variable, we call the constructor each time. DriverManager being
    # a singleton, there is little overhead, though.
    datasets.attrs['storage_output'] = driver_manager.write_dataset_to_storage(
        nudata, file_path, global_attributes, variable_params)
    _LOG.info('Finished task %s', tile_index)

    # When using multiproc executor, Driver Manager is a clone.
    if driver_manager.is_clone:
        driver_manager.close()

    return datasets
示例#13
0
 def data(self, datasets, mask=False):
     if isinstance(datasets, xarray.DataArray):
         sources = datasets
     else:
         holder = numpy.empty(shape=tuple(), dtype=object)
         holder[()] = datasets
         sources = xarray.DataArray(holder)
     if mask:
         prod = self._product.pq_product
         measurements = [
             self._set_resampling(prod.measurements[self._product.pq_band])
         ]
     else:
         prod = self._product.product
         measurements = [
             self._set_resampling(prod.measurements[name])
             for name in self.needed_bands()
         ]
     with datacube.set_options(reproject_threads=1, fast_load=True):
         return datacube.Datacube.load_data(sources, self._geobox,
                                            measurements)
示例#14
0
def ingest_work(config, source_type, output_type, index, sources, geobox):
    namemap = get_namemap(config)
    measurements = get_measurements(source_type, config)
    variable_params = get_variable_params(config)
    global_attributes = config['global_attributes']

    with datacube.set_options(reproject_threads=1):
        fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')]
        data = Datacube.product_data(sources,
                                     geobox,
                                     measurements,
                                     fuse_func=fuse_func)
    nudata = data.rename(namemap)
    file_path = get_filename(config, index, sources)

    def _make_dataset(labels, sources):
        sources_union = union_points(
            *[source.extent.to_crs(geobox.crs).points for source in sources])
        valid_data = intersect_points(geobox.extent.points, sources_union)
        dataset = make_dataset(dataset_type=output_type,
                               sources=sources,
                               extent=geobox.extent,
                               center_time=labels['time'],
                               uri=file_path.absolute().as_uri(),
                               app_info=get_app_metadata(
                                   config, config['filename']),
                               valid_data=GeoPolygon(valid_data, geobox.crs))
        return dataset

    datasets = xr_apply(
        sources, _make_dataset,
        dtype='O')  # Store in Dataarray to associate Time -> Dataset
    nudata['dataset'] = datasets_to_doc(datasets)

    write_dataset_to_netcdf(nudata, global_attributes, variable_params,
                            file_path)

    return datasets
Dependancies in this code:
- csv file with the lat/lon coordinates of the case study bounding box/es

Accompanying code
- Extract_AGDC_for_study_sites.ipynb - steps through this code with explanations of how this code is compiled. See the acompanying code for more detailed explanations and
examples
- Run_AGDC_extraction - PBS submission code to generate single CPU jobs for each study site
'''

# Import the libraries we need in the code and tell matplotlib to display the plots here
import fiona
import shapely.geometry
import rasterio
import rasterio.features
import datacube
datacube.set_options(reproject_threads=1)
import numpy as np
from datacube.storage import masking
import matplotlib.pyplot as plt
import xarray as xr
import scipy.stats
import pandas
import os
import sys
from affine import Affine


# Set up some functions to use later in the code
def warp_geometry(geom, src_crs, dst_crs):
    """
    warp geometry from src_crs to dst_crs