Exemplo n.º 1
0
def load_crophealth_data():
    """
    Loads Sentinel-2 analysis-ready data (ARD) product for the crop health
    case-study area. The ARD product is provided for the last year.
    Last modified: January 2020

    outputs
    ds - data set containing combined, masked data from Sentinel-2a and -2b.
    Masked values are set to 'nan'
    """

    # Suppress warnings
    warnings.filterwarnings('ignore')

    # Initialise the data cube. 'app' argument is used to identify this app
    dc = datacube.Datacube(app='Crophealth-app')

    # Specify latitude and longitude ranges
    latitude = (-24.974997, -24.995971)
    longitude = (152.429994, 152.395805)

    # Specify the date range
    # Calculated as today's date, subtract 90 days to match NRT availability
    # Dates are converted to strings as required by loading function below
    end_date = dt.date.today()
    start_date = end_date - dt.timedelta(days=365)

    time = (start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))

    # Construct the data cube query
    products = ["s2a_ard_granule", "s2b_ard_granule"]

    query = {
        'x':
        longitude,
        'y':
        latitude,
        'time':
        time,
        'measurements': [
            'nbar_red', 'nbar_green', 'nbar_blue', 'nbar_nir_1', 'nbar_swir_2',
            'nbar_swir_3'
        ],
        'output_crs':
        'EPSG:3577',
        'resolution': (-10, 10)
    }

    # Load the data and mask out bad quality pixels
    ds_s2 = load_ard(dc, products=products, min_gooddata=0.5, **query)

    # Calculate the normalised difference vegetation index (NDVI) across
    # all pixels for each image.
    # This is stored as an attribute of the data
    ds_s2 = calculate_indices(ds_s2, index='NDVI', collection='ga_s2_1')

    # Return the data
    return (ds_s2)
Exemplo n.º 2
0
    "output_crs": "EPSG:32755",
    "resolution": (-10, 10)
}

prefire_data = load_ard(
    dc=dc,
    products=['s2a_ard_granule', 's2b_ard_granule'],
    measurements=['nbart_nir_1', 'nbart_swir_3'],
    min_gooddata=0,
    # dask_chunks={'x': 'auto', 'y': 'auto'},
    group_by='solar_day',
    **query_1)

prefire_image = prefire_data.median(dim='time')
prefire_image = calculate_indices(prefire_image,
                                  index='NBR',
                                  collection='ga_s2_1',
                                  drop=False)
prefire_burnratio = prefire_image.NBR
prefire_burnratio.data

query_2 = {
    "x": (central_lon - buffer, central_lon + buffer),
    "y": (central_lat - buffer, central_lat + buffer),
    "time": (postfire_start, postfire_end),
    "output_crs": "EPSG:32755",
    "resolution": (-10, 10)
}

postfire_data = load_ard(
    dc=dc,
    products=['s2a_ard_granule', 's2b_ard_granule'],
def get_training_data_for_shp(gdf,
                              index,
                              row,
                              out_arrs,
                              out_vars,
                              products,
                              dc_query,
                              custom_func=None,
                              field=None,
                              calc_indices=None,
                              reduce_func=None,
                              drop=True,
                              zonal_stats=None):
    """
    Function to extract data from the ODC for training a machine learning classifier 
    using a geopandas geodataframe of labelled geometries. 
    This function provides a number of pre-defined methods for producing training data, 
    including: calcuating band indices, reducing time series using summary statistics, 
    and/or generating zonal statistics across polygons.  The 'custom_func' parameter provides 
    a method for the user to supply a custom function for generating features rather than using the
    pre-defined methods.

    Parameters
    ----------
    gdf : geopandas geodataframe
        geometry data in the form of a geopandas geodataframe
    products : list
        a list of products to load from the datacube. 
        e.g. ['ga_ls7e_ard_3', 'ga_ls8c_ard_3']
    dc_query : dictionary
        Datacube query object, should not contain lat and long (x or y)
        variables as these are supplied by the 'gdf' variable
    field : string 
        A string containing the name of column with class labels. 
        Field must contain numeric values.
    out_arrs : list 
        An empty list into which the training data arrays are stored.
    out_vars : list 
        An empty list into which the data varaible names are stored.
    custom_func : function, optional 
        A custom function for generating feature layers. If this parameter
        is set, all other options (excluding 'zonal_stats'), will be ignored.
        The result of the 'custom_func' must be a single xarray dataset 
        containing 2D coordinates (i.e x, y - no time dimension). The custom function
        has access to the datacube dataset extracted using the 'dc_query' params.
        Example custom function to return multiple products:
        `def custom_function(ds):
            dc = datacube.Datacube(app='custom_function')
            mad = dc.load(product='ls8_nbart_tmad_annual', like=ds.geobox)
            output = xr.merge([ds, mad])
            return output`
    calc_indices: list, optional
        If not using a custom func, then this parameter provides a method for
        calculating a number of remote sensing indices (e.g. `['NDWI', 'NDVI']`).
    reduce_func : string, optional 
        Function to reduce the data from multiple time steps to
        a single timestep. Options are 'mean', 'median', 'std',
        'max', 'min', 'geomedian'.  Ignored if 'custom_func' is provided.
    drop : boolean, optional 
        If this variable is set to True, and 'calc_indices' are supplied, the
        spectral bands will be dropped from the dataset leaving only the
        band indices as data variables in the dataset. Default is True.
    zonal_stats : string, optional
        An optional string giving the names of zonal statistics to calculate 
        for each polygon. Default is None (all pixel values are returned). Supported 
        values are 'mean', 'median', 'max', 'min', and 'std'. Will work in 
        conjunction with a 'custom_func'.


    Returns
    --------
    Two lists, a list of numpy.arrays containing classes and extracted data for 
    each pixel or polygon, and another containing the data variable names.

    """

    # prevent function altering dictionary kwargs
    dc_query = deepcopy(dc_query)

    # remove dask chunks if supplied as using
    # mulitprocessing for parallelization
    if 'dask_chunks' in dc_query.keys():
        dc_query.pop('dask_chunks', None)

    # connect to datacube
    dc = datacube.Datacube(app='training_data')

    # set up query based on polygon (convert to albers)
    geom = geometry.Geometry(gdf.geometry.values[index].__geo_interface__,
                             geometry.CRS('epsg:3577'))

    q = {"geopolygon": geom}

    # merge polygon query with user supplied query params
    dc_query.update(q)

    # load_ard doesn't handle derivative products, so check
    # products aren't one of those below
    others = [
        'ls5_nbart_geomedian_annual', 'ls7_nbart_geomedian_annual',
        'ls8_nbart_geomedian_annual', 'ls5_nbart_tmad_annual',
        'ls7_nbart_tmad_annual', 'ls8_nbart_tmad_annual',
        'landsat_barest_earth', 'ls8_barest_earth_albers'
    ]

    if products[0] in others:
        ds = dc.load(product=products[0], **dc_query)
        ds = ds.where(ds != 0, np.nan)

    else:
        # load data
        with HiddenPrints():
            ds = load_ard(dc=dc,
                          products=products,
                          output_crs='EPSG:3577',
                          **dc_query)

    # create polygon mask
    with HiddenPrints():
        mask = xr_rasterize(gdf.iloc[[index]], ds)

    # Use custom function for training data if it exists
    if custom_func is not None:
        with HiddenPrints():
            data = custom_func(ds)
            # Mask dataset
            data = data.where(mask)
    else:
        # Mask dataset
        ds = ds.where(mask)
        # first check enough variables are set to run functions
        if (len(ds.time.values) > 1) and (reduce_func == None):
            raise ValueError(
                "You're dataset has " + str(len(ds.time.values)) +
                " time-steps, please provide a reduction function," +
                " e.g. reduce_func='mean'")

        if calc_indices is not None:
            # determine which collection is being loaded
            if products[0] in others:
                collection = 'ga_ls_2'
            elif '3' in products[0]:
                collection = 'ga_ls_3'
            elif 's2' in products[0]:
                collection = 'ga_s2_1'

            if len(ds.time.values) > 1:

                if reduce_func in ['mean', 'median', 'std', 'max', 'min']:
                    with HiddenPrints():
                        data = calculate_indices(ds,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)
                        # getattr is equivalent to calling data.reduce_func
                        method_to_call = getattr(data, reduce_func)
                        data = method_to_call(dim='time')

                elif reduce_func == 'geomedian':
                    data = GeoMedian().compute(ds)
                    with HiddenPrints():
                        data = calculate_indices(data,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)

                else:
                    raise Exception(
                        reduce_func + " is not one of the supported" +
                        " reduce functions ('mean','median','std','max','min', 'geomedian')"
                    )

            else:
                with HiddenPrints():
                    data = calculate_indices(ds,
                                             index=calc_indices,
                                             drop=drop,
                                             collection=collection)

        # when band indices are not required, reduce the
        # dataset to a 2d array through reduce function
        if calc_indices is None:

            if len(ds.time.values) > 1:

                if reduce_func == 'geomedian':
                    data = GeoMedian().compute(ds)

                elif reduce_func in ['mean', 'median', 'std', 'max', 'min']:
                    method_to_call = getattr(ds, reduce_func)
                    data = method_to_call('time')
            else:
                data = ds.squeeze()

    if zonal_stats is None:
        # If no zonal stats were requested then extract all pixel values
        flat_train = sklearn_flatten(data)
        # Make a labelled array of identical size
        flat_val = np.repeat(row[field], flat_train.shape[0])
        stacked = np.hstack((np.expand_dims(flat_val, axis=1), flat_train))

    elif zonal_stats in ['mean', 'median', 'std', 'max', 'min']:
        method_to_call = getattr(data, zonal_stats)
        flat_train = method_to_call()
        flat_train = flat_train.to_array()
        stacked = np.hstack((row[field], flat_train))

    else:
        raise Exception(
            zonal_stats + " is not one of the supported" +
            " reduce functions ('mean','median','std','max','min')")

    # Append training data and labels to list
    out_arrs.append(stacked)
    out_vars.append([field] + list(data.data_vars))
def get_training_data_for_shp(path,
                              out,
                              product,
                              time,
                              crs='EPSG:3577',
                              field='classnum',
                              calc_indices=None,
                              feature_stats=None,
                              collection='ga_ls_2'):
    """
    Function to extract data for training classifier using a shapefile 
    of labelled polygons. Currently works for single time steps.

    Parameters
    ----------
    path : string
        Path to shapefile containing labelled polygons.
    out : list
        Empty list to contain output data.
    product : string
        String of product name from which to load and extract datacube 
        data e.g. 'ls8_nbart_tmad_annual'
    time : tuple 
        A tuple containing the time period from which to extract 
        training data e.g. ('2015-01-01', '2015-12-31').
    crs : string
        A string containing desired crs e.g. 'EPSG:3577'
    field : string 
        A string containing name of column with labels in shapefile 
        attribute table. Field must contain numeric values.
    calc_indices: list, optional
        An optional list giving the names of any remote sensing indices 
        to be calculated on the loaded data (e.g. `['NDWI', 'NDVI']`. 
        This step will be skipped if any of the indices cannot be 
        computed on the input product.
    feature_stats: string, optional
        An optional string giving the names of statistics to calculate 
        for the polygon. Default is None (all pixel values). Supported 
        values are 'mean' or 'geomedian' (from the `hdstats` module).

    Returns
    --------
    A list of numpy.arrays containing classes and extracted data for 
    each pixel or polygon.

    """
    # Import hdstats as only needed for this function
    if feature_stats == 'geomedian':
        try:
            import hdstats
        except ImportError as err:
            raise
            raise ImportError(
                'Can not import hdstats module needed to calculate'
                ' geomedian.\n{}'.format(err))

    dc = datacube.Datacube(app='training_data')
    query = {'time': time}
    query['crs'] = crs
    shp = gp.read_file(path)
    bounds = shp.total_bounds
    minx = bounds[0]
    maxx = bounds[2]
    miny = bounds[1]
    maxy = bounds[3]
    query['x'] = (minx, maxx)
    query['y'] = (miny, maxy)

    print("Loading data...")

    data = dc.load(product=product, group_by='solar_day', **query)

    # Check if geomedian is in the product and if indices are wanted
    if calc_indices is not None:
        try:
            print("Calculating indices...")
            # Calculate indices - will use for all features
            for index in calc_indices:
                data = dea_bandindices.calculate_indices(data,
                                                         index,
                                                         collection=collection)
        except ValueError:
            print(
                "Input dataset not suitable for selected indices, just extracting product data"
            )
            pass

    # Remove time step if present
    try:
        data = data.isel(time=0)
    # Don't worry if it isn't
    except ValueError:
        pass

    print("Rasterizing features and extracting data...")
    # Initialize counter for status messages.
    i = 0
    # Go through each feature
    for poly_geom, poly_class_id in zip(shp.geometry, shp[field]):
        print(" Feature {:04}/{:04}\r".format(i + 1, len(shp.geometry)),
              end='')

        # Rasterise the feature
        mask = rasterize([(poly_geom, poly_class_id)],
                         out_shape=(data.y.size, data.x.size),
                         transform=data.affine)

        # Convert mask from numpy to DataArray
        mask = xr.DataArray(mask, coords=(data.y, data.x))
        # Mask out areas that were not within the labelled feature
        data_masked = data.where(mask == poly_class_id, np.nan)

        if feature_stats is None:
            # If no summary stats were requested then
            # extract all pixel values
            flat_train = sklearn_flatten(data_masked)
            # Make a labelled array of identical size
            flat_val = np.repeat(poly_class_id, flat_train.shape[0])
            stacked = np.hstack((np.expand_dims(flat_val, axis=1), flat_train))
        elif feature_stats == 'mean':
            # For the mean of each polygon take the mean over all
            # axis, ignoring masked out values (nan).
            # This gives a single pixel value for each band
            flat_train = data_masked.mean(axis=None, skipna=True)
            flat_train = flat_train.to_array()
            stacked = np.hstack((poly_class_id, flat_train))
        elif feature_stats == 'geomedian':
            # For the geomedian flatten so have a 2D array with
            # bands and pixel values. Then use hdstats
            # to calculate the geomedian
            flat_train = sklearn_flatten(data_masked)
            flat_train_median = hdstats.geomedian(flat_train, axis=0)
            # Geomedian will return a single value for each band so join
            # this with class id to create a single row in output
            stacked = np.hstack((poly_class_id, flat_train_median))

        # Append training data and label to list
        out.append(stacked)

        # Update status counter (feature number)
        i = i + 1

    # Return a list of labels for columns in output array
    return [field] + list(data.data_vars)
Exemplo n.º 5
0
def dNBR_processing(coordinates):

    # Load all data in baseline period available from s2a/b_ard_granule datasets
    prefire_ard = load_ard(
        dc=dc,
        products=['s2a_ard_granule', 's2b_ard_granule'],
        x=(coordinates.x - 0.1, coordinates.x + 0.1),
        y=(coordinates.y - 0.1, coordinates.y + 0.1),
        time=(prefire_start, prefire_end),
        measurements=['nbart_nir_1', 'nbart_swir_3'],
        min_gooddata=0.1,
        output_crs='EPSG:32755',  # UTM Zone 55S
        resolution=(-10, 10),
        group_by='solar_day')

    prefire_ard = calculate_indices(prefire_ard,
                                    index='NBR',
                                    collection='ga_s2_1',
                                    drop=False)

    # Compute median using all observations in the dataset along the time axis
    prefire_image = prefire_ard.median(dim='time')

    # Delete baseline_combined
    del prefire_ard

    # Select NBR
    prefire_NBR = prefire_image.NBR

    del prefire_image

    # Load all data in post-fire period available from s2a/b_ard_granule datasets
    postfire_ard = load_ard(
        dc=dc,
        products=['s2a_ard_granule', 's2b_ard_granule'],
        x=(coordinates.x - 0.1, coordinates.x + 0.1),
        y=(coordinates.y - 0.1, coordinates.y + 0.1),
        time=(postfire_start, postfire_end),
        measurements=['nbart_nir_1', 'nbart_swir_3'],
        min_gooddata=0.1,
        output_crs='EPSG:32755',  # UTM Zone 55S
        resolution=(-10, 10),
        group_by='solar_day')

    # Calculate NBR on all post-fire images
    postfire_ard = calculate_indices(postfire_ard,
                                     index='NBR',
                                     collection='ga_s2_1',
                                     drop=False)

    # Calculate the median post-fire image
    postfire_image = postfire_ard.median(dim='time')

    del postfire_ard

    # Select NBR
    postfire_NBR = postfire_image.NBR

    del postfire_image

    # Calculate delta
    delta_NBR = prefire_NBR - postfire_NBR

    del prefire_NBR
    del postfire_NBR

    x = np.round_(coordinates.x, decimals=4)
    y = np.round_(coordinates.y, decimals=4)

    # Turn dNBR into a x-array dataset for export to GeoTIFF
    dnbr_dataset = delta_NBR.to_dataset(name='delta_NBR')
    # cog.write_cog(dnbr_dataset, './NBR_geotiffs/{x}_{y}_dNBR.tif')
    write_geotiff(f'/scratch/wj97/ab4513/dNBR_geotiffs/{x}_{y}_dNBR.tif',
                  dnbr_dataset)

    del delta_NBR
    del dnbr_dataset