Python mask_invalid_data 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: datacube.storage.masking

메소드/함수: mask_invalid_data

hotexamples.com에서의 예제들: 23

Python mask_invalid_data - 23개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 datacube.storage.masking.mask_invalid_data에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def test_mask_valid_data():
    from xarray import DataArray, Dataset
    import numpy as np
    test_attrs = {
        'one': 1,
        'nodata': -999,
    }

    expected_data_array = DataArray(np.array(
        [[1., np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, np.nan]],
        dtype='float'),
                                    attrs=test_attrs,
                                    name='var_one')

    data_array = DataArray([[1, -999, -999], [2, 3, -999], [-999, -999, -999]],
                           attrs=test_attrs)
    dataset = Dataset(data_vars={'var_one': data_array},
                      attrs={'ds_attr': 'still here'})

    # Make sure test is actually changing something
    assert not data_array.equals(expected_data_array)

    output_ds = mask_invalid_data(dataset, keep_attrs=True)
    assert output_ds.attrs['ds_attr'] == 'still here'
    assert output_ds.data_vars['var_one'].equals(expected_data_array)
    assert output_ds.data_vars['var_one'].attrs['one'] == 1

    output_da = mask_invalid_data(data_array, keep_attrs=True)
    assert output_da.equals(expected_data_array)
    assert output_da.attrs['one'] == 1

예제 #2

파일 보기

파일: TwoBand.py 프로젝트: jeannettestrand/data_cube_docs

def plotIndex(data, b1, b2, p):
    bands = getBands(p)

    band1 = mask_invalid_data(data.data_vars[bands[b1]])

    band2 = mask_invalid_data(data.data_vars[bands[b2]])

    index = ((band1 - band2) / (band1 + band2))

    return index

예제 #3

파일 보기

파일: util.py 프로젝트: yobimania/dea-notebooks

def remove_cloud_nodata(source_prod, data, mask_band):
    ls8_USGS_cloud_pixel_qa_value = [
        324, 352, 368, 386, 388, 392, 400, 416, 432, 480, 864, 880, 898, 900,
        904, 928, 944, 992, 1350
    ]
    non_ls8_USGS_cloud_pixel_qa_value = [
        72, 96, 112, 130, 132, 136, 144, 160, 176, 224
    ]
    non_ls8_USGS_sr_cloud_qa_value = [2, 4, 12, 20, 34, 36, 52]
    mask_data = data[mask_band]
    nodata_value = mask_data.nodata
    nodata_cloud_value = []

    if 'usgs' in source_prod:
        if 'ls8' in source_prod:
            nodata_cloud_value = ls8_USGS_cloud_pixel_qa_value
        else:
            if mask_band == 'sr_cloud_qa':
                nodata_cloud_value = non_ls8_USGS_sr_cloud_qa_value
            else:
                nodata_cloud_value = non_ls8_USGS_cloud_pixel_qa_value

        nodata_cloud_value.append(nodata_value)
        nodata_cloud = np.isin(mask_data, nodata_cloud_value)
        cld_free = data.where(~nodata_cloud).dropna(dim='time', how='all')
    else:
        cld_free = data.where(mask_data == 1).dropna(dim='time', how='all')

    # remove nodata for the pixel of interest
    cld_free_valid = masking.mask_invalid_data(cld_free)

    return cld_free_valid

예제 #4

파일 보기

파일: __init__.py 프로젝트: shawndegroot/datacube-stats

def sensible_mask_invalid_data(data):
    # TODO This should be pushed up to datacube-core
    # xarray.DataArray.where() converts ints to floats, since NaNs are used to represent nodata
    # by default, this uses float64, which is way over the top for an int16 value, so
    # lets convert to float32 first, to save a bunch of memory.
    data = _convert_to_floats(
        data)  # This is stripping out variable attributes
    return mask_invalid_data(data)

예제 #5

파일 보기

파일: main.py 프로젝트: klh5/COLD

def getDataset(time, poly, crs):
    
    fetch_ds = combined_ls_sref.query(dc, geopolygon=poly, time=time)

    grouped_ds = combined_ls_sref.group(fetch_ds, resolution=(-30, 30), output_crs='EPSG:{}'.format(crs))
            
    ds = combined_ls_sref.fetch(grouped_ds)    
            
    ds = ds.sortby('time')

    ds = mask_invalid_data(ds)

    ds = ds.dropna('time', how='all')
    
    return(ds)

예제 #6

파일 보기

def plotRGB(data):

    fake_saturation = 40000

    # Sets all `nodata` values to ``nan``.
    data = mask_invalid_data(data)

    # Isolate the color dimension in an xarray.DataArray, use transpose to make color the last dimension
    rgb = (data.to_array(dim='color'))
    rgb = rgb.transpose(*(rgb.dims[1:] + rgb.dims[:1]))

    # Filter out pixels where any band is 'saturated'
    rgb = rgb.where((rgb <= fake_saturation).all(dim='color'))

    # Scale to [0, 1] range for imshow
    rgb /= fake_saturation
    return rgb

예제 #7

파일 보기

파일: datacube_processes.py 프로젝트: SAMoH-proj/backend

def time_series(query, fp):
    """Returns muliple images with R,G,B values mapped to measurements parameter
    :param dict query: x (or longitude), y (or latitude), time
    :param file object params: optional file object to save plots are other bulky files
    :return: raw HTTP response (json or image/*)
    """
    # keep those imports here to avoid breaking the rest of the file when these
    # libraries do not exist
    import matplotlib
    # pyplot will dry to plot on an X11 Display without this:
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    import datacube
    from datacube.storage.masking import mask_invalid_data

    if 'granule' in DATASET['product']:
        query['resolution'] = (-0.000135, 0.000135)
        query['output_crs'] = 'EPSG:4326'

    dc = datacube.Datacube(env=DATASET['env'], app="ndvi_time_series")

    data = dc.load(DATASET['product'], **query)
    data = mask_invalid_data(data)
    rgb = data.to_array(dim='color')
    fake_saturation = 4000
    rgb = rgb.transpose(*(rgb.dims[1:] +
                          rgb.dims[:1]))  # make 'color' the last dimension
    rgb = rgb.where((rgb <= fake_saturation).all(
        dim='color'))  # mask out pixels where any band is 'saturated'
    rgb /= fake_saturation  # scale to [0, 1] range for imshow
    try:
        rgb.plot.imshow(x=data.crs.dimensions[1],
                        y=data.crs.dimensions[0],
                        col='time',
                        col_wrap=5)
    except Exception as err:
        return error("Plotting failed: {}".format(err))
    ############################
    # save to supplied file object:
    plt.savefig(fp, dpi=150, format='jpg')
    plt.gcf().clear()  # clear figure instead of combining new images with old
    size = fp.tell()
    return {'error': 0, 'mimetype': 'image/jpg', 'size': size}

예제 #8

파일 보기

파일: gen_water_mask.py 프로젝트: klh5/wm_generator

def getDataset(crs, xmin, xmax, ymin, ymax):

    "Fetch all data for the given area."

    print("Fetching data...")

    fetch_ds = combined_ls_sref.query(dc,
                                      x=(xmin, xmax),
                                      y=(ymin, ymax),
                                      crs='EPSG:{}'.format(crs),
                                      time=('2009-01-01', '2011-12-31'))

    grouped_ds = combined_ls_sref.group(fetch_ds,
                                        resolution=(-30, 30),
                                        output_crs='EPSG:{}'.format(crs))

    ds = combined_ls_sref.fetch(grouped_ds)

    ds = mask_invalid_data(ds)

    print("Done.")

    return (ds)

예제 #9

파일 보기

def load_ard(dc,
             products=None,
             min_gooddata=0.0,
             fmask_gooddata=[1, 4, 5],
             mask_pixel_quality=True,
             mask_invalid_data=True,
             ls7_slc_off=True,
             product_metadata=False,
             dask_chunks={'time': 1},
             lazy_load=False,
             **dcload_kwargs):
    '''
    Loads Landsat Collection 3 or Sentinel 2 Definitive and Near Real 
    Time data for multiple sensors (i.e. ls5t, ls7e and ls8c for 
    Landsat; s2a and s2b for Sentinel 2), and returns a single masked 
    xarray dataset containing only observations that contain greater 
    than a given proportion of good quality pixels. This can be used 
    to extract clean time series of observations that are not affected 
    by cloud, for example as an input to the `animated_timeseries` 
    function from `dea_plotting`.
    
    The proportion of good quality pixels is calculated by summing the 
    pixels flagged as good quality in `fmask`. By default non-cloudy or 
    shadowed land, snow and water pixels are treated as good quality, 
    but this can be customised using the `fmask_gooddata` parameter.
    
    MEMORY ISSUES: For large data extractions, it can be advisable to 
    set `mask_pixel_quality=False`. The masking step coerces all 
    numeric values to float32 when NaN values are inserted into the 
    array, potentially causing your data to use twice the memory. 
    Be aware that the resulting arrays will contain invalid values 
    which may affect future analyses.
    
    Last modified: September 2019
    
    Parameters
    ----------  
    dc : datacube Datacube object
        The Datacube to connect to, i.e. `dc = datacube.Datacube()`.
        This allows you to also use development datacubes if required.    
    products : list
        A list of product names to load data from. Valid options are 
        ['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] for Landsat,
        ['s2a_ard_granule', 's2b_ard_granule'] for Sentinel 2 Definitive, 
        and ['s2a_nrt_granule', 's2b_nrt_granule'] for Sentinel 2 Near 
        Real Time.
    min_gooddata : float, optional
        An optional float giving the minimum percentage of good quality 
        pixels required for a satellite observation to be loaded. 
        Defaults to 0.0 which will return all observations regardless of
        pixel quality (set to e.g. 0.99 to return only observations with
        more than 99% good quality pixels).
    fmask_gooddata : list, optional
        An optional list of fmask values to treat as good quality 
        observations in the above `min_gooddata` calculation. The 
        default is `[1, 4, 5]` which will return non-cloudy or shadowed 
        land, snow and water pixels. Choose from: 
        `{'0': 'nodata', '1': 'valid', '2': 'cloud', 
          '3': 'shadow', '4': 'snow', '5': 'water'}`.
    mask_pixel_quality : bool, optional
        An optional boolean indicating whether to apply the good data 
        mask to all observations that were not filtered out for having 
        less good quality pixels than `min_gooddata`. E.g. if 
        `min_gooddata=0.99`, the filtered observations may still contain 
        up to 1% poor quality pixels. The default of False simply 
        returns the resulting observations without masking out these 
        pixels; True masks them out and sets them to NaN using the good 
        data mask. This will convert numeric values to float32 which can 
        cause memory issues, set to False to prevent this.
    mask_invalid_data : bool, optional
        An optional boolean indicating whether invalid -999 nodata 
        values should be replaced with NaN. These invalid values can be
        caused by missing data along the edges of scenes, or terrain 
        effects (for NBAR-T). Setting `mask_invalid_data=True` will 
        convert all numeric values to float32 when -999 values are 
        replaced with NaN which can cause memory issues; set to False 
        to prevent this. Defaults to True. 
    ls7_slc_off : bool, optional
        An optional boolean indicating whether to include data from 
        after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to 
        True, which keeps all Landsat 7 observations > May 31 2003. 
    product_metadata : bool, optional
        An optional boolean indicating whether to return the dataset 
        with a `product` variable that gives the name of the product 
        that each observation in the time series came from (e.g. 
        'ga_ls5t_ard_3'). Defaults to False.
    dask_chunks : dict, optional
        An optional dictionary containing the coords and sizes you wish 
        to create dask chunks over. Usually used in combination with 
        `lazy_load=True` (see below). For example: 
        `dask_chunks = {'x': 500, 'y': 500}`
    lazy_load : boolean, optional
        Setting this variable to True will delay the computation of the 
        function until you explicitly run `ds.compute()`. If used in 
        conjuction with `dask.distributed.Client()` this will allow for 
        automatic parallel computation.
    **dcload_kwargs : 
        A set of keyword arguments to `dc.load` that define the 
        spatiotemporal query used to extract data. This can include `x`,
        `y`, `time`, `resolution`, `resampling`, `group_by`, `crs`
        etc, and can either be listed directly in the `load_ard` call 
        (e.g. `x=(150.0, 151.0)`), or by passing in a query kwarg 
        (e.g. `**query`). For a full list of possible options, see: 
        https://datacube-core.readthedocs.io/en/latest/dev/api/generate/datacube.Datacube.load.html          
        
    Returns
    -------
    combined_ds : xarray Dataset
        An xarray dataset containing only satellite observations that 
        contains greater than `min_gooddata` proportion of good quality 
        pixels.   
        
    '''

    # Due to possible bug in xarray 0.13.0, define temporary function
    # which converts dtypes in a way that preserves attributes
    def astype_attrs(da, dtype=np.float32):
        '''
        Loop through all data variables in the dataset, record 
        attributes, convert to float32, then reassign attributes. If 
        the data variable cannot be converted to float32 (e.g. for a
        non-numeric dtype like strings), skip and return the variable 
        unchanged.
        '''

        try:
            da_attr = da.attrs
            da = da.astype(dtype)
            da = da.assign_attrs(**da_attr)
            return da

        except ValueError:
            return da

    # Verify that products were provided
    if not products:
        raise ValueError("Please provide a list of product names "
                         "to load data from. Valid options are: \n"
                         "['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] "
                         "for Landsat, ['s2a_ard_granule', "
                         "'s2b_ard_granule'] \nfor Sentinel 2 Definitive, or "
                         "['s2a_nrt_granule', 's2b_nrt_granule'] for "
                         "Sentinel 2 Near Real Time")

    # If `measurements` are specified but do not include fmask, add it
    if (('measurements' in dcload_kwargs)
            and ('fmask' not in dcload_kwargs['measurements'])):
        dcload_kwargs['measurements'].append('fmask')

    # Create a list to hold data for each product
    product_data = []

    # Iterate through each requested product
    for product in products:

        try:

            # Load data including fmask band
            print(f'Loading {product} data')
            try:
                ds = dc.load(product=f'{product}',
                             dask_chunks=dask_chunks,
                             **dcload_kwargs)
            except KeyError as e:
                raise ValueError(f'Band {e} does not exist in this product. '
                                 f'Verify all requested `measurements` exist '
                                 f'in {products}')

            # Keep a record of the original number of observations
            total_obs = len(ds.time)

            # Remove Landsat 7 SLC-off observations if ls7_slc_off=False
            if not ls7_slc_off and product == 'ga_ls7e_ard_3':
                print('    Ignoring SLC-off observations for ls7')
                ds = ds.sel(time=ds.time < np.datetime64('2003-05-30'))

            # If no measurements are specified, `fmask` is given a
            # different name. If necessary, rename it:
            if 'oa_fmask' in ds:
                ds = ds.rename({'oa_fmask': 'fmask'})

            # Identify all pixels not affected by cloud/shadow/invalid
            good_quality = ds.fmask.isin(fmask_gooddata)

            # The good data percentage calculation has to load in all `fmask`
            # data, which can be slow. If the user has chosen no filtering
            # by using the default `min_gooddata = 0`, we can skip this step
            # completely to save processing time
            if min_gooddata > 0.0:

                # Compute good data for each observation as % of total pixels
                data_perc = (good_quality.sum(axis=1).sum(axis=1) /
                             (good_quality.shape[1] * good_quality.shape[2]))

                # Filter by `min_gooddata` to drop low quality observations
                ds = ds.sel(time=data_perc >= min_gooddata)
                print(f'    Filtering to {len(ds.time)} '
                      f'out of {total_obs} observations')

            # Optionally apply pixel quality mask to observations remaining
            # after the filtering step above to mask out all remaining
            # bad quality pixels
            if mask_pixel_quality & (len(ds.time) > 0):
                print('    Applying pixel quality mask')

                # First change dtype to float32, then mask out values using
                # `.where()`. By casting to float32, we prevent `.where()`
                # from automatically casting to float64, using 2x the memory.
                # We need to do this by applying a custom function to every
                # variable in the dataset instead of using `.astype()`, due
                # to a possible bug in xarray 0.13.0 that drops attributes
                ds = ds.apply(astype_attrs, dtype=np.float32, keep_attrs=True)
                ds = ds.where(good_quality)

            # Optionally add satellite/product name as a new variable
            if product_metadata:
                ds['product'] = xr.DataArray([product] * len(ds.time),
                                             [('time', ds.time)])

            # If any data was returned, add result to list
            if len(ds.time) > 0:
                product_data.append(ds.drop('fmask'))

        # If  AttributeError due to there being no `fmask` variable in
        # the dataset, skip this product and move on to the next
        except AttributeError:
            print(f'    No data for {product}')

    # If any data was returned above, combine into one xarray
    if (len(product_data) > 0):

        # Concatenate results and sort by time
        print(f'Combining and sorting data')
        combined_ds = xr.concat(product_data, dim='time').sortby('time')

        # Optionally filter to replace no data values with nans
        if mask_invalid_data:
            print('    Masking out invalid values')

            # First change dtype to float32, then mask out values using
            # `.where()`. By casting to float32, we prevent `.where()`
            # from automatically casting to float64, using 2x the memory.
            # We need to do this by applying a custom function to every
            # variable in the dataset instead of using `.astype()`, due
            # to a possible bug in xarray 0.13.0 that drops attributes
            combined_ds = combined_ds.apply(astype_attrs,
                                            dtype=np.float32,
                                            keep_attrs=True)
            combined_ds = masking.mask_invalid_data(combined_ds)

        # If `lazy_load` is True, return data as a dask array without
        # actually loading it in
        if lazy_load:
            print(f'    Returning {len(combined_ds.time)} observations'
                  ' as a dask array')
            return combined_ds

        else:
            print(f'    Returning {len(combined_ds.time)} observations ')
            return combined_ds.compute()

    # If no data was returned:
    else:
        print('No data returned for query')
        return None

예제 #10

파일 보기

import datacube
from datacube.storage.masking import mask_invalid_data

query = {
    'time': ('1990-01-01', '1991-01-01'),
    'lat': (-35.2, -35.4),
    'lon': (149.0, 149.2),
}

dc = datacube.Datacube(app='plot-rgb-recipe')
data = dc.load(product='ls5_nbar_albers',
               measurements=['red', 'green', 'blue'],
               **query)
data = mask_invalid_data(data)

fake_saturation = 4000
rgb = data.to_array(dim='color')
rgb = rgb.transpose(*(rgb.dims[1:] +
                      rgb.dims[:1]))  # make 'color' the last dimension
rgb = rgb.where((rgb <= fake_saturation).all(
    dim='color'))  # mask out pixels where any band is 'saturated'
rgb /= fake_saturation  # scale to [0, 1] range for imshow

rgb.plot.imshow(x=data.crs.dimensions[1],
                y=data.crs.dimensions[0],
                col='time',
                col_wrap=5,
                add_colorbar=False)

예제 #11

파일 보기

def load_ard(dc,
             products=None,
             min_gooddata=0.0,
             fmask_gooddata=[1, 4, 5],
             mask_pixel_quality=True,
             mask_invalid_data=True,
             mask_contiguity=False,
             mask_dtype=np.float32,
             ls7_slc_off=True,
             product_metadata=False,
             **dcload_kwargs):
    '''
    Loads Landsat Collection 3 or Sentinel 2 Definitive and Near Real 
    Time data for multiple sensors (i.e. ls5t, ls7e and ls8c for 
    Landsat; s2a and s2b for Sentinel 2), and returns a single masked 
    xarray dataset containing only observations that contain greater 
    than a given proportion of good quality pixels. This can be used 
    to extract clean time series of observations that are not affected 
    by cloud, for example as an input to the `animated_timeseries` 
    function from `dea_plotting`.
    
    The proportion of good quality pixels is calculated by summing the 
    pixels flagged as good quality in `fmask`. By default non-cloudy or 
    shadowed land, snow and water pixels are treated as good quality, 
    but this can be customised using the `fmask_gooddata` parameter.
    
    Last modified: March 2020
    
    Parameters
    ----------  
    dc : datacube Datacube object
        The Datacube to connect to, i.e. `dc = datacube.Datacube()`.
        This allows you to also use development datacubes if required.    
    products : list
        A list of product names to load data from. Valid options are 
        ['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] for Landsat,
        ['s2a_ard_granule', 's2b_ard_granule'] for Sentinel 2 Definitive, 
        and ['s2a_nrt_granule', 's2b_nrt_granule'] for Sentinel 2 Near 
        Real Time (on the DEA Sandbox only).
    min_gooddata : float, optional
        An optional float giving the minimum percentage of good quality 
        pixels required for a satellite observation to be loaded. 
        Defaults to 0.0 which will return all observations regardless of
        pixel quality (set to e.g. 0.99 to return only observations with
        more than 99% good quality pixels).
    fmask_gooddata : list, optional
        An optional list of fmask values to treat as good quality 
        observations in the above `min_gooddata` calculation. The 
        default is `[1, 4, 5]` which will return non-cloudy or shadowed 
        land, snow and water pixels. Choose from: 
        `{'0': 'nodata', '1': 'valid', '2': 'cloud', 
          '3': 'shadow', '4': 'snow', '5': 'water'}`.
    mask_pixel_quality : bool, optional
        An optional boolean indicating whether to apply the good data 
        mask to all observations that were not filtered out for having 
        less good quality pixels than `min_gooddata`. E.g. if 
        `min_gooddata=0.99`, the filtered observations may still contain 
        up to 1% poor quality pixels. The default of False simply 
        returns the resulting observations without masking out these 
        pixels; True masks them and sets them to NaN using the good data 
        mask. This will convert numeric values to floating point values 
        which can cause memory issues, set to False to prevent this.
    mask_invalid_data : bool, optional
        An optional boolean indicating whether invalid -999 nodata 
        values should be replaced with NaN. These invalid values can be
        caused by missing data along the edges of scenes, or terrain 
        effects (for NBART). Be aware that masking out invalid values 
        will convert all numeric values to floating point values when 
        -999 values are replaced with NaN, which can cause memory issues.
    mask_contiguity : str or bool, optional
        An optional string or boolean indicating whether to mask out
        pixels missing data in any band (i.e. "non-contiguous" values).
        This can be important for generating clean composite datasets. 
        The default is False, which will ignore non-contiguous values 
        completely. If loading NBART data, set the parameter to:       
        `mask_contiguity='nbart_contiguity'`. If loading NBAR data, 
        specify `mask_contiguity='nbar_contiguity'` instead. 
        Non-contiguous pixels will be set to NaN if `dtype='auto'`, or 
        set to the data's native nodata value if `dtype='native'` 
        (which can be useful for reducing memory).
    mask_dtype : numpy dtype, optional
        An optional parameter that controls the data type/dtype that
        layers are coerced to when when `mask_pixel_quality=True` or 
        `mask_contiguity=True`. Defaults to `np.float32`, which uses
        approximately 1/2 the memory of `np.float64`.
    ls7_slc_off : bool, optional
        An optional boolean indicating whether to include data from 
        after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to 
        True, which keeps all Landsat 7 observations > May 31 2003. 
    product_metadata : bool, optional
        An optional boolean indicating whether to return the dataset 
        with a `product` variable that gives the name of the product 
        that each observation in the time series came from (e.g. 
        'ga_ls5t_ard_3'). Defaults to False.
    **dcload_kwargs : 
        A set of keyword arguments to `dc.load` that define the 
        spatiotemporal query used to extract data. This typically
        includes `measurements`, `x`, `y`, `time`, `resolution`, 
        `resampling`, `group_by` and `crs`. Keyword arguments can 
        either be listed directly in the `load_ard` call like any 
        other parameter (e.g. `measurements=['nbart_red']`), or by 
        passing in a query kwarg dictionary (e.g. `**query`). For a 
        list of possible options, see the `dc.load` documentation: 
        https://datacube-core.readthedocs.io/en/latest/dev/api/generate/datacube.Datacube.load.html          
        
    Returns
    -------
    combined_ds : xarray Dataset
        An xarray dataset containing only satellite observations that 
        contains greater than `min_gooddata` proportion of good quality 
        pixels.   
        
    '''

    # Due to possible bug in xarray 0.13.0, define temporary function
    # which converts dtypes in a way that preserves attributes
    def astype_attrs(da, dtype=np.float32):
        '''
        Loop through all data variables in the dataset, record 
        attributes, convert to a custom dtype, then reassign attributes. 
        If the data variable cannot be converted to the custom dtype 
        (e.g. trying to convert non-numeric dtype like strings to 
        floats), skip and return the variable unchanged.
        
        This can be combined with `.where()` to save memory. By casting 
        to e.g. np.float32, we prevent `.where()` from automatically 
        casting to np.float64, using 2x the memory. np.float16 could be 
        used to save even more memory (although this may not be 
        compatible with all downstream applications).
        
        This custom function is required instead of using xarray's 
        built-in `.astype()`, due to a bug in xarray 0.13.0 that drops
        attributes: https://github.com/pydata/xarray/issues/3348
        '''

        try:
            da_attr = da.attrs
            da = da.astype(dtype)
            da = da.assign_attrs(**da_attr)
            return da

        except ValueError:
            return da

    # To prevent modifications to dcload_kwargs being made by this
    # function remaining after the function is run (potentially causing
    # different results each time the function is run), first take a
    # deep copy of the dcload_kwargs object.
    dcload_kwargs = deepcopy(dcload_kwargs)

    # Determine if lazy loading is required
    lazy_load = 'dask_chunks' in dcload_kwargs

    # Warn user if they combine lazy load with min_gooddata
    if (min_gooddata > 0.0) & lazy_load:
        warnings.warn("Setting 'min_gooddata' percentage to > 0.0 "
                      "will cause dask arrays to compute when "
                      "loading pixel-quality data to calculate "
                      "'good pixel' percentage. This can "
                      "significantly slow the return of your dataset.")

    # Verify that products were provided, and that only Sentinel-2 or
    # only Landsat products are being loaded at the same time
    if not products:
        raise ValueError("Please provide a list of product names "
                         "to load data from. Valid options are: \n"
                         "['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] "
                         "for Landsat, ['s2a_ard_granule', "
                         "'s2b_ard_granule'] \nfor Sentinel 2 Definitive, or "
                         "['s2a_nrt_granule', 's2b_nrt_granule'] for "
                         "Sentinel 2 Near Real Time")
    elif all(['ls' in product for product in products]):
        product_type = 'ls'
    elif all(['s2' in product for product in products]):
        product_type = 's2'
    else:
        raise ValueError("Loading both Sentinel-2 and Landsat data "
                         "at the same time is currently not supported")

    # If `measurements` are specified but do not include fmask or
    # contiguity variables, add these to `measurements`
    to_drop = []  # store loaded var names here to later drop
    fmask_band = 'fmask'

    if 'measurements' in dcload_kwargs:

        if fmask_band not in dcload_kwargs['measurements']:
            dcload_kwargs['measurements'].append(fmask_band)
            to_drop.append(fmask_band)

        if (mask_contiguity
                and (mask_contiguity not in dcload_kwargs['measurements'])):
            dcload_kwargs['measurements'].append(mask_contiguity)
            to_drop.append(mask_contiguity)

    # If no `measurements` are specified, Landsat ancillary bands are loaded
    # with a 'oa_' prefix, but Sentinel-2 bands are not. As a work-around,
    # we need to rename the default contiguity and fmask bands if loading
    # Landsat data without specifying `measurements`
    elif product_type == 'ls':
        mask_contiguity = f'oa_{mask_contiguity}' if mask_contiguity else False
        fmask_band = f'oa_{fmask_band}'

    # Create a list to hold data for each product
    product_data = []

    # Iterate through each requested product
    for product in products:

        try:

            # Load data including fmask band
            print(f'Loading {product} data')
            try:

                # If dask_chunks is specified, load data using query
                if lazy_load:
                    ds = dc.load(product=f'{product}', **dcload_kwargs)

                # If no dask chunks specified, add this param so that
                # we can lazy load data before filtering by good data
                else:
                    ds = dc.load(product=f'{product}',
                                 dask_chunks={},
                                 **dcload_kwargs)

            except KeyError as e:
                raise ValueError(f'Band {e} does not exist in this product. '
                                 f'Verify all requested `measurements` exist '
                                 f'in {products}')

            # Keep a record of the original number of observations
            total_obs = len(ds.time)

            # Remove Landsat 7 SLC-off observations if ls7_slc_off=False
            if not ls7_slc_off and product == 'ga_ls7e_ard_3':
                print('    Ignoring SLC-off observations for ls7')
                ds = ds.sel(time=ds.time < np.datetime64('2003-05-31'))

            # Identify all pixels not affected by cloud/shadow/invalid
            good_quality = ds[fmask_band].isin(fmask_gooddata)

            # The good data percentage calculation has to load in all `fmask`
            # data, which can be slow. If the user has chosen no filtering
            # by using the default `min_gooddata = 0`, we can skip this step
            # completely to save processing time
            if min_gooddata > 0.0:

                # Compute good data for each observation as % of total pixels
                data_perc = (good_quality.sum(axis=1).sum(axis=1) /
                             (good_quality.shape[1] * good_quality.shape[2]))

                # Filter by `min_gooddata` to drop low quality observations
                ds = ds.sel(time=data_perc >= min_gooddata)
                print(f'    Filtering to {len(ds.time)} '
                      f'out of {total_obs} observations')

            # If any data was returned
            if len(ds.time) > 0:

                # Optionally apply pixel quality mask to observations
                # remaining after the filtering step above to mask out
                # all remaining bad quality pixels
                if mask_pixel_quality:
                    print('    Applying pixel quality/cloud mask')

                    # Change dtype to custom float before masking to
                    # save memory. See `astype_attrs` func docstring
                    # above for details
                    ds = ds.apply(astype_attrs,
                                  dtype=mask_dtype,
                                  keep_attrs=True)
                    ds = ds.where(good_quality)

                # Optionally filter to replace no data values with nans
                if mask_invalid_data:
                    print('    Applying invalid data mask')

                    # Change dtype to custom float before masking to
                    # save memory. See `astype_attrs` func docstring
                    # above for details
                    ds = ds.apply(astype_attrs,
                                  dtype=mask_dtype,
                                  keep_attrs=True)
                    ds = masking.mask_invalid_data(ds)

                # Optionally apply contiguity mask to observations to
                # remove pixels missing data in any band
                if mask_contiguity:
                    print('    Applying contiguity mask')

                    # Change dtype to custom float before masking to
                    # save memory. See `astype_attrs` func docstring
                    # above for details
                    ds = ds.apply(astype_attrs,
                                  dtype=mask_dtype,
                                  keep_attrs=True)
                    ds = ds.where(ds[mask_contiguity] == 1)

                # Optionally add satellite/product name as a new variable
                if product_metadata:
                    ds['product'] = xr.DataArray([product] * len(ds.time),
                                                 [('time', ds.time)])

                # If any data was returned, add result to list
                product_data.append(ds.drop(to_drop))

            # If no data is returned, print status
            else:
                print(f'    No data for {product}')

        # If  AttributeError due to there being no variables in
        # the dataset, skip this product and move on to the next
        except AttributeError:
            print(f'    No data for {product}')

    # If any data was returned above, combine into one xarray
    if (len(product_data) > 0):

        # Concatenate results and sort by time
        print(f'Combining and sorting data')
        combined_ds = xr.concat(product_data, dim='time').sortby('time')

        # If `lazy_load` is True, return data as a dask array without
        # actually loading it in
        if lazy_load:
            print(f'    Returning {len(combined_ds.time)} observations'
                  ' as a dask array')
            return combined_ds

        else:
            print(f'    Returning {len(combined_ds.time)} observations ')
            return combined_ds.compute()

    # If no data was returned:
    else:
        print('No data returned for query')
        return None

예제 #12

파일 보기

파일: DEADataHandling.py 프로젝트: davecollett/dea-notebooks

def load_clearsentinel2(dc, query, sensors=('s2a', 's2b'), product='ard',
                        bands_of_interest=('nbart_red', 'nbart_green', 'nbart_blue', 'nbart_nir_1', 'nbart_swir_2', 'nbart_swir_3'),
                        masked_prop=0.99, mask_values=(0, 2, 3), pixel_quality_band='fmask',
                        mask_pixel_quality=False, mask_invalid_data=True, satellite_metadata=False):
    
    """
    Loads Sentinel 2 data for multiple sensors (i.e. s2a, s2b), and returns a single xarray dataset containing 
    only observations that contain greater than a given proportion of good quality pixels. This can be used to extract
    visually appealing time series of observations that are not affected by cloud, for example as an input to the
    `animated_timeseries` function from `DEAPlotting`.
    
    The proportion of good quality pixels is calculated by summing the pixels that are not flagged as poor quality
    in the Sentinel pixel quality array. By default pixels flagged as nodata, cloud or shadow are used to 
    calculate the number of poor quality pixels, but this can be customised using the `mask_values` parameter.
    
    MEMORY ISSUES: For large data extractions, it is recommended that you set both `mask_pixel_quality=False` and 
    `mask_invalid_data=False`. Otherwise, all output variables will be coerced to float64 when NaN values are 
    inserted into the array, potentially causing your data to use 4x as much memory. Be aware that the resulting
    arrays will contain invalid -999 values which should be considered in analyses.
    
    Last modified: November 2018
    Author: Robbi Bishop-Taylor
    
    :param dc: 
        A specific Datacube to import from, i.e. `dc = datacube.Datacube(app='Sentinel datacube')`. This allows you 
        to also use development datacubes if they have been imported into the environment.
    
    :param query: 
        A dict containing the query bounds. Can include lat/lon, time etc. If no `time` query is given, the 
        function defaults to all time steps available to all sensors (e.g. 2015 onward)

    :param sensors:
        An optional list of Sentinel 2 sensors to load data for. Options are 's2a', and 's2b'; defaults to both.

    :param product:
        An optional string specifying the product to load. Defaults to 'ard', which is equivalent to loading
        e.g. `s2a_ard_granule`. 
        
    :param bands_of_interest:
        An optional list of strings containing the bands to be read in; to view full list run the following:
        `dc.list_measurements().loc['s2b_ard_granule']`. Defaults to `('nbart_red', 'nbart_green', 'nbart_blue', 
        'nbart_nir_1', 'nbart_swir_2', 'nbart_swir_3')`.

    :param masked_prop:
        An optional float giving the minimum percentage of good quality pixels required for a Sentinel 2 observation
        to be loaded. Defaults to 0.99 (i.e. only return observations with less than 1% of poor quality pixels).
    
    :param mask_values:
        An optional list of pixel quality values to treat as poor quality observations in the above `masked_prop`
        calculation. The default is `[0, 2, 3]` which treats nodata, cloud and cloud shadow as poor quality.
        Choose from: `{'0': 'nodata', '1': 'valid', '2': 'cloud', '3': 'shadow', '4': 'snow', '5': 'water'}`.
        
    :param pixel_quality_band:
        An optional string giving the name of the pixel quality band contained in the Sentinel 2 dataset. The default
        value is 'fmask'.
      
    :param mask_pixel_quality:
        An optional boolean indicating whether to apply the pixel quality mask to all observations that were not
        filtered out for having less good quality pixels that `masked_prop`. For example, if `masked_prop=0.99`, the
        filtered images may still contain up to 1% poor quality pixels. The default of False simply returns the
        resulting observations without masking out these pixels; True masks them out and sets them to NaN using the
        pixel quality mask, but has the side effect of changing the data type of the output arrays from int16 to
        float64 which can cause memory issues. To reduce memory usage, set to False.
        
    :param mask_invalid_data:
        An optional boolean indicating whether invalid -999 nodata values should be replaced with NaN. Defaults to
        True; this has the side effect of changing the data type of the output arrays from int16 to float64 which can
        cause memory issues. To reduce memory usage, set to False.
        
    :param satellite_metadata:
        An optional boolean indicating whether to return the dataset with a `satellite` variable that gives the name
        of the satellite that made each observation in the time series (i.e. s2a, s2b). Defaults to False.
        
    :returns:
        An xarray dataset containing only Sentinel 2 observations that contain greater than `masked_prop`
        proportion of clear pixels.  
        
    :example:
    
    >>> # Import modules
    >>> import datacube
    >>> import sys

    >>> # Import dea-notebooks functions using relative link to 10_Scripts directory
    >>> sys.path.append('../10_Scripts')
    >>> import DEADataHandling

    >>> # Connect to a datacube containing Sentinel data
    >>> dc = datacube.Datacube(app='load_clearsentinel')

    >>> # Set up spatial and temporal query; note that 'output_crs' and 'resolution' need to be set
    >>> query = {'x': (-191400.0, -183400.0),
    ...          'y': (-1423460.0, -1415460.0),
    ...          'time': ('2018-01-01', '2018-03-01'),
    ...          'crs': 'EPSG:3577',
    ...          'output_crs': 'EPSG:3577',
    ...          'resolution': (10, 10)}   

    >>> # Load observations with less than 70% cloud from both S2A and S2B as a single combined dataset
    >>> sentinel_ds = DEADataHandling.load_clearsentinel2(dc=dc, query=query, sensors=['s2a', 's2b'], 
    ...                                    bands_of_interest=['nbart_red', 'nbart_green', 'nbart_blue'], 
    ...                                    masked_prop=0.3, mask_pixel_quality=True)
    Loading s2a pixel quality
        Loading 3 filtered s2a timesteps
    Loading s2b pixel quality
        Loading 2 filtered s2b timesteps
    Combining and sorting s2a, s2b data
        Replacing invalid -999 values with NaN (data will be coerced to float64)

    >>> # Test that function returned data
    >>> len(sentinel_ds.time) > 0
    True
      
    """

    # List to save results from each sensor and list to keep names of successfully processed sensors
    filtered_sensors = []
    successfully_returned = []

    # Iterate through all sensors, returning only observations with > mask_prop clear pixels
    for sensor in sensors:

        try:
        
            # If bands of interest are given, assign measurements in dc.load call. This is
            # for compatibility with the existing dea-notebooks load_nbarx function.
            if bands_of_interest:

                # Lazily load Sentinel 2 data using dask
                data = dc.load(product=f'{sensor}_{product}_granule',
                               measurements=bands_of_interest,
                               group_by='solar_day',
                               dask_chunks={'time': 1},
                               **query)

            # If no bands of interest given, run without specifying measurements, and
            # therefore return all available bands
            else:

                # Lazily load Sentinel 2 data using dask
                data = dc.load(product=f'{sensor}_{product}_granule',
                               group_by='solar_day',
                               dask_chunks={'time': 1},
                               **query)

            # Load PQ data
            print(f'Loading {sensor} pixel quality')
            pq = dc.load(product=f'{sensor}_{product}_granule',
                         measurements=[pixel_quality_band],
                         group_by='solar_day',
                         **query)

            # Identify pixels with valid data
            good_quality = np.isin(pq[pixel_quality_band], test_elements=mask_values, invert=True)
            good_quality = pq[pixel_quality_band].where(good_quality).notnull()

            # Compute good data for each observation as a percentage of total array pixels
            data_perc = good_quality.sum(dim=['x', 'y']) / (good_quality.shape[1] * good_quality.shape[2])

            # Add data_perc data to Sentinel 2 dataset as a new xarray variable
            data['data_perc'] = xr.DataArray(data_perc, [('time', data.time)])

            # Filter by data_perc to drop low quality observations and finally import data using dask
            filtered = data.sel(time=data.data_perc >= masked_prop)
            print(f'    Loading {len(filtered.time)} filtered {sensor} timesteps')
            filtered = filtered.compute()

            # Optionally apply pixel quality mask to all observations that were not dropped in previous step
            if mask_pixel_quality:
                filtered = filtered.where(good_quality)

            # Optionally add satellite name
            if satellite_metadata:
                filtered['satellite'] = xr.DataArray([sensor] * len(filtered.time), [('time', filtered.time)])

            # Append result to list and add sensor name to list of successfully sensors
            filtered_sensors.append(filtered)
            successfully_returned.append(sensor)

            # Close datasets
            filtered = None
            good_quality = None
            data = None
     
        except:

            # If there is no data for sensor or if another error occurs:
            print(f'    Skipping {sensor}; no valid data for query')

    # Concatenate all sensors into one big xarray dataset, and then sort by time
    sensor_string = ", ".join(successfully_returned)
    print(f'Combining and sorting {sensor_string} data')
    combined_ds = xr.concat(filtered_sensors, dim='time')
    combined_ds = combined_ds.sortby('time')
                                                               
    # Optionally filter to replace invalid data values with nans
    if mask_invalid_data:
        print('    Replacing invalid -999 values with NaN (data will be coerced to float64)')
        combined_ds = masking.mask_invalid_data(combined_ds)

    # Return combined dataset
    return combined_ds

예제 #13

파일 보기

파일: DEADataHandling.py 프로젝트: davecollett/dea-notebooks

def load_clearlandsat(dc, query, sensors=('ls5', 'ls7', 'ls8'), product='nbart',
                      bands_of_interest=None, masked_prop=0.99, mask_dict=None,
                      mask_pixel_quality=False, mask_invalid_data=True, ls7_slc_off=False, satellite_metadata=False):
    
    """Load cloud-free data from multiple Landsat satellites as an xarray dataset
    
    Loads Landsat NBAR, NBART or FC25 and PQ data for multiple sensors (i.e. ls5, ls7, ls8) and returns a single 
    xarray dataset containing only observations that contain greater than a given proportion of good quality pixels.
    This function can be used to extract visually appealing time series of observations that are not affected by cloud,
    for example as an input to the `animated_timeseries` function from `DEAPlotting`.
    
    The proportion of clear pixels is calculated by summing the pixels that are not flagged as being poor quality
    in the Landsat PQ25 layer. By default only cloudy pixels or pixels that are missing data in any band are
    used to calculate the number of poor quality pixels, but this can be customised using the `mask_dict` parameter.
    
    Last modified: October 2018
    Author: Robbi Bishop-Taylor, Bex Dunn    
    
    Parameters
    ----------    
    dc : datacube Datacube object
        A specific Datacube to import from, i.e. `dc = datacube.Datacube(app='Clear Landsat')`. This allows you to 
        also use development datacubes if they have been imported into the environment.    
    query : dict
        A dict containing the query bounds. Can include lat/lon, time etc. If no `time` query is given, the 
        function defaults to all timesteps available to all sensors (e.g. 1987-2018)
    sensors : list, optional
        An optional list of Landsat sensor names to load data for. Options are 'ls5', 'ls7', 'ls8'; defaults to all.
    product : str, optional
        An optional string specifying 'nbar', 'nbart' or 'fc'. Defaults to 'nbart'. For information on the difference, 
        see the '02_DEA_datasets/Introduction_to_Landsat' or '02_DEA_datasets/Introduction_to_Fractional_Cover'
        notebooks from DEA-notebooks.
    bands_of_interest : list, optional
        An optional list of strings containing the bands to be read in; options include 'red', 'green', 'blue', 
        'nir', 'swir1', 'swir2'; defaults to all available bands if no bands are specified.
    masked_prop : float, optional
        An optional float giving the minimum percentage of clear pixels required for a Landsat observation to be 
        loaded. Defaults to 0.99 (i.e. only return observations with less than 1% of poor quality pixels).
    mask_dict : dict, optional
        An optional dict of arguments to the `masking.make_mask` function that can be used to identify good/poor
        quality pixels from the PQ layer using alternative masking criteria. The default value of None masks
        out pixels flagged as cloud by either the ACCA or Fmask algorithms, or pixels that are missing data in any
        band (equivalent to: `mask_dict={'cloud_acca': 'no_cloud', 'cloud_fmask': 'no_cloud', 'contiguous': True}`.
        See the `02_DEA_datasets/Introduction_to_LandsatPQ.ipynb` notebook on DEA Notebooks for a list of all
        possible options.
    mask_pixel_quality : bool, optional
        An optional boolean indicating whether to apply the pixel quality mask to all observations that were not
        filtered out for having less good quality pixels that `masked_prop`. For example, if `masked_prop=0.99`, the
        filtered images may still contain up to 1% poor quality pixels. The default of False simply returns the
        resulting observations without masking out these pixels; True masks them out and sets them to NaN using the
        pixel quality mask, but has the side effect of changing the data type of the output arrays from int16 to
        float64 which can cause memory issues. To reduce memory usage, set to False.
    mask_invalid_data : bool, optional
        An optional boolean indicating whether invalid -999 nodata values should be replaced with NaN. Defaults to
        True; this has the side effect of changing the data type of the output arrays from int16 to float64 which
        can cause memory issues. To reduce memory usage, set to False.
    ls7_slc_off : bool, optional
        An optional boolean indicating whether to include data from after the Landsat 7 SLC failure (i.e. SLC-off).
        Defaults to False, which removes all Landsat 7 observations after May 31 2003. 
    satellite_metadata : bool, optional
        An optional boolean indicating whether to return the dataset with a `satellite` variable that gives the name 
        of the satellite that made each observation in the timeseries (i.e. ls5, ls7, ls8). Defaults to False. 
    
    Returns
    -------
    combined_ds : xarray Dataset
        An xarray dataset containing only Landsat observations that contain greater than `masked_prop`
        proportion of clear pixels.   
        
    Notes
    -----
    Memory issues: For large data extractions, it is recommended that you set both `mask_pixel_quality=False` and 
    `mask_invalid_data=False`. Otherwise, all output variables will be coerced to float64 when NaN values are 
    inserted into the array, potentially causing your data to use 4x as much memory. Be aware that the resulting
    arrays will contain invalid -999 values which should be considered in analyses.
        
    Example
    -------    
    >>> # Import modules
    >>> import datacube
    >>> import sys

    >>> # Import dea-notebooks functions using relative link to 10_Scripts directory
    >>> sys.path.append('../10_Scripts')
    >>> import DEADataHandling

    >>> # Connect to a datacube containing Landsat data
    >>> dc = datacube.Datacube(app='load_clearlandsat')

    >>> # Set up spatial and temporal query
    >>> query = {'x': (954163, 972163),
    ...          'y': (-3573891, -3555891),
    ...          'time': ('2011-06-01', '2013-06-01'),
    ...          'crs': 'EPSG:3577'}   

    >>> # Load observations with less than 25% cloud from ls5, ls7 and ls8 as a single combined dataset
    >>> landsat_ds = DEADataHandling.load_clearlandsat(dc=dc, query=query, sensors=['ls5', 'ls7', 'ls8'], 
    ...                                    bands_of_interest=['red', 'green', 'blue'], 
    ...                                    masked_prop=0.75, mask_pixel_quality=True, ls7_slc_off=True)
    Loading ls5 pixel quality
        Loading 4 filtered ls5 timesteps
    Loading ls7 pixel quality
        Loading 29 filtered ls7 timesteps
    Loading ls8 pixel quality
        Loading 3 filtered ls8 timesteps
    Combining and sorting ls5, ls7, ls8 data
        Replacing invalid -999 values with NaN (data will be coerced to float64)

    >>> # Test that function returned data
    >>> len(landsat_ds.time) > 0
    True
                
    """    

    # List to save results from each sensor and list to keep names of successfully processed sensors
    filtered_sensors = []
    successfully_returned = []

    # Iterate through all sensors, returning only observations with > mask_prop clear pixels
    for sensor in sensors:
        
        try:
            
            # If bands of interest are given, assign measurements in dc.load call. This is
            # for compatibility with the existing dea-notebooks load_nbarx function.
            if bands_of_interest:
                
                # Lazily load Landsat data using dask              
                data = dc.load(product=f'{sensor}_{product}_albers',
                               measurements=bands_of_interest,
                               group_by='solar_day', 
                               dask_chunks={'time': 1},
                               **query)

            # If no bands of interest given, run without specifying measurements, and 
            # therefore return all available bands
            else:
                
                # Lazily load Landsat data using dask  
                data = dc.load(product=f'{sensor}_{product}_albers',
                               group_by='solar_day', 
                               dask_chunks={'time': 1},
                               **query)             

            # Load PQ data
            pq = dc.load(product=f'{sensor}_pq_albers',
                         group_by='solar_day',
                         fuse_func=ga_pq_fuser,
                         dask_chunks={'time': 1},
                         **query)

            # Remove Landsat 7 SLC-off from PQ layer if ls7_slc_off=False
            if not ls7_slc_off and sensor == 'ls7':

                print('Ignoring SLC-off observations for ls7')
                data = data.sel(time=data.time < np.datetime64('2003-05-30'))

            # Return only Landsat observations that have matching PQ data 
            time = (data.time - pq.time).time
            data = data.sel(time=time)
            pq = pq.sel(time=time)

            # Load PQ data using dask
            print('Loading {} pixel quality'.format(sensor))
            pq = pq.compute()
            
            # If a custom dict is provided for mask_dict, use these values to make mask from PQ
            if mask_dict:
                
                # Mask PQ using custom values by unpacking mask_dict **kwarg
                good_quality = masking.make_mask(pq.pixelquality, **mask_dict)
                
            else:

                # Identify pixels with no clouds in either ACCA for Fmask
                good_quality = masking.make_mask(pq.pixelquality,
                                                 cloud_acca='no_cloud',
                                                 cloud_fmask='no_cloud',
                                                 contiguous=True)

            # Compute good data for each observation as a percentage of total array pixels
            data_perc = good_quality.sum(dim=['x', 'y']) / (good_quality.shape[1] * good_quality.shape[2])
            
            # Add data_perc data to Landsat dataset as a new xarray variable
            data['data_perc'] = xr.DataArray(data_perc, [('time', data.time)])

            # Filter by data_perc to drop low quality observations and finally import data using dask
            filtered = data.sel(time=data.data_perc >= masked_prop)
            print(f'    Loading {len(filtered.time)} filtered {sensor} timesteps')
            filtered = filtered.compute()
            
            # Optionally apply pixel quality mask to all observations that were not dropped in previous step
            if mask_pixel_quality:
                filtered = filtered.where(good_quality)

            # Optionally add satellite name variable
            if satellite_metadata:
                filtered['satellite'] = xr.DataArray([sensor] * len(filtered.time), [('time', filtered.time)])

            # Append result to list and add sensor name to list of successfully sensors
            filtered_sensors.append(filtered)
            successfully_returned.append(sensor)
            
            # Close datasets
            filtered = None
            good_quality = None
            data = None
            pq = None            
                        
        except:
            
            # If there is no data for sensor or if another error occurs:
            print(f'Loading {sensor} pixel quality\n    Skipping {sensor}; no valid data for query')

    # Concatenate all sensors into one big xarray dataset, and then sort by time 
    sensor_string = ", ".join(successfully_returned)
    print(f'Combining and sorting {sensor_string} data')
    combined_ds = xr.concat(filtered_sensors, dim='time')
    combined_ds = combined_ds.sortby('time')                                                               
       
    # Optionally filter to replace no data values with nans
    if mask_invalid_data:
        print('    Replacing invalid -999 values with NaN (data will be coerced to float64)')
        combined_ds = masking.mask_invalid_data(combined_ds)

    # Return combined dataset
    return combined_ds

예제 #14

파일 보기

파일: ccdc.py 프로젝트: klh5/CCDC

def runAll(num_bands, args):
    """Run on all tiles in the specified datasets/area. Keys are based on the last dataset listed."""

    global rows

    # Calculate the right number of columns to be returned from the data cube
    input_num_cols = num_bands + 1

    dc = datacube.Datacube()

    # Create Gridworkflow object for most recent dataset
    gw = GridWorkflow(dc.index, product=args.input_products[-1])

    # Get list of cell keys for most recent dataset
    keys = list(
        gw.list_cells(product=args.input_products[-1],
                      lat=(args.lowerlat, args.upperlat),
                      lon=(args.lowerlon, args.upperlon)).keys())

    dc.close()

    # Run on each key/tile in turn
    for key in keys:

        ccdc_args = []
        input_ds = []
        tmask_ds = []
        cloud_ds = []

        input_ds = loadAll(args.input_products, key, args.bands)

        if (input_ds):

            if (args.tmask_products):

                tmask_ds = loadAll(args.tmask_products, key,
                                   ['green', 'nir', 'swir1'])

            if (args.cloud_products):

                cloud_ds = loadAll(args.cloud_products, key, ['cloud_mask'])

            # Tidy up input data
            input_data = xr.concat(input_ds, dim='time')
            input_data = mask_invalid_data(input_data)

            if (cloud_ds):

                cloud_masks = xr.concat(cloud_ds, dim='time')

            # Do the same for TOA data if present - tmask_ds will be empty if no TOA data sets were specified
            if (tmask_ds):

                tmask_data = xr.concat(tmask_ds, dim='time')
                tmask_data = mask_invalid_data(tmask_data)

            # We want to process each pixel seperately
            for i in range(len(input_data.x)):
                for j in range(len(input_data.y)):

                    input_ts = input_data.isel(x=i, y=j)  # Get just one pixel

                    x_val = float(input_ts.x)
                    y_val = float(input_ts.y)

                    input_ts = transformToArray(
                        input_ts
                    )  # Transform the time series into a numpy array

                    if (input_ts.shape[0] > 0
                            and input_ts.shape[1] == input_num_cols):

                        if (cloud_ds):

                            cloud_ts = cloud_masks.isel(
                                x=i, y=j
                            )  # Get cloud mask values through time for this pixel

                            cloud_ts = transformToArray(cloud_ts)
                            cloud_ts = cloud_ts[np.isin(
                                cloud_ts[:, 0], input_ts[:, 0]
                            )]  # Remove any rows which aren't in the SREF data
                            input_ts = input_ts[
                                cloud_ts[:, 1] ==
                                0]  # Do masking (0 value is clear)

                        if (tmask_ds):

                            tmask_ts = tmask_data.isel(x=i, y=j)

                            tmask_ts = transformToArray(tmask_ts)
                            tmask_ts = tmask_ts[np.isin(
                                tmask_ts[:, 0], input_ts[:, 0]
                            )]  # Remove any rows which aren't in the SREF data
                            input_ts = doTmask(
                                input_ts, tmask_ts
                            )  # Use Tmask to further screen the input data

                        argslist = (input_ts, num_bands, x_val, y_val, args)
                        ccdc_args.append(argslist)

            # Do some tidying up
            del input_data

            if (cloud_ds):
                del cloud_ds
                del cloud_masks

            if (tmask_ds):
                del tmask_ds
                del tmask_data

            # Run processes for this key
            with Pool(processes=args.num_procs) as pool:
                pool.starmap(runCCDC, ccdc_args)

            # Generate output file name for this key
            output_file = os.path.join(
                args.outdir, "{}_{}_{}.csv".format(args.output_file, key[0],
                                                   key[1]))

            # Write headers to file
            headers = [
                "x", "y", "band", "start_date", "end_date", "start_val",
                "end_val", "coeffs", "RMSE", "intercept", "alpha",
                "change_date", "magnitude"
            ]

            with open(output_file, 'w') as output:
                writer = csv.writer(output)
                writer.writerow(headers)
                writer.writerows(rows)

            # Reset shared list
            rows = []

예제 #15

파일 보기

파일: ccdc.py 프로젝트: klh5/CCDC

def runByTile(key, num_bands, args):
    """Lets you process data using cell keys and x/y extent.
       A key represent one cell/area. Each cell has a tile for each time point. The x and y values define the extent of
       the tile that should be loaded and processed."""

    global rows

    # Calculate the right number of columns to be returned from the data cube
    input_num_cols = num_bands + 1

    ccdc_args = []
    input_ds = []
    tmask_ds = []
    cloud_ds = []

    input_ds = loadByTile(args.input_products, key, args.tile_y_min,
                          args.tile_y_max, args.tile_x_min, args.tile_x_max,
                          args.bands)

    if (input_ds):  # Check that there is actually some input data

        if (args.tmask_products
            ):  # If tmask should be used to screen for outliers

            tmask_ds = loadByTile(args.tmask_products, key, args.tile_y_min,
                                  args.tile_y_max, args.tile_x_min,
                                  args.tile_x_max, ['green', 'nir', 'swir1'])

        if (args.cloud_products):

            cloud_ds = loadByTile(args.cloud_products, key, args.tile_y_min,
                                  args.tile_y_max, args.tile_x_min,
                                  args.tile_x_max, ['cloud_mask'])

        # Tidy up input data
        input_data = xr.concat(input_ds, dim='time')
        input_data = mask_invalid_data(input_data)

        del input_ds

        if (cloud_ds):
            cloud_masks = xr.concat(cloud_ds, dim='time')

        # Do the same for TOA data if present - tmask_ds will be empty if no TOA data sets were specified
        if (tmask_ds):
            tmask_data = xr.concat(tmask_ds, dim='time')
            tmask_data = mask_invalid_data(tmask_data)

        for i in range(len(input_data.x)):
            for j in range(len(input_data.y)):

                input_ts = input_data.isel(x=i, y=j)

                x_val = float(input_ts.x)
                y_val = float(input_ts.y)

                input_ts = transformToArray(
                    input_ts)  # Transform to Numpy array, sort and remove NaNs

                # Check that the input data has at least 1 row and the right number of columns
                if (input_ts.shape[0] > 0
                        and input_ts.shape[1] == input_num_cols):

                    if (cloud_ds):
                        cloud_ts = cloud_masks.isel(
                            x=i, y=j
                        )  # Get cloud mask values through time for this pixel

                        cloud_ts = transformToArray(cloud_ts)
                        cloud_ts = cloud_ts[np.isin(
                            cloud_ts[:, 0], input_ts[:, 0]
                        )]  # Remove any rows which aren't in the SREF data
                        input_ts = input_ts[cloud_ts[:, 1] ==
                                            0]  # Do masking (0 value is clear)

                    if (tmask_ds):
                        tmask_ts = tmask_data.isel(x=i, y=j)

                        tmask_ts = transformToArray(tmask_ts)

                        tmask_ts = tmask_ts[np.isin(
                            tmask_ts[:, 0], input_ts[:, 0]
                        )]  # Remove any rows which aren't in the SREF data
                        input_ts = doTmask(
                            input_ts, tmask_ts
                        )  # Use Tmask to further screen the input data

                    argslist = (input_ts, num_bands, x_val, y_val, args)
                    ccdc_args.append(argslist)

    # Do some tidying up
    del input_data

    if (cloud_ds):
        del cloud_ds
        del cloud_masks

    if (tmask_ds):
        del tmask_ds
        del tmask_data

    # Run processes
    with Pool(processes=args.num_procs) as pool:
        pool.starmap(runCCDC, ccdc_args)

    # Generate output file name
    output_file = os.path.join(args.outdir, "{}.csv".format(args.output_file))

    # Write headers to file
    headers = [
        "x", "y", "band", "start_date", "end_date", "start_val", "end_val",
        "coeffs", "RMSE", "intercept", "alpha", "change_date", "magnitude"
    ]

    with open(output_file, 'w') as output:
        writer = csv.writer(output)
        writer.writerow(headers)
        writer.writerows(rows)

예제 #16

파일 보기

파일: DEADataHandling.py 프로젝트: ANU-WALD/dea_training

def load_clearlandsat(dc,
                      query,
                      sensors=('ls5', 'ls7', 'ls8'),
                      product='nbart',
                      dask_chunks={'time': 1},
                      lazy_load=False,
                      bands_of_interest=None,
                      masked_prop=0.0,
                      mask_dict=None,
                      mask_pixel_quality=True,
                      mask_invalid_data=True,
                      ls7_slc_off=False,
                      satellite_metadata=False):
    """Loads Landsat NBAR, NBART or FC25 and PQ data for multiple sensors (i.e. ls5, ls7, ls8) and returns a single 
    xarray dataset containing only observations that contain greater than a given proportion of good quality pixels.
    
    This function can be used to extract visually appealing time series of observations that are not affected by cloud,
    for example as an input to the `animated_timeseries` function from `DEAPlotting`.
    
    The proportion of clear pixels is calculated by summing the pixels that are marked as being good quality
    in the Landsat PQ25 layer. By default cloud, cloud shadow, saturated pixels and pixels missing data for any band 
    are considered poor quality data, but this can be customised using the `mask_dict` parameter.
    
    Last modified: March 2019
    Author: Robbi Bishop-Taylor, Bex Dunn    
    
    Parameters
    ----------    
    dc : datacube Datacube object
        A specific Datacube to import from, i.e. `dc = datacube.Datacube(app='Clear Landsat')`. This allows you to 
        also use development datacubes if they have been imported into the environment.    
    query : dict
        A dict containing the query bounds. Can include lat/lon, time etc. If no `time` query is given, the 
        function defaults to all timesteps available to all sensors (e.g. 1987-2018)
    sensors : list, optional
        An optional list of Landsat sensor names to load data for. Options are 'ls5', 'ls7', 'ls8'; defaults to all.
    product : str, optional
        An optional string specifying 'nbar', 'nbart' or 'fc'. Defaults to 'nbart'. For information on the difference, 
        see the '02_DEA_datasets/Introduction_to_Landsat' or '02_DEA_datasets/Introduction_to_Fractional_Cover'
        notebooks from DEA-notebooks.
    dask_chunks : dict, optional
        An optional dictionary containing the coords and sizes you wish to create dask chunks over. Usually
        used in combination with lazy_load=True (see below). example: dask_chunks = {'x': 500, 'y': 500}
    lazy_load : boolean, optional
        Setting this variable to 'True' will delay the computation of the function until you explicitly
        run ds.compute(). If used in conjuction with dask.distributed.Client() will allow 
        for automatic parallel computation. 
    bands_of_interest : list, optional
        An optional list of strings containing the bands to be read in; options include 'red', 'green', 'blue', 
        'nir', 'swir1', 'swir2'; defaults to all available bands if no bands are specified.
    masked_prop : float, optional
        An optional float giving the minimum percentage of good quality pixels required for a Landsat observation to 
        be loaded. Defaults to 0.0 which will return all observations regardless of pixel quality (set to e.g. 0.99 
        to return only observations with more than 99% good quality pixels).
    mask_dict : dict, optional
        An optional dict of arguments to the `masking.make_mask` function that can be used to identify poor
        quality pixels from the PQ layer using alternative masking criteria. The default value of None masks
        out pixels flagged as cloud or cloud shadow by either the ACCA or Fmask algorithms, any saturated pixels, 
        or any pixels that are missing data in any band (equivalent to: `mask_dict={'cloud_acca': 'no_cloud', 
        'cloud_shadow_acca': 'no_cloud_shadow', 'cloud_shadow_fmask': 'no_cloud_shadow', 'cloud_fmask': 'no_cloud', 
        'blue_saturated': False, 'green_saturated': False, 'red_saturated': False, 'nir_saturated': False, 
        'swir1_saturated': False, 'swir2_saturated': False, 'contiguous': True}`. See the 
        `02_DEA_datasets/Introduction_to_LandsatPQ.ipynb` notebook on DEA Notebooks for a list of all possible options.
    mask_pixel_quality : bool, optional
        An optional boolean indicating whether to apply the pixel quality mask to all observations that were not
        filtered out for having less good quality pixels that `masked_prop`. For example, if `masked_prop=0.99`, the
        filtered images may still contain up to 1% poor quality pixels. The default of False simply returns the
        resulting observations without masking out these pixels; True masks them out and sets them to NaN using the
        pixel quality mask, but has the side effect of changing the data type of the output arrays from int16 to
        float32 which can cause memory issues. To reduce memory usage, set to False.
    mask_invalid_data : bool, optional
        An optional boolean indicating whether invalid -999 nodata values should be replaced with NaN. Defaults to
        True; this has the side effect of changing the data type of the output arrays from int16 to float32 which
        can cause memory issues. To reduce memory usage, set to False.
    ls7_slc_off : bool, optional
        An optional boolean indicating whether to include data from after the Landsat 7 SLC failure (i.e. SLC-off).
        Defaults to False, which removes all Landsat 7 observations after May 31 2003. 
    satellite_metadata : bool, optional
        An optional boolean indicating whether to return the dataset with a `satellite` variable that gives the name 
        of the satellite that made each observation in the timeseries (i.e. ls5, ls7, ls8). Defaults to False. 
    
    Returns
    -------
    combined_ds : xarray Dataset
        An xarray dataset containing only Landsat observations that contain greater than `masked_prop`
        proportion of clear pixels.   
        
    Notes
    -----
    Memory issues: For large data extractions, it is recommended that you set both `mask_pixel_quality=False` and 
    `mask_invalid_data=False`. Otherwise, all output variables will be coerced to float32 when NaN values are 
    inserted into the array, potentially causing your data to use 2x as much memory. Be aware that the resulting
    arrays will contain invalid -999 values which should be considered in analyses.
        
    Example
    -------    
    >>> # Import modules
    >>> import datacube
    >>> import sys
    >>> # Import dea-notebooks functions using relative link to 10_Scripts directory
    >>> sys.path.append('../10_Scripts')
    >>> import DEADataHandling
    >>> # Connect to a datacube containing Landsat data
    >>> dc = datacube.Datacube(app='load_clearlandsat')
    >>> # Set up spatial and temporal query
    >>> query = {'x': (954163, 972163),
    ...          'y': (-3573891, -3555891),
    ...          'time': ('2011-06-01', '2013-06-01'),
    ...          'crs': 'EPSG:3577'}   
    >>> # Load observations with more than 75% good quality pixels from ls5, ls7 and ls8 as a combined dataset
    >>> landsat_ds = DEADataHandling.load_clearlandsat(dc=dc, query=query, sensors=['ls5', 'ls7', 'ls8'], 
    ...                                    bands_of_interest=['red', 'green', 'blue'], 
    ...                                    masked_prop=0.75, mask_pixel_quality=True, ls7_slc_off=True)
    Loading ls5
        Loading 4 filtered ls5 timesteps
    Loading ls7
        Loading 29 filtered ls7 timesteps
    Loading ls8
        Loading 3 filtered ls8 timesteps
    Combining and sorting ls5, ls7, ls8 data
        Replacing invalid -999 values with NaN (data will be coerced to float32)
    >>> # Test that function returned data
    >>> len(landsat_ds.time) > 0
    True
                
    """

    #######################
    # Process each sensor #
    #######################

    #warn if loading a pq bitstring product and attempting to mask it (and therefore cast to float)
    if product == 'pq' and (mask_invalid_data or mask_pixel_quality):
        warnings.warn(
            """You are attempting to load pixel quality product with a mask flag
                        (mask_invalid_data or mask_pixel_quality). Pixel quality is a bitstring 
                        (only makes sense as int) and masking
                        casts to float32.""")

    # Dictionary to save results from each sensor
    filtered_sensors = {}

    # Iterate through all sensors, returning only observations with > mask_prop clear pixels
    for sensor in sensors:

        # Load PQ data using dask
        print(f'Loading {sensor}')

        # If bands of interest are given, assign measurements in dc.load call. This is
        # for compatibility with the existing dea-notebooks load_nbarx function.
        if bands_of_interest:

            # Lazily load Landsat data using dask
            data = dc.load(product=f'{sensor}_{product}_albers',
                           measurements=bands_of_interest,
                           group_by='solar_day',
                           dask_chunks=dask_chunks,
                           **query)

        # If no bands of interest given, run without specifying measurements, and
        # therefore return all available bands
        else:

            # Lazily load Landsat data using dask
            data = dc.load(product=f'{sensor}_{product}_albers',
                           group_by='solar_day',
                           dask_chunks=dask_chunks,
                           **query)

        # Load PQ data
        pq = dc.load(product=f'{sensor}_pq_albers',
                     group_by='solar_day',
                     fuse_func=ga_pq_fuser,
                     dask_chunks=dask_chunks,
                     **query)

        # If resulting dataset has data, continue:
        if data.variables:

            # Remove Landsat 7 SLC-off from PQ layer if ls7_slc_off=False
            if not ls7_slc_off and sensor == 'ls7':

                print('    Ignoring SLC-off observations for ls7')
                data = data.sel(time=data.time < np.datetime64('2003-05-30'))

            # If more than 0 timesteps
            if len(data.time) > 0:

                # Return only Landsat observations that have matching PQ data
                time = (data.time - pq.time).time
                data = data.sel(time=time)
                pq = pq.sel(time=time)

                # If a custom dict is provided for mask_dict, use these values to make mask from PQ
                if mask_dict:

                    # Mask PQ using custom values by unpacking mask_dict **kwarg
                    good_quality = masking.make_mask(pq.pixelquality,
                                                     **mask_dict)

                else:

                    # Identify pixels with no clouds in either ACCA for Fmask
                    good_quality = masking.make_mask(
                        pq.pixelquality,
                        cloud_acca='no_cloud',
                        cloud_shadow_acca='no_cloud_shadow',
                        cloud_shadow_fmask='no_cloud_shadow',
                        cloud_fmask='no_cloud',
                        blue_saturated=False,
                        green_saturated=False,
                        red_saturated=False,
                        nir_saturated=False,
                        swir1_saturated=False,
                        swir2_saturated=False,
                        contiguous=True)

                # Compute good data for each observation as a percentage of total array pixels. Need to
                # sum over x and y axes individually so that the function works with lat-lon dimensions,
                # and because it isn't currently possible to pass a list of axes (bug with xarray?)
                data_perc = good_quality.sum(axis=1).sum(
                    axis=1) / (good_quality.shape[1] * good_quality.shape[2])

                # Add data_perc data to Landsat dataset as a new xarray variable
                data['data_perc'] = xr.DataArray(data_perc,
                                                 [('time', data.time)])

                # Filter by data_perc to drop low quality observations and finally import data using dask
                filtered = data.sel(time=data.data_perc >= masked_prop)
                print(
                    f'    Loading {len(filtered.time)} filtered {sensor} timesteps'
                )

                # Optionally apply pixel quality mask to all observations that were not dropped in previous step
                if mask_pixel_quality:

                    # First change dtype to float32, then mask out values using
                    # `.where()`. By casting to float32, we prevent `.where()`
                    # from automatically casting to float64, using 2x the memory
                    # We also need to manually reset attributes due to a possible
                    # bug in recent xarray version
                    filtered = filtered.astype(
                        np.float32).assign_attrs(crs=filtered.crs)
                    filtered = filtered.where(good_quality)

                # Optionally add satellite name variable
                if satellite_metadata:
                    filtered['satellite'] = xr.DataArray(
                        [sensor] * len(filtered.time),
                        [('time', filtered.time)])

                # Add result to dictionary
                if lazy_load == True:
                    filtered_sensors[sensor] = filtered
                else:
                    filtered_sensors[sensor] = filtered.compute()

                # Close datasets
                filtered = None
                good_quality = None
                data = None
                pq = None

            else:

                # If there is no data for sensor or if another error occurs:
                print(f'    Skipping {sensor}; no valid data for query')

        else:

            # If there is no data for sensor or if another error occurs:
            print(f'    Skipping {sensor}; no valid data for query')

    ############################
    # Combine multiple sensors #
    ############################

    # Proceed with concatenating only if there is more than 1 sensor processed
    if len(filtered_sensors) > 1:

        # Concatenate all sensors into one big xarray dataset, and then sort by time
        sensor_string = ", ".join(filtered_sensors.keys())
        print(f'Combining and sorting {sensor_string} data')
        combined_ds = xr.concat(filtered_sensors.values(), dim='time')
        combined_ds = combined_ds.sortby('time')

        # Optionally filter to replace no data values with nans
        if mask_invalid_data:

            print(
                '    Replacing invalid -999 values with NaN (data will be coerced to float32)'
            )

            # First change dtype to float32, then mask out values using
            # `.where()`. By casting to float32, we prevent `.where()`
            # from automatically casting to float64, using 2x the memory
            # We also need to manually reset attributes due to a possible
            # bug in recent xarray version
            combined_ds = (combined_ds.astype(
                np.float32).assign_attrs(crs=combined_ds.crs))
            combined_ds = masking.mask_invalid_data(combined_ds)

        # reset pixel quality attributes
        if product == 'pq':
            combined_ds.pixelquality.attrs.update(
                list(filtered_sensors.values())[0].pixelquality.attrs)

        # Return combined dataset
        return combined_ds

    # Return the single dataset if only one sensor was processed
    elif len(filtered_sensors) == 1:

        sensor_string = ", ".join(filtered_sensors.keys())
        print(f'Returning {sensor_string} data')
        sensor_ds = list(filtered_sensors.values())[0]

        # Optionally filter to replace no data values with nans
        if mask_invalid_data:

            print(
                '    Replacing invalid -999 values with NaN (data will be coerced to float32)'
            )

            # First change dtype to float32, then mask out values using
            # `.where()`. By casting to float32, we prevent `.where()`
            # from automatically casting to float64, using 2x the memory
            # We also need to manually reset attributes due to a possible
            # bug in recent xarray version
            sensor_ds = (sensor_ds.astype(
                np.float32).assign_attrs(crs=sensor_ds.crs))
            sensor_ds = masking.mask_invalid_data(sensor_ds)

        return sensor_ds

    else:

        print(
            f'No data returned for query for any sensor in {", ".join(sensors)} '
            f'and time range {"-".join(query["time"])}')

예제 #17

파일 보기

파일: DEADataHandling.py 프로젝트: RSPaW/dea-notebooks

def load_nbarx(dc, sensor, query, product='nbart', bands_of_interest='', filter_pq=True):
    """
    Loads NBAR (Nadir BRDF Adjusted Reflectance) or NBAR-T (terrain corrected NBAR) data for a
    sensor, masks using pixel quality (PQ), then optionally filters out terrain -999s (for NBAR-T).
    Returns an xarray dataset and CRS and Affine objects defining map projection and geotransform

    Last modified: May 2018
    Author: Bex Dunn
    Modified by: Claire Krause, Robbi Bishop-Taylor, Bex Dunn

    inputs
    dc - Handle for the Datacube to import from. This allows you to also use dev environments
    if that have been imported into the environment.
    sensor - Options are 'ls5', 'ls7', 'ls8'
    query - A dict containing the query bounds. Can include lat/lon, time etc. 

    optional
    product - 'nbar' or 'nbart'. Defaults to nbart unless otherwise specified
    bands_of_interest - List of strings containing the bands to be read in; defaults to all bands,
                        options include 'red', 'green', 'blue', 'nir', 'swir1', 'swir2'
    filter_pq - boolean. Will filter clouds and saturated pixels using PQ unless set to False


    outputs
    ds - Extracted and optionally PQ filtered dataset
    crs - CRS object defining dataset coordinate reference system
    affine - Affine object defining dataset affine transformation
    """

    product_name = '{}_{}_albers'.format(sensor, product)
    mask_product = '{}_{}_albers'.format(sensor, 'pq')
    print('Loading {}'.format(product_name))

    # If bands of interest are given, assign measurements in dc.load call
    if bands_of_interest:

        ds = dc.load(product=product_name, measurements=bands_of_interest,
                     group_by='solar_day', **query)

    # If no bands of interest given, run without specifying measurements
    else:

        ds = dc.load(product=product_name, group_by='solar_day', **query)

    # Proceed if the resulting call returns data
    if ds.variables:

        crs = ds.crs
        affine = ds.affine
        print('Loaded {}'.format(product_name))

        # If pixel quality filtering is enabled, extract PQ data to use as mask
        if filter_pq:

            sensor_pq = dc.load(product=mask_product, fuse_func=ga_pq_fuser,
                                group_by='solar_day', **query)

            # If PQ call returns data, use to mask input data
            if sensor_pq.variables:
                print('Generating mask {}'.format(mask_product))
                good_quality = masking.make_mask(sensor_pq.pixelquality,
                                                 cloud_acca='no_cloud',
                                                 cloud_shadow_acca='no_cloud_shadow',
                                                 cloud_shadow_fmask='no_cloud_shadow',
                                                 cloud_fmask='no_cloud',
                                                 blue_saturated=False,
                                                 green_saturated=False,
                                                 red_saturated=False,
                                                 nir_saturated=False,
                                                 swir1_saturated=False,
                                                 swir2_saturated=False,
                                                 contiguous=True)

                # Apply mask to preserve only good data
                ds = ds.where(good_quality)

            ds.attrs['crs'] = crs
            ds.attrs['affine'] = affine

        # Replace nodata values with nans

            ds = masking.mask_invalid_data(ds)

        return ds, crs, affine

    else:

        print('Failed to load {}'.format(product_name))
        return None, None, None

예제 #18

파일 보기

        for tile_index, tile in tile_list.items():
            dataset = gw.load(tile[0:1, 400:401, 0:1],
                              measurements=['red', 'nir', 'blue',
                                            'green'])  # 200ish/400ish

            if (dataset.variables):
                sref_ds.append(dataset)

    # Close datacube connection to database
    dc.close()

    # Concatenate the three datasets
    sref = xr.concat(sref_ds, dim='time')

    # Change nodata values (0's) to NaN
    sref = mask_invalid_data(sref)

    # We want to process each pixel seperately
    for i in range(len(sref.x)):
        for j in range(len(sref.y)):

            # Get the time series of observations for this pixel
            sref_ts = sref.isel(x=i, y=j)

            # Transform to pandas dataframe
            sref_data = transformToDf(sref_ts)

            # Drop rows with NA values in any column
            sref_data.dropna(axis=0, how='any', inplace=True)

            # Check columns weren't dropped

예제 #19

파일 보기

def load_ard(
    dc,
    products=None,
    min_gooddata=0.0,
    fmask_gooddata=[1, 4, 5],
    mask_pixel_quality=True,
    mask_invalid_data=True,
    mask_contiguity="nbart_contiguity",
    mask_dtype=np.float32,
    ls7_slc_off=True,
    product_metadata=False,
    **dcload_kwargs,
):
    """
    Loads Landsat Collection 3 or Sentinel 2 Definitive and Near Real
    Time data for multiple sensors (i.e. ls5t, ls7e and ls8c for
    Landsat; s2a and s2b for Sentinel 2), and returns a single masked
    xarray dataset containing only observations that contain greater
    than a given proportion of good quality pixels. This can be used
    to extract clean time series of observations that are not affected
    by cloud, for example as an input to the `animated_timeseries`
    function from `dea_plotting`.

    The proportion of good quality pixels is calculated by summing the
    pixels flagged as good quality in `fmask`. By default non-cloudy or
    shadowed land, snow and water pixels are treated as good quality,
    but this can be customised using the `fmask_gooddata` parameter.

    Last modified: February 2020

    Parameters
    ----------
    dc : datacube Datacube object
        The Datacube to connect to, i.e. `dc = datacube.Datacube()`.
        This allows you to also use development datacubes if required.
    products : list
        A list of product names to load data from. Valid options are
        ['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] for Landsat,
        ['s2a_ard_granule', 's2b_ard_granule'] for Sentinel 2 Definitive,
        and ['s2a_nrt_granule', 's2b_nrt_granule'] for Sentinel 2 Near
        Real Time (on the DEA Sandbox only).
    min_gooddata : float, optional
        An optional float giving the minimum percentage of good quality
        pixels required for a satellite observation to be loaded.
        Defaults to 0.0 which will return all observations regardless of
        pixel quality (set to e.g. 0.99 to return only observations with
        more than 99% good quality pixels).
    fmask_gooddata : list, optional
        An optional list of fmask values to treat as good quality
        observations in the above `min_gooddata` calculation. The
        default is `[1, 4, 5]` which will return non-cloudy or shadowed
        land, snow and water pixels. Choose from:
        `{'0': 'nodata', '1': 'valid', '2': 'cloud',
          '3': 'shadow', '4': 'snow', '5': 'water'}`.
    mask_pixel_quality : bool, optional
        An optional boolean indicating whether to apply the good data
        mask to all observations that were not filtered out for having
        less good quality pixels than `min_gooddata`. E.g. if
        `min_gooddata=0.99`, the filtered observations may still contain
        up to 1% poor quality pixels. The default of False simply
        returns the resulting observations without masking out these
        pixels; True masks them and sets them to NaN using the good data
        mask. This will convert numeric values to floating point values
        which can cause memory issues, set to False to prevent this.
    mask_invalid_data : bool, optional
        An optional boolean indicating whether invalid -999 nodata
        values should be replaced with NaN. These invalid values can be
        caused by missing data along the edges of scenes, or terrain
        effects (for NBART). Be aware that masking out invalid values
        will convert all numeric values to floating point values when
        -999 values are replaced with NaN, which can cause memory issues.
    mask_contiguity : str or bool, optional
        An optional string or boolean indicating whether to mask out
        pixels missing data in any band (i.e. "non-contiguous" values).
        Although most missing data issues are resolved by
        `mask_invalid_data`, this step is important for generating
        clean and concistent composite datasets. The default
        is `mask_contiguity='nbart_contiguity'` which will set any
        pixels with non-contiguous values to NaN based on NBART data.
        If you are loading NBAR data instead, you should specify
        `mask_contiguity='nbar_contiguity'` instead. To ignore non-
        contiguous values completely, set `mask_contiguity=False`.
        Be aware that masking out non-contiguous values will convert
        all numeric values to floating point values when -999 values
        are replaced with NaN, which can cause memory issues.
    mask_dtype : numpy dtype, optional
        An optional parameter that controls the data type/dtype that
        layers are coerced to when when `mask_pixel_quality=True` or
        `mask_contiguity=True`. Defaults to `np.float32`, which uses
        approximately 1/2 the memory of `np.float64`.
    ls7_slc_off : bool, optional
        An optional boolean indicating whether to include data from
        after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to
        True, which keeps all Landsat 7 observations > May 31 2003.
    product_metadata : bool, optional
        An optional boolean indicating whether to return the dataset
        with a `product` variable that gives the name of the product
        that each observation in the time series came from (e.g.
        'ga_ls5t_ard_3'). Defaults to False.
    **dcload_kwargs :
        A set of keyword arguments to `dc.load` that define the
        spatiotemporal query used to extract data. This typically
        includes `measurements`, `x`, `y`, `time`, `resolution`,
        `resampling`, `group_by` and `crs`. Keyword arguments can
        either be listed directly in the `load_ard` call like any
        other parameter (e.g. `measurements=['nbart_red']`), or by
        passing in a query kwarg dictionary (e.g. `**query`). For a
        list of possible options, see the `dc.load` documentation:
        https://datacube-core.readthedocs.io/en/latest/dev/api/generate/datacube.Datacube.load.html

    Returns
    -------
    combined_ds : xarray Dataset
        An xarray dataset containing only satellite observations that
        contains greater than `min_gooddata` proportion of good quality
        pixels.

    """

    # Due to possible bug in xarray 0.13.0, define temporary function
    # which converts dtypes in a way that preserves attributes
    def astype_attrs(da, dtype=np.float32):
        """
        Loop through all data variables in the dataset, record
        attributes, convert to a custom dtype, then reassign attributes.
        If the data variable cannot be converted to the custom dtype
        (e.g. trying to convert non-numeric dtype like strings to
        floats), skip and return the variable unchanged.

        This can be combined with `.where()` to save memory. By casting
        to e.g. np.float32, we prevent `.where()` from automatically
        casting to np.float64, using 2x the memory. np.float16 could be
        used to save even more memory (although this may not be
        compatible with all downstream applications).

        This custom function is required instead of using xarray's
        built-in `.astype()`, due to a bug in xarray 0.13.0 that drops
        attributes: https://github.com/pydata/xarray/issues/3348
        """

        try:
            da_attr = da.attrs
            da = da.astype(dtype)
            da = da.assign_attrs(**da_attr)
            return da

        except ValueError:
            return da

    dcload_kwargs = deepcopy(dcload_kwargs)

    # Determine if lazy loading is required
    lazy_load = "dask_chunks" in dcload_kwargs

    # Warn user if they combine lazy load with min_gooddata
    if (min_gooddata > 0.0) & lazy_load:
        warnings.warn("Setting 'min_gooddata' percentage to > 0.0 "
                      "will cause dask arrays to compute when "
                      "loading pixel-quality data to calculate "
                      "'good pixel' percentage. This can "
                      "significantly slow the return of your dataset.")

    # Verify that products were provided, and that only Sentinel-2 or
    # only Landsat products are being loaded at the same time
    if not products:
        raise ValueError("Please provide a list of product names "
                         "to load data from. Valid options are: \n"
                         "['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] "
                         "for Landsat, ['s2a_ard_granule', "
                         "'s2b_ard_granule'] \nfor Sentinel 2 Definitive, or "
                         "['s2a_nrt_granule', 's2b_nrt_granule'] for "
                         "Sentinel 2 Near Real Time")
    elif all(["ls" in product for product in products]):
        pass
    elif all(["s2" in product for product in products]):
        pass
    else:
        raise ValueError("Loading both Sentinel-2 and Landsat data "
                         "at the same time is currently not supported")

    # Create a list to hold data for each product
    product_data = []

    # Iterate through each requested product
    for product in products:

        try:

            # Load data including fmask band
            print(f"Loading {product} data")
            try:

                # If dask_chunks is specified, load data using query
                if lazy_load:
                    ds = dc.load(product=f"{product}", **dcload_kwargs)

                # If no dask chunks specified, add this param so that
                # we can lazy load data before filtering by good data
                else:
                    ds = dc.load(product=f"{product}",
                                 dask_chunks={},
                                 **dcload_kwargs)

            except KeyError as e:
                raise ValueError(f"Band {e} does not exist in this product. "
                                 f"Verify all requested `measurements` exist "
                                 f"in {products}")

            # Keep a record of the original number of observations
            total_obs = len(ds.time)
            print(total_obs)

            # Identify all pixels not affected by cloud/shadow/invalid
            good_quality = ds

            # If any data was returned
            if len(ds.time) > 0:

                # Optionally apply pixel quality mask to observations
                # remaining after the filtering step above to mask out
                # all remaining bad quality pixels
                if mask_pixel_quality:
                    print("    Applying pixel quality/cloud mask")

                    # Change dtype to custom float before masking to
                    # save memory. See `astype_attrs` func docstring
                    # above for details
                    ds = ds.apply(astype_attrs,
                                  dtype=mask_dtype,
                                  keep_attrs=True)
                    ds = ds.where(good_quality)

                # Optionally filter to replace no data values with nans
                if mask_invalid_data:
                    print("    Applying invalid data mask")

                    # Change dtype to custom float before masking to
                    # save memory. See `astype_attrs` func docstring
                    # above for details
                    ds = ds.apply(astype_attrs,
                                  dtype=mask_dtype,
                                  keep_attrs=True)
                    ds = masking.mask_invalid_data(ds)

                # If any data was returned, add result to list
                product_data.append(ds)

            # If no data is returned, print status
            else:
                print(f"    No data for {product}")

            # If  AttributeError due to there being no variables in
            # the dataset, skip this product and move on to the next
        except AttributeError:
            print(f"    No data for {product}")
            # If any data was returned above, combine into one xarray
    if len(product_data) > 0:
        # Concatenate results and sort by time
        print(f"Combining and sorting data")
        combined_ds = xr.concat(product_data, dim="time").sortby("time")

        # If `lazy_load` is True, return data as a dask array without
        # actually loading it in
        if lazy_load:
            print(f"    Returning {len(combined_ds.time)} observations"
                  " as a dask array")
            return combined_ds
        else:
            print(f"    Returning {len(combined_ds.time)} observations ")
            return combined_ds.compute()

            # If no data was returned:
    else:
        print("No data returned for query")
        return None

예제 #20

파일 보기

파일: DEADataHandling.py 프로젝트: RSPaW/dea-notebooks

def load_clearlandsat(dc, query, sensors=['ls5', 'ls7', 'ls8'], bands_of_interest=None,
                      product='nbart', masked_prop=0.99, mask_dict=None, apply_mask=False, ls7_slc_off=False):
    
    """
    Loads Landsat NBAR, NBART or FC25 and PQ data for multiple sensors (i.e. ls5, ls7, ls8), and returns a single 
    xarray dataset containing only observations that contain greater than a given proportion of clear pixels.    
  
    This function was designed to extract visually appealing time series of observations that are not
    affected by cloud, for example as an input to the `animated_timeseries` function from `DEAPlotting`.
    
    The proportion of clear pixels is calculated by summing the pixels that are flagged as being problematic
    in the Landsat PQ25 layer. By default only cloudy pixels or pixels without valid data in every band 
    are included in the calculation, but this can be customised using the `mask_dict` function.
    
    Last modified: August 2018
    Author: Robbi Bishop-Taylor, Bex Dunn
    
    :param dc: 
        A specific Datacube to import from, i.e. `dc = datacube.Datacube(app='Clear Landsat')`. This allows you to 
        also use development datacubes if they have been imported into the environment.
    
    :param query: 
        A dict containing the query bounds. Can include lat/lon, time etc. If no `time` query is given, the 
        function defaults to all timesteps available to all sensors (e.g. 1987-2018)

    :param sensors:
        An optional list of Landsat sensor names to load data for. Options are 'ls5', 'ls7', 'ls8'; defaults to all.

    :param product:
        An optional string specifying 'nbar', 'nbart' or 'fc'. Defaults to 'nbart'. For information on the difference, 
        see the 'GettingStartedWithLandsat' or 'Introduction_to_Fractional_Cover' notebooks on DEA-notebooks.
        
    :param bands_of_interest:
        An optional list of strings containing the bands to be read in; options include 'red', 'green', 'blue', 
        'nir', 'swir1', 'swir2'; defaults to all available bands if no bands are specified.

    :param masked_prop:
        An optional float giving the minimum percentage of clear pixels required for a Landsat observation to be 
        loaded. Defaults to 0.99 (i.e. only return observations with less than 1% of unclear pixels).
            
    :param mask_dict:
        An optional dict of arguments to the `masking.make_mask` function that can be used to identify clear 
        observations from the PQ layer using alternative masking criteria. The default value of None masks out 
        pixels flagged as cloud by either the ACCA or Fmask alogorithms, and that have values for every band 
        (equivalent to: `mask_dict={'cloud_acca': 'no_cloud', 'cloud_fmask': 'no_cloud', 'contiguous': True}`.
        See the `Landsat5-7-8-PQ` notebook on DEA Notebooks for a list of all possible options.
        
    :param apply_mask:
        An optional boolean indicating whether resulting observations should have the PQ mask applied to filter
        out any remaining unclear cells. For example, if `masked_prop=0.99`, the filtered images may still contain
        up to 1% unclear/cloudy pixels. The default of False simply returns the resulting observations without
        masking out these pixels; True removes them using the mask. 

    :param ls7_slc_off:
        An optional boolean indicating whether to include data from after the Landsat 7 SLC failure (i.e. SLC-off).
        Defaults to False, which removes all Landsat 7 observations after May 31 2003. 
    
    :returns:
        An xarray dataset containing only Landsat observations that contain greater than `masked_prop`
        proportion of clear pixels.  
        
    :example:
    
    >>> # Import modules
    >>> import datacube
    >>> import sys
    >>> 
    >>> # Import dea-notebooks functions using relative link to Scripts directory
    >>> sys.path.append('../10_Scripts')
    >>> import DEADataHandling   
    >>> 
    >>> # Define datacube to import from
    >>> dc = datacube.Datacube(app='Clear Landsat')
    >>> 
    >>> # Set up spatial and temporal query
    >>> query = {'x': (-191400.0, -183400.0),
    >>>          'y': (-1423460.0, -1415460.0),
    >>>          'time': ('1998-01-01', '2003-01-01'),
    >>>          'crs': 'EPSG:3577'}
    >>> 
    >>> # Load in red, green and blue bands for all clear Landsat observations with < 1% unclear values. 
    >>> combined_ds = DEADataHandling.load_clearlandsat(dc=dc, query=query, 
    >>>                                                 bands_of_interest=['red', 'green', 'blue'], 
    >>>                                                 masked_prop=0.99) 
    >>> combined_ds
        
    """
    

    # List to save results from each sensor
    filtered_sensors = []

    # Iterate through all sensors, returning only observations with > mask_prop clear pixels
    for sensor in sensors:
        
        try:
            
            # If bands of interest are given, assign measurements in dc.load call. This is
            # for compatibility with the existing dea-notebooks load_nbarx function.
            if bands_of_interest:
                
                # Lazily load Landsat data using dask              
                data = dc.load(product = '{}_{}_albers'.format(sensor, product),
                               measurements=bands_of_interest,
                               group_by = 'solar_day', 
                               dask_chunks={'time': 1},
                               **query)

            # If no bands of interest given, run without specifying measurements, and 
            # therefore return all available bands
            else:
                
                # Lazily load Landsat data using dask  
                data = dc.load(product = '{}_{}_albers'.format(sensor, product),
                               group_by = 'solar_day', 
                               dask_chunks={'time': 1},
                               **query)             

            # Load PQ data
            pq = dc.load(product = '{}_pq_albers'.format(sensor),
                         group_by = 'solar_day',
                         fuse_func=ga_pq_fuser,
                         dask_chunks={'time': 1},
                         **query)

            # Remove Landsat 7 SLC-off from PQ layer if ls7_slc_off=False
            if not ls7_slc_off and sensor == 'ls7':

                print('Ignoring SLC-off observations for ls7')
                data = data.where(data.time < np.datetime64('2003-05-30'), drop=True) 

            # Return only Landsat observations that have matching PQ data 
            time = (data.time - pq.time).time
            data = data.sel(time=time)
            pq = pq.sel(time=time)

            # Load PQ data using dask
            print('Loading {} PQ'.format(sensor))
            pq = pq.compute()
            
            # If a custom dict is provided for mask_dict, use these values to make mask from PQ
            if mask_dict:
                
                # Mask PQ using custom values by unpacking mask_dict **kwarg
                good_quality = masking.make_mask(pq.pixelquality, **mask_dict)
                
            else:

                # Identify pixels with no clouds in either ACCA for Fmask
                good_quality = masking.make_mask(pq.pixelquality,
                                                 cloud_acca='no_cloud',
                                                 cloud_fmask='no_cloud',
                                                 contiguous=True)

            # Compute good data for each observation as a percentage of total array pixels
            data_perc = good_quality.sum(dim=['x', 'y']) / (good_quality.shape[1] * good_quality.shape[2])
            
            # Add data_perc data to Landsat dataset as a new xarray variable
            data['data_perc'] = xr.DataArray(data_perc, [('time', data.time)])

            # Filter and finally import data using dask
            filtered = data.where(data.data_perc >= masked_prop, drop=True)
            print('    Loading {} filtered {} timesteps'.format(len(filtered.time), sensor))
            filtered = filtered.compute()
            
            # Optionally apply mask (instead of only filtering)
            if apply_mask:
                filtered = filtered.where(good_quality)
            
            # Append result to list
            filtered_sensors.append(filtered)
            
            # Close datasets
            filtered = None
            good_quality = None
            data = None
            pq = None            
                        
        except:
            
            # If there is no data for sensor or if another error occurs:
            print('    Skipping {}'.format(sensor))

    # Concatenate all sensors into one big xarray dataset, and then sort by time
    print('Combining and sorting ls5, ls7 and ls8 data')
    combined_ds = xr.concat(filtered_sensors, dim='time')
    combined_ds = combined_ds.sortby('time')
                                                               
    #Filter to replace no data values with nans
    combined_ds = masking.mask_invalid_data(combined_ds)

    # Return combined dataset
    return combined_ds

예제 #21

파일 보기

파일: ccdc.py 프로젝트: klh5/CCDC

def runOnSubset(num_bands, args):
    """If the user chooses to run the algorithm on a random subsample of the data, this function is called.
       It divides the number of subsamples to be taken by the number of cells/keys for a product or products.
       It then runs CCDC on the appropriate number of pixels per cell to get an even spread of samples."""

    global rows

    # Calculate the right number of columns to be returned from the data cube
    input_num_cols = num_bands + 1

    tile_size = 3500  # Size of real tiles - too big to fit in memory
    new_tile_size = 875  # New size - this will divide each tile into 16

    dc = datacube.Datacube()

    # Create Gridworkflow object for most recent dataset
    gw = GridWorkflow(dc.index, product=args.input_products[-1])

    # Get list of cell keys for most recent dataset
    keys = list(gw.list_cells(product=args.input_products[-1]).keys())

    dc.close()

    num_keys = len(keys)

    # Calculate number of pixels to use from each cell
    num_subs = (
        (tile_size / new_tile_size) *
        (tile_size / new_tile_size)) * num_keys  # Get number of sub-tiles
    samples_per_cell = np.ceil(args.num_samples / num_subs).astype(
        int)  # Get number of samples to be taken per sub-tile

    # Load data for each cell
    for key in keys:

        # Each tile needs to be divided into mini-tiles
        for x in range(0, tile_size, new_tile_size):  # Division in x dimension
            for y in range(0, tile_size,
                           new_tile_size):  # Division in y dimension

                min_x = x
                max_x = x + new_tile_size
                min_y = y
                max_y = y + new_tile_size

                ccdc_args = []
                input_ds = []
                tmask_ds = []
                cloud_ds = []

                input_ds = loadByTile(args.input_products, key, min_y, max_y,
                                      min_x, max_x, args.bands)

                if (input_ds):

                    if (args.tmask_products):

                        tmask_ds = loadByTile(args.tmask_products, key, min_y,
                                              max_y, min_x, max_x,
                                              ['green', 'nir', 'swir1'])

                    if (args.cloud_products):

                        cloud_ds = loadByTile(args.cloud_products, key, min_y,
                                              max_y, min_x, max_x,
                                              ['cloud_mask'])

                    # Tidy up input data
                    input_data = xr.concat(input_ds, dim='time')
                    input_data = mask_invalid_data(input_data)

                    if (cloud_ds):

                        cloud_masks = xr.concat(cloud_ds, dim='time')

                    # Do the same for TOA data if present - tmask_ds will be empty if no TOA data sets were specified
                    if (tmask_ds):

                        tmask_data = xr.concat(tmask_ds, dim='time')
                        tmask_data = mask_invalid_data(tmask_data)

                    # We want to process a random subset of pixels
                    for i in range(samples_per_cell):

                        random_x = np.random.randint(0, new_tile_size)
                        random_y = np.random.randint(0, new_tile_size)

                        input_ts = input_data.isel(
                            x=random_x, y=random_y)  # Get just one pixel

                        x_val = float(input_ts.x)
                        y_val = float(input_ts.y)

                        input_ts = transformToArray(
                            input_ts
                        )  # Transform the time series into a numpy array

                        if (input_ts.shape[0] > 0
                                and input_ts.shape[1] == input_num_cols):

                            if (cloud_ds):

                                cloud_ts = cloud_masks.isel(
                                    x=random_x, y=random_y
                                )  # Get cloud mask values through time for this pixel

                                cloud_ts = transformToArray(cloud_ts)
                                cloud_ts = cloud_ts[np.isin(
                                    cloud_ts[:, 0], input_ts[:, 0]
                                )]  # Remove any rows which aren't in the SREF data
                                input_ts = input_ts[
                                    cloud_ts[:, 1] ==
                                    0]  # Do masking (0 value is clear)

                            if (tmask_ds):

                                tmask_ts = tmask_data.isel(x=random_x,
                                                           y=random_y)

                                tmask_ts = transformToArray(tmask_ts)
                                tmask_ts = tmask_ts[np.isin(
                                    tmask_ts[:, 0], input_ts[:, 0]
                                )]  # Remove any rows which aren't in the SREF data
                                input_ts = doTmask(
                                    input_ts, tmask_ts
                                )  # Use Tmask to further screen the input data

                            argslist = (input_ts, num_bands, x_val, y_val,
                                        args)
                            ccdc_args.append(argslist)

        # Use multiprocessing to process all samples from this mini-tile
        # Do some tidying up
        del input_data

        if (cloud_ds):
            del cloud_ds
            del cloud_masks

        if (tmask_ds):
            del tmask_ds
            del tmask_data

        # Run processes
        with Pool(processes=args.num_procs) as pool:
            pool.starmap(runCCDC, ccdc_args)

        # Generate output file name
        output_file = os.path.join(
            args.outdir,
            "{}_{}_{}_{}_{}_{}.csv".format(args.output_file, key, min_y, max_y,
                                           min_x, max_x))

        # Write headers to file
        headers = [
            "x", "y", "band", "start_date", "end_date", "start_val", "end_val",
            "coeffs", "RMSE", "intercept", "alpha", "change_date", "magnitude"
        ]

        with open(output_file, 'w') as output:
            writer = csv.writer(output)
            writer.writerow(headers)
            writer.writerows(rows)

        # Reset shared list
        rows = []

예제 #22

파일 보기

파일: DEADataHandling.py 프로젝트: RSPaW/dea-notebooks

def load_clearsentinel(dc, query, sensors=['s2a', 's2b'], bands_of_interest=['red', 'green', 'blue'],
                       product='ard', masked_prop=0.99, mask_values=[0, 2, 3], apply_mask=False, 
                       pixel_quality_band='pixel_quality'):
    
    """
    Loads Sentinel 2 data for multiple sensors (i.e. s2a, s2b), and returns a single xarray dataset containing 
    only observations that contain greater than a given proportion of clear pixels.    
  
    This can be used to extract visually appealing time series of observations that are not affected by cloud, 
    for example as an input to the `animated_timeseries` function from `DEAPlotting`.
    
    The proportion of clear pixels is calculated by summing the pixels that are flagged as being problematic
    in the Sentinel pixel quality array. By default pixels flagged as nodata, cloud or shadow are used to 
    calculate the number of unclear pixels, but this can be customised using the `mask_values` function.
    
    Last modified: August 2018
    Author: Robbi Bishop-Taylor
    
    :param dc: 
        A specific Datacube to import from, i.e. `dc = datacube.Datacube(app='Sentinel datacube')`. This allows you 
        to also use development datacubes if they have been imported into the environment.
    
    :param query: 
        A dict containing the query bounds. Can include lat/lon, time etc. If no `time` query is given, the 
        function defaults to all timesteps available to all sensors (e.g. 2015 onward)

    :param sensors:
        An optional list of Sentinel 2 sensors to load data for. Options are 's2a', and 's2b'; defaults to both.

    :param product:
        An optional string specifying the product to load. Defaults to 'ard', which is equivelent to loading
        e.g. `s2a_ard_granule`. 
        
    :param bands_of_interest:
        An optional list of strings containing the bands to be read in; options can include 'red', 'green', 'blue', 
        'nir1', etc, but these may vary depending on the database. Defaults to `['red', 'green', 'blue']`.

    :param masked_prop:
        An optional float giving the minimum percentage of clear pixels required for a Sentinel 2 observation to be 
        loaded. Defaults to 0.99 (i.e. only return observations with less than 1% of unclear pixels).  
    
    :param mask_values:
        An optional list of pixel quality values to treat as invalid or unclear observations in the above `masked_prop`
        calculation. The default is `[0, 2, 3]` which treats nodata, cloud and cloud shadow as unclear observations. 
        Choose from: `{'0': 'nodata', '1': 'valid', '2': 'cloud', '3': 'shadow', '4': 'snow', '5': 'water'}`.
      
    :param apply_mask:
        An optional boolean indicating whether resulting observations should have the pixel_quality mask applied to 
        mask out any remaining unclear cells. For example, if `masked_prop=0.99`, the filtered images may still 
        contain up to 1% unclear/cloudy pixels. The default of False simply returns the resulting observations 
        without masking out these pixels; True removes them using the mask. 
    
    :param pixel_quality_band:
        An optional string giving the name of the pixel quality band contained in the Sentinel 2 dataset. The default
        value is 'pixel_quality', however the same band may also be referred to as 'fmask' in some databases.
    
    :returns:
        An xarray dataset containing only Sentinel 2 observations that contain greater than `masked_prop`
        proportion of clear pixels.  
        
    :example:
    
    >>> # Import modules
    >>> import datacube
    >>> import sys
    >>> 
    >>> # Import dea-notebooks functions using relative link to Scripts directory
    >>> sys.path.append('../10_Scripts')
    >>> import DEADataHandling
    >>> 
    >>> # Connect to a datacube containing Sentinel data
    >>> s2dc = datacube.Datacube(config='/g/data/r78/dc_configs/sentinel2.conf')
    >>> 
    >>> # Set up spatial and temporal query; note that 'output_crs' and 'resolution' need to be set 
    >>> query = {'x': (-191400.0, -183400.0),
    >>>          'y': (-1423460.0, -1415460.0),
    >>>          'time': ('2017-01-01', '2018-01-01'),
    >>>          'crs': 'EPSG:3577',
    >>>          'output_crs': 'EPSG:3577',
    >>>          'resolution': (10, 10)}                
    >>> 
    >>> # Load in red, green, blue and NIR1 bands for Sentinel observations with < 1% unclear values. 
    >>> # Here we use apply_mask=True to mask out any remaining unclear pixels with NaN.
    >>> sentinel_ds = DEADataHandling.load_clearsentinel(dc=s2dc, query=query, 
    >>>                                                  bands_of_interest=['red', 'green', 'blue', 'nir1'],
    >>>                                                  masked_prop=0.01, apply_mask=True)         
      
    """
    

    # List to save results from each sensor
    filtered_sensors = []

    # Iterate through all sensors, returning only observations with > mask_prop clear pixels
    for sensor in sensors:
        
        # If bands of interest are given, assign measurements in dc.load call. This is
        # for compatibility with the existing dea-notebooks load_nbarx function.
        if bands_of_interest:

            # Lazily load Landsat data using dask               
            data = dc.load(product='{}_{}_granule'.format(sensor, product), 
                           measurements=bands_of_interest,
                           group_by='solar_day', 
                           dask_chunks={'time': 1},
                           **query )

        # If no bands of interest given, run without specifying measurements, and 
        # therefore return all available bands
        else:

            # Lazily load Landsat data using dask  
            data = dc.load(product='{}_{}_granule'.format(sensor, product),
                           group_by='solar_day', 
                           dask_chunks={'time': 1},
                           **query )              
        
        # Load PQ data
        pq = dc.load(product = '{}_{}_granule'.format(sensor, product),
                     measurements=[pixel_quality_band],
                     group_by = 'solar_day',
                     dask_chunks={'time': 1},
                     **query)

        # Load PQ data using dask
        print('Loading {} PQ'.format(sensor))
        pq = pq.compute()
        
        # Identify pixels with valid data
        good_quality = np.isin(pq[pixel_quality_band], test_elements = mask_values, invert=True)
        good_quality = pq[pixel_quality_band].where(good_quality).notnull()

        # Compute good data for each observation as a percentage of total array pixels
        data_perc = good_quality.sum(dim=['x', 'y']) / (good_quality.shape[1] * good_quality.shape[2])

        # Add data_perc data to Sentinel dataset as a new xarray variable
        data['data_perc'] = xr.DataArray(data_perc, [('time', data.time)])

        # Filter and finally import data using dask
        filtered = data.where(data.data_perc >= masked_prop, drop=True)
        print('    Loading {} filtered {} timesteps'.format(len(filtered.time), sensor))
        filtered = filtered.compute()

        # Optionally apply mask (instead of only filtering)
        if apply_mask:
            filtered = filtered.where(good_quality)

        # Append result to list
        filtered_sensors.append(filtered)

        # Close datasets
        filtered = None
        good_quality = None
        data = None       
                        

    # Concatenate all sensors into one big xarray dataset, and then sort by time
    print('Combining and sorting ls5, ls7 and ls8 data')
    combined_ds = xr.concat(filtered_sensors, dim='time')
    combined_ds = combined_ds.sortby('time')
                                                               
    #Filter to replace no data values with nans
    combined_ds = masking.mask_invalid_data(combined_ds)

    # Return combined dataset
    return combined_ds

예제 #23

파일 보기

파일: ccdc.py 프로젝트: klh5/CCDC

def runOnArea(num_bands, args):
    """If the user chooses to run the algorithm on the whole of the specified area, this function is called.
    This function will load the whole area specified by the user, and run the algorithm on each pixel."""

    # Refers to Manager list object
    global rows

    # Calculate the right number of columns to be returned from the data cube
    input_num_cols = num_bands + 1

    ccdc_args = []
    input_ds = []
    tmask_ds = []
    cloud_ds = []

    input_ds = loadArea(args.input_products, args.bands, args.lowerlat,
                        args.upperlat, args.lowerlon, args.upperlon)

    if (len(input_ds) == len(args.input_products)):

        if (args.tmask_products):

            tmask_ds = loadArea(args.tmask_products, ['green', 'nir', 'swir1'],
                                args.lowerlat, args.upperlat, args.lowerlon,
                                args.upperlon)

        if (args.cloud_products):

            cloud_ds = loadArea(args.cloud_products, ['cloud_mask'],
                                args.lowerlat, args.upperlat, args.lowerlon,
                                args.upperlon)

        # Tidy up input data
        input_data = xr.concat(input_ds, dim='time')
        input_data = mask_invalid_data(input_data)

        if (cloud_ds):
            cloud_masks = xr.concat(cloud_ds, dim='time')

        if (tmask_ds):
            tmask_data = xr.concat(tmask_ds, dim='time')
            tmask_data = mask_invalid_data(tmask_data)

        for i in range(len(input_data.x)):
            for j in range(len(input_data.y)):

                input_ts = input_data.isel(x=i, y=j)

                x_val = float(input_ts.x)
                y_val = float(input_ts.y)

                input_ts = transformToArray(input_ts)

                if (input_ts.shape[0] > 0
                        and input_ts.shape[1] == input_num_cols):

                    if (cloud_ds):
                        cloud_ts = cloud_masks.isel(
                            x=i, y=j
                        )  # Get cloud mask values through time for this pixel

                        cloud_ts = transformToArray(cloud_ts)
                        cloud_ts = cloud_ts[np.isin(
                            cloud_ts[:, 0], input_ts[:, 0]
                        )]  # Remove any rows which aren't in the SREF data
                        input_ts = input_ts[cloud_ts[:, 1] ==
                                            0]  # Do masking (0 value is clear)

                    if (tmask_ds):
                        tmask_ts = tmask_data.isel(x=i, y=j)

                        tmask_ts = transformToArray(tmask_ts)
                        tmask_ts = tmask_ts[np.isin(
                            tmask_ts[:, 0], input_ts[:, 0]
                        )]  # Remove any rows which aren't in the SREF data
                        input_ts = doTmask(
                            input_ts, tmask_ts
                        )  # Use Tmask to further screen the input data

                    argslist = (input_ts, num_bands, x_val, y_val, args)
                    ccdc_args.append(argslist)

    # Do some tidying up
    del input_data

    if (cloud_ds):
        del cloud_ds
        del cloud_masks

    if (tmask_ds):
        del tmask_ds
        del tmask_data

    # Run processes
    with Pool(processes=args.num_procs) as pool:
        pool.starmap(runCCDC, ccdc_args)

    # Generate output file name
    output_file = os.path.join(args.outdir, "{}.csv".format(args.output_file))

    # Write headers to file
    headers = [
        "x", "y", "band", "start_date", "end_date", "start_val", "end_val",
        "coeffs", "RMSE", "intercept", "alpha", "change_date", "magnitude"
    ]

    with open(output_file, 'w') as output:
        writer = csv.writer(output)
        writer.writerow(headers)
        writer.writerows(rows)