def fmask_filter_c2(fmask): mask = np.zeros(fmask.shape, dtype=np.uint8) col2_nodata = masking.make_mask(fmask, nodata=True) col2_cloud = masking.make_mask(fmask, cloud_or_cirrus="cloud_or_cirrus") col2_cloud_shadow = masking.make_mask(fmask, cloud_shadow="cloud_shadow") mask[col2_cloud.values] += MASKED_CLOUD mask[col2_cloud_shadow.values] += MASKED_CLOUD_SHADOW mask[col2_nodata.values] = NO_DATA return mask
def create_mask(data, flags): if "or" in flags: fs = flags["or"] mask = None for f in fs.items(): f = {f[0]: f[1]} if mask is None: mask = make_mask(data, **f) else: mask |= make_mask(data, **f) else: fs = flags if "and" not in flags else flags["and"] mask = make_mask(data, **fs) return mask
def to_mask(self, data, pq_data, extra_mask=None): date_count = len(data.coords["time"]) if date_count > 1: mdh = self.get_multi_date_handler(date_count) if extra_mask is not None: extra_mask = mdh.collapse_mask(extra_mask) if pq_data is not None: pq_data = mdh.collapse_mask(pq_data) else: if extra_mask is not None: extra_mask = extra_mask.squeeze(dim="time", drop=True) if pq_data is not None: pq_data = pq_data.squeeze(dim="time", drop=True) result = extra_mask if pq_data is not None: for mask in self.masks: odc_mask = make_mask(pq_data, **mask.flags) mask_data = getattr(odc_mask, self.product.pq_band) if mask.invert: mask_data = ~mask_data if result is None: result = mask_data else: result = result & mask_data return result
def single_date_make_mask(data, mask): pq_data = getattr(data, mask.band_name) if mask.flags: odc_mask = make_mask(pq_data, **mask.flags) else: odc_mask = pq_data == mask.enum odc_mask = odc_mask.squeeze(dim="time", drop=True) return odc_mask
def create_long_arrays(ldc, udc, lquery, lquery2): usgs_names = [ 'coastal_aerosol', 'blue', 'green', 'red', 'nir', 'swir1', 'swir2' ] ls8_temp = ldc.load(product='ls8_nbart_scene', **lquery) ls8_bigtemp = ldc.load(product='ls8_nbart_scene', **lquery2) ls8_usgs_temp = udc.load(product='ls8_usgs_l2c1', measurements=usgs_names, **lquery) ls8_usgs_bigtemp = udc.load(product='ls8_usgs_l2c1', measurements=usgs_names, **lquery2) ls8_pq = ldc.load(product='ls8_pq_scene', fuse_func=ga_pq_fuser, **lquery2) good_quality = masking.make_mask(ls8_pq.pqa, cloud_acca='no_cloud', cloud_fmask='no_cloud', cloud_shadow_acca='no_cloud_shadow', cloud_shadow_fmask='no_cloud_shadow', blue_saturated=False, green_saturated=False, red_saturated=False, nir_saturated=False, swir1_saturated=False, swir2_saturated=False, contiguous=True) ls8_array = ls8_temp.where(good_quality) ls8_bigarray = ls8_bigtemp.where(good_quality) ls8_usgs_array = ls8_usgs_temp.where(good_quality) ls8_usgs_bigarray = ls8_usgs_bigtemp.where(good_quality) ls8_array = ls8_array.rename({ '1': 'coastal_aerosol', '2': 'blue', '3': 'green', '4': 'red', '5': 'nir', '6': 'swir1', '7': 'swir2' }) ls8_bigarray = ls8_bigarray.rename({ '1': 'coastal_aerosol', '2': 'blue', '3': 'green', '4': 'red', '5': 'nir', '6': 'swir1', '7': 'swir2' }) return ls8_array, ls8_usgs_array, ls8_bigarray, ls8_usgs_bigarray
def create_mask(self, data): if self.values: mask = None for v in self.values: vmask = data == v if mask is None: mask = vmask else: mask |= vmask elif self.or_flags: mask = None for f in self.flags.items(): f = {f[0]: f[1]} if mask is None: mask = make_mask(data, **f) else: mask |= make_mask(data, **f) else: mask = make_mask(data, **self.flags) return mask
def make_mask(self, data, mask): odc_mask = None for dt in data.coords["time"].values: tpqdata = getattr(data.sel(time=dt), mask.band_name) if mask.flags: dt_mask = make_mask(tpqdata, **mask.flags) else: dt_mask = tpqdata == mask.enum if odc_mask is None: odc_mask = dt_mask else: odc_mask |= dt_mask return odc_mask
def compute_mosaic(products, measurements, **parsed_expressions): with Datacube() as dc: acq_range = parsed_expressions['time'] click.echo("Processing time range {}".format(acq_range)) datasets = [] for prodname in products: dataset = dc.load(product=prodname, measurements=measurements, group_by='solar_day', **parsed_expressions) if len(dataset) == 0: continue else: click.echo("Found {} time slices of {} during {}.".format( len(dataset['time']), prodname, acq_range)) pq = dc.load(product=prodname.replace('nbar', 'pq'), group_by='solar_day', fuse_func=ga_pq_fuser, **parsed_expressions) if len(pq) == 0: click.echo('No PQ found, skipping') continue crs = dataset.attrs['crs'] dataset = dataset.where(dataset != -999) dataset.attrs['product'] = prodname dataset.attrs['crs'] = crs cloud_free = make_mask(pq.pixelquality, ga_good_pixel=True) dataset = dataset.where(cloud_free) if len(dataset) == 0: click.echo("Nothing left after PQ masking") continue datasets.append(dataset) dataset = xr.concat(datasets, dim='time') return dataset.median(dim='time')
def load_ard(dc, products=None, min_gooddata=0.0, pq_categories_s2=[ 'vegetation', 'snow or ice', 'water', 'bare soils', 'unclassified', 'dark area pixels' ], pq_categories_ls=None, mask_pixel_quality=True, ls7_slc_off=True, predicate=None, dtype='auto', scaling='raw', **kwargs): ''' Loads and combines Landsat Collections 1 or 2, and Sentinel-2 for multiple sensors (i.e. ls5t, ls7e and ls8c for Landsat; s2a and s2b for Sentinel-2), optionally applies pixel quality masks, and drops time steps that contain greater than a minimum proportion of good quality (e.g. non- cloudy or shadowed) pixels. The function supports loading the following DE Africa products: ls5_usgs_sr_scene ls7_usgs_sr_scene ls8_usgs_sr_scene usgs_ls8c_level2_2 ga_ls8c_fractional_cover_2 s2_l2a Last modified: March 2020 Parameters ---------- dc : datacube Datacube object The Datacube to connect to, i.e. `dc = datacube.Datacube()`. This allows you to also use development datacubes if required. products : list A list of product names to load data from. Valid options are Landsat C1: ['ls5_usgs_sr_scene', 'ls7_usgs_sr_scene', 'ls8_usgs_sr_scene'], Landsat C2: ['usgs_ls8c_level2_2'] Sentinel-2: ['s2_l2a'] min_gooddata : float, optional An optional float giving the minimum percentage of good quality pixels required for a satellite observation to be loaded. Defaults to 0.0 which will return all observations regardless of pixel quality (set to e.g. 0.99 to return only observations with more than 99% good quality pixels). pq_categories_s2 : list, optional An optional list of Sentinel-2 Scene Classification Layer (SCL) names to treat as good quality observations in the above `min_gooddata` calculation. The default is ['vegetation','snow or ice','water', 'bare soils','unclassified', 'dark area pixels'] which will return non-cloudy or non-shadowed land, snow, water, veg, and non-veg pixels. pq_categories_ls : dict, optional An optional dictionary that is used to generate a good quality pixel mask from the selected USGS product's pixel quality band (i.e. 'pixel_qa' for USGS Collection 1, and 'quality_l2_aerosol' for USGS Collection 2). This mask is used for both masking out low quality pixels (e.g. cloud or shadow), and for dropping observations entirely based on the above `min_gooddata` calculation. Default is None, which will apply the following mask for USGS Collection 1: `{'cloud': 'no_cloud', 'cloud_shadow': 'no_cloud_shadow', 'nodata': False}`, and for USGS Collection 2: `{'cloud_shadow': 'not_cloud_shadow', 'cloud_or_cirrus': 'not_cloud_or_cirrus', 'nodata': False}. mask_pixel_quality : bool, optional An optional boolean indicating whether to apply the good data mask to all observations that were not filtered out for having less good quality pixels than `min_gooddata`. E.g. if `min_gooddata=0.99`, the filtered observations may still contain up to 1% poor quality pixels. The default of False simply returns the resulting observations without masking out these pixels; True masks them and sets them to NaN using the good data mask. This will convert numeric values to floating point values which can cause memory issues, set to False to prevent this. ls7_slc_off : bool, optional An optional boolean indicating whether to include data from after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to True, which keeps all Landsat 7 observations > May 31 2003. predicate : function, optional An optional function that can be passed in to restrict the datasets that are loaded by the function. A filter function should take a `datacube.model.Dataset` object as an input (i.e. as returned from `dc.find_datasets`), and return a boolean. For example, a filter function could be used to return True on only datasets acquired in January: `dataset.time.begin.month == 1` dtype : string, optional An optional parameter that controls the data type/dtype that layers are coerced to after loading. Valid values: 'native', 'auto', 'float{16|32|64}'. When 'auto' is used, the data will be converted to `float32` if masking is used, otherwise data will be returned in the native data type of the data. Be aware that if data is loaded in its native dtype, nodata and masked pixels will be returned with the data's native nodata value (typically -999), not NaN. scaling : str, optional If 'normalised', then surface reflectance values are scaled from their original values to 0-1. If 'raw' then dataset is returned in its native scaling. WARNING: USGS Landsat Collection 2 surface reflectance values have an offset so normliaed band indices will return non-sensical results if setting scaling='raw'. **kwargs : A set of keyword arguments to `dc.load` that define the spatiotemporal query used to extract data. This typically includes `measurements`, `x`, `y`, `time`, `resolution`, `resampling`, `group_by` and `crs`. Keyword arguments can either be listed directly in the `load_ard` call like any other parameter (e.g. `measurements=['nbart_red']`), or by passing in a query kwarg dictionary (e.g. `**query`). For a list of possible options, see the `dc.load` documentation: https://datacube-core.readthedocs.io/en/latest/dev/api/generate/datacube.Datacube.load.html Returns ------- combined_ds : xarray Dataset An xarray dataset containing only satellite observations that contains greater than `min_gooddata` proportion of good quality pixels. ''' ######### # Setup # ######### # prevent function altering original query object kwargs = deepcopy(kwargs) # We deal with `dask_chunks` separately dask_chunks = kwargs.pop('dask_chunks', None) requested_measurements = kwargs.pop('measurements', None) # Warn user if they combine lazy load with min_gooddata if (min_gooddata > 0.0) and dask_chunks is not None: warnings.warn("Setting 'min_gooddata' percentage to > 0.0 " "will cause dask arrays to compute when " "loading pixel-quality data to calculate " "'good pixel' percentage. This can " "slow the return of your dataset.") # Verify that products were provided and determine if Sentinel-2 # or Landsat data is being loaded if not products: raise ValueError(f'Please provide a list of product names ' f'to load data from.') elif all(['level2' in product for product in products]): product_type = 'c2' elif all(['sr' in product for product in products]): product_type = 'c1' elif all(['s2' in product for product in products]): product_type = 's2' elif all(['fractional_cover' in product for product in products]): product_type = 'fc' # If `measurements` are specified but do not include pixel quality bands, # add these to `measurements` according to collection if (product_type == 'c2') or (product_type == 'fc'): print('Using pixel quality parameters for USGS Collection 2') fmask_band = 'quality_l2_aerosol' elif product_type == 'c1': print('Using pixel quality parameters for USGS Collection 1') fmask_band = 'pixel_qa' elif product_type == 's2': print('Using pixel quality parameters for Sentinel 2') fmask_band = 'SCL' measurements = requested_measurements.copy( ) if requested_measurements else None # Deal with "load all" case: pick a set of bands common across # all products if measurements is None: if product_type == 'fc': measurements = ['pv', 'npv', 'bs', 'ue'] else: measurements = _common_bands(dc, products) # If `measurements` are specified but do not include pq, add. if measurements: #pass if FC if product_type == 'fc': pass else: if fmask_band not in measurements: measurements.append(fmask_band) # Get list of data and mask bands so that we can later exclude # mask bands from being masked themselves if product_type == 'fc': pass else: data_bands = [ band for band in measurements if band not in (fmask_band) ] mask_bands = [band for band in measurements if band not in data_bands] ################# # Find datasets # #################l # Pull out query params only to pass to dc.find_datasets query = _dc_query_only(**kwargs) # Extract datasets for each product using subset of dcload_kwargs dataset_list = [] # Get list of datasets for each product print('Finding datasets') for product in products: # Obtain list of datasets for product print(f' {product}') datasets = dc.find_datasets(product=product, **query) # Remove Landsat 7 SLC-off observations if ls7_slc_off=False if not ls7_slc_off and product in [ 'ls7_usgs_sr_scene', 'usgs_ls7e_level2_2' ]: print(' Ignoring SLC-off observations for ls7') datasets = [ i for i in datasets if i.time.begin < datetime.datetime(2003, 5, 31, tzinfo=pytz.UTC) ] # Add any returned datasets to list dataset_list.extend(datasets) # Raise exception if no datasets are returned if len(dataset_list) == 0: raise ValueError("No data available for query: ensure that " "the products specified have data for the " "time and location requested") # If pedicate is specified, use this function to filter the list # of datasets prior to load if predicate: print(f'Filtering datasets using filter function') dataset_list = [ds for ds in dataset_list if predicate(ds)] # Raise exception if filtering removes all datasets if len(dataset_list) == 0: raise ValueError("No data available after filtering with " "filter function") # load fmask from C2 for masking FC, and filter if required # NOTE: This works because only one sensor (ls8) has FC, if/when # FC is calculated for LS7, LS5, will need to move this section # into the for loop above. if product_type == 'fc': print(' PQ data from USGS C2') dataset_list_fc_pq = dc.find_datasets(product='usgs_ls8c_level2_2', **query) if predicate: print(f'Filtering datasets using filter function') dataset_list_fc_pq = [ ds for ds in dataset_list_fc_pq if predicate(ds) ] ############# # Load data # ############# # Note we always load using dask here so that # we can lazy load data before filtering by good data ds = dc.load(datasets=dataset_list, measurements=measurements, dask_chunks={} if dask_chunks is None else dask_chunks, **kwargs) if product_type == 'fc': ds_fc_pq = dc.load( datasets=dataset_list_fc_pq, dask_chunks={} if dask_chunks is None else dask_chunks, **kwargs) #################### # Filter good data # #################### # need to distinguish between products due to different # pq band properties # collection 2 USGS or FC if (product_type == 'c2') or (product_type == 'fc'): if pq_categories_ls is None: quality_flags_prod = { 'cloud_shadow': 'not_cloud_shadow', 'cloud_or_cirrus': 'not_cloud_or_cirrus', 'nodata': False } else: quality_flags_prod = pq_categories_ls if product_type == 'fc': pq_mask = masking.make_mask(ds_fc_pq[fmask_band], **quality_flags_prod) else: pq_mask = masking.make_mask(ds[fmask_band], **quality_flags_prod) # collection 1 USGS if product_type == 'c1': if pq_categories_ls is None: quality_flags_prod = { 'cloud': 'no_cloud', 'cloud_shadow': 'no_cloud_shadow', 'nodata': False } else: quality_flags_prod = pq_categories_ls pq_mask = masking.make_mask(ds[fmask_band], **quality_flags_prod) # sentinel 2 if product_type == 's2': #currently broken for mask band values >=8 #pq_mask = odc.algo.fmask_to_bool(ds[fmask_band], # categories=pq_categories_s2) flags_s2 = dc.list_measurements().loc[ products[0]].loc[fmask_band]['flags_definition']['qa']['values'] pq_mask = ds[fmask_band].isin( [int(k) for k, v in flags_s2.items() if v in pq_categories_s2]) # The good data percentage calculation has to load in all `fmask` # data, which can be slow. If the user has chosen no filtering # by using the default `min_gooddata = 0`, we can skip this step # completely to save processing time if min_gooddata > 0.0: # Compute good data for each observation as % of total pixels print('Counting good quality pixels for each time step') data_perc = (pq_mask.sum(axis=[1, 2], dtype='int32') / (pq_mask.shape[1] * pq_mask.shape[2])) keep = data_perc >= min_gooddata # Filter by `min_gooddata` to drop low quality observations total_obs = len(ds.time) ds = ds.sel(time=keep) pq_mask = pq_mask.sel(time=keep) print(f'Filtering to {len(ds.time)} out of {total_obs} ' f'time steps with at least {min_gooddata:.1%} ' f'good quality pixels') ############### # Apply masks # ############### # Generate good quality data mask mask = None if mask_pixel_quality: print('Applying pixel quality/cloud mask') mask = pq_mask # Split into data/masks bands, as conversion to float and masking # should only be applied to data bands if product_type == 'fc': ds_data = ds else: ds_data = ds[data_bands] ds_masks = ds[mask_bands] # Mask data if either of the above masks were generated if mask is not None: ds_data = odc.algo.keep_good_only(ds_data, where=mask) # Automatically set dtype to either native or float32 depending # on whether masking was requested if dtype == 'auto': dtype = 'native' if mask is None else 'float32' # Set nodata values using odc.algo tools to reduce peak memory # use when converting data dtype if dtype != 'native': ds_data = odc.algo.to_float(ds_data, dtype=dtype) # Put data and mask bands back together if product_type == 'fc': attrs = ds.attrs ds = ds_data ds.attrs.update(attrs) else: attrs = ds.attrs ds = xr.merge([ds_data, ds_masks]) ds.attrs.update(attrs) ############### # Return data # ############### # Drop bands not originally requested by user if requested_measurements: ds = ds[requested_measurements] # Scale data 0-1 if requested if scaling == 'normalised': if product_type == 'c1': print("Re-scaling Landsat C1 data") not_sr_bands = ['pixel_qa', 'sr_aerosol', 'radsat_qa'] for band in ds.data_vars: if band not in not_sr_bands: ds[band] = ds[band] / 10000 if product_type == 's2': print("Re-scaling Sentinel-2 data") not_sr_bands = [ 'scl', 'qa', 'mask', 'water_vapour', 'aerosol_optical_thickness' ] for band in ds.data_vars: if band not in not_sr_bands: ds[band] = ds[band] / 10000 # Collection 2 Landsat raw values aren't useful so rescale, # need different factors for surface-temp and SR if product_type == 'c2': print("Re-scaling Landsat C2 data") not_sr_bands = [ 'thermal_radiance', 'upwell_radiance', 'upwell_radiance', 'atmospheric_transmittance', 'emissivity', 'emissivity_stdev', 'cloud_distance', 'quality_l2_aerosol', 'quality_l2_surface_temperature', 'quality_l1_pixel', 'quality_l1_radiometric_saturation', 'surface_temperature' ] for band in ds.data_vars: if band == 'surface_temperature': ds[band] = ds[band] * 0.00341802 + 149.0 - 273.15 if band not in not_sr_bands: ds[band] = ds[band] * 2.75e-5 - 0.2 # If user supplied dask_chunks, return data as a dask array without # actually loading it in if dask_chunks is not None: print(f'Returning {len(ds.time)} time steps as a dask array') return ds else: print(f'Loading {len(ds.time)} time steps') return ds.compute()
def process_data(self, data, parameters): wofs_mask_flags = [ dict(dry=True), dict(terrain_or_low_angle=False, high_slope=False, cloud_shadow=False, cloud=False, sea=False) ] water = data.data_vars['water'] data = data.drop_vars(['water']) total = data.count(dim=['x', 'y']) total_valid = (data != -1).sum(dim=['x', 'y']) # TODO enable this check, investigate why it fails # if total_valid <= 0: # raise ProcessError('query returned no data') for m in wofs_mask_flags: mask = make_mask(water, **m) data = data.where(mask) total_invalid = (np.isnan(data)).sum(dim=['x', 'y']) not_pixels = total_valid - (total - total_invalid) # following robbi's advice, cast the dataset to a dataarray maxFC = data.to_array(dim='variable', name='maxFC') # turn FC array into integer only as nanargmax doesn't seem to handle floats the way we want it to FC_int = maxFC.astype('int16') # use numpy.nanargmax to get the index of the maximum value along the variable dimension # BSPVNPV=np.nanargmax(FC_int, axis=0) BSPVNPV = FC_int.argmax(dim='variable') FC_mask = np.isfinite(maxFC).all(dim='variable') # pylint: disable=no-member,unexpected-keyword-arg # #re-mask with nans to remove no-data BSPVNPV = BSPVNPV.where(FC_mask) FC_dominant = xarray.Dataset({ 'BS': (BSPVNPV == 0).where(FC_mask), 'PV': (BSPVNPV == 1).where(FC_mask), 'NPV': (BSPVNPV == 2).where(FC_mask) }) FC_count = FC_dominant.sum(dim=['x', 'y']) # Fractional cover pixel count method # Get number of FC pixels, divide by total number of pixels per polygon new_ds = xarray.Dataset({ 'BS': (FC_count.BS / total_valid)['BS'] * 100, 'PV': (FC_count.PV / total_valid)['PV'] * 100, 'NPV': (FC_count.NPV / total_valid)['NPV'] * 100, 'Unobservable': (not_pixels / total_valid)['BS'] * 100 }) print('dask compute') dask_time = default_timer() new_ds = new_ds.compute() print('dask took', default_timer() - dask_time, 'seconds') print(new_ds) df = new_ds.to_dataframe() df = df.drop('spatial_ref', axis=1) df.reset_index(inplace=True) return df
def load_nbarx(dc, sensor, query, product='nbart', bands_of_interest='', filter_pq=True): """ Loads NBAR (Nadir BRDF Adjusted Reflectance) or NBAR-T (terrain corrected NBAR) data for a sensor, masks using pixel quality (PQ), then optionally filters out terrain -999s (for NBAR-T). Returns an xarray dataset and CRS and Affine objects defining map projection and geotransform Last modified: May 2018 Author: Bex Dunn Modified by: Claire Krause, Robbi Bishop-Taylor, Bex Dunn inputs dc - Handle for the Datacube to import from. This allows you to also use dev environments if that have been imported into the environment. sensor - Options are 'ls5', 'ls7', 'ls8' query - A dict containing the query bounds. Can include lat/lon, time etc. optional product - 'nbar' or 'nbart'. Defaults to nbart unless otherwise specified bands_of_interest - List of strings containing the bands to be read in; defaults to all bands, options include 'red', 'green', 'blue', 'nir', 'swir1', 'swir2' filter_pq - boolean. Will filter clouds and saturated pixels using PQ unless set to False outputs ds - Extracted and optionally PQ filtered dataset crs - CRS object defining dataset coordinate reference system affine - Affine object defining dataset affine transformation """ product_name = '{}_{}_albers'.format(sensor, product) mask_product = '{}_{}_albers'.format(sensor, 'pq') print('Loading {}'.format(product_name)) # If bands of interest are given, assign measurements in dc.load call if bands_of_interest: ds = dc.load(product=product_name, measurements=bands_of_interest, group_by='solar_day', **query) # If no bands of interest given, run without specifying measurements else: ds = dc.load(product=product_name, group_by='solar_day', **query) # Proceed if the resulting call returns data if ds.variables: crs = ds.crs affine = ds.affine print('Loaded {}'.format(product_name)) # If pixel quality filtering is enabled, extract PQ data to use as mask if filter_pq: sensor_pq = dc.load(product=mask_product, fuse_func=ga_pq_fuser, group_by='solar_day', **query) # If PQ call returns data, use to mask input data if sensor_pq.variables: print('Generating mask {}'.format(mask_product)) good_quality = masking.make_mask( sensor_pq.pixelquality, cloud_acca='no_cloud', cloud_shadow_acca='no_cloud_shadow', cloud_shadow_fmask='no_cloud_shadow', cloud_fmask='no_cloud', blue_saturated=False, green_saturated=False, red_saturated=False, nir_saturated=False, swir1_saturated=False, swir2_saturated=False, contiguous=True) # Apply mask to preserve only good data ds = ds.where(good_quality) ds.attrs['crs'] = crs ds.attrs['affine'] = affine # Replace nodata values with nans ds = masking.mask_invalid_data(ds) return ds, crs, affine else: print('Failed to load {}'.format(product_name)) return None, None, None
def load_clearlandsat(dc, query, sensors=('ls5', 'ls7', 'ls8'), product='nbart', dask_chunks={'time': 1}, lazy_load=False, bands_of_interest=None, masked_prop=0.0, mask_dict=None, mask_pixel_quality=True, mask_invalid_data=True, ls7_slc_off=False, satellite_metadata=False): """Loads Landsat NBAR, NBART or FC25 and PQ data for multiple sensors (i.e. ls5, ls7, ls8) and returns a single xarray dataset containing only observations that contain greater than a given proportion of good quality pixels. This function can be used to extract visually appealing time series of observations that are not affected by cloud, for example as an input to the `animated_timeseries` function from `DEAPlotting`. The proportion of clear pixels is calculated by summing the pixels that are marked as being good quality in the Landsat PQ25 layer. By default cloud, cloud shadow, saturated pixels and pixels missing data for any band are considered poor quality data, but this can be customised using the `mask_dict` parameter. Last modified: March 2019 Author: Robbi Bishop-Taylor, Bex Dunn Parameters ---------- dc : datacube Datacube object A specific Datacube to import from, i.e. `dc = datacube.Datacube(app='Clear Landsat')`. This allows you to also use development datacubes if they have been imported into the environment. query : dict A dict containing the query bounds. Can include lat/lon, time etc. If no `time` query is given, the function defaults to all timesteps available to all sensors (e.g. 1987-2018) sensors : list, optional An optional list of Landsat sensor names to load data for. Options are 'ls5', 'ls7', 'ls8'; defaults to all. product : str, optional An optional string specifying 'nbar', 'nbart' or 'fc'. Defaults to 'nbart'. For information on the difference, see the '02_DEA_datasets/Introduction_to_Landsat' or '02_DEA_datasets/Introduction_to_Fractional_Cover' notebooks from DEA-notebooks. dask_chunks : dict, optional An optional dictionary containing the coords and sizes you wish to create dask chunks over. Usually used in combination with lazy_load=True (see below). example: dask_chunks = {'x': 500, 'y': 500} lazy_load : boolean, optional Setting this variable to 'True' will delay the computation of the function until you explicitly run ds.compute(). If used in conjuction with dask.distributed.Client() will allow for automatic parallel computation. bands_of_interest : list, optional An optional list of strings containing the bands to be read in; options include 'red', 'green', 'blue', 'nir', 'swir1', 'swir2'; defaults to all available bands if no bands are specified. masked_prop : float, optional An optional float giving the minimum percentage of good quality pixels required for a Landsat observation to be loaded. Defaults to 0.0 which will return all observations regardless of pixel quality (set to e.g. 0.99 to return only observations with more than 99% good quality pixels). mask_dict : dict, optional An optional dict of arguments to the `masking.make_mask` function that can be used to identify poor quality pixels from the PQ layer using alternative masking criteria. The default value of None masks out pixels flagged as cloud or cloud shadow by either the ACCA or Fmask algorithms, any saturated pixels, or any pixels that are missing data in any band (equivalent to: `mask_dict={'cloud_acca': 'no_cloud', 'cloud_shadow_acca': 'no_cloud_shadow', 'cloud_shadow_fmask': 'no_cloud_shadow', 'cloud_fmask': 'no_cloud', 'blue_saturated': False, 'green_saturated': False, 'red_saturated': False, 'nir_saturated': False, 'swir1_saturated': False, 'swir2_saturated': False, 'contiguous': True}`. See the `02_DEA_datasets/Introduction_to_LandsatPQ.ipynb` notebook on DEA Notebooks for a list of all possible options. mask_pixel_quality : bool, optional An optional boolean indicating whether to apply the pixel quality mask to all observations that were not filtered out for having less good quality pixels that `masked_prop`. For example, if `masked_prop=0.99`, the filtered images may still contain up to 1% poor quality pixels. The default of False simply returns the resulting observations without masking out these pixels; True masks them out and sets them to NaN using the pixel quality mask, but has the side effect of changing the data type of the output arrays from int16 to float32 which can cause memory issues. To reduce memory usage, set to False. mask_invalid_data : bool, optional An optional boolean indicating whether invalid -999 nodata values should be replaced with NaN. Defaults to True; this has the side effect of changing the data type of the output arrays from int16 to float32 which can cause memory issues. To reduce memory usage, set to False. ls7_slc_off : bool, optional An optional boolean indicating whether to include data from after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to False, which removes all Landsat 7 observations after May 31 2003. satellite_metadata : bool, optional An optional boolean indicating whether to return the dataset with a `satellite` variable that gives the name of the satellite that made each observation in the timeseries (i.e. ls5, ls7, ls8). Defaults to False. Returns ------- combined_ds : xarray Dataset An xarray dataset containing only Landsat observations that contain greater than `masked_prop` proportion of clear pixels. Notes ----- Memory issues: For large data extractions, it is recommended that you set both `mask_pixel_quality=False` and `mask_invalid_data=False`. Otherwise, all output variables will be coerced to float32 when NaN values are inserted into the array, potentially causing your data to use 2x as much memory. Be aware that the resulting arrays will contain invalid -999 values which should be considered in analyses. Example ------- >>> # Import modules >>> import datacube >>> import sys >>> # Import dea-notebooks functions using relative link to 10_Scripts directory >>> sys.path.append('../10_Scripts') >>> import DEADataHandling >>> # Connect to a datacube containing Landsat data >>> dc = datacube.Datacube(app='load_clearlandsat') >>> # Set up spatial and temporal query >>> query = {'x': (954163, 972163), ... 'y': (-3573891, -3555891), ... 'time': ('2011-06-01', '2013-06-01'), ... 'crs': 'EPSG:3577'} >>> # Load observations with more than 75% good quality pixels from ls5, ls7 and ls8 as a combined dataset >>> landsat_ds = DEADataHandling.load_clearlandsat(dc=dc, query=query, sensors=['ls5', 'ls7', 'ls8'], ... bands_of_interest=['red', 'green', 'blue'], ... masked_prop=0.75, mask_pixel_quality=True, ls7_slc_off=True) Loading ls5 Loading 4 filtered ls5 timesteps Loading ls7 Loading 29 filtered ls7 timesteps Loading ls8 Loading 3 filtered ls8 timesteps Combining and sorting ls5, ls7, ls8 data Replacing invalid -999 values with NaN (data will be coerced to float32) >>> # Test that function returned data >>> len(landsat_ds.time) > 0 True """ ####################### # Process each sensor # ####################### #warn if loading a pq bitstring product and attempting to mask it (and therefore cast to float) if product == 'pq' and (mask_invalid_data or mask_pixel_quality): warnings.warn( """You are attempting to load pixel quality product with a mask flag (mask_invalid_data or mask_pixel_quality). Pixel quality is a bitstring (only makes sense as int) and masking casts to float32.""") # Dictionary to save results from each sensor filtered_sensors = {} # Iterate through all sensors, returning only observations with > mask_prop clear pixels for sensor in sensors: # Load PQ data using dask print(f'Loading {sensor}') # If bands of interest are given, assign measurements in dc.load call. This is # for compatibility with the existing dea-notebooks load_nbarx function. if bands_of_interest: # Lazily load Landsat data using dask data = dc.load(product=f'{sensor}_{product}_albers', measurements=bands_of_interest, group_by='solar_day', dask_chunks=dask_chunks, **query) # If no bands of interest given, run without specifying measurements, and # therefore return all available bands else: # Lazily load Landsat data using dask data = dc.load(product=f'{sensor}_{product}_albers', group_by='solar_day', dask_chunks=dask_chunks, **query) # Load PQ data pq = dc.load(product=f'{sensor}_pq_albers', group_by='solar_day', fuse_func=ga_pq_fuser, dask_chunks=dask_chunks, **query) # If resulting dataset has data, continue: if data.variables: # Remove Landsat 7 SLC-off from PQ layer if ls7_slc_off=False if not ls7_slc_off and sensor == 'ls7': print(' Ignoring SLC-off observations for ls7') data = data.sel(time=data.time < np.datetime64('2003-05-30')) # If more than 0 timesteps if len(data.time) > 0: # Return only Landsat observations that have matching PQ data time = (data.time - pq.time).time data = data.sel(time=time) pq = pq.sel(time=time) # If a custom dict is provided for mask_dict, use these values to make mask from PQ if mask_dict: # Mask PQ using custom values by unpacking mask_dict **kwarg good_quality = masking.make_mask(pq.pixelquality, **mask_dict) else: # Identify pixels with no clouds in either ACCA for Fmask good_quality = masking.make_mask( pq.pixelquality, cloud_acca='no_cloud', cloud_shadow_acca='no_cloud_shadow', cloud_shadow_fmask='no_cloud_shadow', cloud_fmask='no_cloud', blue_saturated=False, green_saturated=False, red_saturated=False, nir_saturated=False, swir1_saturated=False, swir2_saturated=False, contiguous=True) # Compute good data for each observation as a percentage of total array pixels. Need to # sum over x and y axes individually so that the function works with lat-lon dimensions, # and because it isn't currently possible to pass a list of axes (bug with xarray?) data_perc = good_quality.sum(axis=1).sum( axis=1) / (good_quality.shape[1] * good_quality.shape[2]) # Add data_perc data to Landsat dataset as a new xarray variable data['data_perc'] = xr.DataArray(data_perc, [('time', data.time)]) # Filter by data_perc to drop low quality observations and finally import data using dask filtered = data.sel(time=data.data_perc >= masked_prop) print( f' Loading {len(filtered.time)} filtered {sensor} timesteps' ) # Optionally apply pixel quality mask to all observations that were not dropped in previous step if mask_pixel_quality: # First change dtype to float32, then mask out values using # `.where()`. By casting to float32, we prevent `.where()` # from automatically casting to float64, using 2x the memory # We also need to manually reset attributes due to a possible # bug in recent xarray version filtered = filtered.astype( np.float32).assign_attrs(crs=filtered.crs) filtered = filtered.where(good_quality) # Optionally add satellite name variable if satellite_metadata: filtered['satellite'] = xr.DataArray( [sensor] * len(filtered.time), [('time', filtered.time)]) # Add result to dictionary if lazy_load == True: filtered_sensors[sensor] = filtered else: filtered_sensors[sensor] = filtered.compute() # Close datasets filtered = None good_quality = None data = None pq = None else: # If there is no data for sensor or if another error occurs: print(f' Skipping {sensor}; no valid data for query') else: # If there is no data for sensor or if another error occurs: print(f' Skipping {sensor}; no valid data for query') ############################ # Combine multiple sensors # ############################ # Proceed with concatenating only if there is more than 1 sensor processed if len(filtered_sensors) > 1: # Concatenate all sensors into one big xarray dataset, and then sort by time sensor_string = ", ".join(filtered_sensors.keys()) print(f'Combining and sorting {sensor_string} data') combined_ds = xr.concat(filtered_sensors.values(), dim='time') combined_ds = combined_ds.sortby('time') # Optionally filter to replace no data values with nans if mask_invalid_data: print( ' Replacing invalid -999 values with NaN (data will be coerced to float32)' ) # First change dtype to float32, then mask out values using # `.where()`. By casting to float32, we prevent `.where()` # from automatically casting to float64, using 2x the memory # We also need to manually reset attributes due to a possible # bug in recent xarray version combined_ds = (combined_ds.astype( np.float32).assign_attrs(crs=combined_ds.crs)) combined_ds = masking.mask_invalid_data(combined_ds) # reset pixel quality attributes if product == 'pq': combined_ds.pixelquality.attrs.update( list(filtered_sensors.values())[0].pixelquality.attrs) # Return combined dataset return combined_ds # Return the single dataset if only one sensor was processed elif len(filtered_sensors) == 1: sensor_string = ", ".join(filtered_sensors.keys()) print(f'Returning {sensor_string} data') sensor_ds = list(filtered_sensors.values())[0] # Optionally filter to replace no data values with nans if mask_invalid_data: print( ' Replacing invalid -999 values with NaN (data will be coerced to float32)' ) # First change dtype to float32, then mask out values using # `.where()`. By casting to float32, we prevent `.where()` # from automatically casting to float64, using 2x the memory # We also need to manually reset attributes due to a possible # bug in recent xarray version sensor_ds = (sensor_ds.astype( np.float32).assign_attrs(crs=sensor_ds.crs)) sensor_ds = masking.mask_invalid_data(sensor_ds) return sensor_ds else: print( f'No data returned for query for any sensor in {", ".join(sensors)} ' f'and time range {"-".join(query["time"])}')