def __init__(self, start: Union[str, datetime], freq: Optional[str] = None): """ DateTimeRange('2019-03--P3M') DateTimeRange('2019-03', '3M') DateTimeRange(datetime(2019, 3, 1), '3M') """ if freq is None: assert isinstance(start, str) start, freq = split_and_check(start, '--P', 2) freq = freq.upper().lstrip('P') # Pandas period snaps to frequency resolution, we need to undo that by re-adding the snapping delta t0 = pd.Timestamp(start) period = pd.Period(t0, freq=freq) dt = t0 - period.start_time self.freq: str = freq self.start: datetime = normalise_dt(t0.to_pydatetime(warn=False)) self.end: datetime = normalise_dt( (period.end_time + dt).to_pydatetime(warn=False))
def test_normalise_dt(): dt_notz = datetime(2020, 2, 14, 10, 33, 11, tzinfo=None) assert normalise_dt(dt_notz) is dt_notz assert normalise_dt("2020-01-20") == datetime(2020, 1, 20) assert normalise_dt('2020-03-26T10:15:32.556793+1:00').tzinfo is None assert normalise_dt('2020-03-26T10:15:32.556793+1:00') == datetime( 2020, 3, 26, 9, 15, 32, 556793) assert normalise_dt('2020-03-26T10:15:32.556793+9:00') == datetime( 2020, 3, 26, 1, 15, 32, 556793)
def format_datetime(dt: datetime, with_tz=True, timespec='microseconds') -> str: dt = normalise_dt(dt) dt = dt.isoformat(timespec=timespec) if with_tz: dt = dt + 'Z' return dt
def group_by_nothing( dss: List[Dataset], solar_day_offset: Optional[timedelta] = None) -> xr.DataArray: """ Construct "sources" just like ``.group_dataset`` but with every slice containing just one Dataset object wrapped in a tuple. Time -> (Dataset,) """ dss = sorted(dss, key=lambda ds: (normalise_dt(ds.center_time), ds.id)) time = [normalise_dt(ds.center_time) for ds in dss] solar_day = None if solar_day_offset is not None: solar_day = np.asarray([(dt + solar_day_offset).date() for dt in time], dtype="datetime64[D]") idx = np.arange(0, len(dss), dtype="uint32") uuids = np.empty(len(dss), dtype="O") data = np.empty(len(dss), dtype="O") grid2crs: Dict[int, Any] = {} grid = list(key2num((ds.crs for ds in dss), grid2crs)) for i, ds in enumerate(dss): data[i] = (ds, ) uuids[i] = ds.id coords = [np.asarray(time, dtype="datetime64[ms]"), idx, uuids, grid] names = ["time", "idx", "uuid", "grid"] if solar_day is not None: coords.append(solar_day) names.append("solar_day") coord = pd.MultiIndex.from_arrays(coords, names=names) return xr.DataArray(data=data, coords=dict(spec=coord), attrs={"grid2crs": grid2crs}, dims=("spec", ))
def compress_ds(ds: Dataset) -> CompressedDataset: dt = normalise_dt(ds.center_time) return CompressedDataset(ds.id, dt)
def norm_axis_value(x): if isinstance(x, datetime.datetime): # For datetime we convert to UTC, then strip timezone info # to avoid numpy/pandas warning about timezones return numpy.datetime64(normalise_dt(x), 'ns') return x
def load_ard(dc, products=None, min_gooddata=0.0, fmask_categories=['valid', 'snow', 'water'], mask_pixel_quality=True, mask_contiguity=False, ls7_slc_off=True, predicate=None, dtype='auto', **kwargs): """ Loads and combines Landsat Collection 3 or Sentinel 2 Definitive and Near Real Time data for multiple sensors (i.e. ls5t, ls7e and ls8c for Landsat; s2a and s2b for Sentinel 2), optionally applies pixel quality and contiguity masks, and drops time steps that contain greater than a minimum proportion of good quality (e.g. non- cloudy or shadowed) pixels. The function supports loading the following DEA products: ga_ls5t_ard_3 ga_ls7e_ard_3 ga_ls8c_ard_3 s2a_ard_granule s2b_ard_granule s2a_nrt_granule s2b_nrt_granule Last modified: June 2020 Parameters ---------- dc : datacube Datacube object The Datacube to connect to, i.e. `dc = datacube.Datacube()`. This allows you to also use development datacubes if required. products : list A list of product names to load data from. Valid options are ['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] for Landsat, ['s2a_ard_granule', 's2b_ard_granule'] for Sentinel 2 Definitive, and ['s2a_nrt_granule', 's2b_nrt_granule'] for Sentinel 2 Near Real Time (on the DEA Sandbox only). min_gooddata : float, optional An optional float giving the minimum percentage of good quality pixels required for a satellite observation to be loaded. Defaults to 0.0 which will return all observations regardless of pixel quality (set to e.g. 0.99 to return only observations with more than 99% good quality pixels). fmask_categories : list, optional An optional list of fmask category names to treat as good quality pixels in the above `min_gooddata` calculation, and for masking data by pixel quality (if `mask_pixel_quality=True`). The default is `['valid', 'snow', 'water']` which will return non-cloudy or shadowed land, snow and water pixels. Choose from: 'nodata', 'valid', 'cloud', 'shadow', 'snow', and 'water'. mask_pixel_quality : bool, optional An optional boolean indicating whether to mask out poor quality pixels using fmask based on the `fmask_categories` provided above. The default is True, which will set poor quality pixels to NaN if `dtype='auto'` (which will convert the data to 'float32'), or set poor quality pixels to the data's native nodata value if `dtype='native' (which can be useful for reducing memory). mask_contiguity : str or bool, optional An optional string or boolean indicating whether to mask out pixels missing data in any band (i.e. "non-contiguous" values). This can be important for generating clean composite datasets. The default is False, which will ignore non-contiguous values completely. If loading NBART data, set the parameter to: `mask_contiguity='nbart_contiguity'`. If loading NBAR data, specify `mask_contiguity='nbar_contiguity'` instead. Non-contiguous pixels will be set to NaN if `dtype='auto'`, or set to the data's native nodata value if `dtype='native'` (which can be useful for reducing memory). dtype : string, optional An optional parameter that controls the data type/dtype that layers are coerced to after loading. Valid values: 'native', 'auto', 'float{16|32|64}'. When 'auto' is used, the data will be converted to `float32` if masking is used, otherwise data will be returned in the native data type of the data. Be aware that if data is loaded in its native dtype, nodata and masked pixels will be returned with the data's native nodata value (typically -999), not NaN. ls7_slc_off : bool, optional An optional boolean indicating whether to include data from after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to True, which keeps all Landsat 7 observations > May 31 2003. predicate : function, optional An optional function that can be passed in to restrict the datasets that are loaded by the function. A predicate function should take a `datacube.model.Dataset` object as an input (i.e. as returned from `dc.find_datasets`), and return a boolean. For example, a predicate function could be used to return True for only datasets acquired in January: `dataset.time.begin.month == 1` **kwargs : A set of keyword arguments to `dc.load` that define the spatiotemporal query and load parameters used to extract data. Keyword arguments can either be listed directly in the `load_ard` call like any other parameter (e.g. `measurements=['nbart_red']`), or by passing in a query kwarg dictionary (e.g. `**query`). Keywords can include `measurements`, `x`, `y`, `time`, `resolution`, `resampling`, `group_by`, `crs`; see the `dc.load` documentation for all possible options: https://datacube-core.readthedocs.io/en/latest/dev/api/generate/datacube.Datacube.load.html Returns ------- combined_ds : xarray Dataset An xarray dataset containing only satellite observations that contains greater than `min_gooddata` proportion of good quality pixels. """ ######### # Setup # ######### # Use 'nbart_contiguity' by default if mask_contiguity is true if mask_contiguity is True: mask_contiguity = 'nbart_contiguity' # We deal with `dask_chunks` separately dask_chunks = kwargs.pop('dask_chunks', None) requested_measurements = kwargs.pop('measurements', None) # Warn user if they combine lazy load with min_gooddata if (min_gooddata > 0.0) and dask_chunks is not None: warnings.warn("Setting 'min_gooddata' percentage to > 0.0 " "will cause dask arrays to compute when " "loading pixel-quality data to calculate " "'good pixel' percentage. This can " "slow the return of your dataset.") # Verify that products were provided, and determine if Sentinel-2 # or Landsat data is being loaded if not products: raise ValueError("Please provide a list of product names " "to load data from. Valid options are: \n" "['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] " "for Landsat, ['s2a_ard_granule', " "'s2b_ard_granule'] \nfor Sentinel 2 Definitive, or " "['s2a_nrt_granule', 's2b_nrt_granule'] for " "Sentinel 2 Near Real Time") elif all(['ls' in product for product in products]): product_type = 'ls' elif all(['s2' in product for product in products]): product_type = 's2' fmask_band = 'fmask' measurements = (requested_measurements.copy() if requested_measurements else None) if measurements is None: # Deal with "load all" case: pick a set of bands common across # all products measurements = _common_bands(dc, products) # If no `measurements` are specified, Landsat ancillary bands are # loaded with a 'oa_' prefix, but Sentinel-2 bands are not. As a # work-around, we need to rename the default contiguity and fmask # bands if loading Landsat data without specifying `measurements` if product_type == 'ls': mask_contiguity = (f'oa_{mask_contiguity}' if mask_contiguity else False) fmask_band = f'oa_{fmask_band}' # If `measurements` are specified but do not include fmask or # contiguity variables, add these to `measurements` if fmask_band not in measurements: measurements.append(fmask_band) if mask_contiguity and mask_contiguity not in measurements: measurements.append(mask_contiguity) # Get list of data and mask bands so that we can later exclude # mask bands from being masked themselves data_bands = [ band for band in measurements if band not in (fmask_band, mask_contiguity) ] mask_bands = [band for band in measurements if band not in data_bands] ################# # Find datasets # ################# # Pull out query params only to pass to dc.find_datasets query = _dc_query_only(**kwargs) # Extract datasets for each product using subset of dcload_kwargs dataset_list = [] # Get list of datasets for each product print('Finding datasets') for product in products: # Obtain list of datasets for product print( f' {product} (ignoring SLC-off observations)' if not ls7_slc_off and product == 'ga_ls7e_ard_3' else f' {product}') datasets = dc.find_datasets(product=product, **query) # Remove Landsat 7 SLC-off observations if ls7_slc_off=False if not ls7_slc_off and product == 'ga_ls7e_ard_3': datasets = [ i for i in datasets if normalise_dt(i.time.begin) < datetime.datetime(2003, 5, 31) ] # Add any returned datasets to list dataset_list.extend(datasets) # Raise exception if no datasets are returned if len(dataset_list) == 0: raise ValueError("No data available for query: ensure that " "the products specified have data for the " "time and location requested") # If predicate is specified, use this function to filter the list # of datasets prior to load if predicate: print(f'Filtering datasets using predicate function') dataset_list = [ds for ds in dataset_list if predicate(ds)] # Raise exception if filtering removes all datasets if len(dataset_list) == 0: raise ValueError("No data available after filtering with " "predicate function") ############# # Load data # ############# # Note we always load using dask here so that we can lazy load data # before filtering by good data ds = dc.load(datasets=dataset_list, measurements=measurements, dask_chunks={} if dask_chunks is None else dask_chunks, **kwargs) #################### # Filter good data # #################### # Calculate pixel quality mask pq_mask = odc.algo.fmask_to_bool(ds[fmask_band], categories=fmask_categories) # The good data percentage calculation has to load in all `fmask` # data, which can be slow. If the user has chosen no filtering # by using the default `min_gooddata = 0`, we can skip this step # completely to save processing time if min_gooddata > 0.0: # Compute good data for each observation as % of total pixels print('Counting good quality pixels for each time step') data_perc = (pq_mask.sum(axis=[1, 2], dtype='int32') / (pq_mask.shape[1] * pq_mask.shape[2])) keep = data_perc >= min_gooddata # Filter by `min_gooddata` to drop low quality observations total_obs = len(ds.time) ds = ds.sel(time=keep) pq_mask = pq_mask.sel(time=keep) print(f'Filtering to {len(ds.time)} out of {total_obs} ' f'time steps with at least {min_gooddata:.1%} ' f'good quality pixels') ############### # Apply masks # ############### # Create an overall mask to hold both pixel quality and contiguity mask = None # Add pixel quality mask to overall mask if mask_pixel_quality: print('Applying pixel quality/cloud mask') mask = pq_mask # Add contiguity mask to overall mask if mask_contiguity: print('Applying contiguity mask') cont_mask = ds[mask_contiguity] == 1 # If mask already has data if mask_pixel_quality == True, # multiply with cont_mask to perform a logical 'or' operation # (keeping only pixels good in both) mask = cont_mask if mask is None else mask * cont_mask # Split into data/masks bands, as conversion to float and masking # should only be applied to data bands ds_data = ds[data_bands] ds_masks = ds[mask_bands] # Mask data if either of the above masks were generated if mask is not None: ds_data = odc.algo.keep_good_only(ds_data, where=mask) # Automatically set dtype to either native or float32 depending # on whether masking was requested if dtype == 'auto': dtype = 'native' if mask is None else 'float32' # Set nodata values using odc.algo tools to reduce peak memory # use when converting data dtype if dtype != 'native': ds_data = odc.algo.to_float(ds_data, dtype=dtype) # Put data and mask bands back together attrs = ds.attrs ds = xr.merge([ds_data, ds_masks]) ds.attrs.update(attrs) ############### # Return data # ############### # Drop bands not originally requested by user if requested_measurements: ds = ds[requested_measurements] # If user supplied dask_chunks, return data as a dask array without # actually loading it in if dask_chunks is not None: print(f'Returning {len(ds.time)} time steps as a dask array') return ds else: print(f'Loading {len(ds.time)} time steps') return ds.compute()