def compute_confidence_filtered(self): """ Return the wofs filtered summary band data that is 10% filtered by confidence band. """ con_layer = self.compute_confidence() env = self.cfg.get_env_of_product('wofs_summary') with Datacube(app='wofs_summary', env=env) as dc: gwf = GridWorkflow(dc.index, self.grid_spec) indexed_tile = gwf.list_cells(self.tile_index, product='wofs_summary') # load the data of the tile dataset = gwf.load(tile=indexed_tile[self.tile_index], measurements=['frequency']) data = dataset.data_vars['frequency'].data.ravel().reshape( self.grid_spec.tile_resolution) con_filtering = self.cfg.cfg.get('confidence_filtering') threshold = None if con_filtering: threshold = con_filtering.get('threshold') if threshold: data[con_layer <= threshold] = DEFAULT_FLOAT_NODATA else: data[con_layer <= 0.10] = DEFAULT_FLOAT_NODATA return data
def get_dataset_tiles(self, product, product_type=None, platform=None, time=None, longitude=None, latitude=None, measurements=None, output_crs=None, resolution=None, **kwargs): """ Gets and returns data based on lat/long bounding box inputs. All params are optional. Leaving one out will just query the dc without it, (eg leaving out lat/lng but giving product returns dataset containing entire product.) Args: product (string): The name of the product associated with the desired dataset. product_type (string): The type of product associated with the desired dataset. platform (string): The platform associated with the desired dataset. time (tuple): A tuple consisting of the start time and end time for the dataset. longitude (tuple): A tuple of floats specifying the min,max longitude bounds. latitude (tuple): A tuple of floats specifying the min,max latitutde bounds. measurements (list): A list of strings that represents all measurements. output_crs (string): Determines reprojection of the data before its returned resolution (tuple): A tuple of min,max ints to determine the resolution of the data. Returns: data (xarray): dataset with the desired data in tiled sections. """ # there is probably a better way to do this but I'm not aware of it. query = {} if product_type is not None: query['product_type'] = product_type if platform is not None: query['platform'] = platform if time is not None: query['time'] = time if longitude is not None and latitude is not None: query['longitude'] = longitude query['latitude'] = latitude # set up the grid workflow gw = GridWorkflow(self.dc.index, product=product) # dict of tiles. request_tiles = gw.list_cells(product=product, measurements=measurements, output_crs=output_crs, resolution=resolution, **query) # cells now return stacked xarrays of data. data_tiles = {} for tile_key in request_tiles: tile = request_tiles[tile_key] data_tiles[tile_key] = gw.load(tile, measurements=measurements) return data_tiles
def load_tile_data(self, factors): """ Load and return factor data for confidence band prediction. :param factors: List of factor info as given by Config """ model_data = [] for fac in factors: factor = self.cfg.get_factor_info(fac) with Datacube(app='confidence_layer', env=factor['env']) as dc: gwf = GridWorkflow(dc.index, self.grid_spec) indexed_tiles = gwf.list_cells(self.tile_index, product=factor['product']) # load the data of the tile dataset = gwf.load(tile=indexed_tiles[self.tile_index], measurements=[factor['band']]) data = dataset.data_vars[factor['band']].data # Rescale where needed: Keep an eye on this since this is to do with different scaling factors used during # training than what is on datacube if factor['name'].startswith('phat'): data = data * 100.0 if factor['name'].startswith('phat'): data[data < 0.0] = 0.0 if factor['name'].startswith('mrvbf'): data[data > 10] = 10 if factor['name'].startswith('modis'): data[data > 100] = 100 model_data.append(data.ravel()) del data return np.column_stack(model_data)
def test_wofs_filtered(): cfg = Config('../configs/template_client.yaml') grid_spec = GridSpec(crs=CRS('EPSG:3577'), tile_size=(100000, 100000), resolution=(-25, 25)) cell_index = (17, -39) wf = WofsFiltered(cfg, grid_spec, cell_index) confidence = wf.compute_confidence(cell_index) filtered = wf.compute_confidence_filtered() # Display images: to be removed later with Datacube(app='wofs_summary', env='dev') as dc: gwf = GridWorkflow(dc.index, grid_spec) indexed_tile = gwf.list_cells(cell_index, product='wofs_statistical_summary') # load the data of the tile dataset = gwf.load(tile=indexed_tile[cell_index], measurements=['frequency']) frequency = dataset.data_vars['frequency'].data.ravel().reshape( grid_spec.tile_resolution) # Check with previous run with rasterio.open('confidenceFilteredWOfS_17_-39_epsilon=10.tiff') as f: data = f.read(1) plt.subplot(221) plt.imshow(frequency) plt.subplot(222) plt.imshow(data) plt.subplot(223) plt.imshow(confidence) plt.subplot(224) plt.imshow(filtered) plt.show() wf.compute_and_write()
def save_grid_count_to_file(filename, index, **queryargs): gw = GridWorkflow(product=queryargs['product'], index=index) cells = gw.list_cells(group_by='solar_day', **queryargs) geojson = cells_list_to_featurecollection(cells) with open(filename, 'w') as dest: json.dump(geojson, dest)
def list_all_cells(index, products, period, dt1, dt2): print("date range is FROM " + str(dt2) + " TO " + str(dt1)) gw = GridWorkflow(index=index, product=products[0]) my_cell_info = defaultdict(dict) pq_cell_info = defaultdict(dict) cell_list = [] print(" database querying for listing all cells starts at " + str(datetime.now())) for prod in PQ_PRODUCTS: pq = gw.list_cells(product=prod, time=(dt2, dt1), group_by='solar_day') my_cell_info[prod] = pq pq_cell_info.update(pq) for prod in NBAR_PRODUCTS: data_info = gw.list_cells(product=prod, time=(dt2, dt1), group_by='solar_day') my_cell_info[prod] = data_info for k, v in pq_cell_info.items(): cell_list.append(k) cell_list = ['({0},{1})'.format(a, b) for (a, b) in cell_list] print(" database query done for all cells " + str(len(cell_list)) + str(datetime.now())) return cell_list, my_cell_info
def gwf_query(product, lat=None, long=None, region=None, begin=None, end=None, view=True): """Run a spatial query on a datacube product using either coordinates or a region name Wrapper function to call at the begining of nearly all spatial processing command lines Args: product (str): Name of an ingested datacube product. The product to query lat (tuple): OPtional. For coordinate based spatial query. Tuple of min and max latitudes in decimal degreees. long (tuple): OPtional. For coordinate based spatial query. Tuple of min and max longitudes in decimal degreees. region (str): Optional name of a region or country whose geometry is present in the database region or country table. Overrides lat and long when present (not None). Countries must be queried using ISO code (e.g.: 'MEX' for Mexico) begin (str): Date string in the form '%Y-%m-%d'. For temporally bounded queries end (str): Date string in the form '%Y-%m-%d'. For temporally bounded queries view (bool): Returns a view instead of the dictionary returned by ``GridWorkflow.list_cells``. Useful when the output is be used directly as an iterable (e.g. in ``distributed.map``) Default to True Returns: dict or view: Dictionary (view) of Tile index, Tile key value pair Example: >>> from madmex.wrappers import gwf_query >>> # Using region name, time unbounded >>> tiles_list = gwf_query(product='ls8_espa_mexico', region='Jalisco') >>> # Using region name, time windowed >>> tiles_list = gwf_query(product='ls8_espa_mexico', region='Jalisco', ... begin = '2017-01-01', end='2017-03-31') >>> # Using lat long box, time windowed >>> tiles_list = gwf_query(product='ls8_espa_mexico', lat=[19, 22], long=[-104, -102], ... begin = '2017-01-01', end='2017-03-31') """ query_params = {'product': product} if region is not None: # Query database and build a datacube.utils.Geometry(geopolygon) try: query_set = Country.objects.get(name=region) except Country.DoesNotExist: query_set = Region.objects.get(name=region) region_json = json.loads(query_set.the_geom.geojson) crs = CRS('EPSG:%d' % query_set.the_geom.srid) geom = Geometry(region_json, crs) query_params.update(geopolygon=geom) elif lat is not None and long is not None: query_params.update(x=long, y=lat) else: raise ValueError('Either a region name or a lat and long must be provided') if begin is not None and end is not None: begin = datetime.strptime(begin, "%Y-%m-%d") end = datetime.strptime(end, "%Y-%m-%d") query_params.update(time=(begin, end)) # GridWorkflow object dc = datacube.Datacube() gwf = GridWorkflow(dc.index, product=product) tile_dict = gwf.list_cells(**query_params) # Iterable (dictionary view (analog to list of tuples)) if view: tile_dict = tile_dict.items() return tile_dict
from rasterio.features import rasterize from madmex.io.vector_db import VectorDb from rasterio.features import rasterize import datacube from datacube.api import GridWorkflow from pprint import pprint from datetime import datetime from affine import Affine # Load a test dataset dc = datacube.Datacube() gw = GridWorkflow(dc.index, product='ls8_espa_mexico') tile_dict = gw.list_cells(product='ls8_espa_mexico', x=(-104, -102), y=(19, 21), time=(datetime(2017, 1, 1), datetime(2017, 2, 1))) tile_list = list(tile_dict.items()) sr = gw.load(tile_list[3][1]) # Visualize Dataset metadata print(sr) # Load training data corresponding to that dataset db = VectorDb() fc = db.load_training_from_dataset(sr) # Visualize first element of feature collection pprint(fc[0]) # Rasterize the feature collection geom_list = [x['geometry'] for x in fc]
def runAll(num_bands, args): """Run on all tiles in the specified datasets/area. Keys are based on the last dataset listed.""" global rows # Calculate the right number of columns to be returned from the data cube input_num_cols = num_bands + 1 dc = datacube.Datacube() # Create Gridworkflow object for most recent dataset gw = GridWorkflow(dc.index, product=args.input_products[-1]) # Get list of cell keys for most recent dataset keys = list( gw.list_cells(product=args.input_products[-1], lat=(args.lowerlat, args.upperlat), lon=(args.lowerlon, args.upperlon)).keys()) dc.close() # Run on each key/tile in turn for key in keys: ccdc_args = [] input_ds = [] tmask_ds = [] cloud_ds = [] input_ds = loadAll(args.input_products, key, args.bands) if (input_ds): if (args.tmask_products): tmask_ds = loadAll(args.tmask_products, key, ['green', 'nir', 'swir1']) if (args.cloud_products): cloud_ds = loadAll(args.cloud_products, key, ['cloud_mask']) # Tidy up input data input_data = xr.concat(input_ds, dim='time') input_data = mask_invalid_data(input_data) if (cloud_ds): cloud_masks = xr.concat(cloud_ds, dim='time') # Do the same for TOA data if present - tmask_ds will be empty if no TOA data sets were specified if (tmask_ds): tmask_data = xr.concat(tmask_ds, dim='time') tmask_data = mask_invalid_data(tmask_data) # We want to process each pixel seperately for i in range(len(input_data.x)): for j in range(len(input_data.y)): input_ts = input_data.isel(x=i, y=j) # Get just one pixel x_val = float(input_ts.x) y_val = float(input_ts.y) input_ts = transformToArray( input_ts ) # Transform the time series into a numpy array if (input_ts.shape[0] > 0 and input_ts.shape[1] == input_num_cols): if (cloud_ds): cloud_ts = cloud_masks.isel( x=i, y=j ) # Get cloud mask values through time for this pixel cloud_ts = transformToArray(cloud_ts) cloud_ts = cloud_ts[np.isin( cloud_ts[:, 0], input_ts[:, 0] )] # Remove any rows which aren't in the SREF data input_ts = input_ts[ cloud_ts[:, 1] == 0] # Do masking (0 value is clear) if (tmask_ds): tmask_ts = tmask_data.isel(x=i, y=j) tmask_ts = transformToArray(tmask_ts) tmask_ts = tmask_ts[np.isin( tmask_ts[:, 0], input_ts[:, 0] )] # Remove any rows which aren't in the SREF data input_ts = doTmask( input_ts, tmask_ts ) # Use Tmask to further screen the input data argslist = (input_ts, num_bands, x_val, y_val, args) ccdc_args.append(argslist) # Do some tidying up del input_data if (cloud_ds): del cloud_ds del cloud_masks if (tmask_ds): del tmask_ds del tmask_data # Run processes for this key with Pool(processes=args.num_procs) as pool: pool.starmap(runCCDC, ccdc_args) # Generate output file name for this key output_file = os.path.join( args.outdir, "{}_{}_{}.csv".format(args.output_file, key[0], key[1])) # Write headers to file headers = [ "x", "y", "band", "start_date", "end_date", "start_val", "end_val", "coeffs", "RMSE", "intercept", "alpha", "change_date", "magnitude" ] with open(output_file, 'w') as output: writer = csv.writer(output) writer.writerow(headers) writer.writerows(rows) # Reset shared list rows = []
def runOnSubset(num_bands, args): """If the user chooses to run the algorithm on a random subsample of the data, this function is called. It divides the number of subsamples to be taken by the number of cells/keys for a product or products. It then runs CCDC on the appropriate number of pixels per cell to get an even spread of samples.""" global rows # Calculate the right number of columns to be returned from the data cube input_num_cols = num_bands + 1 tile_size = 3500 # Size of real tiles - too big to fit in memory new_tile_size = 875 # New size - this will divide each tile into 16 dc = datacube.Datacube() # Create Gridworkflow object for most recent dataset gw = GridWorkflow(dc.index, product=args.input_products[-1]) # Get list of cell keys for most recent dataset keys = list(gw.list_cells(product=args.input_products[-1]).keys()) dc.close() num_keys = len(keys) # Calculate number of pixels to use from each cell num_subs = ( (tile_size / new_tile_size) * (tile_size / new_tile_size)) * num_keys # Get number of sub-tiles samples_per_cell = np.ceil(args.num_samples / num_subs).astype( int) # Get number of samples to be taken per sub-tile # Load data for each cell for key in keys: # Each tile needs to be divided into mini-tiles for x in range(0, tile_size, new_tile_size): # Division in x dimension for y in range(0, tile_size, new_tile_size): # Division in y dimension min_x = x max_x = x + new_tile_size min_y = y max_y = y + new_tile_size ccdc_args = [] input_ds = [] tmask_ds = [] cloud_ds = [] input_ds = loadByTile(args.input_products, key, min_y, max_y, min_x, max_x, args.bands) if (input_ds): if (args.tmask_products): tmask_ds = loadByTile(args.tmask_products, key, min_y, max_y, min_x, max_x, ['green', 'nir', 'swir1']) if (args.cloud_products): cloud_ds = loadByTile(args.cloud_products, key, min_y, max_y, min_x, max_x, ['cloud_mask']) # Tidy up input data input_data = xr.concat(input_ds, dim='time') input_data = mask_invalid_data(input_data) if (cloud_ds): cloud_masks = xr.concat(cloud_ds, dim='time') # Do the same for TOA data if present - tmask_ds will be empty if no TOA data sets were specified if (tmask_ds): tmask_data = xr.concat(tmask_ds, dim='time') tmask_data = mask_invalid_data(tmask_data) # We want to process a random subset of pixels for i in range(samples_per_cell): random_x = np.random.randint(0, new_tile_size) random_y = np.random.randint(0, new_tile_size) input_ts = input_data.isel( x=random_x, y=random_y) # Get just one pixel x_val = float(input_ts.x) y_val = float(input_ts.y) input_ts = transformToArray( input_ts ) # Transform the time series into a numpy array if (input_ts.shape[0] > 0 and input_ts.shape[1] == input_num_cols): if (cloud_ds): cloud_ts = cloud_masks.isel( x=random_x, y=random_y ) # Get cloud mask values through time for this pixel cloud_ts = transformToArray(cloud_ts) cloud_ts = cloud_ts[np.isin( cloud_ts[:, 0], input_ts[:, 0] )] # Remove any rows which aren't in the SREF data input_ts = input_ts[ cloud_ts[:, 1] == 0] # Do masking (0 value is clear) if (tmask_ds): tmask_ts = tmask_data.isel(x=random_x, y=random_y) tmask_ts = transformToArray(tmask_ts) tmask_ts = tmask_ts[np.isin( tmask_ts[:, 0], input_ts[:, 0] )] # Remove any rows which aren't in the SREF data input_ts = doTmask( input_ts, tmask_ts ) # Use Tmask to further screen the input data argslist = (input_ts, num_bands, x_val, y_val, args) ccdc_args.append(argslist) # Use multiprocessing to process all samples from this mini-tile # Do some tidying up del input_data if (cloud_ds): del cloud_ds del cloud_masks if (tmask_ds): del tmask_ds del tmask_data # Run processes with Pool(processes=args.num_procs) as pool: pool.starmap(runCCDC, ccdc_args) # Generate output file name output_file = os.path.join( args.outdir, "{}_{}_{}_{}_{}_{}.csv".format(args.output_file, key, min_y, max_y, min_x, max_x)) # Write headers to file headers = [ "x", "y", "band", "start_date", "end_date", "start_val", "end_val", "coeffs", "RMSE", "intercept", "alpha", "change_date", "magnitude" ] with open(output_file, 'w') as output: writer = csv.writer(output) writer.writerow(headers) writer.writerows(rows) # Reset shared list rows = []
def build_my_dataset(index, dt1, dt2, products, cell=None): gw = GridWorkflow(index=index, product=products[0]) ls_7 = defaultdict() ls_8 = defaultdict() new_ds = np.empty(1, dtype=object) for st in products: pq_cell_info = defaultdict(dict) cell_info = defaultdict(dict) prod = None if st == 'ls7_nbar_albers': prod = 'ls7_pq_albers' else: prod = 'ls8_pq_albers' for cl in cell: print("my cell and sensor", cl, st) filepath = odir + '/' + 'LATEST_PIXEL_' + ''.join(map(str, eval(cl))) \ + "_CLOUD_FREE_LAST_" + str(period) + "_DAYS.nc" if os.path.isfile(filepath): print("file exists " + filepath) continue #pq = gw.list_cells(eval(cl), product=prod, time=(dt2, dt1), group_by='solar_day') indexers = { 'product': prod, 'time': (dt2, dt1), 'group_by': 'solar_day' } pq = list_gqa_filtered_cells(index, gw, pix_th=1, cell_index=eval(cl), **indexers) if len(pq) == 0: _log.info( "NO PQ INFO FOUND FOR CELL %s AND IGNORING FOR THIS SENSOR %s" % (cl, st)) print("NO PQ DATA FOUND AND IGNORING FOR THIS SENSOR ", cl, st) continue pq_cell_info.update(pq) for cl, vl in pq_cell_info.items(): cell_info.update( gw.list_cells(cl, product=st, time=(dt2, dt1), group_by='solar_day')) for k, v in cell_info.iteritems(): if type(k) == tuple: print(" loading data for sensor at ", st, str(datetime.now().time())) data = gw.load(cell_info[k], measurements=['swir1', 'nir', 'green']) print(" loaded nbar data", str(datetime.now().time())) pq = gw.load(pq_cell_info[k], fuse_func=pq_fuser) print(" loaded pq data for sensor at ", st, str(datetime.now().time())) mask_clear = pq['pixelquality'] & 15871 == 15871 ndata = data.where(mask_clear).astype(np.int16) # sort in such that latest date comes first ndata = ndata.sel(time=sorted(ndata.time.values, reverse=True)) if len(ndata.attrs) == 0: ndata.attrs = data.attrs if st == 'ls7_nbar_albers': ls_7[k] = copy(ndata) else: ls_8[k] = copy(ndata) my_set = set() for k, v in ls_8.items(): my_set.add(k) for k, v in ls_7.items(): my_set.add(k) ls_new = {} for k in list(my_set): if k in ls_8 and k in ls_7: ls_new[k] = xr.concat([ls_8[k], ls_7[k]], dim='time') ls_new[k] = ls_new[k][['swir1', 'nir', 'green']] ls_new[k] = ls_new[k].sel( time=sorted(ls_new[k].time.values, reverse=True)) elif k in ls_7: ls_new[k] = ls_7[k] elif k in ls_8: ls_new[k] = ls_8[k] return ls_new
import os from datacube.index.postgres._connections import PostgresDb from datacube.index._api import Index from datacube.api import GridWorkflow from datacube.storage.storage import write_dataset_to_netcdf from pprint import pprint import numpy nc_filename = os.path.expanduser( '~/datacube_ingest/recipes/ndvi_mean/ndvi_mean_%d_%d_%s.nc' % (12, -16, '1987')) db = PostgresDb.from_config() i = Index(db) gwf = GridWorkflow(i, product='ls8_espa_mexico') cells_list = gwf.list_cells(product='ls8_espa_mexico', x=(-106, -101), y=(19, 23)) sr = gwf.load(cells_list[(12, -16)], dask_chunks={'x': 1000, 'y': 1000}) sr['ndvi'] = (sr.nir - sr.red) / (sr.nir + sr.red) * 10000 ndvi = sr.drop(['pixel_qa', 'blue', 'red', 'green', 'nir', 'swir1', 'swir2']) # Run temporal reductions and rename DataArrays ndvi_mean = ndvi.mean('time', keep_attrs=True) ndvi_mean = ndvi_mean.astype('int16') ndvi_mean.attrs['crs'] = sr.attrs['crs'] write_dataset_to_netcdf(ndvi_mean, nc_filename) print(nc_filename)
ndvi_clear = ndvi.where(clear) # Run temporal reductions and rename DataArrays ndvi_mean = ndvi_clear.mean('time', keep_attrs=True) ndvi_mean['ndvi'].attrs['nodata'] = -9999 ndvi_mean_int = ndvi_mean.apply(to_int) ndvi_mean_int.attrs['crs'] = sr.attrs['crs'] write_dataset_to_netcdf(ndvi_mean_int, nc_filename, netcdfparams={'zlib': True}) return nc_filename except Exception as e: print('Tile (%d, %d) not processed. %s' % (tile[0][0], tile[0][1], e)) raise return None product = 'ls8_espa_mexico' begin = datetime(2017, 1, 1) end = datetime(2017, 3, 1) long = (-106, -102) lat = (19, 23) center_dt = datetime(2017, 2, 1) # GridWorkflow object dc = datacube.Datacube() gwf = GridWorkflow(dc.index, product=product) tile_dict = gwf.list_cells(product=product, time=(begin, end), x=long, y=lat) # Iterable (dictionary view (analog to list of tuples)) iterable = list(tile_dict.items()) run(iterable[0], gwf=gwf, center_dt=center_dt)