def __init__(self, source, statistic="sum", frequency=None, timezone="UTC"): if not isinstance(source, RasterBlock): raise TypeError("'{}' object is not allowed.".format(type(source))) if not isinstance(statistic, str): raise TypeError("'{}' object is not allowed.".format( type(statistic))) # interpret percentile statistic percentile = parse_percentile_statistic(statistic) if percentile: statistic = "p{0}".format(percentile) elif statistic not in self.STATISTICS: raise ValueError("Unknown statistic '{}'".format(statistic)) if frequency is not None: if not isinstance(frequency, str): raise TypeError("'{}' object is not allowed.".format( type(frequency))) frequency = to_offset(frequency).freqstr if not isinstance(timezone, str): raise TypeError("'{}' object is not allowed.".format( type(timezone))) timezone = pytz.timezone(timezone).zone else: timezone = None super().__init__(source, statistic, frequency, timezone)
def __init__( self, source, raster, statistic="sum", projection=None, pixel_size=None, max_pixels=None, column_name="agg", auto_pixel_size=False, *args ): if not isinstance(source, GeometryBlock): raise TypeError("'{}' object is not allowed".format(type(source))) if not isinstance(raster, RasterBlock): raise TypeError("'{}' object is not allowed".format(type(raster))) if not isinstance(statistic, str): raise TypeError("'{}' object is not allowed".format(type(statistic))) statistic = statistic.lower() percentile = utils.parse_percentile_statistic(statistic) if percentile: statistic = "p{0}".format(percentile) elif statistic not in self.STATISTICS or statistic == "percentile": raise ValueError("Unknown statistic '{}'".format(statistic)) if projection is None: projection = raster.projection if not isinstance(projection, str): raise TypeError("'{}' object is not allowed".format(type(projection))) if pixel_size is None: # get the pixel_size from the raster geo_transform geo_transform = raster.geo_transform if geo_transform is None: raise ValueError( "Cannot get the pixel_size from the source " "raster. Please provide a pixel_size." ) pixel_size = min(abs(float(geo_transform[1])), abs(float(geo_transform[5]))) else: pixel_size = abs(float(pixel_size)) if pixel_size == 0.0: raise ValueError("Pixel size cannot be 0") if max_pixels is not None: max_pixels = int(max_pixels) if not isinstance(auto_pixel_size, bool): raise TypeError("'{}' object is not allowed".format(type(auto_pixel_size))) super(AggregateRaster, self).__init__( source, raster, statistic, projection, pixel_size, max_pixels, column_name, auto_pixel_size, *args )
def __init__( self, source, frequency, statistic="sum", closed=None, label=None, timezone="UTC", ): if not isinstance(source, RasterBlock): raise TypeError("'{}' object is not allowed.".format(type(source))) if frequency is not None: if not isinstance(frequency, str): raise TypeError("'{}' object is not allowed.".format( type(frequency))) frequency = to_offset(frequency).freqstr if closed not in {None, "left", "right"}: raise ValueError("closed must be None, 'left', or 'right'.") if label not in {None, "left", "right"}: raise ValueError("label must be None, 'left', or 'right'.") if not isinstance(timezone, str): raise TypeError("'{}' object is not allowed.".format( type(timezone))) timezone = pytz.timezone(timezone).zone else: closed = None label = None timezone = None if not isinstance(statistic, str): raise TypeError("'{}' object is not allowed.".format( type(statistic))) # interpret percentile statistic percentile = parse_percentile_statistic(statistic) if percentile: statistic = "p{0}".format(percentile) elif statistic not in self.STATISTICS: raise ValueError("Unknown statistic '{}'".format(statistic)) super(TemporalAggregate, self).__init__(source, frequency, statistic, closed, label, timezone)
def process(process_kwargs, time_data=None, data=None): mode = process_kwargs["mode"] # handle empty data if process_kwargs.get("empty"): return None if mode == "vals" else {mode: []} if mode == "time": return time_data if time_data is None or not time_data.get("time"): return None if mode == "vals" else {mode: []} start = process_kwargs["start"] stop = process_kwargs["stop"] frequency = process_kwargs["frequency"] timezone = process_kwargs["timezone"] closed = process_kwargs["closed"] label = process_kwargs["label"] times = (pd.Series( index=time_data["time"]).tz_localize("UTC").tz_convert(timezone)) if frequency is None: # the first (and only label) will be the statistic of all frames indices = {None: range(len(times))} else: # construct a pandas Resampler object to map labels to frames resampler = times.resample(frequency, closed=closed, label=label) # get the frame indices belonging to each bin indices = resampler.indices start_ts = _dt_to_ts(start, timezone) stop_ts = _dt_to_ts(stop, timezone) if mode == "meta": if data is None or "meta" not in data: return {"meta": []} meta = data["meta"] result = [] for indices_in_bin in indices.values(): # [0, 1], [2, 3], ... for length in range(1, len(indices_in_bin) + 1): indices_for_cumulative = indices_in_bin[:length] ts = times.index[indices_for_cumulative[-1]] if ts < start_ts or (stop_ts is not None and ts > stop_ts): continue result.append([meta[i] for i in indices_for_cumulative]) return {"meta": result} # mode == 'vals' if data is None or "values" not in data: return values = data["values"] if values.shape[0] != len(times): raise RuntimeError( "Shape of raster does not match number of timestamps") statistic = process_kwargs["statistic"] percentile = parse_percentile_statistic(statistic) if percentile: extensive = False agg_func = partial(np.nanpercentile, q=percentile) else: extensive = Cumulative.STATISTICS[statistic]["extensive"] agg_func = Cumulative.STATISTICS[statistic]["func"] dtype = process_kwargs["dtype"] fillvalue = 0 if extensive else get_dtype_max(dtype) # cast to at least float32 so that we can fit in NaN (and make copy) values = values.astype(np.result_type(np.float32, dtype)) # put NaN for no data values[data["values"] == data["no_data_value"]] = np.nan output_mask = (times.index >= start_ts) & (times.index <= stop_ts) output_offset = np.where(output_mask)[0][0] n_frames = output_mask.sum() result = np.full( shape=(n_frames, values.shape[1], values.shape[2]), fill_value=fillvalue, dtype=dtype, ) for indices_in_bin in indices.values(): mask = output_mask[indices_in_bin] data = values[indices_in_bin] accumulated = agg_func(data, axis=0)[mask] # keep track of NaN or inf values before casting to target dtype no_data_mask = ~np.isfinite(accumulated) # cast to target dtype if dtype != accumulated.dtype: accumulated = accumulated.astype(dtype) # set fillvalue to NaN values accumulated[no_data_mask] = fillvalue indices_in_result = np.array(indices_in_bin)[mask] - output_offset result[indices_in_result] = accumulated return {"values": result, "no_data_value": get_dtype_max(dtype)}
def process(process_kwargs, time_data=None, data=None): mode = process_kwargs["mode"] # handle empty data if process_kwargs.get("empty"): return None if mode == "vals" else {mode: []} start = process_kwargs["start"] stop = process_kwargs["stop"] frequency = process_kwargs["frequency"] if frequency is None: labels = pd.DatetimeIndex([start]) else: labels = pd.date_range(start, stop or start, freq=frequency) if mode == "time": return {"time": labels.to_pydatetime().tolist()} if time_data is None or not time_data.get("time"): return None if mode == "vals" else {mode: []} timezone = process_kwargs["timezone"] closed = process_kwargs["closed"] label = process_kwargs["label"] times = time_data["time"] # convert times to a pandas series series = pd.Series(index=times).tz_localize("UTC").tz_convert(timezone) # localize the labels so we can use it as an index labels = labels.tz_localize("UTC").tz_convert(timezone) if frequency is None: # the first (and only label) will be the statistic of all frames indices = {labels[0]: range(len(times))} else: # construct a pandas Resampler object to map labels to frames resampler = series.resample(frequency, closed=closed, label=label) # get the frame indices belonging to each bin indices = resampler.indices if mode == "meta": if data is None or "meta" not in data: return {"meta": []} meta = data["meta"] return {"meta": [[meta[i] for i in indices[ts]] for ts in labels]} # mode == 'vals' if data is None or "values" not in data: return values = data["values"] if values.shape[0] != len(times): raise RuntimeError( "Shape of raster does not match number of timestamps") statistic = process_kwargs["statistic"] percentile = parse_percentile_statistic(statistic) if percentile: extensive = False agg_func = partial(np.nanpercentile, q=percentile) else: extensive = TemporalAggregate.STATISTICS[statistic]["extensive"] agg_func = TemporalAggregate.STATISTICS[statistic]["func"] dtype = process_kwargs["dtype"] fillvalue = 0 if extensive else get_dtype_max(dtype) # cast to at least float32 so that we can fit in NaN (and make copy) values = values.astype(np.result_type(np.float32, dtype)) # put NaN for no data values[data["values"] == data["no_data_value"]] = np.nan result = np.full( shape=(len(labels), values.shape[1], values.shape[2]), fill_value=fillvalue, dtype=dtype, ) for i, timestamp in enumerate(labels): inds = indices[timestamp] if len(inds) == 0: continue aggregated = agg_func(values[inds], axis=0) # keep track of NaN or inf values before casting to target dtype no_data_mask = ~np.isfinite(aggregated) # cast to target dtype if dtype != aggregated.dtype: aggregated = aggregated.astype(dtype) # set fillvalue to NaN values aggregated[no_data_mask] = fillvalue result[i] = aggregated return {"values": result, "no_data_value": get_dtype_max(dtype)}
def reduce_rasters(stack, statistic, no_data_value=None, dtype=None): """Apply a statistic (e.g. "mean") to a stack of rasters, skipping 'no data' values. In this context, reduce means that the dimensionality of the input data is reduced by one. Args: stack (list): a list of dicts containing "values" (ndarray) and "no_data_value". If the list has zero length or if the ndarrays do not have the same shape, a ValueError is raised. statistic (str): the applied statistic (no data is ignored). One of: {"last", "first", "count", "sum", "mean", "min", "max", "argmin", "argmax", "product", "std", "var", "p<number>"} no_data_value (number, optional): the 'no data' value in the output array. Defaults to the no data value of the first element in the stack. dtype (str or dtype): the datatype of the output array. Defaults to the dtype of the first element in the stack. If the input data cannot be cast to this dtype, a ValueError is raised. Returns: dict with "values" and "no_data_value" """ if statistic not in STATISTICS: percentile = parse_percentile_statistic(statistic) if percentile is None: raise KeyError('Unknown statistic "{}"'.format(statistic)) else: statistic = "percentile" if len(stack) == 0: raise ValueError("Cannot reduce a zero-length stack") # get the output array properties (dtype, no_data_value, shape) if dtype is None: dtype = stack[0]["values"].dtype if no_data_value is None: no_data_value = stack[0]["no_data_value"] shape = stack[0]["values"].shape # sum, count and nans output do not contain no data: fill zeroes right away if statistic in {"sum", "count", "nans"}: fill_value = 0 else: fill_value = no_data_value # create the output array out = np.full(shape, fill_value, dtype) if statistic == "last": # populate 'out' with the last value that is not 'no data' for data in stack: index = get_index(data["values"], data["no_data_value"]) out[index] = data["values"][index] elif statistic == "first": # populate 'out' with the first value that is not 'no data' for data in stack[::-1]: index = get_index(data["values"], data["no_data_value"]) out[index] = data["values"][index] elif statistic == "count": # count the number of values that are not 'no data' for data in stack: out += get_index(data["values"], data["no_data_value"]) else: if statistic == "percentile": func = partial(np.nanpercentile, q=percentile) else: func = STATISTICS[statistic] # transform 'no data' into 'nan' to be able to use numpy functions # NB: the dtype is at least float16 to accomodate NaN stack_array = np.full((len(stack), ) + shape, np.nan, np.result_type(dtype, np.float16)) for i, data in enumerate(stack): index = get_index(data["values"], data["no_data_value"]) stack_array[i, index] = data["values"][index] # protect against all-NaN slice warnings and errors not_all_nan = ~np.all(np.isnan(stack_array), axis=0) # perform the math out[not_all_nan] = func(stack_array[:, not_all_nan], axis=0) return {"values": out, "no_data_value": no_data_value}
def check_statistic(statistic): if statistic not in STATISTICS: percentile = parse_percentile_statistic(statistic) if percentile is None: raise ValueError('Unknown statistic "{}"'.format(statistic))
def process(geom_data, raster_data, process_kwargs): if process_kwargs.get("empty"): return { "features": gpd.GeoDataFrame([]), "projection": process_kwargs["projection"], } elif process_kwargs["mode"] == "extent": return geom_data features = geom_data["features"] if len(features) == 0: return geom_data result = features.copy() # transform the features into the aggregation projection req_srs = process_kwargs["req_srs"] agg_srs = process_kwargs["agg_srs"] agg_geometries = utils.geoseries_transform(features["geometry"], req_srs, agg_srs) statistic = process_kwargs["statistic"] percentile = utils.parse_percentile_statistic(statistic) if percentile: statistic = "percentile" agg_func = partial(AggregateRaster.STATISTICS[statistic]["func"], qval=percentile) else: agg_func = AggregateRaster.STATISTICS[statistic]["func"] extensive = AggregateRaster.STATISTICS[statistic]["extensive"] result_column = process_kwargs["result_column"] # this is only there for the AggregateRasterAboveThreshold threshold_name = process_kwargs.get("threshold_name") if threshold_name: # get the threshold, appending NaN for unlabeled pixels threshold_values = np.empty((len(features) + 1, ), dtype="f4") threshold_values[:-1] = features[threshold_name].values threshold_values[-1] = np.nan else: threshold_values = None # investigate the raster data if raster_data is None: values = no_data_value = None else: values = raster_data["values"] no_data_value = raster_data["no_data_value"] if values is None or np.all(values == no_data_value): # skip the rest result[result_column] = 0 if extensive else np.nan return {"features": result, "projection": req_srs} depth, height, width = values.shape pixel_size = process_kwargs["pixel_size"] actual_pixel_size = process_kwargs["actual_pixel_size"] # process in groups of disjoint subsets of the features agg = np.full((depth, len(features)), np.nan, dtype="f4") for select in bucketize(features.bounds.values): rasterize_result = utils.rasterize_geoseries( agg_geometries.iloc[select], process_kwargs["agg_bbox"], agg_srs, height, width, values=np.asarray(select, dtype=np.int32), # GDAL needs int32 ) labels = rasterize_result["values"][0] # if there is a threshold, generate a raster with thresholds if threshold_name: # mode="clip" ensures that unlabeled cells use the appended NaN thresholds = np.take(threshold_values, labels, mode="clip") else: thresholds = None for frame_no, frame in enumerate(values): # limit statistics to active pixels active = frame != no_data_value # if there is a threshold, mask the frame if threshold_name: valid = ~np.isnan(thresholds) # to suppress warnings active[~valid] = False # no threshold -> no aggregation active[valid] &= frame[valid] >= thresholds[valid] # if there is no single active value: do not aggregate if not active.any(): continue # select features that actually have data # (min, max, median, and percentile cannot handle it otherwise) active_labels = labels[active] select_and_active = list( set(np.unique(active_labels)) & set(select)) if not select_and_active: continue agg[frame_no][select_and_active] = agg_func( 1 if statistic == "count" else frame[active], labels=active_labels, index=select_and_active, ) if extensive: # sum and count agg[~np.isfinite(agg)] = 0 # extensive aggregations have to be scaled if actual_pixel_size != pixel_size: agg *= (actual_pixel_size / pixel_size)**2 else: agg[~np.isfinite(agg)] = np.nan # replaces inf by nan if depth == 1: result[result_column] = agg[0] else: # store an array in a dataframe cell: set each cell with [np.array] result[result_column] = [[x] for x in agg.T] return {"features": result, "projection": req_srs}
def process(geom_data, raster_data, process_kwargs): if process_kwargs.get("empty"): return { "features": gpd.GeoDataFrame([]), "projection": process_kwargs["projection"], } elif process_kwargs["mode"] == "extent": return geom_data features = geom_data["features"] if len(features) == 0: return geom_data result = features.copy() # transform the features into the aggregation projection req_srs = process_kwargs["req_srs"] agg_srs = process_kwargs["agg_srs"] agg_geometries = utils.geoseries_transform( features["geometry"], req_srs, agg_srs, ) statistic = process_kwargs["statistic"] percentile = utils.parse_percentile_statistic(statistic) if percentile: statistic = "percentile" agg_func = partial(AggregateRaster.STATISTICS[statistic]["func"], qval=percentile) else: agg_func = AggregateRaster.STATISTICS[statistic]["func"] extensive = AggregateRaster.STATISTICS[statistic]["extensive"] result_column = process_kwargs["result_column"] # this is only there for the AggregateRasterAboveThreshold threshold_name = process_kwargs.get("threshold_name") # investigate the raster data if raster_data is None: values = no_data_value = None else: values = raster_data["values"] no_data_value = raster_data["no_data_value"] if values is None or np.all(values == no_data_value): # skip the rest result[result_column] = 0 if extensive else np.nan return {"features": result, "projection": req_srs} depth, height, width = values.shape pixel_size = process_kwargs["pixel_size"] actual_pixel_size = process_kwargs["actual_pixel_size"] # process in groups of disjoint subsets of the features agg = np.full((depth, len(features)), np.nan, dtype="f4") for select in bucketize(features.bounds.values): agg_geometries_bucket = agg_geometries.iloc[select] index = features.index[select] rasterize_result = utils.rasterize_geoseries( agg_geometries_bucket, process_kwargs["agg_bbox"], agg_srs, height, width, values=index, ) labels = rasterize_result["values"][0] # if there is a threshold, generate a raster with thresholds if threshold_name: thresholds = features.loc[labels.ravel(), threshold_name].values.reshape( labels.shape) else: thresholds = None for frame_no, frame in enumerate(values): # limit statistics to active pixels active = frame != no_data_value # if there is a threshold, mask the frame if threshold_name: valid = ~np.isnan(thresholds) # to suppress warnings active[~valid] = False # no threshold -> no aggregation active[valid] &= frame[valid] >= thresholds[valid] # if there is no single active value: do not aggregate if not active.any(): continue with warnings.catch_warnings(): # we may get divide by 0 if any geometry does not contain # any 'active' values warnings.simplefilter("ignore") agg[frame_no][select] = agg_func( 1 if statistic == "count" else frame[active], labels=labels[active], index=index, ) if extensive: # sum and count agg[~np.isfinite(agg)] = 0 # extensive aggregations have to be scaled if actual_pixel_size != pixel_size: agg *= (actual_pixel_size / pixel_size)**2 else: agg[~np.isfinite(agg)] = np.nan # replaces inf by nan if depth == 1: result[result_column] = agg[0] else: # store an array in a dataframe cell: set each cell with [np.array] result[result_column] = [[x] for x in agg.T] return {"features": result, "projection": req_srs}