Python parse_percentile_statistic 예제들, dask_geomodeling.utils.parse_percentile_statistic Python 예제들

예제 #1

0

파일 보기

파일: temporal.py 프로젝트: mdkrol/dask-geomodeling

 def __init__(self,
              source,
              statistic="sum",
              frequency=None,
              timezone="UTC"):
     if not isinstance(source, RasterBlock):
         raise TypeError("'{}' object is not allowed.".format(type(source)))
     if not isinstance(statistic, str):
         raise TypeError("'{}' object is not allowed.".format(
             type(statistic)))
     # interpret percentile statistic
     percentile = parse_percentile_statistic(statistic)
     if percentile:
         statistic = "p{0}".format(percentile)
     elif statistic not in self.STATISTICS:
         raise ValueError("Unknown statistic '{}'".format(statistic))
     if frequency is not None:
         if not isinstance(frequency, str):
             raise TypeError("'{}' object is not allowed.".format(
                 type(frequency)))
         frequency = to_offset(frequency).freqstr
         if not isinstance(timezone, str):
             raise TypeError("'{}' object is not allowed.".format(
                 type(timezone)))
         timezone = pytz.timezone(timezone).zone
     else:
         timezone = None
     super().__init__(source, statistic, frequency, timezone)

예제 #2

0

파일 보기

    def __init__(
        self,
        source,
        raster,
        statistic="sum",
        projection=None,
        pixel_size=None,
        max_pixels=None,
        column_name="agg",
        auto_pixel_size=False,
        *args
    ):
        if not isinstance(source, GeometryBlock):
            raise TypeError("'{}' object is not allowed".format(type(source)))
        if not isinstance(raster, RasterBlock):
            raise TypeError("'{}' object is not allowed".format(type(raster)))
        if not isinstance(statistic, str):
            raise TypeError("'{}' object is not allowed".format(type(statistic)))
        statistic = statistic.lower()
        percentile = utils.parse_percentile_statistic(statistic)
        if percentile:
            statistic = "p{0}".format(percentile)
        elif statistic not in self.STATISTICS or statistic == "percentile":
            raise ValueError("Unknown statistic '{}'".format(statistic))

        if projection is None:
            projection = raster.projection
        if not isinstance(projection, str):
            raise TypeError("'{}' object is not allowed".format(type(projection)))
        if pixel_size is None:
            # get the pixel_size from the raster geo_transform
            geo_transform = raster.geo_transform
            if geo_transform is None:
                raise ValueError(
                    "Cannot get the pixel_size from the source "
                    "raster. Please provide a pixel_size."
                )
            pixel_size = min(abs(float(geo_transform[1])), abs(float(geo_transform[5])))
        else:
            pixel_size = abs(float(pixel_size))
        if pixel_size == 0.0:
            raise ValueError("Pixel size cannot be 0")
        if max_pixels is not None:
            max_pixels = int(max_pixels)
        if not isinstance(auto_pixel_size, bool):
            raise TypeError("'{}' object is not allowed".format(type(auto_pixel_size)))

        super(AggregateRaster, self).__init__(
            source,
            raster,
            statistic,
            projection,
            pixel_size,
            max_pixels,
            column_name,
            auto_pixel_size,
            *args
        )

예제 #3

0

파일 보기

파일: temporal.py 프로젝트: mdkrol/dask-geomodeling

 def __init__(
     self,
     source,
     frequency,
     statistic="sum",
     closed=None,
     label=None,
     timezone="UTC",
 ):
     if not isinstance(source, RasterBlock):
         raise TypeError("'{}' object is not allowed.".format(type(source)))
     if frequency is not None:
         if not isinstance(frequency, str):
             raise TypeError("'{}' object is not allowed.".format(
                 type(frequency)))
         frequency = to_offset(frequency).freqstr
         if closed not in {None, "left", "right"}:
             raise ValueError("closed must be None, 'left', or 'right'.")
         if label not in {None, "left", "right"}:
             raise ValueError("label must be None, 'left', or 'right'.")
         if not isinstance(timezone, str):
             raise TypeError("'{}' object is not allowed.".format(
                 type(timezone)))
         timezone = pytz.timezone(timezone).zone
     else:
         closed = None
         label = None
         timezone = None
     if not isinstance(statistic, str):
         raise TypeError("'{}' object is not allowed.".format(
             type(statistic)))
     # interpret percentile statistic
     percentile = parse_percentile_statistic(statistic)
     if percentile:
         statistic = "p{0}".format(percentile)
     elif statistic not in self.STATISTICS:
         raise ValueError("Unknown statistic '{}'".format(statistic))
     super(TemporalAggregate, self).__init__(source, frequency, statistic,
                                             closed, label, timezone)

예제 #4

0

파일 보기

파일: temporal.py 프로젝트: mdkrol/dask-geomodeling

    def process(process_kwargs, time_data=None, data=None):
        mode = process_kwargs["mode"]
        # handle empty data
        if process_kwargs.get("empty"):
            return None if mode == "vals" else {mode: []}
        if mode == "time":
            return time_data
        if time_data is None or not time_data.get("time"):
            return None if mode == "vals" else {mode: []}

        start = process_kwargs["start"]
        stop = process_kwargs["stop"]
        frequency = process_kwargs["frequency"]
        timezone = process_kwargs["timezone"]
        closed = process_kwargs["closed"]
        label = process_kwargs["label"]
        times = (pd.Series(
            index=time_data["time"]).tz_localize("UTC").tz_convert(timezone))

        if frequency is None:
            # the first (and only label) will be the statistic of all frames
            indices = {None: range(len(times))}
        else:
            # construct a pandas Resampler object to map labels to frames
            resampler = times.resample(frequency, closed=closed, label=label)
            # get the frame indices belonging to each bin
            indices = resampler.indices

        start_ts = _dt_to_ts(start, timezone)
        stop_ts = _dt_to_ts(stop, timezone)

        if mode == "meta":
            if data is None or "meta" not in data:
                return {"meta": []}
            meta = data["meta"]
            result = []
            for indices_in_bin in indices.values():  # [0, 1], [2, 3], ...
                for length in range(1, len(indices_in_bin) + 1):
                    indices_for_cumulative = indices_in_bin[:length]
                    ts = times.index[indices_for_cumulative[-1]]
                    if ts < start_ts or (stop_ts is not None and ts > stop_ts):
                        continue
                    result.append([meta[i] for i in indices_for_cumulative])
            return {"meta": result}

        # mode == 'vals'
        if data is None or "values" not in data:
            return

        values = data["values"]
        if values.shape[0] != len(times):
            raise RuntimeError(
                "Shape of raster does not match number of timestamps")
        statistic = process_kwargs["statistic"]
        percentile = parse_percentile_statistic(statistic)
        if percentile:
            extensive = False
            agg_func = partial(np.nanpercentile, q=percentile)
        else:
            extensive = Cumulative.STATISTICS[statistic]["extensive"]
            agg_func = Cumulative.STATISTICS[statistic]["func"]

        dtype = process_kwargs["dtype"]
        fillvalue = 0 if extensive else get_dtype_max(dtype)

        # cast to at least float32 so that we can fit in NaN (and make copy)
        values = values.astype(np.result_type(np.float32, dtype))
        # put NaN for no data
        values[data["values"] == data["no_data_value"]] = np.nan

        output_mask = (times.index >= start_ts) & (times.index <= stop_ts)
        output_offset = np.where(output_mask)[0][0]
        n_frames = output_mask.sum()
        result = np.full(
            shape=(n_frames, values.shape[1], values.shape[2]),
            fill_value=fillvalue,
            dtype=dtype,
        )

        for indices_in_bin in indices.values():
            mask = output_mask[indices_in_bin]
            data = values[indices_in_bin]
            accumulated = agg_func(data, axis=0)[mask]
            # keep track of NaN or inf values before casting to target dtype
            no_data_mask = ~np.isfinite(accumulated)
            # cast to target dtype
            if dtype != accumulated.dtype:
                accumulated = accumulated.astype(dtype)
            # set fillvalue to NaN values
            accumulated[no_data_mask] = fillvalue
            indices_in_result = np.array(indices_in_bin)[mask] - output_offset
            result[indices_in_result] = accumulated

        return {"values": result, "no_data_value": get_dtype_max(dtype)}

예제 #5

0

파일 보기

파일: temporal.py 프로젝트: mdkrol/dask-geomodeling

    def process(process_kwargs, time_data=None, data=None):
        mode = process_kwargs["mode"]
        # handle empty data
        if process_kwargs.get("empty"):
            return None if mode == "vals" else {mode: []}
        start = process_kwargs["start"]
        stop = process_kwargs["stop"]
        frequency = process_kwargs["frequency"]
        if frequency is None:
            labels = pd.DatetimeIndex([start])
        else:
            labels = pd.date_range(start, stop or start, freq=frequency)
        if mode == "time":
            return {"time": labels.to_pydatetime().tolist()}

        if time_data is None or not time_data.get("time"):
            return None if mode == "vals" else {mode: []}

        timezone = process_kwargs["timezone"]
        closed = process_kwargs["closed"]
        label = process_kwargs["label"]
        times = time_data["time"]

        # convert times to a pandas series
        series = pd.Series(index=times).tz_localize("UTC").tz_convert(timezone)

        # localize the labels so we can use it as an index
        labels = labels.tz_localize("UTC").tz_convert(timezone)

        if frequency is None:
            # the first (and only label) will be the statistic of all frames
            indices = {labels[0]: range(len(times))}
        else:
            # construct a pandas Resampler object to map labels to frames
            resampler = series.resample(frequency, closed=closed, label=label)
            # get the frame indices belonging to each bin
            indices = resampler.indices

        if mode == "meta":
            if data is None or "meta" not in data:
                return {"meta": []}
            meta = data["meta"]
            return {"meta": [[meta[i] for i in indices[ts]] for ts in labels]}

        # mode == 'vals'
        if data is None or "values" not in data:
            return

        values = data["values"]
        if values.shape[0] != len(times):
            raise RuntimeError(
                "Shape of raster does not match number of timestamps")
        statistic = process_kwargs["statistic"]
        percentile = parse_percentile_statistic(statistic)
        if percentile:
            extensive = False
            agg_func = partial(np.nanpercentile, q=percentile)
        else:
            extensive = TemporalAggregate.STATISTICS[statistic]["extensive"]
            agg_func = TemporalAggregate.STATISTICS[statistic]["func"]

        dtype = process_kwargs["dtype"]
        fillvalue = 0 if extensive else get_dtype_max(dtype)

        # cast to at least float32 so that we can fit in NaN (and make copy)
        values = values.astype(np.result_type(np.float32, dtype))
        # put NaN for no data
        values[data["values"] == data["no_data_value"]] = np.nan

        result = np.full(
            shape=(len(labels), values.shape[1], values.shape[2]),
            fill_value=fillvalue,
            dtype=dtype,
        )

        for i, timestamp in enumerate(labels):
            inds = indices[timestamp]
            if len(inds) == 0:
                continue
            aggregated = agg_func(values[inds], axis=0)
            # keep track of NaN or inf values before casting to target dtype
            no_data_mask = ~np.isfinite(aggregated)
            # cast to target dtype
            if dtype != aggregated.dtype:
                aggregated = aggregated.astype(dtype)
            # set fillvalue to NaN values
            aggregated[no_data_mask] = fillvalue
            result[i] = aggregated

        return {"values": result, "no_data_value": get_dtype_max(dtype)}

예제 #6

0

파일 보기

파일: reduction.py 프로젝트: nens/dask-geomodeling

def reduce_rasters(stack, statistic, no_data_value=None, dtype=None):
    """Apply a statistic (e.g. "mean") to a stack of rasters, skipping
    'no data' values.

    In this context, reduce means that the dimensionality of the input data
    is reduced by one.

    Args:
      stack (list): a list of dicts containing "values" (ndarray)
        and "no_data_value". If the list has zero length or if the ndarrays
        do not have the same shape, a ValueError is raised.
      statistic (str): the applied statistic (no data is ignored). One of:
        {"last", "first", "count", "sum", "mean", "min",
        "max", "argmin", "argmax", "product", "std", "var", "p<number>"}
      no_data_value (number, optional): the 'no data' value in the output
        array. Defaults to the no data value of the first element in the stack.
      dtype (str or dtype): the datatype of the output array. Defaults to the
        dtype of the first element in the stack. If the input
        data cannot be cast to this dtype, a ValueError is raised.

    Returns:
      dict with "values" and "no_data_value"
    """
    if statistic not in STATISTICS:
        percentile = parse_percentile_statistic(statistic)
        if percentile is None:
            raise KeyError('Unknown statistic "{}"'.format(statistic))
        else:
            statistic = "percentile"

    if len(stack) == 0:
        raise ValueError("Cannot reduce a zero-length stack")

    # get the output array properties (dtype, no_data_value, shape)
    if dtype is None:
        dtype = stack[0]["values"].dtype
    if no_data_value is None:
        no_data_value = stack[0]["no_data_value"]
    shape = stack[0]["values"].shape

    # sum, count and nans output do not contain no data: fill zeroes right away
    if statistic in {"sum", "count", "nans"}:
        fill_value = 0
    else:
        fill_value = no_data_value

    # create the output array
    out = np.full(shape, fill_value, dtype)

    if statistic == "last":
        # populate 'out' with the last value that is not 'no data'
        for data in stack:
            index = get_index(data["values"], data["no_data_value"])
            out[index] = data["values"][index]
    elif statistic == "first":
        # populate 'out' with the first value that is not 'no data'
        for data in stack[::-1]:
            index = get_index(data["values"], data["no_data_value"])
            out[index] = data["values"][index]
    elif statistic == "count":
        # count the number of values that are not 'no data'
        for data in stack:
            out += get_index(data["values"], data["no_data_value"])
    else:
        if statistic == "percentile":
            func = partial(np.nanpercentile, q=percentile)
        else:
            func = STATISTICS[statistic]
        # transform 'no data' into 'nan' to be able to use numpy functions
        # NB: the dtype is at least float16 to accomodate NaN
        stack_array = np.full((len(stack), ) + shape, np.nan,
                              np.result_type(dtype, np.float16))
        for i, data in enumerate(stack):
            index = get_index(data["values"], data["no_data_value"])
            stack_array[i, index] = data["values"][index]

        # protect against all-NaN slice warnings and errors
        not_all_nan = ~np.all(np.isnan(stack_array), axis=0)

        # perform the math
        out[not_all_nan] = func(stack_array[:, not_all_nan], axis=0)

    return {"values": out, "no_data_value": no_data_value}

예제 #7

0

파일 보기

파일: reduction.py 프로젝트: nens/dask-geomodeling

def check_statistic(statistic):
    if statistic not in STATISTICS:
        percentile = parse_percentile_statistic(statistic)
        if percentile is None:
            raise ValueError('Unknown statistic "{}"'.format(statistic))

예제 #8

0

파일 보기

파일: aggregate.py 프로젝트: Geostatistic/dask-geomodeling

    def process(geom_data, raster_data, process_kwargs):
        if process_kwargs.get("empty"):
            return {
                "features": gpd.GeoDataFrame([]),
                "projection": process_kwargs["projection"],
            }
        elif process_kwargs["mode"] == "extent":
            return geom_data

        features = geom_data["features"]
        if len(features) == 0:
            return geom_data

        result = features.copy()

        # transform the features into the aggregation projection
        req_srs = process_kwargs["req_srs"]
        agg_srs = process_kwargs["agg_srs"]

        agg_geometries = utils.geoseries_transform(features["geometry"],
                                                   req_srs, agg_srs)

        statistic = process_kwargs["statistic"]
        percentile = utils.parse_percentile_statistic(statistic)
        if percentile:
            statistic = "percentile"
            agg_func = partial(AggregateRaster.STATISTICS[statistic]["func"],
                               qval=percentile)
        else:
            agg_func = AggregateRaster.STATISTICS[statistic]["func"]

        extensive = AggregateRaster.STATISTICS[statistic]["extensive"]
        result_column = process_kwargs["result_column"]

        # this is only there for the AggregateRasterAboveThreshold
        threshold_name = process_kwargs.get("threshold_name")
        if threshold_name:
            # get the threshold, appending NaN for unlabeled pixels
            threshold_values = np.empty((len(features) + 1, ), dtype="f4")
            threshold_values[:-1] = features[threshold_name].values
            threshold_values[-1] = np.nan
        else:
            threshold_values = None

        # investigate the raster data
        if raster_data is None:
            values = no_data_value = None
        else:
            values = raster_data["values"]
            no_data_value = raster_data["no_data_value"]
        if values is None or np.all(values == no_data_value):  # skip the rest
            result[result_column] = 0 if extensive else np.nan
            return {"features": result, "projection": req_srs}
        depth, height, width = values.shape

        pixel_size = process_kwargs["pixel_size"]
        actual_pixel_size = process_kwargs["actual_pixel_size"]

        # process in groups of disjoint subsets of the features
        agg = np.full((depth, len(features)), np.nan, dtype="f4")
        for select in bucketize(features.bounds.values):
            rasterize_result = utils.rasterize_geoseries(
                agg_geometries.iloc[select],
                process_kwargs["agg_bbox"],
                agg_srs,
                height,
                width,
                values=np.asarray(select, dtype=np.int32),  # GDAL needs int32
            )
            labels = rasterize_result["values"][0]

            # if there is a threshold, generate a raster with thresholds
            if threshold_name:
                # mode="clip" ensures that unlabeled cells use the appended NaN
                thresholds = np.take(threshold_values, labels, mode="clip")
            else:
                thresholds = None

            for frame_no, frame in enumerate(values):
                # limit statistics to active pixels
                active = frame != no_data_value
                # if there is a threshold, mask the frame
                if threshold_name:
                    valid = ~np.isnan(thresholds)  # to suppress warnings
                    active[~valid] = False  # no threshold -> no aggregation
                    active[valid] &= frame[valid] >= thresholds[valid]

                # if there is no single active value: do not aggregate
                if not active.any():
                    continue

                # select features that actually have data
                # (min, max, median, and percentile cannot handle it otherwise)
                active_labels = labels[active]
                select_and_active = list(
                    set(np.unique(active_labels)) & set(select))

                if not select_and_active:
                    continue

                agg[frame_no][select_and_active] = agg_func(
                    1 if statistic == "count" else frame[active],
                    labels=active_labels,
                    index=select_and_active,
                )

        if extensive:  # sum and count
            agg[~np.isfinite(agg)] = 0
            # extensive aggregations have to be scaled
            if actual_pixel_size != pixel_size:
                agg *= (actual_pixel_size / pixel_size)**2
        else:
            agg[~np.isfinite(agg)] = np.nan  # replaces inf by nan

        if depth == 1:
            result[result_column] = agg[0]
        else:
            # store an array in a dataframe cell: set each cell with [np.array]
            result[result_column] = [[x] for x in agg.T]

        return {"features": result, "projection": req_srs}

예제 #9

0

파일 보기

파일: aggregate.py 프로젝트: ivarlokhorst/dask-geomodeling

    def process(geom_data, raster_data, process_kwargs):
        if process_kwargs.get("empty"):
            return {
                "features": gpd.GeoDataFrame([]),
                "projection": process_kwargs["projection"],
            }
        elif process_kwargs["mode"] == "extent":
            return geom_data

        features = geom_data["features"]
        if len(features) == 0:
            return geom_data

        result = features.copy()

        # transform the features into the aggregation projection
        req_srs = process_kwargs["req_srs"]
        agg_srs = process_kwargs["agg_srs"]

        agg_geometries = utils.geoseries_transform(
            features["geometry"],
            req_srs,
            agg_srs,
        )

        statistic = process_kwargs["statistic"]
        percentile = utils.parse_percentile_statistic(statistic)
        if percentile:
            statistic = "percentile"
            agg_func = partial(AggregateRaster.STATISTICS[statistic]["func"],
                               qval=percentile)
        else:
            agg_func = AggregateRaster.STATISTICS[statistic]["func"]

        extensive = AggregateRaster.STATISTICS[statistic]["extensive"]
        result_column = process_kwargs["result_column"]

        # this is only there for the AggregateRasterAboveThreshold
        threshold_name = process_kwargs.get("threshold_name")

        # investigate the raster data
        if raster_data is None:
            values = no_data_value = None
        else:
            values = raster_data["values"]
            no_data_value = raster_data["no_data_value"]
        if values is None or np.all(values == no_data_value):  # skip the rest
            result[result_column] = 0 if extensive else np.nan
            return {"features": result, "projection": req_srs}
        depth, height, width = values.shape

        pixel_size = process_kwargs["pixel_size"]
        actual_pixel_size = process_kwargs["actual_pixel_size"]

        # process in groups of disjoint subsets of the features
        agg = np.full((depth, len(features)), np.nan, dtype="f4")
        for select in bucketize(features.bounds.values):
            agg_geometries_bucket = agg_geometries.iloc[select]
            index = features.index[select]

            rasterize_result = utils.rasterize_geoseries(
                agg_geometries_bucket,
                process_kwargs["agg_bbox"],
                agg_srs,
                height,
                width,
                values=index,
            )
            labels = rasterize_result["values"][0]

            # if there is a threshold, generate a raster with thresholds
            if threshold_name:
                thresholds = features.loc[labels.ravel(),
                                          threshold_name].values.reshape(
                                              labels.shape)
            else:
                thresholds = None

            for frame_no, frame in enumerate(values):
                # limit statistics to active pixels
                active = frame != no_data_value
                # if there is a threshold, mask the frame
                if threshold_name:
                    valid = ~np.isnan(thresholds)  # to suppress warnings
                    active[~valid] = False  # no threshold -> no aggregation
                    active[valid] &= frame[valid] >= thresholds[valid]

                # if there is no single active value: do not aggregate
                if not active.any():
                    continue

                with warnings.catch_warnings():
                    # we may get divide by 0 if any geometry does not contain
                    # any 'active' values
                    warnings.simplefilter("ignore")
                    agg[frame_no][select] = agg_func(
                        1 if statistic == "count" else frame[active],
                        labels=labels[active],
                        index=index,
                    )

        if extensive:  # sum and count
            agg[~np.isfinite(agg)] = 0
            # extensive aggregations have to be scaled
            if actual_pixel_size != pixel_size:
                agg *= (actual_pixel_size / pixel_size)**2
        else:
            agg[~np.isfinite(agg)] = np.nan  # replaces inf by nan

        if depth == 1:
            result[result_column] = agg[0]
        else:
            # store an array in a dataframe cell: set each cell with [np.array]
            result[result_column] = [[x] for x in agg.T]

        return {"features": result, "projection": req_srs}