示例#1
0
def geohash_encode(
    df: DataFrame,
    geohash: str,
    longitude: str,
    latitude: str,
) -> DataFrame:
    """
    Encode longitude and latitude into geohash

    :param df: DataFrame containing longitude and latitude data
    :param geohash: Name of new column to be created containing geohash location.
    :param longitude: Name of source column containing longitude.
    :param latitude: Name of source column containing latitude.
    :return: DataFrame with decoded longitudes and latitudes
    """
    try:
        encode_df = df[[latitude, longitude]]
        encode_df.columns = ["latitude", "longitude"]
        encode_df["geohash"] = encode_df.apply(
            lambda row: geohash_lib.encode(row["latitude"], row["longitude"]),
            axis=1,
        )
        return _append_columns(df, encode_df, {"geohash": geohash})
    except ValueError as ex:
        raise QueryObjectValidationError(
            _("Invalid longitude/latitude")) from ex
示例#2
0
def cum(
    df: DataFrame,
    operator: str,
    columns: Dict[str, str],
) -> DataFrame:
    """
    Calculate cumulative sum/product/min/max for select columns.

    :param df: DataFrame on which the cumulative operation will be based.
    :param columns: columns on which to perform a cumulative operation, mapping source
           column to target column. For instance, `{'y': 'y'}` will replace the column
           `y` with the cumulative value in `y`, while `{'y': 'y2'}` will add a column
           `y2` based on cumulative values calculated from `y`, leaving the original
           column `y` unchanged.
    :param operator: cumulative operator, e.g. `sum`, `prod`, `min`, `max`
    :return: DataFrame with cumulated columns
    """
    columns = columns or {}
    df_cum = df.loc[:, columns.keys()]
    operation = "cum" + operator
    if operation not in ALLOWLIST_CUMULATIVE_FUNCTIONS or not hasattr(
            df_cum, operation):
        raise InvalidPostProcessingError(
            _("Invalid cumulative operator: %(operator)s", operator=operator))
    df_cum = _append_columns(df, getattr(df_cum, operation)(), columns)
    return df_cum
示例#3
0
def cum(
    df: DataFrame,
    operator: str,
    columns: Optional[Dict[str, str]] = None,
    is_pivot_df: bool = False,
) -> DataFrame:
    """
    Calculate cumulative sum/product/min/max for select columns.

    :param df: DataFrame on which the cumulative operation will be based.
    :param columns: columns on which to perform a cumulative operation, mapping source
           column to target column. For instance, `{'y': 'y'}` will replace the column
           `y` with the cumulative value in `y`, while `{'y': 'y2'}` will add a column
           `y2` based on cumulative values calculated from `y`, leaving the original
           column `y` unchanged.
    :param operator: cumulative operator, e.g. `sum`, `prod`, `min`, `max`
    :param is_pivot_df: Dataframe is pivoted or not
    :return: DataFrame with cumulated columns
    """
    columns = columns or {}
    if is_pivot_df:
        df_cum = df
    else:
        df_cum = df[columns.keys()]
    operation = "cum" + operator
    if operation not in ALLOWLIST_CUMULATIVE_FUNCTIONS or not hasattr(
            df_cum, operation):
        raise QueryObjectValidationError(
            _("Invalid cumulative operator: %(operator)s", operator=operator))
    if is_pivot_df:
        df_cum = getattr(df_cum, operation)()
        agg_in_pivot_df = df.columns.get_level_values(
            0).drop_duplicates().to_list()
        agg: Dict[str, Dict[str, Any]] = {col: {} for col in agg_in_pivot_df}
        df_cum.columns = [
            _flatten_column_after_pivot(col, agg) for col in df_cum.columns
        ]
        df_cum.reset_index(level=0, inplace=True)
    else:
        df_cum = _append_columns(df, getattr(df_cum, operation)(), columns)
    return df_cum
示例#4
0
def geohash_decode(df: DataFrame, geohash: str, longitude: str,
                   latitude: str) -> DataFrame:
    """
    Decode a geohash column into longitude and latitude

    :param df: DataFrame containing geohash data
    :param geohash: Name of source column containing geohash location.
    :param longitude: Name of new column to be created containing longitude.
    :param latitude: Name of new column to be created containing latitude.
    :return: DataFrame with decoded longitudes and latitudes
    """
    try:
        lonlat_df = DataFrame()
        lonlat_df["latitude"], lonlat_df["longitude"] = zip(
            *df[geohash].apply(geohash_lib.decode))
        return _append_columns(df, lonlat_df, {
            "latitude": latitude,
            "longitude": longitude
        })
    except ValueError as ex:
        raise QueryObjectValidationError(_("Invalid geohash string")) from ex
示例#5
0
def geodetic_parse(
    df: DataFrame,
    geodetic: str,
    longitude: str,
    latitude: str,
    altitude: Optional[str] = None,
) -> DataFrame:
    """
    Parse a column containing a geodetic point string
    [Geopy](https://geopy.readthedocs.io/en/stable/#geopy.point.Point).

    :param df: DataFrame containing geodetic point data
    :param geodetic: Name of source column containing geodetic point string.
    :param longitude: Name of new column to be created containing longitude.
    :param latitude: Name of new column to be created containing latitude.
    :param altitude: Name of new column to be created containing altitude.
    :return: DataFrame with decoded longitudes and latitudes
    """
    def _parse_location(location: str) -> Tuple[float, float, float]:
        """
        Parse a string containing a geodetic point and return latitude, longitude
        and altitude
        """
        point = Point(location)
        return point[0], point[1], point[2]

    try:
        geodetic_df = DataFrame()
        (
            geodetic_df["latitude"],
            geodetic_df["longitude"],
            geodetic_df["altitude"],
        ) = zip(*df[geodetic].apply(_parse_location))
        columns = {"latitude": latitude, "longitude": longitude}
        if altitude:
            columns["altitude"] = altitude
        return _append_columns(df, geodetic_df, columns)
    except ValueError as ex:
        raise QueryObjectValidationError(_("Invalid geodetic string")) from ex
示例#6
0
def diff(
    df: DataFrame,
    columns: Dict[str, str],
    periods: int = 1,
    axis: PandasAxis = PandasAxis.ROW,
) -> DataFrame:
    """
    Calculate row-by-row or column-by-column difference for select columns.

    :param df: DataFrame on which the diff will be based.
    :param columns: columns on which to perform diff, mapping source column to
           target column. For instance, `{'y': 'y'}` will replace the column `y` with
           the diff value in `y`, while `{'y': 'y2'}` will add a column `y2` based
           on diff values calculated from `y`, leaving the original column `y`
           unchanged.
    :param periods: periods to shift for calculating difference.
    :param axis: 0 for row, 1 for column. default 0.
    :return: DataFrame with diffed columns
    :raises QueryObjectValidationError: If the request in incorrect
    """
    df_diff = df[columns.keys()]
    df_diff = df_diff.diff(periods=periods, axis=axis)
    return _append_columns(df, df_diff, columns)
示例#7
0
def rolling(  # pylint: disable=too-many-arguments
    df: DataFrame,
    rolling_type: str,
    columns: Optional[Dict[str, str]] = None,
    window: Optional[int] = None,
    rolling_type_options: Optional[Dict[str, Any]] = None,
    center: bool = False,
    win_type: Optional[str] = None,
    min_periods: Optional[int] = None,
    is_pivot_df: bool = False,
) -> DataFrame:
    """
    Apply a rolling window on the dataset. See the Pandas docs for further details:
    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html

    :param df: DataFrame on which the rolling period will be based.
    :param columns: columns on which to perform rolling, mapping source column to
           target column. For instance, `{'y': 'y'}` will replace the column `y` with
           the rolling value in `y`, while `{'y': 'y2'}` will add a column `y2` based
           on rolling values calculated from `y`, leaving the original column `y`
           unchanged.
    :param rolling_type: Type of rolling window. Any numpy function will work.
    :param window: Size of the window.
    :param rolling_type_options: Optional options to pass to rolling method. Needed
           for e.g. quantile operation.
    :param center: Should the label be at the center of the window.
    :param win_type: Type of window function.
    :param min_periods: The minimum amount of periods required for a row to be included
                        in the result set.
    :param is_pivot_df: Dataframe is pivoted or not
    :return: DataFrame with the rolling columns
    :raises QueryObjectValidationError: If the request in incorrect
    """
    rolling_type_options = rolling_type_options or {}
    columns = columns or {}
    if is_pivot_df:
        df_rolling = df
    else:
        df_rolling = df[columns.keys()]
    kwargs: Dict[str, Union[str, int]] = {}
    if window is None:
        raise QueryObjectValidationError(
            _("Undefined window for rolling operation"))
    if window == 0:
        raise QueryObjectValidationError(_("Window must be > 0"))

    kwargs["window"] = window
    if min_periods is not None:
        kwargs["min_periods"] = min_periods
    if center is not None:
        kwargs["center"] = center
    if win_type is not None:
        kwargs["win_type"] = win_type

    df_rolling = df_rolling.rolling(**kwargs)
    if rolling_type not in DENYLIST_ROLLING_FUNCTIONS or not hasattr(
            df_rolling, rolling_type):
        raise QueryObjectValidationError(
            _("Invalid rolling_type: %(type)s", type=rolling_type))
    try:
        df_rolling = getattr(df_rolling, rolling_type)(**rolling_type_options)
    except TypeError as ex:
        raise QueryObjectValidationError(
            _(
                "Invalid options for %(rolling_type)s: %(options)s",
                rolling_type=rolling_type,
                options=rolling_type_options,
            )) from ex

    if is_pivot_df:
        agg_in_pivot_df = df.columns.get_level_values(
            0).drop_duplicates().to_list()
        agg: Dict[str, Dict[str, Any]] = {col: {} for col in agg_in_pivot_df}
        df_rolling.columns = [
            _flatten_column_after_pivot(col, agg) for col in df_rolling.columns
        ]
        df_rolling.reset_index(level=0, inplace=True)
    else:
        df_rolling = _append_columns(df, df_rolling, columns)

    if min_periods:
        df_rolling = df_rolling[min_periods:]
    return df_rolling