def geohash_encode( df: DataFrame, geohash: str, longitude: str, latitude: str, ) -> DataFrame: """ Encode longitude and latitude into geohash :param df: DataFrame containing longitude and latitude data :param geohash: Name of new column to be created containing geohash location. :param longitude: Name of source column containing longitude. :param latitude: Name of source column containing latitude. :return: DataFrame with decoded longitudes and latitudes """ try: encode_df = df[[latitude, longitude]] encode_df.columns = ["latitude", "longitude"] encode_df["geohash"] = encode_df.apply( lambda row: geohash_lib.encode(row["latitude"], row["longitude"]), axis=1, ) return _append_columns(df, encode_df, {"geohash": geohash}) except ValueError as ex: raise QueryObjectValidationError( _("Invalid longitude/latitude")) from ex
def cum( df: DataFrame, operator: str, columns: Dict[str, str], ) -> DataFrame: """ Calculate cumulative sum/product/min/max for select columns. :param df: DataFrame on which the cumulative operation will be based. :param columns: columns on which to perform a cumulative operation, mapping source column to target column. For instance, `{'y': 'y'}` will replace the column `y` with the cumulative value in `y`, while `{'y': 'y2'}` will add a column `y2` based on cumulative values calculated from `y`, leaving the original column `y` unchanged. :param operator: cumulative operator, e.g. `sum`, `prod`, `min`, `max` :return: DataFrame with cumulated columns """ columns = columns or {} df_cum = df.loc[:, columns.keys()] operation = "cum" + operator if operation not in ALLOWLIST_CUMULATIVE_FUNCTIONS or not hasattr( df_cum, operation): raise InvalidPostProcessingError( _("Invalid cumulative operator: %(operator)s", operator=operator)) df_cum = _append_columns(df, getattr(df_cum, operation)(), columns) return df_cum
def cum( df: DataFrame, operator: str, columns: Optional[Dict[str, str]] = None, is_pivot_df: bool = False, ) -> DataFrame: """ Calculate cumulative sum/product/min/max for select columns. :param df: DataFrame on which the cumulative operation will be based. :param columns: columns on which to perform a cumulative operation, mapping source column to target column. For instance, `{'y': 'y'}` will replace the column `y` with the cumulative value in `y`, while `{'y': 'y2'}` will add a column `y2` based on cumulative values calculated from `y`, leaving the original column `y` unchanged. :param operator: cumulative operator, e.g. `sum`, `prod`, `min`, `max` :param is_pivot_df: Dataframe is pivoted or not :return: DataFrame with cumulated columns """ columns = columns or {} if is_pivot_df: df_cum = df else: df_cum = df[columns.keys()] operation = "cum" + operator if operation not in ALLOWLIST_CUMULATIVE_FUNCTIONS or not hasattr( df_cum, operation): raise QueryObjectValidationError( _("Invalid cumulative operator: %(operator)s", operator=operator)) if is_pivot_df: df_cum = getattr(df_cum, operation)() agg_in_pivot_df = df.columns.get_level_values( 0).drop_duplicates().to_list() agg: Dict[str, Dict[str, Any]] = {col: {} for col in agg_in_pivot_df} df_cum.columns = [ _flatten_column_after_pivot(col, agg) for col in df_cum.columns ] df_cum.reset_index(level=0, inplace=True) else: df_cum = _append_columns(df, getattr(df_cum, operation)(), columns) return df_cum
def geohash_decode(df: DataFrame, geohash: str, longitude: str, latitude: str) -> DataFrame: """ Decode a geohash column into longitude and latitude :param df: DataFrame containing geohash data :param geohash: Name of source column containing geohash location. :param longitude: Name of new column to be created containing longitude. :param latitude: Name of new column to be created containing latitude. :return: DataFrame with decoded longitudes and latitudes """ try: lonlat_df = DataFrame() lonlat_df["latitude"], lonlat_df["longitude"] = zip( *df[geohash].apply(geohash_lib.decode)) return _append_columns(df, lonlat_df, { "latitude": latitude, "longitude": longitude }) except ValueError as ex: raise QueryObjectValidationError(_("Invalid geohash string")) from ex
def geodetic_parse( df: DataFrame, geodetic: str, longitude: str, latitude: str, altitude: Optional[str] = None, ) -> DataFrame: """ Parse a column containing a geodetic point string [Geopy](https://geopy.readthedocs.io/en/stable/#geopy.point.Point). :param df: DataFrame containing geodetic point data :param geodetic: Name of source column containing geodetic point string. :param longitude: Name of new column to be created containing longitude. :param latitude: Name of new column to be created containing latitude. :param altitude: Name of new column to be created containing altitude. :return: DataFrame with decoded longitudes and latitudes """ def _parse_location(location: str) -> Tuple[float, float, float]: """ Parse a string containing a geodetic point and return latitude, longitude and altitude """ point = Point(location) return point[0], point[1], point[2] try: geodetic_df = DataFrame() ( geodetic_df["latitude"], geodetic_df["longitude"], geodetic_df["altitude"], ) = zip(*df[geodetic].apply(_parse_location)) columns = {"latitude": latitude, "longitude": longitude} if altitude: columns["altitude"] = altitude return _append_columns(df, geodetic_df, columns) except ValueError as ex: raise QueryObjectValidationError(_("Invalid geodetic string")) from ex
def diff( df: DataFrame, columns: Dict[str, str], periods: int = 1, axis: PandasAxis = PandasAxis.ROW, ) -> DataFrame: """ Calculate row-by-row or column-by-column difference for select columns. :param df: DataFrame on which the diff will be based. :param columns: columns on which to perform diff, mapping source column to target column. For instance, `{'y': 'y'}` will replace the column `y` with the diff value in `y`, while `{'y': 'y2'}` will add a column `y2` based on diff values calculated from `y`, leaving the original column `y` unchanged. :param periods: periods to shift for calculating difference. :param axis: 0 for row, 1 for column. default 0. :return: DataFrame with diffed columns :raises QueryObjectValidationError: If the request in incorrect """ df_diff = df[columns.keys()] df_diff = df_diff.diff(periods=periods, axis=axis) return _append_columns(df, df_diff, columns)
def rolling( # pylint: disable=too-many-arguments df: DataFrame, rolling_type: str, columns: Optional[Dict[str, str]] = None, window: Optional[int] = None, rolling_type_options: Optional[Dict[str, Any]] = None, center: bool = False, win_type: Optional[str] = None, min_periods: Optional[int] = None, is_pivot_df: bool = False, ) -> DataFrame: """ Apply a rolling window on the dataset. See the Pandas docs for further details: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html :param df: DataFrame on which the rolling period will be based. :param columns: columns on which to perform rolling, mapping source column to target column. For instance, `{'y': 'y'}` will replace the column `y` with the rolling value in `y`, while `{'y': 'y2'}` will add a column `y2` based on rolling values calculated from `y`, leaving the original column `y` unchanged. :param rolling_type: Type of rolling window. Any numpy function will work. :param window: Size of the window. :param rolling_type_options: Optional options to pass to rolling method. Needed for e.g. quantile operation. :param center: Should the label be at the center of the window. :param win_type: Type of window function. :param min_periods: The minimum amount of periods required for a row to be included in the result set. :param is_pivot_df: Dataframe is pivoted or not :return: DataFrame with the rolling columns :raises QueryObjectValidationError: If the request in incorrect """ rolling_type_options = rolling_type_options or {} columns = columns or {} if is_pivot_df: df_rolling = df else: df_rolling = df[columns.keys()] kwargs: Dict[str, Union[str, int]] = {} if window is None: raise QueryObjectValidationError( _("Undefined window for rolling operation")) if window == 0: raise QueryObjectValidationError(_("Window must be > 0")) kwargs["window"] = window if min_periods is not None: kwargs["min_periods"] = min_periods if center is not None: kwargs["center"] = center if win_type is not None: kwargs["win_type"] = win_type df_rolling = df_rolling.rolling(**kwargs) if rolling_type not in DENYLIST_ROLLING_FUNCTIONS or not hasattr( df_rolling, rolling_type): raise QueryObjectValidationError( _("Invalid rolling_type: %(type)s", type=rolling_type)) try: df_rolling = getattr(df_rolling, rolling_type)(**rolling_type_options) except TypeError as ex: raise QueryObjectValidationError( _( "Invalid options for %(rolling_type)s: %(options)s", rolling_type=rolling_type, options=rolling_type_options, )) from ex if is_pivot_df: agg_in_pivot_df = df.columns.get_level_values( 0).drop_duplicates().to_list() agg: Dict[str, Dict[str, Any]] = {col: {} for col in agg_in_pivot_df} df_rolling.columns = [ _flatten_column_after_pivot(col, agg) for col in df_rolling.columns ] df_rolling.reset_index(level=0, inplace=True) else: df_rolling = _append_columns(df, df_rolling, columns) if min_periods: df_rolling = df_rolling[min_periods:] return df_rolling