예제 #1
0
 def _sanitize_filters(self) -> None:
     for param in ("where", "having"):
         clause = self.extras.get(param)
         if clause:
             try:
                 sanitized_clause = sanitize_clause(clause)
                 if sanitized_clause != clause:
                     self.extras[param] = sanitized_clause
             except QueryClauseValidationException as ex:
                 raise QueryObjectValidationError(ex.message) from ex
예제 #2
0
def _get_aggregate_funcs(
    df: DataFrame,
    aggregates: Dict[str, Dict[str, Any]],
) -> Dict[str, NamedAgg]:
    """
    Converts a set of aggregate config objects into functions that pandas can use as
    aggregators. Currently only numpy aggregators are supported.

    :param df: DataFrame on which to perform aggregate operation.
    :param aggregates: Mapping from column name to aggregate config.
    :return: Mapping from metric name to function that takes a single input argument.
    """
    agg_funcs: Dict[str, NamedAgg] = {}
    for name, agg_obj in aggregates.items():
        column = agg_obj.get("column", name)
        if column not in df:
            raise QueryObjectValidationError(
                _(
                    "Column referenced by aggregate is undefined: %(column)s",
                    column=column,
                ))
        if "operator" not in agg_obj:
            raise QueryObjectValidationError(
                _(
                    "Operator undefined for aggregator: %(name)s",
                    name=name,
                ))
        operator = agg_obj["operator"]
        if callable(operator):
            aggfunc = operator
        else:
            func = NUMPY_FUNCTIONS.get(operator)
            if not func:
                raise QueryObjectValidationError(
                    _(
                        "Invalid numpy function: %(operator)s",
                        operator=operator,
                    ))
            options = agg_obj.get("options", {})
            aggfunc = partial(func, **options)
        agg_funcs[name] = NamedAgg(column=column, aggfunc=aggfunc)

    return agg_funcs
예제 #3
0
 def wrapped(df: DataFrame, **options: Any) -> Any:
     columns = df.columns.tolist()
     for name in argnames:
         if name in options and not all(
             elem in columns for elem in options.get(name) or []
         ):
             raise QueryObjectValidationError(
                 _("Referenced columns not available in DataFrame.")
             )
     return func(df, **options)
예제 #4
0
 def _validate_there_are_no_missing_series(self) -> None:
     missing_series = [col for col in self.series_columns if col not in self.columns]
     if missing_series:
         raise QueryObjectValidationError(
             _(
                 "The following entries in `series_columns` are missing "
                 "in `columns`: %(columns)s. ",
                 columns=", ".join(f'"{x}"' for x in missing_series),
             )
         )
예제 #5
0
 def _validate_no_have_duplicate_labels(self) -> None:
     all_labels = self.metric_names + self.column_names
     if len(set(all_labels)) < len(all_labels):
         dup_labels = find_duplicates(all_labels)
         raise QueryObjectValidationError(
             _(
                 "Duplicate column/metric labels: %(labels)s. Please make "
                 "sure all columns and metrics have a unique label.",
                 labels=", ".join(f'"{x}"' for x in dup_labels),
             )
         )
 def get_viz_annotation_data(annotation_layer: Dict[str, Any],
                             force: bool) -> Dict[str, Any]:
     chart = ChartDAO.find_by_id(annotation_layer["value"])
     if not chart:
         raise QueryObjectValidationError(_("The chart does not exist"))
     if not chart.datasource:
         raise QueryObjectValidationError(
             _("The chart datasource does not exist"))
     form_data = chart.form_data.copy()
     try:
         viz_obj = get_viz(
             datasource_type=chart.datasource.type,
             datasource_id=chart.datasource.id,
             form_data=form_data,
             force=force,
         )
         payload = viz_obj.get_payload()
         return payload["data"]
     except SupersetException as ex:
         raise QueryObjectValidationError(
             error_msg_from_exception(ex)) from ex
예제 #7
0
    def get_df_payload(
        self, query_obj: QueryObject, force_cached: Optional[bool] = False,
    ) -> Dict[str, Any]:
        """Handles caching around the df payload retrieval"""
        cache_key = self.query_cache_key(query_obj)
        cache = QueryCacheManager.get(
            cache_key, CacheRegion.DATA, self.force, force_cached,
        )

        if query_obj and cache_key and not cache.is_loaded:
            try:
                invalid_columns = [
                    col
                    for col in query_obj.columns
                    + get_column_names_from_metrics(query_obj.metrics or [])
                    if col not in self.datasource.column_names and col != DTTM_ALIAS
                ]
                if invalid_columns:
                    raise QueryObjectValidationError(
                        _(
                            "Columns missing in datasource: %(invalid_columns)s",
                            invalid_columns=invalid_columns,
                        )
                    )
                query_result = self.get_query_result(query_obj)
                annotation_data = self.get_annotation_data(query_obj)
                cache.set_query_result(
                    key=cache_key,
                    query_result=query_result,
                    annotation_data=annotation_data,
                    force_query=self.force,
                    timeout=self.cache_timeout,
                    datasource_uid=self.datasource.uid,
                    region=CacheRegion.DATA,
                )
            except QueryObjectValidationError as ex:
                cache.error_message = str(ex)
                cache.status = QueryStatus.FAILED

        return {
            "cache_key": cache_key,
            "cached_dttm": cache.cache_dttm,
            "cache_timeout": self.cache_timeout,
            "df": cache.df,
            "annotation_data": cache.annotation_data,
            "error": cache.error_message,
            "is_cached": cache.is_cached,
            "query": cache.query,
            "status": cache.status,
            "stacktrace": cache.stacktrace,
            "rowcount": len(cache.df.index),
        }
예제 #8
0
    def exec_post_processing(self, df: DataFrame) -> DataFrame:
        """
        Perform post processing operations on DataFrame.

        :param df: DataFrame returned from database model.
        :return: new DataFrame to which all post processing operations have been
                 applied
        :raises ChartDataValidationError: If the post processing operation in incorrect
        """
        for post_process in self.post_processing:
            operation = post_process.get("operation")
            if not operation:
                raise QueryObjectValidationError(
                    _("`operation` property of post processing object undefined"
                      ))
            if not hasattr(pandas_postprocessing, operation):
                raise QueryObjectValidationError(
                    _(
                        "Unsupported post processing operation: %(operation)s",
                        type=operation,
                    ))
            options = post_process.get("options", {})
            df = getattr(pandas_postprocessing, operation)(df, **options)
        return df
예제 #9
0
 def validate(
     self,
     raise_exceptions: Optional[bool] = True
 ) -> Optional[QueryObjectValidationError]:
     """Validate query object"""
     error: Optional[QueryObjectValidationError] = None
     all_labels = self.metric_names + self.column_names
     if len(set(all_labels)) < len(all_labels):
         dup_labels = find_duplicates(all_labels)
         error = QueryObjectValidationError(
             _(
                 "Duplicate column/metric labels: %(labels)s. Please make "
                 "sure all columns and metrics have a unique label.",
                 labels=", ".join(f'"{x}"' for x in dup_labels),
             ))
     if error and raise_exceptions:
         raise error
     return error
예제 #10
0
def cum(df: DataFrame, columns: Dict[str, str], operator: str) -> DataFrame:
    """

    :param df: DataFrame on which the cumulative operation will be based.
    :param columns: columns on which to perform a cumulative operation, mapping source
           column to target column. For instance, `{'y': 'y'}` will replace the column
           `y` with the cumulative value in `y`, while `{'y': 'y2'}` will add a column
           `y2` based on cumulative values calculated from `y`, leaving the original
           column `y` unchanged.
    :param operator: cumulative operator, e.g. `sum`, `prod`, `min`, `max`
    :return:
    """
    df_cum = df[columns.keys()]
    operation = "cum" + operator
    if operation not in WHITELIST_CUMULATIVE_FUNCTIONS or not hasattr(
            df_cum, operation):
        raise QueryObjectValidationError(
            _("Invalid cumulative operator: %(operator)s", operator=operator))
    return _append_columns(df, getattr(df_cum, operation)(), columns)
def cum(
    df: DataFrame,
    operator: str,
    columns: Optional[Dict[str, str]] = None,
    is_pivot_df: bool = False,
) -> DataFrame:
    """
    Calculate cumulative sum/product/min/max for select columns.

    :param df: DataFrame on which the cumulative operation will be based.
    :param columns: columns on which to perform a cumulative operation, mapping source
           column to target column. For instance, `{'y': 'y'}` will replace the column
           `y` with the cumulative value in `y`, while `{'y': 'y2'}` will add a column
           `y2` based on cumulative values calculated from `y`, leaving the original
           column `y` unchanged.
    :param operator: cumulative operator, e.g. `sum`, `prod`, `min`, `max`
    :param is_pivot_df: Dataframe is pivoted or not
    :return: DataFrame with cumulated columns
    """
    columns = columns or {}
    if is_pivot_df:
        df_cum = df
    else:
        df_cum = df[columns.keys()]
    operation = "cum" + operator
    if operation not in ALLOWLIST_CUMULATIVE_FUNCTIONS or not hasattr(
        df_cum, operation
    ):
        raise QueryObjectValidationError(
            _("Invalid cumulative operator: %(operator)s", operator=operator)
        )
    if is_pivot_df:
        df_cum = getattr(df_cum, operation)()
        agg_in_pivot_df = df.columns.get_level_values(0).drop_duplicates().to_list()
        agg: Dict[str, Dict[str, Any]] = {col: {} for col in agg_in_pivot_df}
        df_cum.columns = [
            _flatten_column_after_pivot(col, agg) for col in df_cum.columns
        ]
        df_cum.reset_index(level=0, inplace=True)
    else:
        df_cum = _append_columns(df, getattr(df_cum, operation)(), columns)
    return df_cum
예제 #12
0
def geohash_decode(df: DataFrame, geohash: str, longitude: str,
                   latitude: str) -> DataFrame:
    """
    Decode a geohash column into longitude and latitude

    :param df: DataFrame containing geohash data
    :param geohash: Name of source column containing geohash location.
    :param longitude: Name of new column to be created containing longitude.
    :param latitude: Name of new column to be created containing latitude.
    :return: DataFrame with decoded longitudes and latitudes
    """
    try:
        lonlat_df = DataFrame()
        lonlat_df["latitude"], lonlat_df["longitude"] = zip(
            *df[geohash].apply(geohash_lib.decode))
        return _append_columns(df, lonlat_df, {
            "latitude": latitude,
            "longitude": longitude
        })
    except ValueError:
        raise QueryObjectValidationError(_("Invalid geohash string"))
예제 #13
0
def get_query_results(
    result_type: ChartDataResultType,
    query_context: "QueryContext",
    query_obj: "QueryObject",
    force_cached: bool,
) -> Dict[str, Any]:
    """
    Return result payload for a chart data request.

    :param result_type: the type of result to return
    :param query_context: query context to which the query object belongs
    :param query_obj: query object for which to retrieve the results
    :param force_cached: should results be forcefully retrieved from cache
    :raises QueryObjectValidationError: if an unsupported result type is requested
    :return: JSON serializable result payload
    """
    result_func = _result_type_functions.get(result_type)
    if result_func:
        return result_func(query_context, query_obj, force_cached)
    raise QueryObjectValidationError(
        _("Invalid result type: %(result_type)s", result_type=result_type))
예제 #14
0
def geohash_encode(
    df: DataFrame, geohash: str, longitude: str, latitude: str,
) -> DataFrame:
    """
    Encode longitude and latitude into geohash

    :param df: DataFrame containing longitude and latitude data
    :param geohash: Name of new column to be created containing geohash location.
    :param longitude: Name of source column containing longitude.
    :param latitude: Name of source column containing latitude.
    :return: DataFrame with decoded longitudes and latitudes
    """
    try:
        encode_df = df[[latitude, longitude]]
        encode_df.columns = ["latitude", "longitude"]
        encode_df["geohash"] = encode_df.apply(
            lambda row: geohash_lib.encode(row["latitude"], row["longitude"]), axis=1,
        )
        return _append_columns(df, encode_df, {"geohash": geohash})
    except ValueError as ex:
        raise QueryObjectValidationError(_("Invalid longitude/latitude")) from ex
예제 #15
0
def geodetic_parse(
    df: DataFrame,
    geodetic: str,
    longitude: str,
    latitude: str,
    altitude: Optional[str] = None,
) -> DataFrame:
    """
    Parse a column containing a geodetic point string
    [Geopy](https://geopy.readthedocs.io/en/stable/#geopy.point.Point).

    :param df: DataFrame containing geodetic point data
    :param geodetic: Name of source column containing geodetic point string.
    :param longitude: Name of new column to be created containing longitude.
    :param latitude: Name of new column to be created containing latitude.
    :param altitude: Name of new column to be created containing altitude.
    :return: DataFrame with decoded longitudes and latitudes
    """

    def _parse_location(location: str) -> Tuple[float, float, float]:
        """
        Parse a string containing a geodetic point and return latitude, longitude
        and altitude
        """
        point = Point(location)
        return point[0], point[1], point[2]

    try:
        geodetic_df = DataFrame()
        (
            geodetic_df["latitude"],
            geodetic_df["longitude"],
            geodetic_df["altitude"],
        ) = zip(*df[geodetic].apply(_parse_location))
        columns = {"latitude": latitude, "longitude": longitude}
        if altitude:
            columns["altitude"] = altitude
        return _append_columns(df, geodetic_df, columns)
    except ValueError as ex:
        raise QueryObjectValidationError(_("Invalid geodetic string")) from ex
예제 #16
0
def _prophet_fit_and_predict(  # pylint: disable=too-many-arguments
    df: DataFrame,
    confidence_interval: float,
    yearly_seasonality: Union[bool, str, int],
    weekly_seasonality: Union[bool, str, int],
    daily_seasonality: Union[bool, str, int],
    periods: int,
    freq: str,
) -> DataFrame:
    """
    Fit a prophet model and return a DataFrame with predicted results.
    """
    try:
        prophet_logger = logging.getLogger("fbprophet.plot")

        prophet_logger.setLevel(logging.CRITICAL)
        from fbprophet import Prophet  # pylint: disable=import-error

        prophet_logger.setLevel(logging.NOTSET)
    except ModuleNotFoundError:
        raise QueryObjectValidationError(
            _("`fbprophet` package not installed"))
    model = Prophet(
        interval_width=confidence_interval,
        yearly_seasonality=yearly_seasonality,
        weekly_seasonality=weekly_seasonality,
        daily_seasonality=daily_seasonality,
    )
    if df["ds"].dt.tz:
        df["ds"] = df["ds"].dt.tz_convert(None)
    model.fit(df)
    future = model.make_future_dataframe(periods=periods, freq=freq)
    forecast = model.predict(future)[[
        "ds", "yhat", "yhat_lower", "yhat_upper"
    ]]
    return forecast.join(df.set_index("ds"), on="ds").set_index(["ds"])
예제 #17
0
    def get_df_payload(  # pylint: disable=too-many-statements,too-many-locals
        self, query_obj: QueryObject, force_cached: Optional[bool] = False,
    ) -> Dict[str, Any]:
        """Handles caching around the df payload retrieval"""
        cache_key = self.query_cache_key(query_obj)
        logger.info("Cache key: %s", cache_key)
        is_loaded = False
        stacktrace = None
        df = pd.DataFrame()
        cache_value = None
        status = None
        query = ""
        annotation_data = {}
        error_message = None
        if cache_key and cache_manager.data_cache and not self.force:
            cache_value = cache_manager.data_cache.get(cache_key)
            if cache_value:
                stats_logger.incr("loading_from_cache")
                try:
                    df = cache_value["df"]
                    query = cache_value["query"]
                    annotation_data = cache_value.get("annotation_data", {})
                    status = utils.QueryStatus.SUCCESS
                    is_loaded = True
                    stats_logger.incr("loaded_from_cache")
                except KeyError as ex:
                    logger.exception(ex)
                    logger.error(
                        "Error reading cache: %s", utils.error_msg_from_exception(ex)
                    )
                logger.info("Serving from cache")

        if force_cached and not is_loaded:
            logger.warning(
                "force_cached (QueryContext): value not found for key %s", cache_key
            )
            raise CacheLoadError("Error loading data from cache")

        if query_obj and not is_loaded:
            try:
                invalid_columns = [
                    col
                    for col in query_obj.columns
                    + query_obj.groupby
                    + utils.get_column_names_from_metrics(query_obj.metrics)
                    if col not in self.datasource.column_names and col != DTTM_ALIAS
                ]
                if invalid_columns:
                    raise QueryObjectValidationError(
                        _(
                            "Columns missing in datasource: %(invalid_columns)s",
                            invalid_columns=invalid_columns,
                        )
                    )
                query_result = self.get_query_result(query_obj)
                status = query_result["status"]
                query = query_result["query"]
                error_message = query_result["error_message"]
                df = query_result["df"]
                annotation_data = self.get_annotation_data(query_obj)

                if status != utils.QueryStatus.FAILED:
                    stats_logger.incr("loaded_from_source")
                    if not self.force:
                        stats_logger.incr("loaded_from_source_without_force")
                    is_loaded = True
            except QueryObjectValidationError as ex:
                error_message = str(ex)
                status = utils.QueryStatus.FAILED
            except Exception as ex:  # pylint: disable=broad-except
                logger.exception(ex)
                if not error_message:
                    error_message = str(ex)
                status = utils.QueryStatus.FAILED
                stacktrace = utils.get_stacktrace()

            if is_loaded and cache_key and status != utils.QueryStatus.FAILED:
                set_and_log_cache(
                    cache_manager.data_cache,
                    cache_key,
                    {"df": df, "query": query, "annotation_data": annotation_data},
                    self.cache_timeout,
                    self.datasource.uid,
                )
        return {
            "cache_key": cache_key,
            "cached_dttm": cache_value["dttm"] if cache_value is not None else None,
            "cache_timeout": self.cache_timeout,
            "df": df,
            "annotation_data": annotation_data,
            "error": error_message,
            "is_cached": cache_value is not None,
            "query": query,
            "status": status,
            "stacktrace": stacktrace,
            "rowcount": len(df.index),
        }
예제 #18
0
    def get_df_payload(  # pylint: disable=too-many-statements
            self, query_obj: QueryObject, **kwargs: Any) -> Dict[str, Any]:
        """Handles caching around the df payload retrieval"""
        cache_key = self.cache_key(query_obj, **kwargs)
        logger.info("Cache key: %s", cache_key)
        is_loaded = False
        stacktrace = None
        df = pd.DataFrame()
        cached_dttm = datetime.utcnow().isoformat().split(".")[0]
        cache_value = None
        status = None
        query = ""
        error_message = None
        if cache_key and cache and not self.force:
            cache_value = cache.get(cache_key)
            if cache_value:
                stats_logger.incr("loading_from_cache")
                try:
                    df = cache_value["df"]
                    query = cache_value["query"]
                    status = utils.QueryStatus.SUCCESS
                    is_loaded = True
                    stats_logger.incr("loaded_from_cache")
                except KeyError as ex:
                    logger.exception(ex)
                    logger.error("Error reading cache: %s",
                                 utils.error_msg_from_exception(ex))
                logger.info("Serving from cache")

        if query_obj and not is_loaded:
            try:
                invalid_columns = [
                    col for col in query_obj.columns + query_obj.groupby +
                    utils.get_column_names_from_metrics(query_obj.metrics)
                    if col not in self.datasource.column_names
                ]
                if invalid_columns:
                    raise QueryObjectValidationError(
                        _(
                            "Columns missing in datasource: %(invalid_columns)s",
                            invalid_columns=invalid_columns,
                        ))
                query_result = self.get_query_result(query_obj)
                status = query_result["status"]
                query = query_result["query"]
                error_message = query_result["error_message"]
                df = query_result["df"]
                if status != utils.QueryStatus.FAILED:
                    stats_logger.incr("loaded_from_source")
                    if not self.force:
                        stats_logger.incr("loaded_from_source_without_force")
                    is_loaded = True
            except QueryObjectValidationError as ex:
                error_message = str(ex)
                status = utils.QueryStatus.FAILED
            except Exception as ex:  # pylint: disable=broad-except
                logger.exception(ex)
                if not error_message:
                    error_message = str(ex)
                status = utils.QueryStatus.FAILED
                stacktrace = utils.get_stacktrace()

            if is_loaded and cache_key and cache and status != utils.QueryStatus.FAILED:
                set_and_log_cache(
                    cache_key,
                    df,
                    query,
                    cached_dttm,
                    self.cache_timeout,
                    self.datasource.uid,
                )
        return {
            "cache_key": cache_key,
            "cached_dttm":
            cache_value["dttm"] if cache_value is not None else None,
            "cache_timeout": self.cache_timeout,
            "df": df,
            "error": error_message,
            "is_cached": cache_value is not None,
            "query": query,
            "status": status,
            "stacktrace": stacktrace,
            "rowcount": len(df.index),
        }
예제 #19
0
def boxplot(
    df: DataFrame,
    groupby: List[str],
    metrics: List[str],
    whisker_type: PostProcessingBoxplotWhiskerType,
    percentiles: Optional[Union[List[Union[int, float]],
                                Tuple[Union[int, float],
                                      Union[int, float]]]] = None,
) -> DataFrame:
    """
    Calculate boxplot statistics. For each metric, the operation creates eight
    new columns with the column name suffixed with the following values:

    - `__mean`: the mean
    - `__median`: the median
    - `__max`: the maximum value excluding outliers (see whisker type)
    - `__min`: the minimum value excluding outliers (see whisker type)
    - `__q1`: the median
    - `__q1`: the first quartile (25th percentile)
    - `__q3`: the third quartile (75th percentile)
    - `__count`: count of observations
    - `__outliers`: the values that fall outside the minimum/maximum value
                    (see whisker type)

    :param df: DataFrame containing all-numeric data (temporal column ignored)
    :param groupby: The categories to group by (x-axis)
    :param metrics: The metrics for which to calculate the distribution
    :param whisker_type: The confidence level type
    :return: DataFrame with boxplot statistics per groupby
    """
    def quartile1(series: Series) -> float:
        return np.nanpercentile(series, 25, interpolation="midpoint")

    def quartile3(series: Series) -> float:
        return np.nanpercentile(series, 75, interpolation="midpoint")

    if whisker_type == PostProcessingBoxplotWhiskerType.TUKEY:

        def whisker_high(series: Series) -> float:
            upper_outer_lim = quartile3(series) + 1.5 * (quartile3(series) -
                                                         quartile1(series))
            return series[series <= upper_outer_lim].max()

        def whisker_low(series: Series) -> float:
            lower_outer_lim = quartile1(series) - 1.5 * (quartile3(series) -
                                                         quartile1(series))
            return series[series >= lower_outer_lim].min()

    elif whisker_type == PostProcessingBoxplotWhiskerType.PERCENTILE:
        if (not isinstance(percentiles, (list, tuple)) or len(percentiles) != 2
                or not isinstance(percentiles[0], (int, float))
                or not isinstance(percentiles[1], (int, float))
                or percentiles[0] >= percentiles[1]):
            raise QueryObjectValidationError(
                _("percentiles must be a list or tuple with two numeric values, "
                  "of which the first is lower than the second value"))
        low, high = percentiles[0], percentiles[1]

        def whisker_high(series: Series) -> float:
            return np.nanpercentile(series, high)

        def whisker_low(series: Series) -> float:
            return np.nanpercentile(series, low)

    else:
        whisker_high = np.max
        whisker_low = np.min

    def outliers(series: Series) -> Set[float]:
        above = series[series > whisker_high(series)]
        below = series[series < whisker_low(series)]
        return above.tolist() + below.tolist()

    operators: Dict[str, Callable[[Any], Any]] = {
        "mean": np.mean,
        "median": np.median,
        "max": whisker_high,
        "min": whisker_low,
        "q1": quartile1,
        "q3": quartile3,
        "count": np.ma.count,
        "outliers": outliers,
    }
    aggregates: Dict[str, Dict[str, Union[str, Callable[..., Any]]]] = {
        f"{metric}__{operator_name}": {
            "column": metric,
            "operator": operator
        }
        for operator_name, operator in operators.items() for metric in metrics
    }
    return aggregate(df, groupby=groupby, aggregates=aggregates)
예제 #20
0
    def processing_time_offsets(
        self,
        df: pd.DataFrame,
        query_object: QueryObject,
    ) -> CachedTimeOffset:
        # ensure query_object is immutable
        query_object_clone = copy.copy(query_object)
        queries = []
        cache_keys = []

        time_offsets = query_object.time_offsets
        outer_from_dttm = query_object.from_dttm
        outer_to_dttm = query_object.to_dttm
        for offset in time_offsets:
            try:
                query_object_clone.from_dttm = get_past_or_future(
                    offset,
                    outer_from_dttm,
                )
                query_object_clone.to_dttm = get_past_or_future(
                    offset, outer_to_dttm)
            except ValueError as ex:
                raise QueryObjectValidationError(str(ex))
            # make sure subquery use main query where clause
            query_object_clone.inner_from_dttm = outer_from_dttm
            query_object_clone.inner_to_dttm = outer_to_dttm
            query_object_clone.time_offsets = []
            query_object_clone.post_processing = []

            if not query_object.from_dttm or not query_object.to_dttm:
                raise QueryObjectValidationError(
                    _("An enclosed time range (both start and end) must be specified "
                      "when using a Time Comparison."))
            # `offset` is added to the hash function
            cache_key = self.query_cache_key(query_object_clone,
                                             time_offset=offset)
            cache = QueryCacheManager.get(cache_key, CacheRegion.DATA,
                                          self.force)
            # whether hit in the cache
            if cache.is_loaded:
                df = self.left_join_on_dttm(df, cache.df)
                queries.append(cache.query)
                cache_keys.append(cache_key)
                continue

            query_object_clone_dct = query_object_clone.to_dict()
            result = self.datasource.query(query_object_clone_dct)
            queries.append(result.query)
            cache_keys.append(None)

            # rename metrics: SUM(value) => SUM(value) 1 year ago
            columns_name_mapping = {
                metric: TIME_COMPARISION.join([metric, offset])
                for metric in get_metric_names(
                    query_object_clone_dct.get("metrics", []))
            }
            columns_name_mapping[DTTM_ALIAS] = DTTM_ALIAS

            offset_metrics_df = result.df
            if offset_metrics_df.empty:
                offset_metrics_df = pd.DataFrame(
                    {col: [np.NaN]
                     for col in columns_name_mapping.values()})
            else:
                # 1. normalize df, set dttm column
                offset_metrics_df = self.normalize_df(offset_metrics_df,
                                                      query_object_clone)

                # 2. extract `metrics` columns and `dttm` column from extra query
                offset_metrics_df = offset_metrics_df[
                    columns_name_mapping.keys()]

                # 3. rename extra query columns
                offset_metrics_df = offset_metrics_df.rename(
                    columns=columns_name_mapping)

                # 4. set offset for dttm column
                offset_metrics_df[DTTM_ALIAS] = offset_metrics_df[
                    DTTM_ALIAS] - DateOffset(**normalize_time_delta(offset))

            # df left join `offset_metrics_df` on `DTTM`
            df = self.left_join_on_dttm(df, offset_metrics_df)

            # set offset df to cache.
            value = {
                "df": offset_metrics_df,
                "query": result.query,
            }
            cache.set(
                key=cache_key,
                value=value,
                timeout=self.cache_timeout,
                datasource_uid=self.datasource.uid,
                region=CacheRegion.DATA,
            )

        return CachedTimeOffset(df=df, queries=queries, cache_keys=cache_keys)
예제 #21
0
def prophet(  # pylint: disable=too-many-arguments
    df: DataFrame,
    time_grain: str,
    periods: int,
    confidence_interval: float,
    yearly_seasonality: Optional[Union[bool, int]] = None,
    weekly_seasonality: Optional[Union[bool, int]] = None,
    daily_seasonality: Optional[Union[bool, int]] = None,
) -> DataFrame:
    """
    Add forecasts to each series in a timeseries dataframe, along with confidence
    intervals for the prediction. For each series, the operation creates three
    new columns with the column name suffixed with the following values:

    - `__yhat`: the forecast for the given date
    - `__yhat_lower`: the lower bound of the forecast for the given date
    - `__yhat_upper`: the upper bound of the forecast for the given date
    - `__yhat_upper`: the upper bound of the forecast for the given date


    :param df: DataFrame containing all-numeric data (temporal column ignored)
    :param time_grain: Time grain used to specify time period increments in prediction
    :param periods: Time periods (in units of `time_grain`) to predict into the future
    :param confidence_interval: Width of predicted confidence interval
    :param yearly_seasonality: Should yearly seasonality be applied.
           An integer value will specify Fourier order of seasonality.
    :param weekly_seasonality: Should weekly seasonality be applied.
           An integer value will specify Fourier order of seasonality, `None` will
           automatically detect seasonality.
    :param daily_seasonality: Should daily seasonality be applied.
           An integer value will specify Fourier order of seasonality, `None` will
           automatically detect seasonality.
    :return: DataFrame with contributions, with temporal column at beginning if present
    """
    # validate inputs
    if not time_grain:
        raise QueryObjectValidationError(_("Time grain missing"))
    if time_grain not in PROPHET_TIME_GRAIN_MAP:
        raise QueryObjectValidationError(
            _(
                "Unsupported time grain: %(time_grain)s",
                time_grain=time_grain,
            ))
    freq = PROPHET_TIME_GRAIN_MAP[time_grain]
    # check type at runtime due to marhsmallow schema not being able to handle
    # union types
    if not periods or periods < 0 or not isinstance(periods, int):
        raise QueryObjectValidationError(
            _("Periods must be a positive integer value"))
    if not confidence_interval or confidence_interval <= 0 or confidence_interval >= 1:
        raise QueryObjectValidationError(
            _("Confidence interval must be between 0 and 1 (exclusive)"))
    if DTTM_ALIAS not in df.columns:
        raise QueryObjectValidationError(
            _("DataFrame must include temporal column"))
    if len(df.columns) < 2:
        raise QueryObjectValidationError(
            _("DataFrame include at least one series"))

    target_df = DataFrame()
    for column in [column for column in df.columns if column != DTTM_ALIAS]:
        fit_df = _prophet_fit_and_predict(
            df=df[[DTTM_ALIAS, column]].rename(columns={
                DTTM_ALIAS: "ds",
                column: "y"
            }),
            confidence_interval=confidence_interval,
            yearly_seasonality=_prophet_parse_seasonality(yearly_seasonality),
            weekly_seasonality=_prophet_parse_seasonality(weekly_seasonality),
            daily_seasonality=_prophet_parse_seasonality(daily_seasonality),
            periods=periods,
            freq=freq,
        )
        new_columns = [
            f"{column}__yhat",
            f"{column}__yhat_lower",
            f"{column}__yhat_upper",
            f"{column}",
        ]
        fit_df.columns = new_columns
        if target_df.empty:
            target_df = fit_df
        else:
            for new_column in new_columns:
                target_df = target_df.assign(
                    **{new_column: fit_df[new_column]})
    target_df.reset_index(level=0, inplace=True)
    return target_df.rename(columns={"ds": DTTM_ALIAS})
    def get_df_payload(self,
                       query_obj: QueryObject,
                       force_cached: Optional[bool] = False) -> Dict[str, Any]:
        """Handles caching around the df payload retrieval"""
        cache_key = self.query_cache_key(query_obj)
        cache = QueryCacheManager.get(
            cache_key,
            CacheRegion.DATA,
            self._query_context.force,
            force_cached,
        )

        if query_obj and cache_key and not cache.is_loaded:
            try:
                invalid_columns = [
                    col for col in
                    get_column_names_from_columns(query_obj.columns) +
                    get_column_names_from_metrics(query_obj.metrics or [])
                    if (col not in self._qc_datasource.column_names
                        and col != DTTM_ALIAS)
                ]

                if invalid_columns:
                    raise QueryObjectValidationError(
                        _(
                            "Columns missing in datasource: %(invalid_columns)s",
                            invalid_columns=invalid_columns,
                        ))

                query_result = self.get_query_result(query_obj)
                annotation_data = self.get_annotation_data(query_obj)
                cache.set_query_result(
                    key=cache_key,
                    query_result=query_result,
                    annotation_data=annotation_data,
                    force_query=self._query_context.force,
                    timeout=self.get_cache_timeout(),
                    datasource_uid=self._qc_datasource.uid,
                    region=CacheRegion.DATA,
                )
            except QueryObjectValidationError as ex:
                cache.error_message = str(ex)
                cache.status = QueryStatus.FAILED

        # the N-dimensional DataFrame has converteds into flat DataFrame
        # by `flatten operator`, "comma" in the column is escaped by `escape_separator`
        # the result DataFrame columns should be unescaped
        label_map = {
            unescape_separator(col):
            [unescape_separator(col) for col in re.split(r"(?<!\\),\s", col)]
            for col in cache.df.columns.values
        }
        cache.df.columns = [
            unescape_separator(col) for col in cache.df.columns.values
        ]

        return {
            "cache_key": cache_key,
            "cached_dttm": cache.cache_dttm,
            "cache_timeout": self.get_cache_timeout(),
            "df": cache.df,
            "applied_template_filters": cache.applied_template_filters,
            "annotation_data": cache.annotation_data,
            "error": cache.error_message,
            "is_cached": cache.is_cached,
            "query": cache.query,
            "status": cache.status,
            "stacktrace": cache.stacktrace,
            "rowcount": len(cache.df.index),
            "from_dttm": query_obj.from_dttm,
            "to_dttm": query_obj.to_dttm,
            "label_map": label_map,
        }
예제 #23
0
def rolling(  # pylint: disable=too-many-arguments
    df: DataFrame,
    columns: Dict[str, str],
    rolling_type: str,
    window: Optional[int] = None,
    rolling_type_options: Optional[Dict[str, Any]] = None,
    center: bool = False,
    win_type: Optional[str] = None,
    min_periods: Optional[int] = None,
) -> DataFrame:
    """
    Apply a rolling window on the dataset. See the Pandas docs for further details:
    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html

    :param df: DataFrame on which the rolling period will be based.
    :param columns: columns on which to perform rolling, mapping source column to
           target column. For instance, `{'y': 'y'}` will replace the column `y` with
           the rolling value in `y`, while `{'y': 'y2'}` will add a column `y2` based
           on rolling values calculated from `y`, leaving the original column `y`
           unchanged.
    :param rolling_type: Type of rolling window. Any numpy function will work.
    :param window: Size of the window.
    :param rolling_type_options: Optional options to pass to rolling method. Needed
           for e.g. quantile operation.
    :param center: Should the label be at the center of the window.
    :param win_type: Type of window function.
    :param min_periods: The minimum amount of periods required for a row to be included
                        in the result set.
    :return: DataFrame with the rolling columns
    :raises QueryObjectValidationError: If the request in incorrect
    """
    rolling_type_options = rolling_type_options or {}
    df_rolling = df[columns.keys()]
    kwargs: Dict[str, Union[str, int]] = {}
    if window is None:
        raise QueryObjectValidationError(
            _("Undefined window for rolling operation"))
    if window == 0:
        raise QueryObjectValidationError(_("Window must be > 0"))

    kwargs["window"] = window
    if min_periods is not None:
        kwargs["min_periods"] = min_periods
    if center is not None:
        kwargs["center"] = center
    if win_type is not None:
        kwargs["win_type"] = win_type

    df_rolling = df_rolling.rolling(**kwargs)
    if rolling_type not in DENYLIST_ROLLING_FUNCTIONS or not hasattr(
            df_rolling, rolling_type):
        raise QueryObjectValidationError(
            _("Invalid rolling_type: %(type)s", type=rolling_type))
    try:
        df_rolling = getattr(df_rolling, rolling_type)(**rolling_type_options)
    except TypeError:
        raise QueryObjectValidationError(
            _(
                "Invalid options for %(rolling_type)s: %(options)s",
                rolling_type=rolling_type,
                options=rolling_type_options,
            ))
    df = _append_columns(df, df_rolling, columns)
    if min_periods:
        df = df[min_periods:]
    return df
def pivot(  # pylint: disable=too-many-arguments
    df: DataFrame,
    index: List[str],
    aggregates: Dict[str, Dict[str, Any]],
    columns: Optional[List[str]] = None,
    metric_fill_value: Optional[Any] = None,
    column_fill_value: Optional[str] = None,
    drop_missing_columns: Optional[bool] = True,
    combine_value_with_metric: bool = False,
    marginal_distributions: Optional[bool] = None,
    marginal_distribution_name: Optional[str] = None,
    flatten_columns: bool = True,
) -> DataFrame:
    """
    Perform a pivot operation on a DataFrame.

    :param df: Object on which pivot operation will be performed
    :param index: Columns to group by on the table index (=rows)
    :param columns: Columns to group by on the table columns
    :param metric_fill_value: Value to replace missing values with
    :param column_fill_value: Value to replace missing pivot columns with
    :param drop_missing_columns: Do not include columns whose entries are all missing
    :param combine_value_with_metric: Display metrics side by side within each column,
           as opposed to each column being displayed side by side for each metric.
    :param aggregates: A mapping from aggregate column name to the the aggregate
           config.
    :param marginal_distributions: Add totals for row/column. Default to False
    :param marginal_distribution_name: Name of row/column with marginal distribution.
           Default to 'All'.
    :param flatten_columns: Convert column names to strings
    :return: A pivot table
    :raises ChartDataValidationError: If the request in incorrect
    """
    if not index:
        raise QueryObjectValidationError(
            _("Pivot operation requires at least one index"))
    if not aggregates:
        raise QueryObjectValidationError(
            _("Pivot operation must include at least one aggregate"))

    if column_fill_value:
        df[columns] = df[columns].fillna(value=column_fill_value)

    aggregate_funcs = _get_aggregate_funcs(df, aggregates)

    # TODO (villebro): Pandas 1.0.3 doesn't yet support NamedAgg in pivot_table.
    #  Remove once/if support is added.
    aggfunc = {na.column: na.aggfunc for na in aggregate_funcs.values()}

    df = df.pivot_table(
        values=aggfunc.keys(),
        index=index,
        columns=columns,
        aggfunc=aggfunc,
        fill_value=metric_fill_value,
        dropna=drop_missing_columns,
        margins=marginal_distributions,
        margins_name=marginal_distribution_name,
    )

    if combine_value_with_metric:
        df = df.stack(0).unstack()

    # Make index regular column
    if flatten_columns:
        df.columns = [
            _flatten_column_after_pivot(col, aggregates) for col in df.columns
        ]
    # return index as regular column
    df.reset_index(level=0, inplace=True)
    return df
예제 #25
0
    def processing_time_offsets(  # pylint: disable=too-many-locals
        self, df: pd.DataFrame, query_object: QueryObject,
    ) -> CachedTimeOffset:
        # ensure query_object is immutable
        query_object_clone = copy.copy(query_object)
        queries: List[str] = []
        cache_keys: List[Optional[str]] = []
        rv_dfs: List[pd.DataFrame] = [df]

        time_offsets = query_object.time_offsets
        outer_from_dttm = query_object.from_dttm
        outer_to_dttm = query_object.to_dttm
        for offset in time_offsets:
            try:
                query_object_clone.from_dttm = get_past_or_future(
                    offset, outer_from_dttm,
                )
                query_object_clone.to_dttm = get_past_or_future(offset, outer_to_dttm)
            except ValueError as ex:
                raise QueryObjectValidationError(str(ex)) from ex
            # make sure subquery use main query where clause
            query_object_clone.inner_from_dttm = outer_from_dttm
            query_object_clone.inner_to_dttm = outer_to_dttm
            query_object_clone.time_offsets = []
            query_object_clone.post_processing = []

            if not query_object.from_dttm or not query_object.to_dttm:
                raise QueryObjectValidationError(
                    _(
                        "An enclosed time range (both start and end) must be specified "
                        "when using a Time Comparison."
                    )
                )
            # `offset` is added to the hash function
            cache_key = self.query_cache_key(query_object_clone, time_offset=offset)
            cache = QueryCacheManager.get(cache_key, CacheRegion.DATA, self.force)
            # whether hit on the cache
            if cache.is_loaded:
                rv_dfs.append(cache.df)
                queries.append(cache.query)
                cache_keys.append(cache_key)
                continue

            query_object_clone_dct = query_object_clone.to_dict()
            # rename metrics: SUM(value) => SUM(value) 1 year ago
            metrics_mapping = {
                metric: TIME_COMPARISION.join([metric, offset])
                for metric in get_metric_names(
                    query_object_clone_dct.get("metrics", [])
                )
            }
            join_keys = [col for col in df.columns if col not in metrics_mapping.keys()]

            result = self.datasource.query(query_object_clone_dct)
            queries.append(result.query)
            cache_keys.append(None)

            offset_metrics_df = result.df
            if offset_metrics_df.empty:
                offset_metrics_df = pd.DataFrame(
                    {
                        col: [np.NaN]
                        for col in join_keys + list(metrics_mapping.values())
                    }
                )
            else:
                # 1. normalize df, set dttm column
                offset_metrics_df = self.normalize_df(
                    offset_metrics_df, query_object_clone
                )

                # 2. rename extra query columns
                offset_metrics_df = offset_metrics_df.rename(columns=metrics_mapping)

                # 3. set time offset for dttm column
                offset_metrics_df[DTTM_ALIAS] = offset_metrics_df[
                    DTTM_ALIAS
                ] - DateOffset(**normalize_time_delta(offset))

            # df left join `offset_metrics_df`
            offset_df = self.left_join_df(
                left_df=df, right_df=offset_metrics_df, join_keys=join_keys,
            )
            offset_slice = offset_df[metrics_mapping.values()]

            # set offset_slice to cache and stack.
            value = {
                "df": offset_slice,
                "query": result.query,
            }
            cache.set(
                key=cache_key,
                value=value,
                timeout=self.cache_timeout,
                datasource_uid=self.datasource.uid,
                region=CacheRegion.DATA,
            )
            rv_dfs.append(offset_slice)

        rv_df = pd.concat(rv_dfs, axis=1, copy=False) if time_offsets else df
        return CachedTimeOffset(df=rv_df, queries=queries, cache_keys=cache_keys)
예제 #26
0
def pivot(  # pylint: disable=too-many-arguments
    df: DataFrame,
    index: List[str],
    aggregates: Dict[str, Dict[str, Any]],
    columns: Optional[List[str]] = None,
    metric_fill_value: Optional[Any] = None,
    column_fill_value: Optional[str] = NULL_STRING,
    drop_missing_columns: Optional[bool] = True,
    combine_value_with_metric: bool = False,
    marginal_distributions: Optional[bool] = None,
    marginal_distribution_name: Optional[str] = None,
    flatten_columns: bool = True,
) -> DataFrame:
    """
    Perform a pivot operation on a DataFrame.

    :param df: Object on which pivot operation will be performed
    :param index: Columns to group by on the table index (=rows)
    :param columns: Columns to group by on the table columns
    :param metric_fill_value: Value to replace missing values with
    :param column_fill_value: Value to replace missing pivot columns with. By default
           replaces missing values with "<NULL>". Set to `None` to remove columns
           with missing values.
    :param drop_missing_columns: Do not include columns whose entries are all missing
    :param combine_value_with_metric: Display metrics side by side within each column,
           as opposed to each column being displayed side by side for each metric.
    :param aggregates: A mapping from aggregate column name to the the aggregate
           config.
    :param marginal_distributions: Add totals for row/column. Default to False
    :param marginal_distribution_name: Name of row/column with marginal distribution.
           Default to 'All'.
    :param flatten_columns: Convert column names to strings
    :return: A pivot table
    :raises QueryObjectValidationError: If the request in incorrect
    """
    if not index:
        raise QueryObjectValidationError(
            _("Pivot operation requires at least one index"))
    if not aggregates:
        raise QueryObjectValidationError(
            _("Pivot operation must include at least one aggregate"))

    if columns and column_fill_value:
        df[columns] = df[columns].fillna(value=column_fill_value)

    aggregate_funcs = _get_aggregate_funcs(df, aggregates)

    # TODO (villebro): Pandas 1.0.3 doesn't yet support NamedAgg in pivot_table.
    #  Remove once/if support is added.
    aggfunc = {na.column: na.aggfunc for na in aggregate_funcs.values()}

    # When dropna = False, the pivot_table function will calculate cartesian-product
    # for MultiIndex.
    # https://github.com/apache/superset/issues/15956
    # https://github.com/pandas-dev/pandas/issues/18030
    series_set = set()
    if not drop_missing_columns and columns:
        for row in df[columns].itertuples():
            for metric in aggfunc.keys():
                series_set.add(str(tuple([metric]) + tuple(row[1:])))

    df = df.pivot_table(
        values=aggfunc.keys(),
        index=index,
        columns=columns,
        aggfunc=aggfunc,
        fill_value=metric_fill_value,
        dropna=drop_missing_columns,
        margins=marginal_distributions,
        margins_name=marginal_distribution_name,
    )

    if not drop_missing_columns and len(series_set) > 0 and not df.empty:
        for col in df.columns:
            series = str(col)
            if series not in series_set:
                df = df.drop(col, axis=PandasAxis.COLUMN)

    if combine_value_with_metric:
        df = df.stack(0).unstack()

    # Make index regular column
    if flatten_columns:
        df.columns = [
            _flatten_column_after_pivot(col, aggregates) for col in df.columns
        ]
    # return index as regular column
    df.reset_index(level=0, inplace=True)
    return df
예제 #27
0
def _prophet_fit_and_predict(  # pylint: disable=too-many-arguments
    df,
    confidence_interval,
    yearly_seasonality,
    weekly_seasonality,
    daily_seasonality,
    periods,
    freq
    ):
    """
    Fit a prophet model and return a DataFrame with predicted results.
    """
    
    try:
        prophet_logger = logging.getLogger("fbprophet.plot")

        prophet_logger.setLevel(logging.CRITICAL)
        from statsmodels.tsa.arima.model import ARIMA  # pylint: disable=import-error

        prophet_logger.setLevel(logging.NOTSET)
    except ModuleNotFoundError:
        raise QueryObjectValidationError(_("Statsmodels package not installed"))
    
    
    
    from numpy import array
    from keras.models import Sequential
    from keras.layers import LSTM
    from keras.layers import Dense
    from keras.layers import Bidirectional

    import numpy as np
    import pandas as pd
    import os
    from statsmodels.tsa.statespace.sarimax import SARIMAX
    from statsmodels.graphics.tsaplots import plot_acf,plot_pacf 
    from statsmodels.tsa.seasonal import seasonal_decompose
    from pandas import datetime

    df.columns =['Date','Value']
    df['Date'] = pd.to_datetime(df['Date'])

    dff=df.copy()
    df['Value']=df['Value'].astype(float)
    ll =df['Date'].values.tolist()
    #df.set_index(['Date'],inplace=True)
    df.set_index('Date', inplace=True)
    df.index = pd.to_datetime(df.index)
    freq = PROPHET_TIME_GRAIN_MAP[time_grain]
    dataset = df.resample(freq).mean()


    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(dataset)
    scaled_data = scaler.transform(dataset)


    from keras.models import Sequential
    from keras.layers import Dense
    from keras.layers import LSTM
    n_input=int(len(scaled_data)/2) #means how many previous values needs to be taken for predicting next
    n_features= 1 #for univariate
    
    lstm_model = Sequential()
    lstm_model.add(Bidirectional(LSTM(200, activation='relu'), input_shape=(n_steps, n_features)))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss='mse')

    from keras.preprocessing.sequence import TimeseriesGenerator

    
    n_features= 1
    generator = TimeseriesGenerator(scaled_data, scaled_data, length=n_input, batch_size=1)

    lstm_model.fit_generator(generator,epochs=int(confidence_interval*10))

    lstm_predictions_scaled = list()

    batch = scaled_data[-n_input:]
    current_batch = batch.reshape((1, n_input, n_features))
    ## len(test_data) will be np. of forcasting periods
    for i in range(periods):   
            lstm_pred = lstm_model.predict(current_batch)[0]

            lstm_predictions_scaled.append(lstm_pred) 
            current_batch = np.append(current_batch[:,1:,:],[[lstm_pred]],axis=1)

    lstm_predictions = scaler.inverse_transform(lstm_predictions_scaled)
    fr=pd.DataFrame(lstm_predictions)
    fr.columns=['yhat']

    df.sort_values('Date',inplace=True)

    didx = pd.date_range(start=ll[-1],periods=periods+1,freq=freq)
    future = pd.DataFrame(didx)
    future.columns =['ds']
    future = future.iloc[1:,:]

    fr['ds']=future['ds'].values.tolist()
    fr['ds']=pd.to_datetime(fr['ds'])
    forecast = fr.copy()

    confidence=0.8
    forecast['yhat_upper']=forecast['yhat'] + forecast['yhat']*(1-confidence)/2
    forecast['yhat_lower']=forecast['yhat'] - forecast['yhat']*(1-confidence)/2

    forecast=forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]]
    dff.columns =['ds','y']
    dff['ds'] = pd.to_datetime(dff['ds'], utc = True)
    forecast['ds'] = pd.to_datetime(forecast['ds'], utc = True)
    return forecast.join(dff.set_index("ds"), on="ds").set_index(['ds'])