def test_get_metric_names(): assert get_metric_names( [STR_METRIC, SIMPLE_SUM_ADHOC_METRIC, SQL_ADHOC_METRIC]) == ["my_metric", "my SUM", "my_sql"] assert get_metric_names( [STR_METRIC, SIMPLE_SUM_ADHOC_METRIC, SQL_ADHOC_METRIC], {STR_METRIC: "My Metric"}, ) == ["My Metric", "my SUM", "my_sql"]
def pivot_table( df: pd.DataFrame, form_data: Dict[str, Any], datasource: Optional["BaseDatasource"] = None, ) -> pd.DataFrame: """ Pivot table (v1). """ verbose_map = datasource.data["verbose_map"] if datasource else None if form_data.get("granularity") == "all" and DTTM_ALIAS in df: del df[DTTM_ALIAS] # v1 func names => v2 func names func_map = { "sum": "Sum", "mean": "Average", "min": "Minimum", "max": "Maximum", "std": "Sample Standard Deviation", "var": "Sample Variance", } return pivot_df( df, rows=get_column_names(form_data.get("groupby"), verbose_map), columns=get_column_names(form_data.get("columns"), verbose_map), metrics=get_metric_names(form_data["metrics"], verbose_map), aggfunc=func_map.get(form_data.get("pandas_aggfunc", "sum"), "Sum"), transpose_pivot=bool(form_data.get("transpose_pivot")), combine_metrics=bool(form_data.get("combine_metric")), show_rows_total=bool(form_data.get("pivot_margins")), show_columns_total=bool(form_data.get("pivot_margins")), apply_metrics_on_rows=False, )
def pivot_table_v2( df: pd.DataFrame, form_data: Dict[str, Any], datasource: Optional["BaseDatasource"] = None, ) -> pd.DataFrame: """ Pivot table v2. """ verbose_map = datasource.data["verbose_map"] if datasource else None if form_data.get("granularity_sqla") == "all" and DTTM_ALIAS in df: del df[DTTM_ALIAS] return pivot_df( df, rows=get_column_names(form_data.get("groupbyRows"), verbose_map), columns=get_column_names(form_data.get("groupbyColumns"), verbose_map), metrics=get_metric_names(form_data["metrics"], verbose_map), aggfunc=form_data.get("aggregateFunction", "Sum"), transpose_pivot=bool(form_data.get("transposePivot")), combine_metrics=bool(form_data.get("combineMetric")), show_rows_total=bool(form_data.get("rowTotals")), show_columns_total=bool(form_data.get("colTotals")), apply_metrics_on_rows=form_data.get("metricsLayout") == "ROWS", )
def processing_time_offsets( # pylint: disable=too-many-locals self, df: pd.DataFrame, query_object: QueryObject, ) -> CachedTimeOffset: # ensure query_object is immutable query_object_clone = copy.copy(query_object) queries: List[str] = [] cache_keys: List[Optional[str]] = [] rv_dfs: List[pd.DataFrame] = [df] time_offsets = query_object.time_offsets outer_from_dttm = query_object.from_dttm outer_to_dttm = query_object.to_dttm for offset in time_offsets: try: query_object_clone.from_dttm = get_past_or_future( offset, outer_from_dttm, ) query_object_clone.to_dttm = get_past_or_future(offset, outer_to_dttm) except ValueError as ex: raise QueryObjectValidationError(str(ex)) from ex # make sure subquery use main query where clause query_object_clone.inner_from_dttm = outer_from_dttm query_object_clone.inner_to_dttm = outer_to_dttm query_object_clone.time_offsets = [] query_object_clone.post_processing = [] if not query_object.from_dttm or not query_object.to_dttm: raise QueryObjectValidationError( _( "An enclosed time range (both start and end) must be specified " "when using a Time Comparison." ) ) # `offset` is added to the hash function cache_key = self.query_cache_key(query_object_clone, time_offset=offset) cache = QueryCacheManager.get(cache_key, CacheRegion.DATA, self.force) # whether hit on the cache if cache.is_loaded: rv_dfs.append(cache.df) queries.append(cache.query) cache_keys.append(cache_key) continue query_object_clone_dct = query_object_clone.to_dict() # rename metrics: SUM(value) => SUM(value) 1 year ago metrics_mapping = { metric: TIME_COMPARISION.join([metric, offset]) for metric in get_metric_names( query_object_clone_dct.get("metrics", []) ) } join_keys = [col for col in df.columns if col not in metrics_mapping.keys()] result = self.datasource.query(query_object_clone_dct) queries.append(result.query) cache_keys.append(None) offset_metrics_df = result.df if offset_metrics_df.empty: offset_metrics_df = pd.DataFrame( { col: [np.NaN] for col in join_keys + list(metrics_mapping.values()) } ) else: # 1. normalize df, set dttm column offset_metrics_df = self.normalize_df( offset_metrics_df, query_object_clone ) # 2. rename extra query columns offset_metrics_df = offset_metrics_df.rename(columns=metrics_mapping) # 3. set time offset for dttm column offset_metrics_df[DTTM_ALIAS] = offset_metrics_df[ DTTM_ALIAS ] - DateOffset(**normalize_time_delta(offset)) # df left join `offset_metrics_df` offset_df = self.left_join_df( left_df=df, right_df=offset_metrics_df, join_keys=join_keys, ) offset_slice = offset_df[metrics_mapping.values()] # set offset_slice to cache and stack. value = { "df": offset_slice, "query": result.query, } cache.set( key=cache_key, value=value, timeout=self.cache_timeout, datasource_uid=self.datasource.uid, region=CacheRegion.DATA, ) rv_dfs.append(offset_slice) rv_df = pd.concat(rv_dfs, axis=1, copy=False) if time_offsets else df return CachedTimeOffset(df=rv_df, queries=queries, cache_keys=cache_keys)
def metric_names(self) -> List[str]: """Return metrics names (labels), coerce adhoc metrics to strings.""" return get_metric_names(self.metrics or [])
def metric_names(self) -> List[str]: return get_metric_names(self.metrics)
def processing_time_offsets( self, df: pd.DataFrame, query_object: QueryObject, ) -> CachedTimeOffset: # ensure query_object is immutable query_object_clone = copy.copy(query_object) queries = [] cache_keys = [] time_offsets = query_object.time_offsets outer_from_dttm = query_object.from_dttm outer_to_dttm = query_object.to_dttm for offset in time_offsets: try: query_object_clone.from_dttm = get_past_or_future( offset, outer_from_dttm, ) query_object_clone.to_dttm = get_past_or_future( offset, outer_to_dttm) except ValueError as ex: raise QueryObjectValidationError(str(ex)) # make sure subquery use main query where clause query_object_clone.inner_from_dttm = outer_from_dttm query_object_clone.inner_to_dttm = outer_to_dttm query_object_clone.time_offsets = [] query_object_clone.post_processing = [] if not query_object.from_dttm or not query_object.to_dttm: raise QueryObjectValidationError( _("An enclosed time range (both start and end) must be specified " "when using a Time Comparison.")) # `offset` is added to the hash function cache_key = self.query_cache_key(query_object_clone, time_offset=offset) cache = QueryCacheManager.get(cache_key, CacheRegion.DATA, self.force) # whether hit in the cache if cache.is_loaded: df = self.left_join_on_dttm(df, cache.df) queries.append(cache.query) cache_keys.append(cache_key) continue query_object_clone_dct = query_object_clone.to_dict() result = self.datasource.query(query_object_clone_dct) queries.append(result.query) cache_keys.append(None) # rename metrics: SUM(value) => SUM(value) 1 year ago columns_name_mapping = { metric: TIME_COMPARISION.join([metric, offset]) for metric in get_metric_names( query_object_clone_dct.get("metrics", [])) } columns_name_mapping[DTTM_ALIAS] = DTTM_ALIAS offset_metrics_df = result.df if offset_metrics_df.empty: offset_metrics_df = pd.DataFrame( {col: [np.NaN] for col in columns_name_mapping.values()}) else: # 1. normalize df, set dttm column offset_metrics_df = self.normalize_df(offset_metrics_df, query_object_clone) # 2. extract `metrics` columns and `dttm` column from extra query offset_metrics_df = offset_metrics_df[ columns_name_mapping.keys()] # 3. rename extra query columns offset_metrics_df = offset_metrics_df.rename( columns=columns_name_mapping) # 4. set offset for dttm column offset_metrics_df[DTTM_ALIAS] = offset_metrics_df[ DTTM_ALIAS] - DateOffset(**normalize_time_delta(offset)) # df left join `offset_metrics_df` on `DTTM` df = self.left_join_on_dttm(df, offset_metrics_df) # set offset df to cache. value = { "df": offset_metrics_df, "query": result.query, } cache.set( key=cache_key, value=value, timeout=self.cache_timeout, datasource_uid=self.datasource.uid, region=CacheRegion.DATA, ) return CachedTimeOffset(df=df, queries=queries, cache_keys=cache_keys)