def apply_post_process( result: Dict[Any, Any], form_data: Optional[Dict[str, Any]] = None, ) -> Dict[Any, Any]: form_data = form_data or {} viz_type = form_data.get("viz_type") if viz_type not in post_processors: return result post_processor = post_processors[viz_type] for query in result["queries"]: df = pd.read_csv(StringIO(query["data"])) processed_df = post_processor(df, form_data) buf = StringIO() processed_df.to_csv(buf) buf.seek(0) query["data"] = buf.getvalue() query["colnames"] = list(processed_df.columns) query["coltypes"] = extract_dataframe_dtypes(processed_df) query["rowcount"] = len(processed_df.index) return result
def test_extract_dataframe_dtypes(self): slc = self.get_slice("Girls", db.session) cols: Tuple[Tuple[str, GenericDataType, List[Any]], ...] = ( ("dt", GenericDataType.TEMPORAL, [date(2021, 2, 4), date(2021, 2, 4)]), ( "dttm", GenericDataType.TEMPORAL, [datetime(2021, 2, 4, 1, 1, 1), datetime(2021, 2, 4, 1, 1, 1)], ), ("str", GenericDataType.STRING, ["foo", "foo"]), ("int", GenericDataType.NUMERIC, [1, 1]), ("float", GenericDataType.NUMERIC, [0.5, 0.5]), ("mixed-int-float", GenericDataType.NUMERIC, [0.5, 1.0]), ("bool", GenericDataType.BOOLEAN, [True, False]), ("mixed-str-int", GenericDataType.STRING, ["abc", 1.0]), ("obj", GenericDataType.STRING, [{"a": 1}, {"a": 1}]), ("dt_null", GenericDataType.TEMPORAL, [None, date(2021, 2, 4)]), ( "dttm_null", GenericDataType.TEMPORAL, [None, datetime(2021, 2, 4, 1, 1, 1)], ), ("str_null", GenericDataType.STRING, [None, "foo"]), ("int_null", GenericDataType.NUMERIC, [None, 1]), ("float_null", GenericDataType.NUMERIC, [None, 0.5]), ("bool_null", GenericDataType.BOOLEAN, [None, False]), ("obj_null", GenericDataType.STRING, [None, {"a": 1}]), # Non-timestamp columns should be identified as temporal if # `is_dttm` is set to `True` in the underlying datasource ("ds", GenericDataType.TEMPORAL, [None, {"ds": "2017-01-01"}]), ) df = pd.DataFrame(data={col[0]: col[2] for col in cols}) assert extract_dataframe_dtypes(df, slc.datasource) == [col[1] for col in cols]
def _get_full( query_context: "QueryContext", query_obj: "QueryObject", force_cached: Optional[bool] = False, ) -> Dict[str, Any]: datasource = _get_datasource(query_context, query_obj) result_type = query_obj.result_type or query_context.result_type payload = query_context.get_df_payload(query_obj, force_cached=force_cached) df = payload["df"] status = payload["status"] if status != QueryStatus.FAILED: payload["colnames"] = list(df.columns) payload["coltypes"] = extract_dataframe_dtypes(df) payload["data"] = query_context.get_data(df) del payload["df"] filters = query_obj.filter filter_columns = cast(List[str], [flt.get("col") for flt in filters]) columns = set(datasource.column_names) applied_time_columns, rejected_time_columns = get_time_filter_status( datasource, query_obj.applied_time_extras) payload["applied_filters"] = [{ "column": col } for col in filter_columns if col in columns] + applied_time_columns payload["rejected_filters"] = [{ "reason": "not_in_datasource", "column": col } for col in filter_columns if col not in columns] + rejected_time_columns if result_type == ChartDataResultType.RESULTS and status != QueryStatus.FAILED: return {"data": payload.get("data")} return payload
def test_extract_dataframe_dtypes(self): cols: Tuple[Tuple[str, GenericDataType, List[Any]], ...] = ( ("dt", GenericDataType.TEMPORAL, [date(2021, 2, 4), date(2021, 2, 4)]), ( "dttm", GenericDataType.TEMPORAL, [datetime(2021, 2, 4, 1, 1, 1), datetime(2021, 2, 4, 1, 1, 1)], ), ("str", GenericDataType.STRING, ["foo", "foo"]), ("int", GenericDataType.NUMERIC, [1, 1]), ("float", GenericDataType.NUMERIC, [0.5, 0.5]), ("mixed-int-float", GenericDataType.NUMERIC, [0.5, 1.0]), ("bool", GenericDataType.BOOLEAN, [True, False]), ("mixed-str-int", GenericDataType.STRING, ["abc", 1.0]), ("obj", GenericDataType.STRING, [{"a": 1}, {"a": 1}]), ("dt_null", GenericDataType.TEMPORAL, [None, date(2021, 2, 4)]), ( "dttm_null", GenericDataType.TEMPORAL, [None, datetime(2021, 2, 4, 1, 1, 1)], ), ("str_null", GenericDataType.STRING, [None, "foo"]), ("int_null", GenericDataType.NUMERIC, [None, 1]), ("float_null", GenericDataType.NUMERIC, [None, 0.5]), ("bool_null", GenericDataType.BOOLEAN, [None, False]), ("obj_null", GenericDataType.STRING, [None, {"a": 1}]), ) df = pd.DataFrame(data={col[0]: col[2] for col in cols}) assert extract_dataframe_dtypes(df) == [col[1] for col in cols]
def apply_post_process( result: Dict[Any, Any], form_data: Optional[Dict[str, Any]] = None, datasource: Optional["BaseDatasource"] = None, ) -> Dict[Any, Any]: form_data = form_data or {} viz_type = form_data.get("viz_type") if viz_type not in post_processors: return result post_processor = post_processors[viz_type] for query in result["queries"]: if query["result_format"] not in (rf.value for rf in ChartDataResultFormat): raise Exception( f"Result format {query['result_format']} not supported") if not query["data"]: # do not try to process empty data continue if query["result_format"] == ChartDataResultFormat.JSON: df = pd.DataFrame.from_dict(query["data"]) elif query["result_format"] == ChartDataResultFormat.CSV: df = pd.read_csv(StringIO(query["data"])) # convert all columns to verbose (label) name if datasource: df.rename(columns=datasource.data["verbose_map"], inplace=True) processed_df = post_processor(df, form_data, datasource) query["colnames"] = list(processed_df.columns) query["indexnames"] = list(processed_df.index) query["coltypes"] = extract_dataframe_dtypes(processed_df, datasource) query["rowcount"] = len(processed_df.index) # Flatten hierarchical columns/index since they are represented as # `Tuple[str]`. Otherwise encoding to JSON later will fail because # maps cannot have tuples as their keys in JSON. processed_df.columns = [ " ".join(str(name) for name in column).strip() if isinstance( column, tuple) else column for column in processed_df.columns ] processed_df.index = [ " ".join(str(name) for name in index).strip() if isinstance( index, tuple) else index for index in processed_df.index ] if query["result_format"] == ChartDataResultFormat.JSON: query["data"] = processed_df.to_dict() elif query["result_format"] == ChartDataResultFormat.CSV: buf = StringIO() processed_df.to_csv(buf) buf.seek(0) query["data"] = buf.getvalue() return result
def get_single_payload( self, query_obj: QueryObject, force_cached: Optional[bool] = False, ) -> Dict[str, Any]: """Return results payload for a single quey""" if self.result_type == utils.ChartDataResultType.QUERY: return { "query": self.datasource.get_query_str(query_obj.to_dict()), "language": self.datasource.query_language, } if self.result_type == utils.ChartDataResultType.SAMPLES: row_limit = query_obj.row_limit or math.inf query_obj = copy.copy(query_obj) query_obj.is_timeseries = False query_obj.orderby = [] query_obj.groupby = [] query_obj.metrics = [] query_obj.post_processing = [] query_obj.row_limit = min(row_limit, config["SAMPLES_ROW_LIMIT"]) query_obj.row_offset = 0 query_obj.columns = [ o.column_name for o in self.datasource.columns ] payload = self.get_df_payload(query_obj, force_cached=force_cached) df = payload["df"] status = payload["status"] if status != utils.QueryStatus.FAILED: payload["colnames"] = list(df.columns) payload["coltypes"] = utils.extract_dataframe_dtypes(df) payload["data"] = self.get_data(df) del payload["df"] filters = query_obj.filter filter_columns = cast(List[str], [flt.get("col") for flt in filters]) columns = set(self.datasource.column_names) applied_time_columns, rejected_time_columns = utils.get_time_filter_status( self.datasource, query_obj.applied_time_extras) payload["applied_filters"] = [{ "column": col } for col in filter_columns if col in columns] + applied_time_columns payload["rejected_filters"] = [{ "reason": "not_in_datasource", "column": col } for col in filter_columns if col not in columns ] + rejected_time_columns if (self.result_type == utils.ChartDataResultType.RESULTS and status != utils.QueryStatus.FAILED): return {"data": payload["data"]} return payload
def _get_full( query_context: QueryContext, query_obj: QueryObject, force_cached: Optional[bool] = False, ) -> Dict[str, Any]: datasource = _get_datasource(query_context, query_obj) result_type = query_obj.result_type or query_context.result_type payload = query_context.get_df_payload(query_obj, force_cached=force_cached) applied_template_filters = payload.get("applied_template_filters", []) df = payload["df"] status = payload["status"] if status != QueryStatus.FAILED: payload["colnames"] = list(df.columns) payload["indexnames"] = list(df.index) payload["coltypes"] = extract_dataframe_dtypes(df, datasource) payload["data"] = query_context.get_data(df) payload["result_format"] = query_context.result_format del payload["df"] filters = query_obj.filter filter_columns = cast(List[str], [flt.get("col") for flt in filters]) columns = set(datasource.column_names) applied_time_columns, rejected_time_columns = get_time_filter_status( datasource, query_obj.applied_time_extras ) payload["applied_filters"] = [ {"column": get_column_name(col)} for col in filter_columns if is_adhoc_column(col) or col in columns or col in applied_template_filters ] + applied_time_columns payload["rejected_filters"] = [ {"reason": ExtraFiltersReasonType.COL_NOT_IN_DATASOURCE, "column": col} for col in filter_columns if not is_adhoc_column(col) and col not in columns and col not in applied_template_filters ] + rejected_time_columns if result_type == ChartDataResultType.RESULTS and status != QueryStatus.FAILED: return { "data": payload.get("data"), "colnames": payload.get("colnames"), "coltypes": payload.get("coltypes"), } return payload