def test_get_samples(test_client, login_as_admin, virtual_dataset): """ Dataset API: Test get dataset samples """ # 1. should cache data uri = ( f"/datasource/samples?datasource_id={virtual_dataset.id}&datasource_type=table" ) # feeds data test_client.post(uri) # get from cache rv = test_client.post(uri) rv_data = json.loads(rv.data) assert rv.status_code == 200 assert len(rv_data["result"]["data"]) == 10 assert QueryCacheManager.has( rv_data["result"]["cache_key"], region=CacheRegion.DATA, ) assert rv_data["result"]["is_cached"] # 2. should read through cache data uri2 = f"/datasource/samples?datasource_id={virtual_dataset.id}&datasource_type=table&force=true" # feeds data test_client.post(uri2) # force query rv2 = test_client.post(uri2) rv_data2 = json.loads(rv2.data) assert rv2.status_code == 200 assert len(rv_data2["result"]["data"]) == 10 assert QueryCacheManager.has( rv_data2["result"]["cache_key"], region=CacheRegion.DATA, ) assert not rv_data2["result"]["is_cached"] # 3. data precision assert "colnames" in rv_data2["result"] assert "coltypes" in rv_data2["result"] assert "data" in rv_data2["result"] eager_samples = virtual_dataset.database.get_df( f"select * from ({virtual_dataset.sql}) as tbl" f' limit {app.config["SAMPLES_ROW_LIMIT"]}') # the col3 is Decimal eager_samples["col3"] = eager_samples["col3"].apply(float) eager_samples = eager_samples.to_dict(orient="records") assert eager_samples == rv_data2["result"]["data"]
def get_df_payload(self, query_obj: QueryObject, force_cached: Optional[bool] = False) -> Dict[str, Any]: """Handles caching around the df payload retrieval""" cache_key = self.query_cache_key(query_obj) cache = QueryCacheManager.get( cache_key, CacheRegion.DATA, self._query_context.force, force_cached, ) if query_obj and cache_key and not cache.is_loaded: try: invalid_columns = [ col for col in get_column_names_from_columns(query_obj.columns) + get_column_names_from_metrics(query_obj.metrics or []) if (col not in self._qc_datasource.column_names and col != DTTM_ALIAS) ] if invalid_columns: raise QueryObjectValidationError( _( "Columns missing in datasource: %(invalid_columns)s", invalid_columns=invalid_columns, )) query_result = self.get_query_result(query_obj) annotation_data = self.get_annotation_data(query_obj) cache.set_query_result( key=cache_key, query_result=query_result, annotation_data=annotation_data, force_query=self._query_context.force, timeout=self.get_cache_timeout(), datasource_uid=self._qc_datasource.uid, region=CacheRegion.DATA, ) except QueryObjectValidationError as ex: cache.error_message = str(ex) cache.status = QueryStatus.FAILED return { "cache_key": cache_key, "cached_dttm": cache.cache_dttm, "cache_timeout": self.get_cache_timeout(), "df": cache.df, "applied_template_filters": cache.applied_template_filters, "annotation_data": cache.annotation_data, "error": cache.error_message, "is_cached": cache.is_cached, "query": cache.query, "status": cache.status, "stacktrace": cache.stacktrace, "rowcount": len(cache.df.index), "from_dttm": query_obj.from_dttm, "to_dttm": query_obj.to_dttm, }
def test_get_samples_on_physical_dataset(test_client, login_as_admin, physical_dataset): uri = ( f"/datasource/samples?datasource_id={physical_dataset.id}&datasource_type=table" ) rv = test_client.post(uri) assert rv.status_code == 200 assert QueryCacheManager.has( rv.json["result"]["cache_key"], region=CacheRegion.DATA ) assert len(rv.json["result"]["data"]) == 10
def processing_time_offsets( # pylint: disable=too-many-locals self, df: pd.DataFrame, query_object: QueryObject, ) -> CachedTimeOffset: query_context = self._query_context # ensure query_object is immutable query_object_clone = copy.copy(query_object) queries: List[str] = [] cache_keys: List[Optional[str]] = [] rv_dfs: List[pd.DataFrame] = [df] time_offsets = query_object.time_offsets outer_from_dttm = query_object.from_dttm outer_to_dttm = query_object.to_dttm for offset in time_offsets: try: query_object_clone.from_dttm = get_past_or_future( offset, outer_from_dttm, ) query_object_clone.to_dttm = get_past_or_future( offset, outer_to_dttm) except ValueError as ex: raise QueryObjectValidationError(str(ex)) from ex # make sure subquery use main query where clause query_object_clone.inner_from_dttm = outer_from_dttm query_object_clone.inner_to_dttm = outer_to_dttm query_object_clone.time_offsets = [] query_object_clone.post_processing = [] if not query_object.from_dttm or not query_object.to_dttm: raise QueryObjectValidationError( _("An enclosed time range (both start and end) must be specified " "when using a Time Comparison.")) # `offset` is added to the hash function cache_key = self.query_cache_key(query_object_clone, time_offset=offset) cache = QueryCacheManager.get(cache_key, CacheRegion.DATA, query_context.force) # whether hit on the cache if cache.is_loaded: rv_dfs.append(cache.df) queries.append(cache.query) cache_keys.append(cache_key) continue query_object_clone_dct = query_object_clone.to_dict() # rename metrics: SUM(value) => SUM(value) 1 year ago metrics_mapping = { metric: TIME_COMPARISON.join([metric, offset]) for metric in get_metric_names( query_object_clone_dct.get("metrics", [])) } join_keys = [ col for col in df.columns if col not in metrics_mapping.keys() ] result = self._qc_datasource.query(query_object_clone_dct) queries.append(result.query) cache_keys.append(None) offset_metrics_df = result.df if offset_metrics_df.empty: offset_metrics_df = pd.DataFrame({ col: [np.NaN] for col in join_keys + list(metrics_mapping.values()) }) else: # 1. normalize df, set dttm column offset_metrics_df = self.normalize_df(offset_metrics_df, query_object_clone) # 2. rename extra query columns offset_metrics_df = offset_metrics_df.rename( columns=metrics_mapping) # 3. set time offset for index # TODO: add x-axis to QueryObject, potentially as an array for # multi-dimensional charts granularity = query_object.granularity index = granularity if granularity in df.columns else DTTM_ALIAS offset_metrics_df[index] = offset_metrics_df[ index] - DateOffset(**normalize_time_delta(offset)) # df left join `offset_metrics_df` offset_df = df_utils.left_join_df( left_df=df, right_df=offset_metrics_df, join_keys=join_keys, ) offset_slice = offset_df[metrics_mapping.values()] # set offset_slice to cache and stack. value = { "df": offset_slice, "query": result.query, } cache.set( key=cache_key, value=value, timeout=self.get_cache_timeout(), datasource_uid=query_context.datasource.uid, region=CacheRegion.DATA, ) rv_dfs.append(offset_slice) rv_df = pd.concat(rv_dfs, axis=1, copy=False) if time_offsets else df return CachedTimeOffset(df=rv_df, queries=queries, cache_keys=cache_keys)
def get_samples( # pylint: disable=too-many-arguments,too-many-locals datasource_type: str, datasource_id: int, force: bool = False, page: int = 1, per_page: int = 1000, payload: Optional[SamplesPayloadSchema] = None, ) -> Dict[str, Any]: datasource = DatasourceDAO.get_datasource( session=db.session, datasource_type=datasource_type, datasource_id=datasource_id, ) limit_clause = get_limit_clause(page, per_page) # todo(yongjie): Constructing count(*) and samples in the same query_context, # then remove query_type==SAMPLES # constructing samples query samples_instance = QueryContextFactory().create( datasource={ "type": datasource.type, "id": datasource.id, }, queries=[{ **payload, **limit_clause } if payload else limit_clause], result_type=ChartDataResultType.SAMPLES, force=force, ) # constructing count(*) query count_star_metric = { "metrics": [{ "expressionType": "SQL", "sqlExpression": "COUNT(*)", "label": "COUNT(*)", }] } count_star_instance = QueryContextFactory().create( datasource={ "type": datasource.type, "id": datasource.id, }, queries=[{ **payload, **count_star_metric } if payload else count_star_metric], result_type=ChartDataResultType.FULL, force=force, ) samples_results = samples_instance.get_payload() count_star_results = count_star_instance.get_payload() try: sample_data = samples_results["queries"][0] count_star_data = count_star_results["queries"][0] failed_status = (sample_data.get("status") == QueryStatus.FAILED or count_star_data.get("status") == QueryStatus.FAILED) error_msg = sample_data.get("error") or count_star_data.get("error") if failed_status and error_msg: cache_key = sample_data.get("cache_key") QueryCacheManager.delete(cache_key, region=CacheRegion.DATA) raise DatasetSamplesFailedError(error_msg) sample_data["page"] = page sample_data["per_page"] = per_page sample_data["total_count"] = count_star_data["data"][0]["COUNT(*)"] return sample_data except (IndexError, KeyError) as exc: raise DatasetSamplesFailedError from exc
def get_df_payload(self, query_obj: QueryObject, force_cached: Optional[bool] = False) -> Dict[str, Any]: """Handles caching around the df payload retrieval""" cache_key = self.query_cache_key(query_obj) cache = QueryCacheManager.get( cache_key, CacheRegion.DATA, self._query_context.force, force_cached, ) if query_obj and cache_key and not cache.is_loaded: try: invalid_columns = [ col for col in get_column_names_from_columns(query_obj.columns) + get_column_names_from_metrics(query_obj.metrics or []) if (col not in self._qc_datasource.column_names and col != DTTM_ALIAS) ] if invalid_columns: raise QueryObjectValidationError( _( "Columns missing in datasource: %(invalid_columns)s", invalid_columns=invalid_columns, )) query_result = self.get_query_result(query_obj) annotation_data = self.get_annotation_data(query_obj) cache.set_query_result( key=cache_key, query_result=query_result, annotation_data=annotation_data, force_query=self._query_context.force, timeout=self.get_cache_timeout(), datasource_uid=self._qc_datasource.uid, region=CacheRegion.DATA, ) except QueryObjectValidationError as ex: cache.error_message = str(ex) cache.status = QueryStatus.FAILED # the N-dimensional DataFrame has converteds into flat DataFrame # by `flatten operator`, "comma" in the column is escaped by `escape_separator` # the result DataFrame columns should be unescaped label_map = { unescape_separator(col): [unescape_separator(col) for col in re.split(r"(?<!\\),\s", col)] for col in cache.df.columns.values } cache.df.columns = [ unescape_separator(col) for col in cache.df.columns.values ] return { "cache_key": cache_key, "cached_dttm": cache.cache_dttm, "cache_timeout": self.get_cache_timeout(), "df": cache.df, "applied_template_filters": cache.applied_template_filters, "annotation_data": cache.annotation_data, "error": cache.error_message, "is_cached": cache.is_cached, "query": cache.query, "status": cache.status, "stacktrace": cache.stacktrace, "rowcount": len(cache.df.index), "from_dttm": query_obj.from_dttm, "to_dttm": query_obj.to_dttm, "label_map": label_map, }