def test_get_aggregations(self): ds = DruidDatasource(datasource_name="datasource") metrics_dict = { "sum1": DruidMetric( metric_name="sum1", metric_type="doubleSum", json=json.dumps({ "type": "doubleSum", "name": "sum1" }), ), "sum2": DruidMetric( metric_name="sum2", metric_type="doubleSum", json=json.dumps({ "type": "doubleSum", "name": "sum2" }), ), "div1": DruidMetric( metric_name="div1", metric_type="postagg", json=json.dumps({ "fn": "/", "type": "arithmetic", "name": "div1", "fields": [ { "fieldName": "sum1", "type": "fieldAccess" }, { "fieldName": "sum2", "type": "fieldAccess" }, ], }), ), } metric_names = ["sum1", "sum2"] aggs = ds.get_aggregations(metrics_dict, metric_names) expected_agg = { name: metrics_dict[name].json_obj for name in metric_names } self.assertEqual(expected_agg, aggs) metric_names = ["sum1", "col1"] self.assertRaises(SupersetException, ds.get_aggregations, metrics_dict, metric_names) metric_names = ["sum1", "div1"] self.assertRaises(SupersetException, ds.get_aggregations, metrics_dict, metric_names)
def test_get_aggregations(self): ds = DruidDatasource(datasource_name='datasource') metrics_dict = { 'sum1': DruidMetric( metric_name='sum1', metric_type='doubleSum', json=json.dumps({ 'type': 'doubleSum', 'name': 'sum1' }), ), 'sum2': DruidMetric( metric_name='sum2', metric_type='doubleSum', json=json.dumps({ 'type': 'doubleSum', 'name': 'sum2' }), ), 'div1': DruidMetric( metric_name='div1', metric_type='postagg', json=json.dumps({ 'fn': '/', 'type': 'arithmetic', 'name': 'div1', 'fields': [ { 'fieldName': 'sum1', 'type': 'fieldAccess', }, { 'fieldName': 'sum2', 'type': 'fieldAccess', }, ], }), ), } metric_names = ['sum1', 'sum2'] aggs = ds.get_aggregations(metrics_dict, metric_names) expected_agg = { name: metrics_dict[name].json_obj for name in metric_names } self.assertEqual(expected_agg, aggs) metric_names = ['sum1', 'col1'] self.assertRaises(SupersetException, ds.get_aggregations, metrics_dict, metric_names) metric_names = ['sum1', 'div1'] self.assertRaises(SupersetException, ds.get_aggregations, metrics_dict, metric_names)
def test_run_query_with_adhoc_metric(self): client = Mock() from_dttm = Mock() to_dttm = Mock() from_dttm.replace = Mock(return_value=from_dttm) to_dttm.replace = Mock(return_value=to_dttm) from_dttm.isoformat = Mock(return_value="from") to_dttm.isoformat = Mock(return_value="to") timezone = "timezone" from_dttm.tzname = Mock(return_value=timezone) ds = DruidDatasource(datasource_name="datasource") metric1 = DruidMetric(metric_name="metric1") metric2 = DruidMetric(metric_name="metric2") ds.metrics = [metric1, metric2] col1 = DruidColumn(column_name="col1") col2 = DruidColumn(column_name="col2") ds.columns = [col1, col2] all_metrics = [] post_aggs = ["some_agg"] ds._metrics_and_post_aggs = Mock(return_value=(all_metrics, post_aggs)) groupby = [] metrics = [{ "expressionType": "SIMPLE", "column": { "type": "DOUBLE", "column_name": "col1" }, "aggregate": "SUM", "label": "My Adhoc Metric", }] ds.get_having_filters = Mock(return_value=[]) client.query_builder = Mock() client.query_builder.last_query = Mock() client.query_builder.last_query.query_dict = {"mock": 0} # no groupby calls client.timeseries ds.run_query( groupby, metrics, None, from_dttm, to_dttm, client=client, filter=[], row_limit=100, ) self.assertEqual(0, len(client.topn.call_args_list)) self.assertEqual(0, len(client.groupby.call_args_list)) self.assertEqual(1, len(client.timeseries.call_args_list)) # check that there is no dimensions entry called_args = client.timeseries.call_args_list[0][1] self.assertNotIn("dimensions", called_args) self.assertIn("post_aggregations", called_args)
def test_run_query_with_adhoc_metric(self): client = Mock() from_dttm = Mock() to_dttm = Mock() from_dttm.replace = Mock(return_value=from_dttm) to_dttm.replace = Mock(return_value=to_dttm) from_dttm.isoformat = Mock(return_value='from') to_dttm.isoformat = Mock(return_value='to') timezone = 'timezone' from_dttm.tzname = Mock(return_value=timezone) ds = DruidDatasource(datasource_name='datasource') metric1 = DruidMetric(metric_name='metric1') metric2 = DruidMetric(metric_name='metric2') ds.metrics = [metric1, metric2] col1 = DruidColumn(column_name='col1') col2 = DruidColumn(column_name='col2') ds.columns = [col1, col2] all_metrics = [] post_aggs = ['some_agg'] ds._metrics_and_post_aggs = Mock(return_value=(all_metrics, post_aggs)) groupby = [] metrics = [{ 'expressionType': 'SIMPLE', 'column': { 'type': 'DOUBLE', 'column_name': 'col1' }, 'aggregate': 'SUM', 'label': 'My Adhoc Metric', }] ds.get_having_filters = Mock(return_value=[]) client.query_builder = Mock() client.query_builder.last_query = Mock() client.query_builder.last_query.query_dict = {'mock': 0} # no groupby calls client.timeseries ds.run_query( groupby, metrics, None, from_dttm, to_dttm, client=client, filter=[], row_limit=100, ) self.assertEqual(0, len(client.topn.call_args_list)) self.assertEqual(0, len(client.groupby.call_args_list)) self.assertEqual(1, len(client.timeseries.call_args_list)) # check that there is no dimensions entry called_args = client.timeseries.call_args_list[0][1] self.assertNotIn('dimensions', called_args) self.assertIn('post_aggregations', called_args)
def create_druid_datasource(self, name, id=0, cols_names=[], metric_names=[]): name = '{0}{1}'.format(NAME_PREFIX, name) cluster_name = 'druid_test' params = {DBREF: id, 'database_name': cluster_name} dict_rep = { 'cluster_name': cluster_name, 'datasource_name': name, 'id': id, 'params': json.dumps(params), 'columns': [{ 'column_name': c } for c in cols_names], 'metrics': [{ 'metric_name': c, 'json': '{}' } for c in metric_names], } datasource = DruidDatasource( id=id, datasource_name=name, cluster_name=cluster_name, params=json.dumps(params), ) for col_name in cols_names: datasource.columns.append(DruidColumn(column_name=col_name)) for metric_name in metric_names: datasource.metrics.append(DruidMetric(metric_name=metric_name)) return datasource, dict_rep
def decode_dashboards( # pylint: disable=too-many-return-statements o: Dict[str, Any]) -> Any: """ Function to be passed into json.loads obj_hook parameter Recreates the dashboard object from a json representation. """ from superset.connectors.druid.models import ( DruidCluster, DruidColumn, DruidDatasource, DruidMetric, ) if "__Dashboard__" in o: return Dashboard(**o["__Dashboard__"]) if "__Slice__" in o: return Slice(**o["__Slice__"]) if "__TableColumn__" in o: return TableColumn(**o["__TableColumn__"]) if "__SqlaTable__" in o: return SqlaTable(**o["__SqlaTable__"]) if "__SqlMetric__" in o: return SqlMetric(**o["__SqlMetric__"]) if "__DruidCluster__" in o: return DruidCluster(**o["__DruidCluster__"]) if "__DruidColumn__" in o: return DruidColumn(**o["__DruidColumn__"]) if "__DruidDatasource__" in o: return DruidDatasource(**o["__DruidDatasource__"]) if "__DruidMetric__" in o: return DruidMetric(**o["__DruidMetric__"]) if "__datetime__" in o: return datetime.strptime(o["__datetime__"], "%Y-%m-%dT%H:%M:%S") return o
def create_druid_datasource(self, name, id=0, cols_names=[], metric_names=[]): name = "{0}{1}".format(NAME_PREFIX, name) cluster_name = "druid_test" params = {DBREF: id, "database_name": cluster_name} dict_rep = { "cluster_name": cluster_name, "datasource_name": name, "id": id, "params": json.dumps(params), "columns": [{ "column_name": c } for c in cols_names], "metrics": [{ "metric_name": c, "json": "{}" } for c in metric_names], } datasource = DruidDatasource( id=id, datasource_name=name, cluster_name=cluster_name, params=json.dumps(params), ) for col_name in cols_names: datasource.columns.append(DruidColumn(column_name=col_name)) for metric_name in metric_names: datasource.metrics.append(DruidMetric(metric_name=metric_name)) return datasource, dict_rep
def test_run_query_multiple_groupby(self): client = Mock() from_dttm = Mock() to_dttm = Mock() from_dttm.replace = Mock(return_value=from_dttm) to_dttm.replace = Mock(return_value=to_dttm) from_dttm.isoformat = Mock(return_value='from') to_dttm.isoformat = Mock(return_value='to') timezone = 'timezone' from_dttm.tzname = Mock(return_value=timezone) ds = DruidDatasource(datasource_name='datasource') metric1 = DruidMetric(metric_name='metric1') metric2 = DruidMetric(metric_name='metric2') ds.metrics = [metric1, metric2] col1 = DruidColumn(column_name='col1') col2 = DruidColumn(column_name='col2') ds.columns = [col1, col2] aggs = [] post_aggs = ['some_agg'] ds._metrics_and_post_aggs = Mock(return_value=(aggs, post_aggs)) groupby = ['col1', 'col2'] metrics = ['metric1'] ds.get_having_filters = Mock(return_value=[]) client.query_builder = Mock() client.query_builder.last_query = Mock() client.query_builder.last_query.query_dict = {'mock': 0} # no groupby calls client.timeseries ds.run_query( groupby, metrics, None, from_dttm, to_dttm, client=client, row_limit=100, filter=[], ) self.assertEqual(0, len(client.topn.call_args_list)) self.assertEqual(1, len(client.groupby.call_args_list)) self.assertEqual(0, len(client.timeseries.call_args_list)) # check that there is no dimensions entry called_args = client.groupby.call_args_list[0][1] self.assertIn('dimensions', called_args) self.assertEqual(['col1', 'col2'], called_args['dimensions'])
def test_run_query_multiple_groupby(self): client = Mock() from_dttm = Mock() to_dttm = Mock() from_dttm.replace = Mock(return_value=from_dttm) to_dttm.replace = Mock(return_value=to_dttm) from_dttm.isoformat = Mock(return_value="from") to_dttm.isoformat = Mock(return_value="to") timezone = "timezone" from_dttm.tzname = Mock(return_value=timezone) ds = DruidDatasource(datasource_name="datasource") metric1 = DruidMetric(metric_name="metric1") metric2 = DruidMetric(metric_name="metric2") ds.metrics = [metric1, metric2] col1 = DruidColumn(column_name="col1") col2 = DruidColumn(column_name="col2") ds.columns = [col1, col2] aggs = [] post_aggs = ["some_agg"] ds._metrics_and_post_aggs = Mock(return_value=(aggs, post_aggs)) groupby = ["col1", "col2"] metrics = ["metric1"] ds.get_having_filters = Mock(return_value=[]) client.query_builder = Mock() client.query_builder.last_query = Mock() client.query_builder.last_query.query_dict = {"mock": 0} # no groupby calls client.timeseries ds.run_query( groupby, metrics, None, from_dttm, to_dttm, client=client, row_limit=100, filter=[], ) self.assertEqual(0, len(client.topn.call_args_list)) self.assertEqual(1, len(client.groupby.call_args_list)) self.assertEqual(0, len(client.timeseries.call_args_list)) # check that there is no dimensions entry called_args = client.groupby.call_args_list[0][1] self.assertIn("dimensions", called_args) self.assertEqual(["col1", "col2"], called_args["dimensions"])
def create_druid_datasource(self, name, id=0, cols_names=[], metric_names=[]): params = {"remote_id": id, "database_name": "druid_test"} datasource = DruidDatasource( id=id, datasource_name=name, cluster_name="druid_test", params=json.dumps(params), ) for col_name in cols_names: datasource.columns.append(DruidColumn(column_name=col_name)) for metric_name in metric_names: datasource.metrics.append(DruidMetric(metric_name=metric_name, json="{}")) return datasource
def create_druid_datasource( self, name, id=0, cols_names=[], metric_names=[]): params = {'remote_id': id, 'database_name': 'druid_test'} datasource = DruidDatasource( id=id, datasource_name=name, cluster_name='druid_test', params=json.dumps(params) ) for col_name in cols_names: datasource.columns.append( DruidColumn(column_name=col_name)) for metric_name in metric_names: datasource.metrics.append(DruidMetric( metric_name=metric_name)) return datasource
def create_druid_datasource(self, name, id=0, cols_names=[], metric_names=[]): cluster_name = "druid_test" cluster = self.get_or_create( DruidCluster, {"cluster_name": cluster_name}, db.session ) params = {"remote_id": id, "database_name": cluster_name} datasource = DruidDatasource( id=id, datasource_name=name, cluster_id=cluster.id, params=json.dumps(params), ) for col_name in cols_names: datasource.columns.append(DruidColumn(column_name=col_name)) for metric_name in metric_names: datasource.metrics.append(DruidMetric(metric_name=metric_name, json="{}")) return datasource
def test_run_query_order_by_metrics(self): client = Mock() client.query_builder.last_query.query_dict = {"mock": 0} from_dttm = Mock() to_dttm = Mock() ds = DruidDatasource(datasource_name="datasource") ds.get_having_filters = Mock(return_value=[]) dim1 = DruidColumn(column_name="dim1") dim2 = DruidColumn(column_name="dim2") metrics_dict = { "count1": DruidMetric( metric_name="count1", metric_type="count", json=json.dumps({ "type": "count", "name": "count1" }), ), "sum1": DruidMetric( metric_name="sum1", metric_type="doubleSum", json=json.dumps({ "type": "doubleSum", "name": "sum1" }), ), "sum2": DruidMetric( metric_name="sum2", metric_type="doubleSum", json=json.dumps({ "type": "doubleSum", "name": "sum2" }), ), "div1": DruidMetric( metric_name="div1", metric_type="postagg", json=json.dumps({ "fn": "/", "type": "arithmetic", "name": "div1", "fields": [ { "fieldName": "sum1", "type": "fieldAccess" }, { "fieldName": "sum2", "type": "fieldAccess" }, ], }), ), } ds.columns = [dim1, dim2] ds.metrics = list(metrics_dict.values()) groupby = ["dim1"] metrics = ["count1"] granularity = "all" # get the counts of the top 5 'dim1's, order by 'sum1' ds.run_query( groupby, metrics, granularity, from_dttm, to_dttm, timeseries_limit=5, timeseries_limit_metric="sum1", client=client, order_desc=True, filter=[], ) qry_obj = client.topn.call_args_list[0][1] self.assertEqual("dim1", qry_obj["dimension"]) self.assertEqual("sum1", qry_obj["metric"]) aggregations = qry_obj["aggregations"] post_aggregations = qry_obj["post_aggregations"] self.assertEqual({"count1", "sum1"}, set(aggregations.keys())) self.assertEqual(set(), set(post_aggregations.keys())) # get the counts of the top 5 'dim1's, order by 'div1' ds.run_query( groupby, metrics, granularity, from_dttm, to_dttm, timeseries_limit=5, timeseries_limit_metric="div1", client=client, order_desc=True, filter=[], ) qry_obj = client.topn.call_args_list[1][1] self.assertEqual("dim1", qry_obj["dimension"]) self.assertEqual("div1", qry_obj["metric"]) aggregations = qry_obj["aggregations"] post_aggregations = qry_obj["post_aggregations"] self.assertEqual({"count1", "sum1", "sum2"}, set(aggregations.keys())) self.assertEqual({"div1"}, set(post_aggregations.keys())) groupby = ["dim1", "dim2"] # get the counts of the top 5 ['dim1', 'dim2']s, order by 'sum1' ds.run_query( groupby, metrics, granularity, from_dttm, to_dttm, timeseries_limit=5, timeseries_limit_metric="sum1", client=client, order_desc=True, filter=[], ) qry_obj = client.groupby.call_args_list[0][1] self.assertEqual({"dim1", "dim2"}, set(qry_obj["dimensions"])) self.assertEqual("sum1", qry_obj["limit_spec"]["columns"][0]["dimension"]) aggregations = qry_obj["aggregations"] post_aggregations = qry_obj["post_aggregations"] self.assertEqual({"count1", "sum1"}, set(aggregations.keys())) self.assertEqual(set(), set(post_aggregations.keys())) # get the counts of the top 5 ['dim1', 'dim2']s, order by 'div1' ds.run_query( groupby, metrics, granularity, from_dttm, to_dttm, timeseries_limit=5, timeseries_limit_metric="div1", client=client, order_desc=True, filter=[], ) qry_obj = client.groupby.call_args_list[1][1] self.assertEqual({"dim1", "dim2"}, set(qry_obj["dimensions"])) self.assertEqual("div1", qry_obj["limit_spec"]["columns"][0]["dimension"]) aggregations = qry_obj["aggregations"] post_aggregations = qry_obj["post_aggregations"] self.assertEqual({"count1", "sum1", "sum2"}, set(aggregations.keys())) self.assertEqual({"div1"}, set(post_aggregations.keys()))
def test_metrics_and_post_aggs(self): """ Test generation of metrics and post-aggregations from an initial list of superset metrics (which may include the results of either). This primarily tests that specifying a post-aggregator metric will also require the raw aggregation of the associated druid metric column. """ metrics_dict = { "unused_count": DruidMetric( metric_name="unused_count", verbose_name="COUNT(*)", metric_type="count", json=json.dumps({ "type": "count", "name": "unused_count" }), ), "some_sum": DruidMetric( metric_name="some_sum", verbose_name="SUM(*)", metric_type="sum", json=json.dumps({ "type": "sum", "name": "sum" }), ), "a_histogram": DruidMetric( metric_name="a_histogram", verbose_name="APPROXIMATE_HISTOGRAM(*)", metric_type="approxHistogramFold", json=json.dumps({ "type": "approxHistogramFold", "name": "a_histogram" }), ), "aCustomMetric": DruidMetric( metric_name="aCustomMetric", verbose_name="MY_AWESOME_METRIC(*)", metric_type="aCustomType", json=json.dumps({ "type": "customMetric", "name": "aCustomMetric" }), ), "quantile_p95": DruidMetric( metric_name="quantile_p95", verbose_name="P95(*)", metric_type="postagg", json=json.dumps({ "type": "quantile", "probability": 0.95, "name": "p95", "fieldName": "a_histogram", }), ), "aCustomPostAgg": DruidMetric( metric_name="aCustomPostAgg", verbose_name="CUSTOM_POST_AGG(*)", metric_type="postagg", json=json.dumps({ "type": "customPostAgg", "name": "aCustomPostAgg", "field": { "type": "fieldAccess", "fieldName": "aCustomMetric" }, }), ), } adhoc_metric = { "expressionType": "SIMPLE", "column": { "type": "DOUBLE", "column_name": "value" }, "aggregate": "SUM", "label": "My Adhoc Metric", } metrics = ["some_sum"] saved_metrics, post_aggs = DruidDatasource.metrics_and_post_aggs( metrics, metrics_dict) assert set(saved_metrics.keys()) == {"some_sum"} assert post_aggs == {} metrics = [adhoc_metric] saved_metrics, post_aggs = DruidDatasource.metrics_and_post_aggs( metrics, metrics_dict) assert set(saved_metrics.keys()) == set([adhoc_metric["label"]]) assert post_aggs == {} metrics = ["some_sum", adhoc_metric] saved_metrics, post_aggs = DruidDatasource.metrics_and_post_aggs( metrics, metrics_dict) assert set(saved_metrics.keys()) == {"some_sum", adhoc_metric["label"]} assert post_aggs == {} metrics = ["quantile_p95"] saved_metrics, post_aggs = DruidDatasource.metrics_and_post_aggs( metrics, metrics_dict) result_postaggs = set(["quantile_p95"]) assert set(saved_metrics.keys()) == {"a_histogram"} assert set(post_aggs.keys()) == result_postaggs metrics = ["aCustomPostAgg"] saved_metrics, post_aggs = DruidDatasource.metrics_and_post_aggs( metrics, metrics_dict) result_postaggs = set(["aCustomPostAgg"]) assert set(saved_metrics.keys()) == {"aCustomMetric"} assert set(post_aggs.keys()) == result_postaggs
def test_run_query_single_groupby(self): client = Mock() from_dttm = Mock() to_dttm = Mock() from_dttm.replace = Mock(return_value=from_dttm) to_dttm.replace = Mock(return_value=to_dttm) from_dttm.isoformat = Mock(return_value="from") to_dttm.isoformat = Mock(return_value="to") timezone = "timezone" from_dttm.tzname = Mock(return_value=timezone) ds = DruidDatasource(datasource_name="datasource") metric1 = DruidMetric(metric_name="metric1") metric2 = DruidMetric(metric_name="metric2") ds.metrics = [metric1, metric2] col1 = DruidColumn(column_name="col1") col2 = DruidColumn(column_name="col2") ds.columns = [col1, col2] aggs = ["metric1"] post_aggs = ["some_agg"] ds._metrics_and_post_aggs = Mock(return_value=(aggs, post_aggs)) groupby = ["col1"] metrics = ["metric1"] ds.get_having_filters = Mock(return_value=[]) client.query_builder.last_query.query_dict = {"mock": 0} # client.topn is called twice ds.run_query( groupby, metrics, None, from_dttm, to_dttm, timeseries_limit=100, client=client, order_desc=True, filter=[], ) self.assertEqual(2, len(client.topn.call_args_list)) self.assertEqual(0, len(client.groupby.call_args_list)) self.assertEqual(0, len(client.timeseries.call_args_list)) # check that there is no dimensions entry called_args_pre = client.topn.call_args_list[0][1] self.assertNotIn("dimensions", called_args_pre) self.assertIn("dimension", called_args_pre) called_args = client.topn.call_args_list[1][1] self.assertIn("dimension", called_args) self.assertEqual("col1", called_args["dimension"]) # not order_desc client = Mock() client.query_builder.last_query.query_dict = {"mock": 0} ds.run_query( groupby, metrics, None, from_dttm, to_dttm, client=client, order_desc=False, filter=[], row_limit=100, ) self.assertEqual(0, len(client.topn.call_args_list)) self.assertEqual(1, len(client.groupby.call_args_list)) self.assertEqual(0, len(client.timeseries.call_args_list)) self.assertIn("dimensions", client.groupby.call_args_list[0][1]) self.assertEqual(["col1"], client.groupby.call_args_list[0][1]["dimensions"]) # order_desc but timeseries and dimension spec # calls topn with single dimension spec 'dimension' spec = {"outputName": "hello", "dimension": "matcho"} spec_json = json.dumps(spec) col3 = DruidColumn(column_name="col3", dimension_spec_json=spec_json) ds.columns.append(col3) groupby = ["col3"] client = Mock() client.query_builder.last_query.query_dict = {"mock": 0} ds.run_query( groupby, metrics, None, from_dttm, to_dttm, client=client, order_desc=True, timeseries_limit=5, filter=[], row_limit=100, ) self.assertEqual(2, len(client.topn.call_args_list)) self.assertEqual(0, len(client.groupby.call_args_list)) self.assertEqual(0, len(client.timeseries.call_args_list)) self.assertIn("dimension", client.topn.call_args_list[0][1]) self.assertIn("dimension", client.topn.call_args_list[1][1]) # uses dimension for pre query and full spec for final query self.assertEqual("matcho", client.topn.call_args_list[0][1]["dimension"]) self.assertEqual(spec, client.topn.call_args_list[1][1]["dimension"])
def test_run_query_order_by_metrics(self): client = Mock() client.query_builder.last_query.query_dict = {'mock': 0} from_dttm = Mock() to_dttm = Mock() ds = DruidDatasource(datasource_name='datasource') ds.get_having_filters = Mock(return_value=[]) dim1 = DruidColumn(column_name='dim1') dim2 = DruidColumn(column_name='dim2') metrics_dict = { 'count1': DruidMetric( metric_name='count1', metric_type='count', json=json.dumps({ 'type': 'count', 'name': 'count1' }), ), 'sum1': DruidMetric( metric_name='sum1', metric_type='doubleSum', json=json.dumps({ 'type': 'doubleSum', 'name': 'sum1' }), ), 'sum2': DruidMetric( metric_name='sum2', metric_type='doubleSum', json=json.dumps({ 'type': 'doubleSum', 'name': 'sum2' }), ), 'div1': DruidMetric( metric_name='div1', metric_type='postagg', json=json.dumps({ 'fn': '/', 'type': 'arithmetic', 'name': 'div1', 'fields': [ { 'fieldName': 'sum1', 'type': 'fieldAccess', }, { 'fieldName': 'sum2', 'type': 'fieldAccess', }, ], }), ), } ds.columns = [dim1, dim2] ds.metrics = list(metrics_dict.values()) groupby = ['dim1'] metrics = ['count1'] granularity = 'all' # get the counts of the top 5 'dim1's, order by 'sum1' ds.run_query( groupby, metrics, granularity, from_dttm, to_dttm, timeseries_limit=5, timeseries_limit_metric='sum1', client=client, order_desc=True, filter=[], ) qry_obj = client.topn.call_args_list[0][1] self.assertEqual('dim1', qry_obj['dimension']) self.assertEqual('sum1', qry_obj['metric']) aggregations = qry_obj['aggregations'] post_aggregations = qry_obj['post_aggregations'] self.assertEqual({'count1', 'sum1'}, set(aggregations.keys())) self.assertEqual(set(), set(post_aggregations.keys())) # get the counts of the top 5 'dim1's, order by 'div1' ds.run_query( groupby, metrics, granularity, from_dttm, to_dttm, timeseries_limit=5, timeseries_limit_metric='div1', client=client, order_desc=True, filter=[], ) qry_obj = client.topn.call_args_list[1][1] self.assertEqual('dim1', qry_obj['dimension']) self.assertEqual('div1', qry_obj['metric']) aggregations = qry_obj['aggregations'] post_aggregations = qry_obj['post_aggregations'] self.assertEqual({'count1', 'sum1', 'sum2'}, set(aggregations.keys())) self.assertEqual({'div1'}, set(post_aggregations.keys())) groupby = ['dim1', 'dim2'] # get the counts of the top 5 ['dim1', 'dim2']s, order by 'sum1' ds.run_query( groupby, metrics, granularity, from_dttm, to_dttm, timeseries_limit=5, timeseries_limit_metric='sum1', client=client, order_desc=True, filter=[], ) qry_obj = client.groupby.call_args_list[0][1] self.assertEqual({'dim1', 'dim2'}, set(qry_obj['dimensions'])) self.assertEqual('sum1', qry_obj['limit_spec']['columns'][0]['dimension']) aggregations = qry_obj['aggregations'] post_aggregations = qry_obj['post_aggregations'] self.assertEqual({'count1', 'sum1'}, set(aggregations.keys())) self.assertEqual(set(), set(post_aggregations.keys())) # get the counts of the top 5 ['dim1', 'dim2']s, order by 'div1' ds.run_query( groupby, metrics, granularity, from_dttm, to_dttm, timeseries_limit=5, timeseries_limit_metric='div1', client=client, order_desc=True, filter=[], ) qry_obj = client.groupby.call_args_list[1][1] self.assertEqual({'dim1', 'dim2'}, set(qry_obj['dimensions'])) self.assertEqual('div1', qry_obj['limit_spec']['columns'][0]['dimension']) aggregations = qry_obj['aggregations'] post_aggregations = qry_obj['post_aggregations'] self.assertEqual({'count1', 'sum1', 'sum2'}, set(aggregations.keys())) self.assertEqual({'div1'}, set(post_aggregations.keys()))
def test_metrics_and_post_aggs(self): """ Test generation of metrics and post-aggregations from an initial list of superset metrics (which may include the results of either). This primarily tests that specifying a post-aggregator metric will also require the raw aggregation of the associated druid metric column. """ metrics_dict = { 'unused_count': DruidMetric( metric_name='unused_count', verbose_name='COUNT(*)', metric_type='count', json=json.dumps({ 'type': 'count', 'name': 'unused_count' }), ), 'some_sum': DruidMetric( metric_name='some_sum', verbose_name='SUM(*)', metric_type='sum', json=json.dumps({ 'type': 'sum', 'name': 'sum' }), ), 'a_histogram': DruidMetric( metric_name='a_histogram', verbose_name='APPROXIMATE_HISTOGRAM(*)', metric_type='approxHistogramFold', json=json.dumps( { 'type': 'approxHistogramFold', 'name': 'a_histogram' }, ), ), 'aCustomMetric': DruidMetric( metric_name='aCustomMetric', verbose_name='MY_AWESOME_METRIC(*)', metric_type='aCustomType', json=json.dumps( { 'type': 'customMetric', 'name': 'aCustomMetric' }, ), ), 'quantile_p95': DruidMetric( metric_name='quantile_p95', verbose_name='P95(*)', metric_type='postagg', json=json.dumps({ 'type': 'quantile', 'probability': 0.95, 'name': 'p95', 'fieldName': 'a_histogram', }), ), 'aCustomPostAgg': DruidMetric( metric_name='aCustomPostAgg', verbose_name='CUSTOM_POST_AGG(*)', metric_type='postagg', json=json.dumps({ 'type': 'customPostAgg', 'name': 'aCustomPostAgg', 'field': { 'type': 'fieldAccess', 'fieldName': 'aCustomMetric', }, }), ), } adhoc_metric = { 'expressionType': 'SIMPLE', 'column': { 'type': 'DOUBLE', 'column_name': 'value' }, 'aggregate': 'SUM', 'label': 'My Adhoc Metric', } metrics = ['some_sum'] saved_metrics, post_aggs = DruidDatasource.metrics_and_post_aggs( metrics, metrics_dict) assert set(saved_metrics.keys()) == {'some_sum'} assert post_aggs == {} metrics = [adhoc_metric] saved_metrics, post_aggs = DruidDatasource.metrics_and_post_aggs( metrics, metrics_dict) assert set(saved_metrics.keys()) == set([adhoc_metric['label']]) assert post_aggs == {} metrics = ['some_sum', adhoc_metric] saved_metrics, post_aggs = DruidDatasource.metrics_and_post_aggs( metrics, metrics_dict) assert set(saved_metrics.keys()) == {'some_sum', adhoc_metric['label']} assert post_aggs == {} metrics = ['quantile_p95'] saved_metrics, post_aggs = DruidDatasource.metrics_and_post_aggs( metrics, metrics_dict) result_postaggs = set(['quantile_p95']) assert set(saved_metrics.keys()) == {'a_histogram'} assert set(post_aggs.keys()) == result_postaggs metrics = ['aCustomPostAgg'] saved_metrics, post_aggs = DruidDatasource.metrics_and_post_aggs( metrics, metrics_dict) result_postaggs = set(['aCustomPostAgg']) assert set(saved_metrics.keys()) == {'aCustomMetric'} assert set(post_aggs.keys()) == result_postaggs
def test_run_query_single_groupby(self): client = Mock() from_dttm = Mock() to_dttm = Mock() from_dttm.replace = Mock(return_value=from_dttm) to_dttm.replace = Mock(return_value=to_dttm) from_dttm.isoformat = Mock(return_value='from') to_dttm.isoformat = Mock(return_value='to') timezone = 'timezone' from_dttm.tzname = Mock(return_value=timezone) ds = DruidDatasource(datasource_name='datasource') metric1 = DruidMetric(metric_name='metric1') metric2 = DruidMetric(metric_name='metric2') ds.metrics = [metric1, metric2] col1 = DruidColumn(column_name='col1') col2 = DruidColumn(column_name='col2') ds.columns = [col1, col2] aggs = ['metric1'] post_aggs = ['some_agg'] ds._metrics_and_post_aggs = Mock(return_value=(aggs, post_aggs)) groupby = ['col1'] metrics = ['metric1'] ds.get_having_filters = Mock(return_value=[]) client.query_builder.last_query.query_dict = {'mock': 0} # client.topn is called twice ds.run_query( groupby, metrics, None, from_dttm, to_dttm, timeseries_limit=100, client=client, order_desc=True, filter=[], ) self.assertEqual(2, len(client.topn.call_args_list)) self.assertEqual(0, len(client.groupby.call_args_list)) self.assertEqual(0, len(client.timeseries.call_args_list)) # check that there is no dimensions entry called_args_pre = client.topn.call_args_list[0][1] self.assertNotIn('dimensions', called_args_pre) self.assertIn('dimension', called_args_pre) called_args = client.topn.call_args_list[1][1] self.assertIn('dimension', called_args) self.assertEqual('col1', called_args['dimension']) # not order_desc client = Mock() client.query_builder.last_query.query_dict = {'mock': 0} ds.run_query( groupby, metrics, None, from_dttm, to_dttm, client=client, order_desc=False, filter=[], row_limit=100, ) self.assertEqual(0, len(client.topn.call_args_list)) self.assertEqual(1, len(client.groupby.call_args_list)) self.assertEqual(0, len(client.timeseries.call_args_list)) self.assertIn('dimensions', client.groupby.call_args_list[0][1]) self.assertEqual(['col1'], client.groupby.call_args_list[0][1]['dimensions']) # order_desc but timeseries and dimension spec # calls topn with single dimension spec 'dimension' spec = {'outputName': 'hello', 'dimension': 'matcho'} spec_json = json.dumps(spec) col3 = DruidColumn(column_name='col3', dimension_spec_json=spec_json) ds.columns.append(col3) groupby = ['col3'] client = Mock() client.query_builder.last_query.query_dict = {'mock': 0} ds.run_query( groupby, metrics, None, from_dttm, to_dttm, client=client, order_desc=True, timeseries_limit=5, filter=[], row_limit=100, ) self.assertEqual(2, len(client.topn.call_args_list)) self.assertEqual(0, len(client.groupby.call_args_list)) self.assertEqual(0, len(client.timeseries.call_args_list)) self.assertIn('dimension', client.topn.call_args_list[0][1]) self.assertIn('dimension', client.topn.call_args_list[1][1]) # uses dimension for pre query and full spec for final query self.assertEqual('matcho', client.topn.call_args_list[0][1]['dimension']) self.assertEqual(spec, client.topn.call_args_list[1][1]['dimension'])