def __init__(self, name, count_filter=None): super(_HelperCalculation, self).__init__() self.outer_aggregations = {} # If the unique aggregation should count *all* of the unique values, # we can just use a simple "count" on the outer groupby if not count_filter or isinstance(count_filter, EmptyFilter): self.outer_aggregations[name] = count('count') else: # If the unique aggregation should only count unique values when # they meet a specific criteria, then we need to do more work. # Conceptually, to include a row if it meets a specific filter, we # would store a 1 for that row and sum the new column in the outer # groupby. Unfortunately, druid does not provide an aggregator that # returns a constant, so we must use a post aggregator on the inner # groupby to convert the value into a constant 1. # Choose an aggregation that is guaranteed to not be 0 inner_agg = filtered_aggregator(filter=count_filter, agg=count('count')) inner_agg_key = '%s%s_agg' % (name, self.SUFFIX) self.add_aggregation(inner_agg_key, inner_agg) # Divide the value by itself during post aggregation so that the # inner groupby returns a 1 or 0 for this row const_formula = '%s / %s' % (inner_agg_key, inner_agg_key) post_agg_key = '%s%s_post_agg' % (name, self.SUFFIX) self.add_post_aggregation_from_formula(post_agg_key, const_formula) # Sum the constant column in the outer groupby to get the exact # unique count for a filtered set self.outer_aggregations[name] = longsum(post_agg_key)
def test_build_filtered_aggregator(self): filter_ = filters.Filter(dimension="dim", value="val") agg_input = { "agg1": aggregators.filtered(filter_, aggregators.count("metric1")), "agg2": aggregators.filtered(filter_, aggregators.longsum("metric2")), "agg3": aggregators.filtered(filter_, aggregators.doublesum("metric3")), "agg4": aggregators.filtered(filter_, aggregators.min("metric4")), "agg5": aggregators.filtered(filter_, aggregators.max("metric5")), "agg6": aggregators.filtered(filter_, aggregators.hyperunique("metric6")), "agg7": aggregators.filtered(filter_, aggregators.cardinality("dim1")), "agg8": aggregators.filtered(filter_, aggregators.cardinality(["dim1", "dim2"], by_row=True)), } base = {"type": "filtered", "filter": {"type": "selector", "dimension": "dim", "value": "val"}} aggs = [ {"name": "agg1", "type": "count", "fieldName": "metric1"}, {"name": "agg2", "type": "longSum", "fieldName": "metric2"}, {"name": "agg3", "type": "doubleSum", "fieldName": "metric3"}, {"name": "agg4", "type": "min", "fieldName": "metric4"}, {"name": "agg5", "type": "max", "fieldName": "metric5"}, {"name": "agg6", "type": "hyperUnique", "fieldName": "metric6"}, {"name": "agg7", "type": "cardinality", "fieldNames": ["dim1"], "byRow": False}, {"name": "agg8", "type": "cardinality", "fieldNames": ["dim1", "dim2"], "byRow": True}, ] expected = [] for agg in aggs: exp = deepcopy(base) exp.update({"aggregator": agg}) expected.append(exp) built_agg = aggregators.build_aggregators(agg_input) expected = sorted(built_agg, key=lambda k: itemgetter("name")(itemgetter("aggregator")(k))) actual = sorted(expected, key=lambda k: itemgetter("name")(itemgetter("aggregator")(k))) assert expected == actual
def test_build_query_none_type(self): # given expected_query_dict = { "queryType": None, "dataSource": "things", "aggregations": [{"fieldName": "thing", "name": "count", "type": "count"}], "filter": {"dimension": "one", "type": "selector", "value": 1}, "having": {"aggregation": "sum", "type": "greaterThan", "value": 1}, "dimension": "dim1", } builder = QueryBuilder() # when builder_dict = { "datasource": "things", "aggregations": {"count": aggregators.count("thing")}, "filter": filters.Dimension("one") == 1, "having": having.Aggregation("sum") > 1, "dimension": "dim1", } query = builder.build_query(None, builder_dict) # then assert query.query_dict == expected_query_dict # you should be able to pass `None` to dimension/having/filter for v in ["dimension", "having", "filter"]: expected_query_dict[v] = None builder_dict[v] = None query = builder.build_query(None, builder_dict) assert query.query_dict == expected_query_dict
def test_nested_filtered_aggregator(self): filter1 = filters.Filter(dimension="dim1", value="val") filter2 = filters.Filter(dimension="dim2", value="val") agg = aggregators.filtered( filter1, aggregators.filtered(filter2, aggregators.count("metric1"))) actual = aggregators.build_aggregators({"agg_name": agg}) # the innermost aggregation must have 'agg_name' expected = [{ "type": "filtered", "aggregator": { "type": "filtered", "aggregator": { "fieldName": "metric1", "type": "count", "name": "agg_name", }, "filter": { "dimension": "dim2", "value": "val", "type": "selector" }, }, "filter": { "dimension": "dim1", "value": "val", "type": "selector" }, }] assert expected == actual
def values_for_column(self, column_name, limit=10000): """Retrieve some values for the given column""" logging.info( 'Getting values for columns [{}] limited to [{}]' .format(column_name, limit)) # TODO: Use Lexicographic TopNMetricSpec once supported by PyDruid if self.fetch_values_from: from_dttm = utils.parse_human_datetime(self.fetch_values_from) else: from_dttm = datetime(1970, 1, 1) qry = dict( datasource=self.datasource_name, granularity='all', intervals=from_dttm.isoformat() + '/' + datetime.now().isoformat(), aggregations=dict(count=count('count')), dimension=column_name, metric='count', threshold=limit, ) client = self.cluster.get_pydruid_client() client.topn(**qry) df = client.export_pandas() return [row[column_name] for row in df.to_records(index=False)]
def test_nested_filtered_aggregator(self): filter1 = filters.Filter(dimension='dim1', value='val') filter2 = filters.Filter(dimension='dim2', value='val') agg = aggregators.filtered( filter1, aggregators.filtered(filter2, aggregators.count('metric1'))) actual = aggregators.build_aggregators({'agg_name': agg}) # the innermost aggregation must have 'agg_name' expected = [{ 'type': 'filtered', 'aggregator': { 'type': 'filtered', 'aggregator': { 'fieldName': 'metric1', 'type': 'count', 'name': 'agg_name' }, 'filter': { 'dimension': 'dim2', 'value': 'val', 'type': 'selector' } }, 'filter': { 'dimension': 'dim1', 'value': 'val', 'type': 'selector' } }] assert expected == actual
def _parse_metric(self): if self._metric == 'uv': return { "aggregations": { "result": cardinality(self._field) }, "metric": "result" } elif self._metric == 'exact_uv': return { "aggregations": { "result": thetasketch(self._field) }, "metric": "result" } elif self._metric == 'pv': return { "aggregations": { "result": count(self._field) }, "metric": "result" } elif self._metric == 'longsum': return { "aggregations": { "result": longsum(self._field) }, "metric": "result" } else: raise ParseArgException("Parse metric failed")
def test_build_aggregators(self): agg_input = { 'agg1': aggregators.count('metric1'), 'agg2': aggregators.longsum('metric2'), 'agg3': aggregators.doublesum('metric3'), 'agg4': aggregators.doublemin('metric4'), 'agg5': aggregators.doublemax('metric5'), 'agg6': aggregators.hyperunique('metric6'), 'agg7': aggregators.cardinality('dim1'), 'agg8': aggregators.cardinality(['dim1', 'dim2'], by_row=True), 'agg9': aggregators.thetasketch('dim1'), 'agg10': aggregators.thetasketch('metric7'), 'agg11': aggregators.thetasketch('metric8', isinputthetasketch = True, size=8192) } built_agg = aggregators.build_aggregators(agg_input) expected = [ {'name': 'agg1', 'type': 'count', 'fieldName': 'metric1'}, {'name': 'agg2', 'type': 'longSum', 'fieldName': 'metric2'}, {'name': 'agg3', 'type': 'doubleSum', 'fieldName': 'metric3'}, {'name': 'agg4', 'type': 'doubleMin', 'fieldName': 'metric4'}, {'name': 'agg5', 'type': 'doubleMax', 'fieldName': 'metric5'}, {'name': 'agg6', 'type': 'hyperUnique', 'fieldName': 'metric6'}, {'name': 'agg7', 'type': 'cardinality', 'fieldNames': ['dim1'], 'byRow': False}, {'name': 'agg8', 'type': 'cardinality', 'fieldNames': ['dim1', 'dim2'], 'byRow': True}, {'name': 'agg9', 'type': 'thetaSketch', 'fieldName': 'dim1', 'isInputThetaSketch': False, 'size': 16384}, {'name': 'agg10', 'type': 'thetaSketch', 'fieldName': 'metric7', 'isInputThetaSketch': False, 'size': 16384}, {'name': 'agg11', 'type': 'thetaSketch', 'fieldName': 'metric8', 'isInputThetaSketch': True, 'size': 8192} ] assert (sorted(built_agg, key=itemgetter('name')) == sorted(expected, key=itemgetter('name')))
def test_filtered_aggregator(self): filter_ = filters.Filter(dimension="dim", value="val") aggs = [ aggregators.count("metric1"), aggregators.longsum("metric2"), aggregators.doublesum("metric3"), aggregators.doublemin("metric4"), aggregators.doublemax("metric5"), aggregators.hyperunique("metric6"), aggregators.cardinality("dim1"), aggregators.cardinality(["dim1", "dim2"], by_row=True), aggregators.thetasketch("dim1"), aggregators.thetasketch("metric7"), aggregators.thetasketch("metric8", isinputthetasketch=True, size=8192), ] for agg in aggs: expected = { "type": "filtered", "filter": { "type": "selector", "dimension": "dim", "value": "val" }, "aggregator": agg, } actual = aggregators.filtered(filter_, agg) assert actual == expected
def test_filtered_aggregator(self): filter_ = filters.Filter(dimension='dim', value='val') aggs = [aggregators.count('metric1'), aggregators.longsum('metric2'), aggregators.doublesum('metric3'), aggregators.doublemin('metric4'), aggregators.doublemax('metric5'), aggregators.hyperunique('metric6'), aggregators.cardinality('dim1'), aggregators.cardinality(['dim1', 'dim2'], by_row=True), aggregators.thetasketch('dim1'), aggregators.thetasketch('metric7'), aggregators.thetasketch('metric8', isinputthetasketch=True, size=8192) ] for agg in aggs: expected = { 'type': 'filtered', 'filter': { 'type': 'selector', 'dimension': 'dim', 'value': 'val' }, 'aggregator': agg } actual = aggregators.filtered(filter_, agg) assert actual == expected
def test_build_aggregators(self): agg_input = { 'agg1': aggregators.count('metric1'), 'agg2': aggregators.longsum('metric2'), 'agg3': aggregators.doublesum('metric3'), 'agg4': aggregators.min('metric4'), 'agg5': aggregators.max('metric5'), 'agg6': aggregators.hyperunique('metric6'), 'agg7': aggregators.cardinality('dim1'), 'agg8': aggregators.cardinality(['dim1', 'dim2'], by_row=True) } built_agg = aggregators.build_aggregators(agg_input) expected = [ { 'name': 'agg1', 'type': 'count', 'fieldName': 'metric1' }, { 'name': 'agg2', 'type': 'longSum', 'fieldName': 'metric2' }, { 'name': 'agg3', 'type': 'doubleSum', 'fieldName': 'metric3' }, { 'name': 'agg4', 'type': 'min', 'fieldName': 'metric4' }, { 'name': 'agg5', 'type': 'max', 'fieldName': 'metric5' }, { 'name': 'agg6', 'type': 'hyperUnique', 'fieldName': 'metric6' }, { 'name': 'agg7', 'type': 'cardinality', 'fieldNames': ['dim1'], 'byRow': False }, { 'name': 'agg8', 'type': 'cardinality', 'fieldNames': ['dim1', 'dim2'], 'byRow': True }, ] assert (sorted(built_agg, key=itemgetter('name')) == sorted( expected, key=itemgetter('name')))
def __init__(self, query_client=None): ''' Class to query pydruid and return the data as a pandas dataframe. Pivoted to contain ''' self.datasource = DATASOURCE.name self.granularity = 'month' self.intervals = '%s/%s' % (START_DATE_STR, TODAY_DATE_STR) self.dimensions = [] self.field_dimension = DEFAULT_FIELD self.filter = DEFAULT_FILTER self.agg_alias = 'sum' self.aggregations = {self.agg_alias: doublesum('sum'), 'count': count('count')} self.query_client = query_client or DruidQueryClient
def test_build_filtered_aggregator(self): filter_ = filters.Filter(dimension='dim', value='val') agg_input = { 'agg1': aggregators.filtered(filter_, aggregators.count('metric1')), 'agg2': aggregators.filtered(filter_, aggregators.longsum('metric2')), 'agg3': aggregators.filtered(filter_, aggregators.doublesum('metric3')), 'agg4': aggregators.filtered(filter_, aggregators.min('metric4')), 'agg5': aggregators.filtered(filter_, aggregators.max('metric5')), 'agg6': aggregators.filtered(filter_, aggregators.hyperunique('metric6')), 'agg7': aggregators.filtered(filter_, aggregators.cardinality('dim1')), 'agg8': aggregators.filtered(filter_, aggregators.cardinality(['dim1', 'dim2'], by_row=True)), } base = { 'type': 'filtered', 'filter': { 'type': 'selector', 'dimension': 'dim', 'value': 'val' } } aggs = [ {'name': 'agg1', 'type': 'count', 'fieldName': 'metric1'}, {'name': 'agg2', 'type': 'longSum', 'fieldName': 'metric2'}, {'name': 'agg3', 'type': 'doubleSum', 'fieldName': 'metric3'}, {'name': 'agg4', 'type': 'min', 'fieldName': 'metric4'}, {'name': 'agg5', 'type': 'max', 'fieldName': 'metric5'}, {'name': 'agg6', 'type': 'hyperUnique', 'fieldName': 'metric6'}, {'name': 'agg7', 'type': 'cardinality', 'fieldNames': ['dim1'], 'byRow': False}, {'name': 'agg8', 'type': 'cardinality', 'fieldNames': ['dim1', 'dim2'], 'byRow': True}, ] expected = [] for agg in aggs: exp = deepcopy(base) exp.update({'aggregator': agg}) expected.append(exp) built_agg = aggregators.build_aggregators(agg_input) expected = sorted(built_agg, key=lambda k: itemgetter('name')( itemgetter('aggregator')(k))) actual = sorted(expected, key=lambda k: itemgetter('name')( itemgetter('aggregator')(k))) assert expected == actual
def test_build_query_none_type(self): # given expected_query_dict = { 'queryType': None, 'dataSource': 'things', 'aggregations': [{ 'fieldName': 'thing', 'name': 'count', 'type': 'count' }], 'filter': { 'dimension': 'one', 'type': 'selector', 'value': 1 }, 'having': { 'aggregation': 'sum', 'type': 'greaterThan', 'value': 1 }, 'dimension': 'dim1', } builder = QueryBuilder() # when builder_dict = { 'datasource': 'things', 'aggregations': { 'count': aggregators.count('thing'), }, 'filter': filters.Dimension('one') == 1, 'having': having.Aggregation('sum') > 1, 'dimension': 'dim1', } query = builder.build_query(None, builder_dict) # then assert query.query_dict == expected_query_dict # you should be able to pass `None` to dimension/having/filter for v in ['dimension', 'having', 'filter']: expected_query_dict[v] = None builder_dict[v] = None query = builder.build_query(None, builder_dict) assert query.query_dict == expected_query_dict
def test_build_query(self): # given expected_query_dict = { 'queryType': None, 'dataSource': 'things', 'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}], 'postAggregations': [{ 'fields': [{ 'fieldName': 'sum', 'type': 'fieldAccess', }, { 'fieldName': 'count', 'type': 'fieldAccess', }], 'fn': '/', 'name': 'avg', 'type': 'arithmetic', }], 'pagingSpec': {'pagingIdentifies': {}, 'threshold': 1}, 'filter': {'dimension': 'one', 'type': 'selector', 'value': 1}, 'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1}, 'new_key': 'value', 'virtualColumns': [{ 'type': 'expression', 'name': 'foo', 'expression': "concat('foo' + page)", 'outputType': 'STRING' }], } builder = QueryBuilder() # when query = builder.build_query(None, { 'datasource': 'things', 'aggregations': { 'count': aggregators.count('thing'), }, 'post_aggregations': { 'avg': (postaggregator.Field('sum') / postaggregator.Field('count')), }, 'paging_spec': { 'pagingIdentifies': {}, 'threshold': 1, }, 'filter': filters.Dimension('one') == 1, 'having': having.Aggregation('sum') > 1, 'new_key': 'value', 'virtualColumns': [VirtualColumn(type='expression', name='foo', expression="concat('foo' + page)", outputType='STRING')] }) # then assert query.query_dict == expected_query_dict
def _parse_metric(self): if self._metric == 'uv': return {"aggregations": {"result": cardinality(self._field)}} elif self._metric == 'pv': return {"aggregations": {"result": count(self._field)}} elif self._metric == 'longsum': return {"aggregations": {"result": longsum(self._field)}} elif self._metric == 'doublesum': return {"aggregations": {"result": doublesum(self._field)}} else: raise ParseArgException("Parse metric failed")
def test_nested_filtered_aggregator(self): filter1 = filters.Filter(dimension='dim1', value='val') filter2 = filters.Filter(dimension='dim2', value='val') agg = aggregators.filtered(filter1, aggregators.filtered(filter2, aggregators.count('metric1'))) actual = aggregators.build_aggregators({'agg_name': agg}) # the innermost aggregation must have 'agg_name' expected = [{ 'type': 'filtered', 'aggregator': { 'type': 'filtered', 'aggregator': {'fieldName': 'metric1', 'type': 'count', 'name': 'agg_name'}, 'filter': {'dimension': 'dim2', 'value': 'val', 'type': 'selector'}}, 'filter': {'dimension': 'dim1', 'value': 'val', 'type': 'selector'} }] assert expected == actual
def test_build_query(self): # given expected_query_dict = { 'queryType': None, 'dataSource': 'things', 'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}], 'postAggregations': [{ 'fields': [{ 'fieldName': 'sum', 'type': 'fieldAccess', }, { 'fieldName': 'count', 'type': 'fieldAccess', }], 'fn': '/', 'name': 'avg', 'type': 'arithmetic', }], 'pagingSpec': {'pagingIdentifies': {}, 'threshold': 1}, 'filter': {'dimension': 'one', 'type': 'selector', 'value': 1}, 'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1}, 'new_key': 'value', } builder = QueryBuilder() # when query = builder.build_query(None, { 'datasource': 'things', 'aggregations': { 'count': aggregators.count('thing'), }, 'post_aggregations': { 'avg': (postaggregator.Field('sum') / postaggregator.Field('count')), }, 'paging_spec': { 'pagingIdentifies': {}, 'threshold': 1, }, 'filter': filters.Dimension('one') == 1, 'having': having.Aggregation('sum') > 1, 'new_key': 'value', }) # then assert query.query_dict == expected_query_dict
def test_nested_filtered_aggregator(self): filter1 = filters.Filter(dimension="dim1", value="val") filter2 = filters.Filter(dimension="dim2", value="val") agg = aggregators.filtered(filter1, aggregators.filtered(filter2, aggregators.count("metric1"))) actual = aggregators.build_aggregators({"agg_name": agg}) # the innermost aggregation must have 'agg_name' expected = [ { "type": "filtered", "aggregator": { "type": "filtered", "aggregator": {"fieldName": "metric1", "type": "count", "name": "agg_name"}, "filter": {"dimension": "dim2", "value": "val", "type": "selector"}, }, "filter": {"dimension": "dim1", "value": "val", "type": "selector"}, } ] assert expected == actual
def test_build_subquery(self): # given expected_query_dict = { "query": { "queryType": "groupBy", "dataSource": "things", "aggregations": [ {"fieldName": "thing", "name": "count", "type": "count"} ], "postAggregations": [ { "fields": [ {"fieldName": "sum", "type": "fieldAccess"}, {"fieldName": "count", "type": "fieldAccess"}, ], "fn": "/", "name": "avg", "type": "arithmetic", } ], "filter": {"dimension": "one", "type": "selector", "value": 1}, "having": {"aggregation": "sum", "type": "greaterThan", "value": 1}, }, "type": "query", } builder = QueryBuilder() # when subquery_dict = builder.subquery( { "datasource": "things", "aggregations": {"count": aggregators.count("thing")}, "post_aggregations": { "avg": (postaggregator.Field("sum") / postaggregator.Field("count")) }, "filter": filters.Dimension("one") == 1, "having": having.Aggregation("sum") > 1, } ) # then assert subquery_dict == expected_query_dict
def test_build_query(self): # given expected_query_dict = { "queryType": None, "dataSource": "things", "aggregations": [{"fieldName": "thing", "name": "count", "type": "count"}], "postAggregations": [ { "fields": [ {"fieldName": "sum", "type": "fieldAccess"}, {"fieldName": "count", "type": "fieldAccess"}, ], "fn": "/", "name": "avg", "type": "arithmetic", } ], "pagingSpec": {"pagingIdentifies": {}, "threshold": 1}, "filter": {"dimension": "one", "type": "selector", "value": 1}, "having": {"aggregation": "sum", "type": "greaterThan", "value": 1}, "new_key": "value", } builder = QueryBuilder() # when query = builder.build_query( None, { "datasource": "things", "aggregations": {"count": aggregators.count("thing")}, "post_aggregations": { "avg": (postaggregator.Field("sum") / postaggregator.Field("count")) }, "paging_spec": {"pagingIdentifies": {}, "threshold": 1}, "filter": filters.Dimension("one") == 1, "having": having.Aggregation("sum") > 1, "new_key": "value", }, ) # then assert query.query_dict == expected_query_dict
def test_filtered_aggregator(self): filter_ = filters.Filter(dimension='dim', value='val') aggs = [aggregators.count('metric1'), aggregators.longsum('metric2'), aggregators.doublesum('metric3'), aggregators.min('metric4'), aggregators.max('metric5'), aggregators.hyperunique('metric6')] for agg in aggs: expected = { 'type': 'filtered', 'filter': { 'type': 'selector', 'dimension': 'dim', 'value': 'val' }, 'aggregator': agg } actual = aggregators.filtered(filter_, agg) assert actual == expected
def test_build_aggregators(self): agg_input = { 'agg1': aggregators.count('metric1'), 'agg2': aggregators.longsum('metric2'), 'agg3': aggregators.doublesum('metric3'), 'agg4': aggregators.min('metric4'), 'agg5': aggregators.max('metric5'), 'agg6': aggregators.hyperunique('metric6') } built_agg = aggregators.build_aggregators(agg_input) expected = [ {'name': 'agg1', 'type': 'count', 'fieldName': 'metric1'}, {'name': 'agg2', 'type': 'longSum', 'fieldName': 'metric2'}, {'name': 'agg3', 'type': 'doubleSum', 'fieldName': 'metric3'}, {'name': 'agg4', 'type': 'min', 'fieldName': 'metric4'}, {'name': 'agg5', 'type': 'max', 'fieldName': 'metric5'}, {'name': 'agg6', 'type': 'hyperUnique', 'fieldName': 'metric6'}, ] assert (sorted(built_agg, key=itemgetter('name')) == sorted(expected, key=itemgetter('name')))
def test_filtered_aggregator(self): filter_ = filters.Filter(dimension="dim", value="val") aggs = [ aggregators.count("metric1"), aggregators.longsum("metric2"), aggregators.doublesum("metric3"), aggregators.min("metric4"), aggregators.max("metric5"), aggregators.hyperunique("metric6"), aggregators.cardinality("dim1"), aggregators.cardinality(["dim1", "dim2"], by_row=True), ] for agg in aggs: expected = { "type": "filtered", "filter": {"type": "selector", "dimension": "dim", "value": "val"}, "aggregator": agg, } actual = aggregators.filtered(filter_, agg) assert actual == expected
def test_build_subquery(self): # given expected_query_dict = { 'query': { 'queryType': 'groupBy', 'dataSource': 'things', 'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}], 'postAggregations': [{ 'fields': [{ 'fieldName': 'sum', 'type': 'fieldAccess', }, { 'fieldName': 'count', 'type': 'fieldAccess', }], 'fn': '/', 'name': 'avg', 'type': 'arithmetic', }], 'filter': {'dimension': 'one', 'type': 'selector', 'value': 1}, 'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1}, }, 'type': 'query' } builder = QueryBuilder() # when subquery_dict = builder.subquery({ 'datasource': 'things', 'aggregations': { 'count': aggregators.count('thing'), }, 'post_aggregations': { 'avg': (postaggregator.Field('sum') / postaggregator.Field('count')), }, 'filter': filters.Dimension('one') == 1, 'having': having.Aggregation('sum') > 1, }) # then assert subquery_dict == expected_query_dict
def test_build_aggregators(self): agg_input = { "agg1": aggregators.count("metric1"), "agg2": aggregators.longsum("metric2"), "agg3": aggregators.doublesum("metric3"), "agg4": aggregators.min("metric4"), "agg5": aggregators.max("metric5"), "agg6": aggregators.hyperunique("metric6"), "agg7": aggregators.cardinality("dim1"), "agg8": aggregators.cardinality(["dim1", "dim2"], by_row=True), } built_agg = aggregators.build_aggregators(agg_input) expected = [ {"name": "agg1", "type": "count", "fieldName": "metric1"}, {"name": "agg2", "type": "longSum", "fieldName": "metric2"}, {"name": "agg3", "type": "doubleSum", "fieldName": "metric3"}, {"name": "agg4", "type": "min", "fieldName": "metric4"}, {"name": "agg5", "type": "max", "fieldName": "metric5"}, {"name": "agg6", "type": "hyperUnique", "fieldName": "metric6"}, {"name": "agg7", "type": "cardinality", "fieldNames": ["dim1"], "byRow": False}, {"name": "agg8", "type": "cardinality", "fieldNames": ["dim1", "dim2"], "byRow": True}, ] assert sorted(built_agg, key=itemgetter("name")) == sorted(expected, key=itemgetter("name"))
def values_for_column(self, column_name, from_dttm, to_dttm, limit=500): """Retrieve some values for the given column""" # TODO: Use Lexicographic TopNMetricSpec once supported by PyDruid from_dttm = from_dttm.replace(tzinfo=DRUID_TZ) to_dttm = to_dttm.replace(tzinfo=DRUID_TZ) qry = dict( datasource=self.datasource_name, granularity="all", intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(), aggregations=dict(count=count("count")), dimension=column_name, metric="count", threshold=limit, ) client = self.cluster.get_pydruid_client() client.topn(**qry) df = client.export_pandas() if df is None or df.size == 0: raise Exception(_("No data was returned.")) return df
def test_build_aggregators(self): agg_input = { 'agg1': aggregators.count('metric1'), 'agg2': aggregators.longsum('metric2'), 'agg3': aggregators.doublesum('metric3'), 'agg4': aggregators.min('metric4'), 'agg5': aggregators.max('metric5'), 'agg6': aggregators.hyperunique('metric6'), 'agg7': aggregators.cardinality('dim1'), 'agg8': aggregators.cardinality(['dim1', 'dim2'], by_row=True) } built_agg = aggregators.build_aggregators(agg_input) expected = [ {'name': 'agg1', 'type': 'count', 'fieldName': 'metric1'}, {'name': 'agg2', 'type': 'longSum', 'fieldName': 'metric2'}, {'name': 'agg3', 'type': 'doubleSum', 'fieldName': 'metric3'}, {'name': 'agg4', 'type': 'min', 'fieldName': 'metric4'}, {'name': 'agg5', 'type': 'max', 'fieldName': 'metric5'}, {'name': 'agg6', 'type': 'hyperUnique', 'fieldName': 'metric6'}, {'name': 'agg7', 'type': 'cardinality', 'fieldNames': ['dim1'], 'byRow': False}, {'name': 'agg8', 'type': 'cardinality', 'fieldNames': ['dim1', 'dim2'], 'byRow': True}, ] assert (sorted(built_agg, key=itemgetter('name')) == sorted(expected, key=itemgetter('name')))
def test_build_query_none_type(self): # given expected_query_dict = { 'queryType': None, 'dataSource': 'things', 'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}], 'filter': {'dimension': 'one', 'type': 'selector', 'value': 1}, 'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1}, 'dimension': 'dim1', } builder = QueryBuilder() # when builder_dict = { 'datasource': 'things', 'aggregations': { 'count': aggregators.count('thing'), }, 'filter': filters.Dimension('one') == 1, 'having': having.Aggregation('sum') > 1, 'dimension': 'dim1', } query = builder.build_query(None, builder_dict) # then assert query.query_dict == expected_query_dict # you should be able to pass `None` to dimension/having/filter for v in ['dimension', 'having', 'filter']: expected_query_dict[v] = None builder_dict[v] = None query = builder.build_query(None, builder_dict) assert query.query_dict == expected_query_dict
from pydruid.utils.filters import Dimension from db.druid.calculations.base_calculation import BaseCalculation from db.druid.query_builder import GroupByQueryBuilder from log import LOG from web.server.data.status import INTERVAL from web.server.security.permissions import SuperUserPermission from web.server.routes.views.query_policy import enumerate_query_needs # Key for the display name. Prefixed with __ to distinguish between actual # dimensions. DISPLAY_FIELD = '_display' COUNT_AGGREGATION_NAME = 'count' COUNT_CALCULATION = BaseCalculation( aggregations={COUNT_AGGREGATION_NAME: count('count')}) class DimensionValuesLookup(object): def __init__( self, query_client, datasource, filter_dimensions, dimension_slices, authorizable_dimensions, geo_field_ordering, ): self.query_client = query_client # Map from dimension to a list of dimension values. self.dimension_map = defaultdict(list)
def test_build_filtered_aggregator(self): filter_ = filters.Filter(dimension='dim', value='val') agg_input = { 'agg1': aggregators.filtered(filter_, aggregators.count('metric1')), 'agg2': aggregators.filtered(filter_, aggregators.longsum('metric2')), 'agg3': aggregators.filtered(filter_, aggregators.doublesum('metric3')), 'agg4': aggregators.filtered(filter_, aggregators.doublemin('metric4')), 'agg5': aggregators.filtered(filter_, aggregators.doublemax('metric5')), 'agg6': aggregators.filtered(filter_, aggregators.hyperunique('metric6')), 'agg7': aggregators.filtered(filter_, aggregators.cardinality('dim1')), 'agg8': aggregators.filtered(filter_, aggregators.cardinality(['dim1', 'dim2'], by_row=True)), 'agg9': aggregators.filtered(filter_, aggregators.thetasketch('dim1')), 'agg10': aggregators.filtered(filter_, aggregators.thetasketch('metric7')), 'agg11': aggregators.filtered(filter_, aggregators.thetasketch('metric8', isinputthetasketch = True, size=8192)), } base = { 'type': 'filtered', 'filter': { 'type': 'selector', 'dimension': 'dim', 'value': 'val' } } aggs = [ {'name': 'agg1', 'type': 'count', 'fieldName': 'metric1'}, {'name': 'agg2', 'type': 'longSum', 'fieldName': 'metric2'}, {'name': 'agg3', 'type': 'doubleSum', 'fieldName': 'metric3'}, {'name': 'agg4', 'type': 'doubleMin', 'fieldName': 'metric4'}, {'name': 'agg5', 'type': 'doubleMax', 'fieldName': 'metric5'}, {'name': 'agg6', 'type': 'hyperUnique', 'fieldName': 'metric6'}, {'name': 'agg7', 'type': 'cardinality', 'fieldNames': ['dim1'], 'byRow': False}, {'name': 'agg8', 'type': 'cardinality', 'fieldNames': ['dim1', 'dim2'], 'byRow': True}, {'name': 'agg9', 'type': 'thetaSketch', 'fieldName': 'dim1', 'isInputThetaSketch': False, 'size': 16384}, {'name': 'agg10', 'type': 'thetaSketch', 'fieldName': 'metric7', 'isInputThetaSketch': False, 'size': 16384}, {'name': 'agg11', 'type': 'thetaSketch', 'fieldName': 'metric8', 'isInputThetaSketch': True, 'size': 8192} ] expected = [] for agg in aggs: exp = deepcopy(base) exp.update({'aggregator': agg}) expected.append(exp) built_agg = aggregators.build_aggregators(agg_input) expected = sorted(built_agg, key=lambda k: itemgetter('name')( itemgetter('aggregator')(k))) actual = sorted(expected, key=lambda k: itemgetter('name')( itemgetter('aggregator')(k))) assert expected == actual
def test_build_filtered_aggregator(self): filter_ = filters.Filter(dimension="dim", value="val") agg_input = { "agg1": aggregators.filtered(filter_, aggregators.count("metric1")), "agg2": aggregators.filtered(filter_, aggregators.longsum("metric2")), "agg3": aggregators.filtered(filter_, aggregators.doublesum("metric3")), "agg4": aggregators.filtered(filter_, aggregators.doublemin("metric4")), "agg5": aggregators.filtered(filter_, aggregators.doublemax("metric5")), "agg6": aggregators.filtered(filter_, aggregators.hyperunique("metric6")), "agg7": aggregators.filtered(filter_, aggregators.cardinality("dim1")), "agg8": aggregators.filtered( filter_, aggregators.cardinality(["dim1", "dim2"], by_row=True)), "agg9": aggregators.filtered(filter_, aggregators.thetasketch("dim1")), "agg10": aggregators.filtered(filter_, aggregators.thetasketch("metric7")), "agg11": aggregators.filtered( filter_, aggregators.thetasketch("metric8", isinputthetasketch=True, size=8192), ), } base = { "type": "filtered", "filter": { "type": "selector", "dimension": "dim", "value": "val" }, } aggs = [ { "name": "agg1", "type": "count", "fieldName": "metric1" }, { "name": "agg2", "type": "longSum", "fieldName": "metric2" }, { "name": "agg3", "type": "doubleSum", "fieldName": "metric3" }, { "name": "agg4", "type": "doubleMin", "fieldName": "metric4" }, { "name": "agg5", "type": "doubleMax", "fieldName": "metric5" }, { "name": "agg6", "type": "hyperUnique", "fieldName": "metric6" }, { "name": "agg7", "type": "cardinality", "fieldNames": ["dim1"], "byRow": False, }, { "name": "agg8", "type": "cardinality", "fieldNames": ["dim1", "dim2"], "byRow": True, }, { "name": "agg9", "type": "thetaSketch", "fieldName": "dim1", "isInputThetaSketch": False, "size": 16384, }, { "name": "agg10", "type": "thetaSketch", "fieldName": "metric7", "isInputThetaSketch": False, "size": 16384, }, { "name": "agg11", "type": "thetaSketch", "fieldName": "metric8", "isInputThetaSketch": True, "size": 8192, }, ] expected = [] for agg in aggs: exp = deepcopy(base) exp.update({"aggregator": agg}) expected.append(exp) built_agg = aggregators.build_aggregators(agg_input) expected = sorted(built_agg, key=lambda k: itemgetter("name") (itemgetter("aggregator")(k))) actual = sorted(expected, key=lambda k: itemgetter("name") (itemgetter("aggregator")(k))) assert expected == actual
def test_build_aggregators(self): agg_input = { "agg1": aggregators.count("metric1"), "agg2": aggregators.longsum("metric2"), "agg3": aggregators.doublesum("metric3"), "agg4": aggregators.doublemin("metric4"), "agg5": aggregators.doublemax("metric5"), "agg6": aggregators.hyperunique("metric6"), "agg7": aggregators.cardinality("dim1"), "agg8": aggregators.cardinality(["dim1", "dim2"], by_row=True), "agg9": aggregators.thetasketch("dim1"), "agg10": aggregators.thetasketch("metric7"), "agg11": aggregators.thetasketch("metric8", isinputthetasketch=True, size=8192), } built_agg = aggregators.build_aggregators(agg_input) expected = [ { "name": "agg1", "type": "count", "fieldName": "metric1" }, { "name": "agg2", "type": "longSum", "fieldName": "metric2" }, { "name": "agg3", "type": "doubleSum", "fieldName": "metric3" }, { "name": "agg4", "type": "doubleMin", "fieldName": "metric4" }, { "name": "agg5", "type": "doubleMax", "fieldName": "metric5" }, { "name": "agg6", "type": "hyperUnique", "fieldName": "metric6" }, { "name": "agg7", "type": "cardinality", "fieldNames": ["dim1"], "byRow": False, }, { "name": "agg8", "type": "cardinality", "fieldNames": ["dim1", "dim2"], "byRow": True, }, { "name": "agg9", "type": "thetaSketch", "fieldName": "dim1", "isInputThetaSketch": False, "size": 16384, }, { "name": "agg10", "type": "thetaSketch", "fieldName": "metric7", "isInputThetaSketch": False, "size": 16384, }, { "name": "agg11", "type": "thetaSketch", "fieldName": "metric8", "isInputThetaSketch": True, "size": 8192, }, ] assert sorted(built_agg, key=itemgetter("name")) == sorted(expected, key=itemgetter("name"))