Exemplo n.º 1
0
    def __init__(self, name, count_filter=None):
        super(_HelperCalculation, self).__init__()
        self.outer_aggregations = {}
        # If the unique aggregation should count *all* of the unique values,
        # we can just use a simple "count" on the outer groupby
        if not count_filter or isinstance(count_filter, EmptyFilter):
            self.outer_aggregations[name] = count('count')
        else:
            # If the unique aggregation should only count unique values when
            # they meet a specific criteria, then we need to do more work.
            # Conceptually, to include a row if it meets a specific filter, we
            # would store a 1 for that row and sum the new column in the outer
            # groupby. Unfortunately, druid does not provide an aggregator that
            # returns a constant, so we must use a post aggregator on the inner
            # groupby to convert the value into a constant 1.

            # Choose an aggregation that is guaranteed to not be 0
            inner_agg = filtered_aggregator(filter=count_filter,
                                            agg=count('count'))
            inner_agg_key = '%s%s_agg' % (name, self.SUFFIX)
            self.add_aggregation(inner_agg_key, inner_agg)

            # Divide the value by itself during post aggregation so that the
            # inner groupby returns a 1 or 0 for this row
            const_formula = '%s / %s' % (inner_agg_key, inner_agg_key)
            post_agg_key = '%s%s_post_agg' % (name, self.SUFFIX)
            self.add_post_aggregation_from_formula(post_agg_key, const_formula)

            # Sum the constant column in the outer groupby to get the exact
            # unique count for a filtered set
            self.outer_aggregations[name] = longsum(post_agg_key)
Exemplo n.º 2
0
    def test_build_filtered_aggregator(self):
        filter_ = filters.Filter(dimension="dim", value="val")
        agg_input = {
            "agg1": aggregators.filtered(filter_, aggregators.count("metric1")),
            "agg2": aggregators.filtered(filter_, aggregators.longsum("metric2")),
            "agg3": aggregators.filtered(filter_, aggregators.doublesum("metric3")),
            "agg4": aggregators.filtered(filter_, aggregators.min("metric4")),
            "agg5": aggregators.filtered(filter_, aggregators.max("metric5")),
            "agg6": aggregators.filtered(filter_, aggregators.hyperunique("metric6")),
            "agg7": aggregators.filtered(filter_, aggregators.cardinality("dim1")),
            "agg8": aggregators.filtered(filter_, aggregators.cardinality(["dim1", "dim2"], by_row=True)),
        }
        base = {"type": "filtered", "filter": {"type": "selector", "dimension": "dim", "value": "val"}}

        aggs = [
            {"name": "agg1", "type": "count", "fieldName": "metric1"},
            {"name": "agg2", "type": "longSum", "fieldName": "metric2"},
            {"name": "agg3", "type": "doubleSum", "fieldName": "metric3"},
            {"name": "agg4", "type": "min", "fieldName": "metric4"},
            {"name": "agg5", "type": "max", "fieldName": "metric5"},
            {"name": "agg6", "type": "hyperUnique", "fieldName": "metric6"},
            {"name": "agg7", "type": "cardinality", "fieldNames": ["dim1"], "byRow": False},
            {"name": "agg8", "type": "cardinality", "fieldNames": ["dim1", "dim2"], "byRow": True},
        ]
        expected = []
        for agg in aggs:
            exp = deepcopy(base)
            exp.update({"aggregator": agg})
            expected.append(exp)

        built_agg = aggregators.build_aggregators(agg_input)
        expected = sorted(built_agg, key=lambda k: itemgetter("name")(itemgetter("aggregator")(k)))
        actual = sorted(expected, key=lambda k: itemgetter("name")(itemgetter("aggregator")(k)))
        assert expected == actual
Exemplo n.º 3
0
    def test_build_query_none_type(self):
        # given
        expected_query_dict = {
            "queryType": None,
            "dataSource": "things",
            "aggregations": [{"fieldName": "thing", "name": "count", "type": "count"}],
            "filter": {"dimension": "one", "type": "selector", "value": 1},
            "having": {"aggregation": "sum", "type": "greaterThan", "value": 1},
            "dimension": "dim1",
        }

        builder = QueryBuilder()

        # when
        builder_dict = {
            "datasource": "things",
            "aggregations": {"count": aggregators.count("thing")},
            "filter": filters.Dimension("one") == 1,
            "having": having.Aggregation("sum") > 1,
            "dimension": "dim1",
        }
        query = builder.build_query(None, builder_dict)

        # then
        assert query.query_dict == expected_query_dict

        # you should be able to pass `None` to dimension/having/filter
        for v in ["dimension", "having", "filter"]:
            expected_query_dict[v] = None
            builder_dict[v] = None

            query = builder.build_query(None, builder_dict)

            assert query.query_dict == expected_query_dict
Exemplo n.º 4
0
 def test_nested_filtered_aggregator(self):
     filter1 = filters.Filter(dimension="dim1", value="val")
     filter2 = filters.Filter(dimension="dim2", value="val")
     agg = aggregators.filtered(
         filter1, aggregators.filtered(filter2,
                                       aggregators.count("metric1")))
     actual = aggregators.build_aggregators({"agg_name": agg})
     # the innermost aggregation must have 'agg_name'
     expected = [{
         "type": "filtered",
         "aggregator": {
             "type": "filtered",
             "aggregator": {
                 "fieldName": "metric1",
                 "type": "count",
                 "name": "agg_name",
             },
             "filter": {
                 "dimension": "dim2",
                 "value": "val",
                 "type": "selector"
             },
         },
         "filter": {
             "dimension": "dim1",
             "value": "val",
             "type": "selector"
         },
     }]
     assert expected == actual
Exemplo n.º 5
0
    def values_for_column(self,
                          column_name,
                          limit=10000):
        """Retrieve some values for the given column"""
        logging.info(
            'Getting values for columns [{}] limited to [{}]'
            .format(column_name, limit))
        # TODO: Use Lexicographic TopNMetricSpec once supported by PyDruid
        if self.fetch_values_from:
            from_dttm = utils.parse_human_datetime(self.fetch_values_from)
        else:
            from_dttm = datetime(1970, 1, 1)

        qry = dict(
            datasource=self.datasource_name,
            granularity='all',
            intervals=from_dttm.isoformat() + '/' + datetime.now().isoformat(),
            aggregations=dict(count=count('count')),
            dimension=column_name,
            metric='count',
            threshold=limit,
        )

        client = self.cluster.get_pydruid_client()
        client.topn(**qry)
        df = client.export_pandas()
        return [row[column_name] for row in df.to_records(index=False)]
Exemplo n.º 6
0
 def test_nested_filtered_aggregator(self):
     filter1 = filters.Filter(dimension='dim1', value='val')
     filter2 = filters.Filter(dimension='dim2', value='val')
     agg = aggregators.filtered(
         filter1, aggregators.filtered(filter2,
                                       aggregators.count('metric1')))
     actual = aggregators.build_aggregators({'agg_name': agg})
     # the innermost aggregation must have 'agg_name'
     expected = [{
         'type': 'filtered',
         'aggregator': {
             'type': 'filtered',
             'aggregator': {
                 'fieldName': 'metric1',
                 'type': 'count',
                 'name': 'agg_name'
             },
             'filter': {
                 'dimension': 'dim2',
                 'value': 'val',
                 'type': 'selector'
             }
         },
         'filter': {
             'dimension': 'dim1',
             'value': 'val',
             'type': 'selector'
         }
     }]
     assert expected == actual
Exemplo n.º 7
0
    def values_for_column(self,
                          column_name,
                          limit=10000):
        """Retrieve some values for the given column"""
        logging.info(
            'Getting values for columns [{}] limited to [{}]'
            .format(column_name, limit))
        # TODO: Use Lexicographic TopNMetricSpec once supported by PyDruid
        if self.fetch_values_from:
            from_dttm = utils.parse_human_datetime(self.fetch_values_from)
        else:
            from_dttm = datetime(1970, 1, 1)

        qry = dict(
            datasource=self.datasource_name,
            granularity='all',
            intervals=from_dttm.isoformat() + '/' + datetime.now().isoformat(),
            aggregations=dict(count=count('count')),
            dimension=column_name,
            metric='count',
            threshold=limit,
        )

        client = self.cluster.get_pydruid_client()
        client.topn(**qry)
        df = client.export_pandas()
        return [row[column_name] for row in df.to_records(index=False)]
Exemplo n.º 8
0
    def _parse_metric(self):
        if self._metric == 'uv':
            return {
                "aggregations": {
                    "result": cardinality(self._field)
                },
                "metric": "result"
            }

        elif self._metric == 'exact_uv':
            return {
                "aggregations": {
                    "result": thetasketch(self._field)
                },
                "metric": "result"
            }

        elif self._metric == 'pv':
            return {
                "aggregations": {
                    "result": count(self._field)
                },
                "metric": "result"
            }

        elif self._metric == 'longsum':
            return {
                "aggregations": {
                    "result": longsum(self._field)
                },
                "metric": "result"
            }

        else:
            raise ParseArgException("Parse metric failed")
Exemplo n.º 9
0
    def test_build_aggregators(self):
        agg_input = {
            'agg1': aggregators.count('metric1'),
            'agg2': aggregators.longsum('metric2'),
            'agg3': aggregators.doublesum('metric3'),
            'agg4': aggregators.doublemin('metric4'),
            'agg5': aggregators.doublemax('metric5'),
            'agg6': aggregators.hyperunique('metric6'),
            'agg7': aggregators.cardinality('dim1'),
            'agg8': aggregators.cardinality(['dim1', 'dim2'], by_row=True),
            'agg9': aggregators.thetasketch('dim1'),
            'agg10': aggregators.thetasketch('metric7'),
            'agg11': aggregators.thetasketch('metric8', isinputthetasketch = True, size=8192)
        }
        built_agg = aggregators.build_aggregators(agg_input)
        expected = [
            {'name': 'agg1', 'type': 'count', 'fieldName': 'metric1'},
            {'name': 'agg2', 'type': 'longSum', 'fieldName': 'metric2'},
            {'name': 'agg3', 'type': 'doubleSum', 'fieldName': 'metric3'},
            {'name': 'agg4', 'type': 'doubleMin', 'fieldName': 'metric4'},
            {'name': 'agg5', 'type': 'doubleMax', 'fieldName': 'metric5'},
            {'name': 'agg6', 'type': 'hyperUnique', 'fieldName': 'metric6'},
            {'name': 'agg7', 'type': 'cardinality', 'fieldNames': ['dim1'], 'byRow': False},
            {'name': 'agg8', 'type': 'cardinality', 'fieldNames': ['dim1', 'dim2'], 'byRow': True},
            {'name': 'agg9', 'type': 'thetaSketch', 'fieldName': 'dim1', 'isInputThetaSketch': False, 'size': 16384},
            {'name': 'agg10', 'type': 'thetaSketch', 'fieldName': 'metric7', 'isInputThetaSketch': False, 'size': 16384},
            {'name': 'agg11', 'type': 'thetaSketch', 'fieldName': 'metric8', 'isInputThetaSketch': True, 'size': 8192}

        ]
        assert (sorted(built_agg, key=itemgetter('name')) ==
                sorted(expected, key=itemgetter('name')))
Exemplo n.º 10
0
 def test_filtered_aggregator(self):
     filter_ = filters.Filter(dimension="dim", value="val")
     aggs = [
         aggregators.count("metric1"),
         aggregators.longsum("metric2"),
         aggregators.doublesum("metric3"),
         aggregators.doublemin("metric4"),
         aggregators.doublemax("metric5"),
         aggregators.hyperunique("metric6"),
         aggregators.cardinality("dim1"),
         aggregators.cardinality(["dim1", "dim2"], by_row=True),
         aggregators.thetasketch("dim1"),
         aggregators.thetasketch("metric7"),
         aggregators.thetasketch("metric8",
                                 isinputthetasketch=True,
                                 size=8192),
     ]
     for agg in aggs:
         expected = {
             "type": "filtered",
             "filter": {
                 "type": "selector",
                 "dimension": "dim",
                 "value": "val"
             },
             "aggregator": agg,
         }
         actual = aggregators.filtered(filter_, agg)
         assert actual == expected
Exemplo n.º 11
0
 def test_filtered_aggregator(self):
     filter_ = filters.Filter(dimension='dim', value='val')
     aggs = [aggregators.count('metric1'),
             aggregators.longsum('metric2'),
             aggregators.doublesum('metric3'),
             aggregators.doublemin('metric4'),
             aggregators.doublemax('metric5'),
             aggregators.hyperunique('metric6'),
             aggregators.cardinality('dim1'),
             aggregators.cardinality(['dim1', 'dim2'], by_row=True),
             aggregators.thetasketch('dim1'),
             aggregators.thetasketch('metric7'),
             aggregators.thetasketch('metric8', isinputthetasketch=True, size=8192)
            ]
     for agg in aggs:
         expected = {
             'type': 'filtered',
             'filter': {
                 'type': 'selector',
                 'dimension': 'dim',
                 'value': 'val'
             },
             'aggregator': agg
         }
         actual = aggregators.filtered(filter_, agg)
         assert actual == expected
Exemplo n.º 12
0
 def test_build_aggregators(self):
     agg_input = {
         'agg1': aggregators.count('metric1'),
         'agg2': aggregators.longsum('metric2'),
         'agg3': aggregators.doublesum('metric3'),
         'agg4': aggregators.min('metric4'),
         'agg5': aggregators.max('metric5'),
         'agg6': aggregators.hyperunique('metric6'),
         'agg7': aggregators.cardinality('dim1'),
         'agg8': aggregators.cardinality(['dim1', 'dim2'], by_row=True)
     }
     built_agg = aggregators.build_aggregators(agg_input)
     expected = [
         {
             'name': 'agg1',
             'type': 'count',
             'fieldName': 'metric1'
         },
         {
             'name': 'agg2',
             'type': 'longSum',
             'fieldName': 'metric2'
         },
         {
             'name': 'agg3',
             'type': 'doubleSum',
             'fieldName': 'metric3'
         },
         {
             'name': 'agg4',
             'type': 'min',
             'fieldName': 'metric4'
         },
         {
             'name': 'agg5',
             'type': 'max',
             'fieldName': 'metric5'
         },
         {
             'name': 'agg6',
             'type': 'hyperUnique',
             'fieldName': 'metric6'
         },
         {
             'name': 'agg7',
             'type': 'cardinality',
             'fieldNames': ['dim1'],
             'byRow': False
         },
         {
             'name': 'agg8',
             'type': 'cardinality',
             'fieldNames': ['dim1', 'dim2'],
             'byRow': True
         },
     ]
     assert (sorted(built_agg, key=itemgetter('name')) == sorted(
         expected, key=itemgetter('name')))
Exemplo n.º 13
0
 def __init__(self, query_client=None):
     ''' Class to query pydruid and return the data as a pandas dataframe.
     Pivoted to contain '''
     self.datasource = DATASOURCE.name
     self.granularity = 'month'
     self.intervals = '%s/%s' % (START_DATE_STR, TODAY_DATE_STR)
     self.dimensions = []
     self.field_dimension = DEFAULT_FIELD
     self.filter = DEFAULT_FILTER
     self.agg_alias = 'sum'
     self.aggregations = {self.agg_alias: doublesum('sum'), 'count': count('count')}
     self.query_client = query_client or DruidQueryClient
Exemplo n.º 14
0
    def test_build_filtered_aggregator(self):
        filter_ = filters.Filter(dimension='dim', value='val')
        agg_input = {
            'agg1': aggregators.filtered(filter_,
                                         aggregators.count('metric1')),
            'agg2': aggregators.filtered(filter_,
                                         aggregators.longsum('metric2')),
            'agg3': aggregators.filtered(filter_,
                                         aggregators.doublesum('metric3')),
            'agg4': aggregators.filtered(filter_,
                                         aggregators.min('metric4')),
            'agg5': aggregators.filtered(filter_,
                                         aggregators.max('metric5')),
            'agg6': aggregators.filtered(filter_,
                                         aggregators.hyperunique('metric6')),
            'agg7': aggregators.filtered(filter_,
                                         aggregators.cardinality('dim1')),
            'agg8': aggregators.filtered(filter_,
                                         aggregators.cardinality(['dim1', 'dim2'], by_row=True)),
        }
        base = {
            'type': 'filtered',
            'filter': {
                'type': 'selector',
                'dimension': 'dim',
                'value': 'val'
            }
        }

        aggs = [
            {'name': 'agg1', 'type': 'count', 'fieldName': 'metric1'},
            {'name': 'agg2', 'type': 'longSum', 'fieldName': 'metric2'},
            {'name': 'agg3', 'type': 'doubleSum', 'fieldName': 'metric3'},
            {'name': 'agg4', 'type': 'min', 'fieldName': 'metric4'},
            {'name': 'agg5', 'type': 'max', 'fieldName': 'metric5'},
            {'name': 'agg6', 'type': 'hyperUnique', 'fieldName': 'metric6'},
            {'name': 'agg7', 'type': 'cardinality', 'fieldNames': ['dim1'], 'byRow': False},
            {'name': 'agg8', 'type': 'cardinality', 'fieldNames': ['dim1', 'dim2'], 'byRow': True},
        ]
        expected = []
        for agg in aggs:
            exp = deepcopy(base)
            exp.update({'aggregator': agg})
            expected.append(exp)

        built_agg = aggregators.build_aggregators(agg_input)
        expected = sorted(built_agg, key=lambda k: itemgetter('name')(
            itemgetter('aggregator')(k)))
        actual = sorted(expected, key=lambda k: itemgetter('name')(
            itemgetter('aggregator')(k)))
        assert expected == actual
Exemplo n.º 15
0
    def test_build_query_none_type(self):
        # given
        expected_query_dict = {
            'queryType':
            None,
            'dataSource':
            'things',
            'aggregations': [{
                'fieldName': 'thing',
                'name': 'count',
                'type': 'count'
            }],
            'filter': {
                'dimension': 'one',
                'type': 'selector',
                'value': 1
            },
            'having': {
                'aggregation': 'sum',
                'type': 'greaterThan',
                'value': 1
            },
            'dimension':
            'dim1',
        }

        builder = QueryBuilder()

        # when
        builder_dict = {
            'datasource': 'things',
            'aggregations': {
                'count': aggregators.count('thing'),
            },
            'filter': filters.Dimension('one') == 1,
            'having': having.Aggregation('sum') > 1,
            'dimension': 'dim1',
        }
        query = builder.build_query(None, builder_dict)

        # then
        assert query.query_dict == expected_query_dict

        # you should be able to pass `None` to dimension/having/filter
        for v in ['dimension', 'having', 'filter']:
            expected_query_dict[v] = None
            builder_dict[v] = None

            query = builder.build_query(None, builder_dict)

            assert query.query_dict == expected_query_dict
Exemplo n.º 16
0
    def test_build_query(self):
        # given
        expected_query_dict = {
            'queryType': None,
            'dataSource': 'things',
            'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}],
            'postAggregations': [{
                'fields': [{
                    'fieldName': 'sum', 'type': 'fieldAccess',
                }, {
                    'fieldName': 'count', 'type': 'fieldAccess',
                }],
                'fn': '/',
                'name': 'avg',
                'type': 'arithmetic',
            }],
            'pagingSpec': {'pagingIdentifies': {}, 'threshold': 1},
            'filter': {'dimension': 'one', 'type': 'selector', 'value': 1},
            'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1},
            'new_key': 'value',
            'virtualColumns': [{
                    'type': 'expression', 'name': 'foo', 'expression': "concat('foo' + page)", 'outputType': 'STRING'
                }],
        }

        builder = QueryBuilder()

        # when
        query = builder.build_query(None, {
            'datasource': 'things',
            'aggregations': {
                'count': aggregators.count('thing'),
            },
            'post_aggregations': {
                'avg': (postaggregator.Field('sum') /
                        postaggregator.Field('count')),
            },
            'paging_spec': {
                'pagingIdentifies': {},
                'threshold': 1,
            },
            'filter': filters.Dimension('one') == 1,
            'having': having.Aggregation('sum') > 1,
            'new_key': 'value',
            'virtualColumns':
                [VirtualColumn(type='expression', name='foo', expression="concat('foo' + page)", outputType='STRING')]
        })

        # then
        assert query.query_dict == expected_query_dict
Exemplo n.º 17
0
    def _parse_metric(self):
        if self._metric == 'uv':
            return {"aggregations": {"result": cardinality(self._field)}}

        elif self._metric == 'pv':
            return {"aggregations": {"result": count(self._field)}}

        elif self._metric == 'longsum':
            return {"aggregations": {"result": longsum(self._field)}}

        elif self._metric == 'doublesum':
            return {"aggregations": {"result": doublesum(self._field)}}

        else:
            raise ParseArgException("Parse metric failed")
Exemplo n.º 18
0
 def test_nested_filtered_aggregator(self):
     filter1 = filters.Filter(dimension='dim1', value='val')
     filter2 = filters.Filter(dimension='dim2', value='val')
     agg = aggregators.filtered(filter1,
                                aggregators.filtered(filter2, aggregators.count('metric1')))
     actual = aggregators.build_aggregators({'agg_name': agg})
     # the innermost aggregation must have 'agg_name'
     expected = [{
         'type': 'filtered',
         'aggregator': {
             'type': 'filtered',
             'aggregator': {'fieldName': 'metric1', 'type': 'count', 'name': 'agg_name'},
             'filter': {'dimension': 'dim2', 'value': 'val', 'type': 'selector'}},
         'filter': {'dimension': 'dim1', 'value': 'val', 'type': 'selector'}
     }]
     assert expected == actual
Exemplo n.º 19
0
    def test_build_query(self):
        # given
        expected_query_dict = {
            'queryType': None,
            'dataSource': 'things',
            'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}],
            'postAggregations': [{
                'fields': [{
                    'fieldName': 'sum', 'type': 'fieldAccess',
                }, {
                    'fieldName': 'count', 'type': 'fieldAccess',
                }],
                'fn': '/',
                'name': 'avg',
                'type': 'arithmetic',
            }],
            'pagingSpec': {'pagingIdentifies': {}, 'threshold': 1},
            'filter': {'dimension': 'one', 'type': 'selector', 'value': 1},
            'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1},
            'new_key': 'value',
        }

        builder = QueryBuilder()

        # when
        query = builder.build_query(None, {
            'datasource': 'things',
            'aggregations': {
                'count': aggregators.count('thing'),
            },
            'post_aggregations': {
                'avg': (postaggregator.Field('sum') /
                        postaggregator.Field('count')),
            },
            'paging_spec': {
                'pagingIdentifies': {},
                'threshold': 1,
            },
            'filter': filters.Dimension('one') == 1,
            'having': having.Aggregation('sum') > 1,
            'new_key': 'value',
        })

        # then
        assert query.query_dict == expected_query_dict
Exemplo n.º 20
0
 def test_nested_filtered_aggregator(self):
     filter1 = filters.Filter(dimension="dim1", value="val")
     filter2 = filters.Filter(dimension="dim2", value="val")
     agg = aggregators.filtered(filter1, aggregators.filtered(filter2, aggregators.count("metric1")))
     actual = aggregators.build_aggregators({"agg_name": agg})
     # the innermost aggregation must have 'agg_name'
     expected = [
         {
             "type": "filtered",
             "aggregator": {
                 "type": "filtered",
                 "aggregator": {"fieldName": "metric1", "type": "count", "name": "agg_name"},
                 "filter": {"dimension": "dim2", "value": "val", "type": "selector"},
             },
             "filter": {"dimension": "dim1", "value": "val", "type": "selector"},
         }
     ]
     assert expected == actual
Exemplo n.º 21
0
    def test_build_subquery(self):
        # given
        expected_query_dict = {
            "query": {
                "queryType": "groupBy",
                "dataSource": "things",
                "aggregations": [
                    {"fieldName": "thing", "name": "count", "type": "count"}
                ],
                "postAggregations": [
                    {
                        "fields": [
                            {"fieldName": "sum", "type": "fieldAccess"},
                            {"fieldName": "count", "type": "fieldAccess"},
                        ],
                        "fn": "/",
                        "name": "avg",
                        "type": "arithmetic",
                    }
                ],
                "filter": {"dimension": "one", "type": "selector", "value": 1},
                "having": {"aggregation": "sum", "type": "greaterThan", "value": 1},
            },
            "type": "query",
        }

        builder = QueryBuilder()

        # when
        subquery_dict = builder.subquery(
            {
                "datasource": "things",
                "aggregations": {"count": aggregators.count("thing")},
                "post_aggregations": {
                    "avg": (postaggregator.Field("sum") / postaggregator.Field("count"))
                },
                "filter": filters.Dimension("one") == 1,
                "having": having.Aggregation("sum") > 1,
            }
        )

        # then
        assert subquery_dict == expected_query_dict
Exemplo n.º 22
0
    def test_build_query(self):
        # given
        expected_query_dict = {
            "queryType": None,
            "dataSource": "things",
            "aggregations": [{"fieldName": "thing", "name": "count", "type": "count"}],
            "postAggregations": [
                {
                    "fields": [
                        {"fieldName": "sum", "type": "fieldAccess"},
                        {"fieldName": "count", "type": "fieldAccess"},
                    ],
                    "fn": "/",
                    "name": "avg",
                    "type": "arithmetic",
                }
            ],
            "pagingSpec": {"pagingIdentifies": {}, "threshold": 1},
            "filter": {"dimension": "one", "type": "selector", "value": 1},
            "having": {"aggregation": "sum", "type": "greaterThan", "value": 1},
            "new_key": "value",
        }

        builder = QueryBuilder()

        # when
        query = builder.build_query(
            None,
            {
                "datasource": "things",
                "aggregations": {"count": aggregators.count("thing")},
                "post_aggregations": {
                    "avg": (postaggregator.Field("sum") / postaggregator.Field("count"))
                },
                "paging_spec": {"pagingIdentifies": {}, "threshold": 1},
                "filter": filters.Dimension("one") == 1,
                "having": having.Aggregation("sum") > 1,
                "new_key": "value",
            },
        )

        # then
        assert query.query_dict == expected_query_dict
Exemplo n.º 23
0
 def test_filtered_aggregator(self):
     filter_ = filters.Filter(dimension='dim', value='val')
     aggs = [aggregators.count('metric1'),
             aggregators.longsum('metric2'),
             aggregators.doublesum('metric3'),
             aggregators.min('metric4'),
             aggregators.max('metric5'),
             aggregators.hyperunique('metric6')]
     for agg in aggs:
         expected = {
             'type': 'filtered',
             'filter': {
                 'type': 'selector',
                 'dimension': 'dim',
                 'value': 'val'
             },
             'aggregator': agg
         }
         actual = aggregators.filtered(filter_, agg)
         assert actual == expected
Exemplo n.º 24
0
 def test_build_aggregators(self):
     agg_input = {
         'agg1': aggregators.count('metric1'),
         'agg2': aggregators.longsum('metric2'),
         'agg3': aggregators.doublesum('metric3'),
         'agg4': aggregators.min('metric4'),
         'agg5': aggregators.max('metric5'),
         'agg6': aggregators.hyperunique('metric6')
     }
     built_agg = aggregators.build_aggregators(agg_input)
     expected = [
         {'name': 'agg1', 'type': 'count', 'fieldName': 'metric1'},
         {'name': 'agg2', 'type': 'longSum', 'fieldName': 'metric2'},
         {'name': 'agg3', 'type': 'doubleSum', 'fieldName': 'metric3'},
         {'name': 'agg4', 'type': 'min', 'fieldName': 'metric4'},
         {'name': 'agg5', 'type': 'max', 'fieldName': 'metric5'},
         {'name': 'agg6', 'type': 'hyperUnique', 'fieldName': 'metric6'},
     ]
     assert (sorted(built_agg, key=itemgetter('name')) ==
             sorted(expected, key=itemgetter('name')))
Exemplo n.º 25
0
 def test_filtered_aggregator(self):
     filter_ = filters.Filter(dimension="dim", value="val")
     aggs = [
         aggregators.count("metric1"),
         aggregators.longsum("metric2"),
         aggregators.doublesum("metric3"),
         aggregators.min("metric4"),
         aggregators.max("metric5"),
         aggregators.hyperunique("metric6"),
         aggregators.cardinality("dim1"),
         aggregators.cardinality(["dim1", "dim2"], by_row=True),
     ]
     for agg in aggs:
         expected = {
             "type": "filtered",
             "filter": {"type": "selector", "dimension": "dim", "value": "val"},
             "aggregator": agg,
         }
         actual = aggregators.filtered(filter_, agg)
         assert actual == expected
Exemplo n.º 26
0
    def test_build_subquery(self):
        # given
        expected_query_dict = {
            'query': {
                'queryType': 'groupBy',
                'dataSource': 'things',
                'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}],
                'postAggregations': [{
                    'fields': [{
                        'fieldName': 'sum', 'type': 'fieldAccess',
                    }, {
                        'fieldName': 'count', 'type': 'fieldAccess',
                    }],
                    'fn': '/',
                    'name': 'avg',
                    'type': 'arithmetic',
                }],
                'filter': {'dimension': 'one', 'type': 'selector', 'value': 1},
                'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1},
            },
            'type': 'query'
        }

        builder = QueryBuilder()

        # when
        subquery_dict = builder.subquery({
            'datasource': 'things',
            'aggregations': {
                'count': aggregators.count('thing'),
            },
            'post_aggregations': {
                'avg': (postaggregator.Field('sum') /
                        postaggregator.Field('count')),
            },
            'filter': filters.Dimension('one') == 1,
            'having': having.Aggregation('sum') > 1,
        })

        # then
        assert subquery_dict == expected_query_dict
Exemplo n.º 27
0
 def test_build_aggregators(self):
     agg_input = {
         "agg1": aggregators.count("metric1"),
         "agg2": aggregators.longsum("metric2"),
         "agg3": aggregators.doublesum("metric3"),
         "agg4": aggregators.min("metric4"),
         "agg5": aggregators.max("metric5"),
         "agg6": aggregators.hyperunique("metric6"),
         "agg7": aggregators.cardinality("dim1"),
         "agg8": aggregators.cardinality(["dim1", "dim2"], by_row=True),
     }
     built_agg = aggregators.build_aggregators(agg_input)
     expected = [
         {"name": "agg1", "type": "count", "fieldName": "metric1"},
         {"name": "agg2", "type": "longSum", "fieldName": "metric2"},
         {"name": "agg3", "type": "doubleSum", "fieldName": "metric3"},
         {"name": "agg4", "type": "min", "fieldName": "metric4"},
         {"name": "agg5", "type": "max", "fieldName": "metric5"},
         {"name": "agg6", "type": "hyperUnique", "fieldName": "metric6"},
         {"name": "agg7", "type": "cardinality", "fieldNames": ["dim1"], "byRow": False},
         {"name": "agg8", "type": "cardinality", "fieldNames": ["dim1", "dim2"], "byRow": True},
     ]
     assert sorted(built_agg, key=itemgetter("name")) == sorted(expected, key=itemgetter("name"))
Exemplo n.º 28
0
    def values_for_column(self, column_name, from_dttm, to_dttm, limit=500):
        """Retrieve some values for the given column"""
        # TODO: Use Lexicographic TopNMetricSpec once supported by PyDruid
        from_dttm = from_dttm.replace(tzinfo=DRUID_TZ)
        to_dttm = to_dttm.replace(tzinfo=DRUID_TZ)

        qry = dict(
            datasource=self.datasource_name,
            granularity="all",
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
            aggregations=dict(count=count("count")),
            dimension=column_name,
            metric="count",
            threshold=limit,
        )

        client = self.cluster.get_pydruid_client()
        client.topn(**qry)
        df = client.export_pandas()

        if df is None or df.size == 0:
            raise Exception(_("No data was returned."))

        return df
Exemplo n.º 29
0
 def test_build_aggregators(self):
     agg_input = {
         'agg1': aggregators.count('metric1'),
         'agg2': aggregators.longsum('metric2'),
         'agg3': aggregators.doublesum('metric3'),
         'agg4': aggregators.min('metric4'),
         'agg5': aggregators.max('metric5'),
         'agg6': aggregators.hyperunique('metric6'),
         'agg7': aggregators.cardinality('dim1'),
         'agg8': aggregators.cardinality(['dim1', 'dim2'], by_row=True)
     }
     built_agg = aggregators.build_aggregators(agg_input)
     expected = [
         {'name': 'agg1', 'type': 'count', 'fieldName': 'metric1'},
         {'name': 'agg2', 'type': 'longSum', 'fieldName': 'metric2'},
         {'name': 'agg3', 'type': 'doubleSum', 'fieldName': 'metric3'},
         {'name': 'agg4', 'type': 'min', 'fieldName': 'metric4'},
         {'name': 'agg5', 'type': 'max', 'fieldName': 'metric5'},
         {'name': 'agg6', 'type': 'hyperUnique', 'fieldName': 'metric6'},
         {'name': 'agg7', 'type': 'cardinality', 'fieldNames': ['dim1'], 'byRow': False},
         {'name': 'agg8', 'type': 'cardinality', 'fieldNames': ['dim1', 'dim2'], 'byRow': True},
     ]
     assert (sorted(built_agg, key=itemgetter('name')) ==
             sorted(expected, key=itemgetter('name')))
Exemplo n.º 30
0
    def test_build_query_none_type(self):
        # given
        expected_query_dict = {
            'queryType': None,
            'dataSource': 'things',
            'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}],
            'filter': {'dimension': 'one', 'type': 'selector', 'value': 1},
            'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1},
            'dimension': 'dim1',
        }

        builder = QueryBuilder()

        # when
        builder_dict = {
            'datasource': 'things',
            'aggregations': {
                'count': aggregators.count('thing'),
            },
            'filter': filters.Dimension('one') == 1,
            'having': having.Aggregation('sum') > 1,
            'dimension': 'dim1',
        }
        query = builder.build_query(None, builder_dict)

        # then
        assert query.query_dict == expected_query_dict

        # you should be able to pass `None` to dimension/having/filter
        for v in ['dimension', 'having', 'filter']:
            expected_query_dict[v] = None
            builder_dict[v] = None

            query = builder.build_query(None, builder_dict)

            assert query.query_dict == expected_query_dict
Exemplo n.º 31
0
from pydruid.utils.filters import Dimension

from db.druid.calculations.base_calculation import BaseCalculation
from db.druid.query_builder import GroupByQueryBuilder
from log import LOG
from web.server.data.status import INTERVAL
from web.server.security.permissions import SuperUserPermission
from web.server.routes.views.query_policy import enumerate_query_needs

# Key for the display name. Prefixed with __ to distinguish between actual
# dimensions.
DISPLAY_FIELD = '_display'

COUNT_AGGREGATION_NAME = 'count'
COUNT_CALCULATION = BaseCalculation(
    aggregations={COUNT_AGGREGATION_NAME: count('count')})


class DimensionValuesLookup(object):
    def __init__(
        self,
        query_client,
        datasource,
        filter_dimensions,
        dimension_slices,
        authorizable_dimensions,
        geo_field_ordering,
    ):
        self.query_client = query_client
        # Map from dimension to a list of dimension values.
        self.dimension_map = defaultdict(list)
Exemplo n.º 32
0
    def test_build_filtered_aggregator(self):
        filter_ = filters.Filter(dimension='dim', value='val')
        agg_input = {
            'agg1': aggregators.filtered(filter_,
                                         aggregators.count('metric1')),
            'agg2': aggregators.filtered(filter_,
                                         aggregators.longsum('metric2')),
            'agg3': aggregators.filtered(filter_,
                                         aggregators.doublesum('metric3')),
            'agg4': aggregators.filtered(filter_,
                                         aggregators.doublemin('metric4')),
            'agg5': aggregators.filtered(filter_,
                                         aggregators.doublemax('metric5')),
            'agg6': aggregators.filtered(filter_,
                                         aggregators.hyperunique('metric6')),
            'agg7': aggregators.filtered(filter_,
                                         aggregators.cardinality('dim1')),
            'agg8': aggregators.filtered(filter_,
                                         aggregators.cardinality(['dim1', 'dim2'], by_row=True)),
            'agg9': aggregators.filtered(filter_,
                                         aggregators.thetasketch('dim1')),
            'agg10': aggregators.filtered(filter_,
                                         aggregators.thetasketch('metric7')),
            'agg11': aggregators.filtered(filter_,
                                         aggregators.thetasketch('metric8', isinputthetasketch = True, size=8192)),
        }
        base = {
            'type': 'filtered',
            'filter': {
                'type': 'selector',
                'dimension': 'dim',
                'value': 'val'
            }
        }

        aggs = [
            {'name': 'agg1', 'type': 'count', 'fieldName': 'metric1'},
            {'name': 'agg2', 'type': 'longSum', 'fieldName': 'metric2'},
            {'name': 'agg3', 'type': 'doubleSum', 'fieldName': 'metric3'},
            {'name': 'agg4', 'type': 'doubleMin', 'fieldName': 'metric4'},
            {'name': 'agg5', 'type': 'doubleMax', 'fieldName': 'metric5'},
            {'name': 'agg6', 'type': 'hyperUnique', 'fieldName': 'metric6'},
            {'name': 'agg7', 'type': 'cardinality', 'fieldNames': ['dim1'], 'byRow': False},
            {'name': 'agg8', 'type': 'cardinality', 'fieldNames': ['dim1', 'dim2'], 'byRow': True},
            {'name': 'agg9', 'type': 'thetaSketch', 'fieldName': 'dim1', 'isInputThetaSketch': False, 'size': 16384},
            {'name': 'agg10', 'type': 'thetaSketch', 'fieldName': 'metric7', 'isInputThetaSketch': False, 'size': 16384},
            {'name': 'agg11', 'type': 'thetaSketch', 'fieldName': 'metric8', 'isInputThetaSketch': True, 'size': 8192}

        ]
        expected = []
        for agg in aggs:
            exp = deepcopy(base)
            exp.update({'aggregator': agg})
            expected.append(exp)

        built_agg = aggregators.build_aggregators(agg_input)
        expected = sorted(built_agg, key=lambda k: itemgetter('name')(
            itemgetter('aggregator')(k)))
        actual = sorted(expected, key=lambda k: itemgetter('name')(
            itemgetter('aggregator')(k)))
        assert expected == actual
Exemplo n.º 33
0
    def test_build_filtered_aggregator(self):
        filter_ = filters.Filter(dimension="dim", value="val")
        agg_input = {
            "agg1":
            aggregators.filtered(filter_, aggregators.count("metric1")),
            "agg2":
            aggregators.filtered(filter_, aggregators.longsum("metric2")),
            "agg3":
            aggregators.filtered(filter_, aggregators.doublesum("metric3")),
            "agg4":
            aggregators.filtered(filter_, aggregators.doublemin("metric4")),
            "agg5":
            aggregators.filtered(filter_, aggregators.doublemax("metric5")),
            "agg6":
            aggregators.filtered(filter_, aggregators.hyperunique("metric6")),
            "agg7":
            aggregators.filtered(filter_, aggregators.cardinality("dim1")),
            "agg8":
            aggregators.filtered(
                filter_, aggregators.cardinality(["dim1", "dim2"],
                                                 by_row=True)),
            "agg9":
            aggregators.filtered(filter_, aggregators.thetasketch("dim1")),
            "agg10":
            aggregators.filtered(filter_, aggregators.thetasketch("metric7")),
            "agg11":
            aggregators.filtered(
                filter_,
                aggregators.thetasketch("metric8",
                                        isinputthetasketch=True,
                                        size=8192),
            ),
        }
        base = {
            "type": "filtered",
            "filter": {
                "type": "selector",
                "dimension": "dim",
                "value": "val"
            },
        }

        aggs = [
            {
                "name": "agg1",
                "type": "count",
                "fieldName": "metric1"
            },
            {
                "name": "agg2",
                "type": "longSum",
                "fieldName": "metric2"
            },
            {
                "name": "agg3",
                "type": "doubleSum",
                "fieldName": "metric3"
            },
            {
                "name": "agg4",
                "type": "doubleMin",
                "fieldName": "metric4"
            },
            {
                "name": "agg5",
                "type": "doubleMax",
                "fieldName": "metric5"
            },
            {
                "name": "agg6",
                "type": "hyperUnique",
                "fieldName": "metric6"
            },
            {
                "name": "agg7",
                "type": "cardinality",
                "fieldNames": ["dim1"],
                "byRow": False,
            },
            {
                "name": "agg8",
                "type": "cardinality",
                "fieldNames": ["dim1", "dim2"],
                "byRow": True,
            },
            {
                "name": "agg9",
                "type": "thetaSketch",
                "fieldName": "dim1",
                "isInputThetaSketch": False,
                "size": 16384,
            },
            {
                "name": "agg10",
                "type": "thetaSketch",
                "fieldName": "metric7",
                "isInputThetaSketch": False,
                "size": 16384,
            },
            {
                "name": "agg11",
                "type": "thetaSketch",
                "fieldName": "metric8",
                "isInputThetaSketch": True,
                "size": 8192,
            },
        ]
        expected = []
        for agg in aggs:
            exp = deepcopy(base)
            exp.update({"aggregator": agg})
            expected.append(exp)

        built_agg = aggregators.build_aggregators(agg_input)
        expected = sorted(built_agg,
                          key=lambda k: itemgetter("name")
                          (itemgetter("aggregator")(k)))
        actual = sorted(expected,
                        key=lambda k: itemgetter("name")
                        (itemgetter("aggregator")(k)))
        assert expected == actual
Exemplo n.º 34
0
 def test_build_aggregators(self):
     agg_input = {
         "agg1":
         aggregators.count("metric1"),
         "agg2":
         aggregators.longsum("metric2"),
         "agg3":
         aggregators.doublesum("metric3"),
         "agg4":
         aggregators.doublemin("metric4"),
         "agg5":
         aggregators.doublemax("metric5"),
         "agg6":
         aggregators.hyperunique("metric6"),
         "agg7":
         aggregators.cardinality("dim1"),
         "agg8":
         aggregators.cardinality(["dim1", "dim2"], by_row=True),
         "agg9":
         aggregators.thetasketch("dim1"),
         "agg10":
         aggregators.thetasketch("metric7"),
         "agg11":
         aggregators.thetasketch("metric8",
                                 isinputthetasketch=True,
                                 size=8192),
     }
     built_agg = aggregators.build_aggregators(agg_input)
     expected = [
         {
             "name": "agg1",
             "type": "count",
             "fieldName": "metric1"
         },
         {
             "name": "agg2",
             "type": "longSum",
             "fieldName": "metric2"
         },
         {
             "name": "agg3",
             "type": "doubleSum",
             "fieldName": "metric3"
         },
         {
             "name": "agg4",
             "type": "doubleMin",
             "fieldName": "metric4"
         },
         {
             "name": "agg5",
             "type": "doubleMax",
             "fieldName": "metric5"
         },
         {
             "name": "agg6",
             "type": "hyperUnique",
             "fieldName": "metric6"
         },
         {
             "name": "agg7",
             "type": "cardinality",
             "fieldNames": ["dim1"],
             "byRow": False,
         },
         {
             "name": "agg8",
             "type": "cardinality",
             "fieldNames": ["dim1", "dim2"],
             "byRow": True,
         },
         {
             "name": "agg9",
             "type": "thetaSketch",
             "fieldName": "dim1",
             "isInputThetaSketch": False,
             "size": 16384,
         },
         {
             "name": "agg10",
             "type": "thetaSketch",
             "fieldName": "metric7",
             "isInputThetaSketch": False,
             "size": 16384,
         },
         {
             "name": "agg11",
             "type": "thetaSketch",
             "fieldName": "metric8",
             "isInputThetaSketch": True,
             "size": 8192,
         },
     ]
     assert sorted(built_agg,
                   key=itemgetter("name")) == sorted(expected,
                                                     key=itemgetter("name"))