Exemplo n.º 1
0
    def test_client_allows_to_export_last_query(self, mock_urlopen):
        # given
        response = Mock()
        response.read.return_value = """
            [ {
  "timestamp" : "2015-12-30T14:14:49.000Z",
  "result" : [ {
    "dimension" : "aaaa",
    "metric" : 100
  } ]
            } ]
        """.encode("utf-8")
        mock_urlopen.return_value = response
        client = create_client()
        client.topn(datasource="testdatasource",
                    granularity="all",
                    intervals="2015-12-29/pt1h",
                    aggregations={"count": doublesum("count")},
                    dimension="user_name",
                    metric="count",
                    filter=Dimension("user_lang") == "en",
                    threshold=1,
                    context={"timeout": 1000})

        # when / then
        # assert that last_query.export_tsv method was called (it should throw an exception, given empty path)
        with pytest.raises(TypeError):
            client.export_tsv(None)
Exemplo n.º 2
0
    def test_druid_returns_results(self, mock_urlopen):
        # given
        response = Mock()
        response.read.return_value = """
            [ {
  "timestamp" : "2015-12-30T14:14:49.000Z",
  "result" : [ {
    "dimension" : "aaaa",
    "metric" : 100
  } ]
            } ]
        """.encode("utf-8")
        mock_urlopen.return_value = response
        client = create_client()

        # when
        top = client.topn(datasource="testdatasource",
                          granularity="all",
                          intervals="2015-12-29/pt1h",
                          aggregations={"count": doublesum("count")},
                          dimension="user_name",
                          metric="count",
                          filter=Dimension("user_lang") == "en",
                          threshold=1,
                          context={"timeout": 1000})

        # then
        assert top is not None
        assert len(top.result) == 1
        assert len(top.result[0]['result']) == 1
Exemplo n.º 3
0
    def test_druid_returns_html_error(self, mock_urlopen):
        # given
        message = textwrap.dedent("""
            <html>
            <head>
            <meta http-equiv="Content-Type" content="text/html;charset=ISO-8859-1"/>
            <title>Error 500 </title>
            </head>
            <body>
            <h2>HTTP ERROR: 500</h2>
            <p>Problem accessing /druid/v2/. Reason:
            <pre>    javax.servlet.ServletException: java.lang.OutOfMemoryError: GC overhead limit exceeded</pre></p>
            <hr /><a href="http://eclipse.org/jetty">Powered by Jetty:// 9.3.19.v20170502</a><hr/>
            </body>
            </html>
        """).strip()
        mock_urlopen.side_effect = _http_error(500, 'Internal Server Error',
                                               message)
        client = create_client()

        # when / then
        with pytest.raises(IOError) as e:
            client.topn(datasource="testdatasource",
                        granularity="all",
                        intervals="2015-12-29/pt1h",
                        aggregations={"count": doublesum("count")},
                        dimension="user_name",
                        metric="count",
                        filter=Dimension("user_lang") == "en",
                        threshold=1,
                        context={"timeout": 1000})

        assert str(e.value) == textwrap.dedent("""
            HTTP Error 500: Internal Server Error 
             Druid Error: javax.servlet.ServletException: java.lang.OutOfMemoryError: GC overhead limit exceeded 
             Query is: {
                "aggregations": [
                    {
                        "fieldName": "count",
                        "name": "count",
                        "type": "doubleSum"
                    }
                ],
                "context": {
                    "timeout": 1000
                },
                "dataSource": "testdatasource",
                "dimension": "user_name",
                "filter": {
                    "dimension": "user_lang",
                    "type": "selector",
                    "value": "en"
                },
                "granularity": "all",
                "intervals": "2015-12-29/pt1h",
                "metric": "count",
                "queryType": "topN",
                "threshold": 1
            }
        """).strip()
 def get_filters(self, raw_filters):  # noqa
     filters = None
     for flt in raw_filters:
         if not all(f in flt for f in ['col', 'op', 'val']):
             continue
         col = flt['col']
         op = flt['op']
         eq = flt['val']
         cond = None
         if op in ('in', 'not in'):
             eq = [
                 types.replace("'", '').strip()
                 if isinstance(types, string_types)
                 else types
                 for types in eq]
         elif not isinstance(flt['val'], string_types):
             eq = eq[0] if len(eq) > 0 else ''
         if col in self.num_cols:
             if op in ('in', 'not in'):
                 eq = [utils.string_to_num(v) for v in eq]
             else:
                 eq = utils.string_to_num(eq)
         if op == '==':
             cond = Dimension(col) == eq
         elif op == '!=':
             cond = ~(Dimension(col) == eq)
         elif op in ('in', 'not in'):
             fields = []
             if len(eq) > 1:
                 for s in eq:
                     fields.append(Dimension(col) == s)
                 cond = Filter(type="or", fields=fields)
             elif len(eq) == 1:
                 cond = Dimension(col) == eq[0]
             if op == 'not in':
                 cond = ~cond
         elif op == 'regex':
             cond = Filter(type="regex", pattern=eq, dimension=col)
         elif op == '>=':
             cond = Dimension(col) >= eq
         elif op == '<=':
             cond = Dimension(col) <= eq
         elif op == '>':
             cond = Dimension(col) > eq
         elif op == '<':
             cond = Dimension(col) < eq
         if filters:
             filters = Filter(type="and", fields=[
                 cond,
                 filters
             ])
         else:
             filters = cond
     return filters
Exemplo n.º 5
0
    def load_dimensions_from_druid(self):
        base_query = GroupByQueryBuilder(
            datasource=self.datasource.name,
            granularity='all',
            grouping_fields=[],
            intervals=INTERVAL,
            calculation=COUNT_CALCULATION,
        )
        for ordered_dimensions in list(self.filter_dimensions.values()):
            # Special case: meta-dimensions (like Nation) are prefixed with '_'
            # and are handled elsewhere - don't query them in druid.
            queryable_dimensions = [
                d for d in ordered_dimensions if d[0] != '_'
            ]
            for dimension in queryable_dimensions:
                dimensions = self.dimension_slices.get(dimension, [dimension])
                base_query.dimensions = dimensions
                base_query.query_filter = Dimension(dimension) != None

                LOG.info('Querying distinct %s from Druid...', dimensions)
                query_result = self.query_client.run_query(base_query)

                output_rows = []
                for row in query_result.result:
                    event = row['event']
                    output_row = dict(event)
                    del output_row[COUNT_AGGREGATION_NAME]

                    # Create a display version of this dimension that includes
                    # the parent dimensions to help disambiguate dimension
                    # values that are the same with a different hierarchy
                    dimension_display = event[dimension]
                    num_dimensions = len(dimensions)
                    if num_dimensions > 1:
                        # NOTE(ian): This logic matches logic used on the
                        # frontend in SelectFilter.jsx
                        start = num_dimensions - 1
                        disambiguation = [
                            event[d] for d in dimensions[start::-1] if event[d]
                        ]
                        dimension_display = '%s (%s)' % (
                            dimension_display,
                            ', '.join(disambiguation),
                        )

                    output_row[DISPLAY_FIELD] = dimension_display
                    output_rows.append(output_row)

                self.dimension_map[dimension] = sorted(
                    output_rows, key=lambda a: a[DISPLAY_FIELD])
                LOG.info('%s values loaded for dimension: %s',
                         len(output_rows), dimension)

        LOG.info('Done preloading dimension values.')
Exemplo n.º 6
0
def query_druid():
    client = PyDruid(DRUID_URL, 'druid/v2')
    query = client.select(
        datasource='pageviews1',
        granularity='all',
        dimensions=["url", "user"],
        filter=Dimension('user') == 'ethan',
        paging_spec={"pagingIdentifiers": {}, "threshold": 5},
        intervals=["2016-07-08/2017-09-13"]
    )
    # print json.dumps(query.result, indent=2)
    return query.result
Exemplo n.º 7
0
def query_scan():
    data = query.scan(datasource='alg_car_price_detail',
                      granularity='none',
                      intervals='2019-03-03/p1d',
                      columns=['body_color', 'city_id'],
                      filter=(Dimension('brand_name') == '上汽大众'),
                      limit=10)
    print(json.dumps(data.query_dict, indent=2))
    df = query.export_pandas()
    print(len(df))
    for i in range(len(df)):
        print(df.loc[i, :])
Exemplo n.º 8
0
 def query_filters(self):
     args = self.form_data
     # Building filters
     filters = None
     for i in range(1, 10):
         col = args.get("flt_col_" + str(i))
         op = args.get("flt_op_" + str(i))
         eq = args.get("flt_eq_" + str(i))
         if col and op and eq:
             cond = None
             if op == '==':
                 cond = Dimension(col) == eq
             elif op == '!=':
                 cond = ~(Dimension(col) == eq)
             elif op in ('in', 'not in'):
                 fields = []
                 splitted = eq.split(',')
                 if len(splitted) > 1:
                     for s in eq.split(','):
                         s = s.strip()
                         fields.append(
                             Filter.build_filter(Dimension(col) == s))
                     cond = Filter(type="or", fields=fields)
                 else:
                     cond = Dimension(col) == eq
                 if op == 'not in':
                     cond = ~cond
             if filters:
                 filters = Filter(type="and",
                                  fields=[
                                      Filter.build_filter(cond),
                                      Filter.build_filter(filters)
                                  ])
             else:
                 filters = cond
     return filters
Exemplo n.º 9
0
    def test_druid_returns_error(self):
        # given
        client = AsyncPyDruid("http://localhost:%s" % (self.get_http_port(), ),
                              "druid/v2/fail_request")

        # when / then
        with pytest.raises(IOError):
            yield client.topn(datasource="testdatasource",
                              granularity="all",
                              intervals="2015-12-29/pt1h",
                              aggregations={"count": doublesum("count")},
                              dimension="user_name",
                              metric="count",
                              filter=Dimension("user_lang") == "en",
                              threshold=1,
                              context={"timeout": 1000})
Exemplo n.º 10
0
    def test_druid_returns_error(self, mock_urlopen):
        # given
        mock_urlopen.side_effect = _http_error(500, "Druid error")
        client = create_client()

        # when / then
        with pytest.raises(IOError):
            client.topn(datasource="testdatasource",
                        granularity="all",
                        intervals="2015-12-29/pt1h",
                        aggregations={"count": doublesum("count")},
                        dimension="user_name",
                        metric="count",
                        filter=Dimension("user_lang") == "en",
                        threshold=1,
                        context={"timeout": 1000})
Exemplo n.º 11
0
 def get(self, request, cid):
     campaign_id = cid
     query = create_druid_client()
     start_date = '2017-06-27'
     query_result = query.groupby(
         datasource='celtra3',
         granularity='all',
         dimensions=['adId'],
         intervals=["{0}/p1d".format(start_date)],
         aggregations={
             'swipes': doublesum('swipes'),
             'interactions': doublesum('interactions'),
             'impressions': doublesum('impressions')
         },
         filter=(Dimension('campaignId') == campaign_id))
     return Response(query_result.result)
Exemplo n.º 12
0
    def test_client_allows_to_export_last_query(self):
        # given
        client = AsyncPyDruid("http://localhost:%s" % (self.get_http_port(), ),
                              "druid/v2/return_results")
        yield client.topn(datasource="testdatasource",
                          granularity="all",
                          intervals="2015-12-29/pt1h",
                          aggregations={"count": doublesum("count")},
                          dimension="user_name",
                          metric="count",
                          filter=Dimension("user_lang") == "en",
                          threshold=1,
                          context={"timeout": 1000})

        # when / then
        # assert that last_query.export_tsv method was called (it should throw an exception, given empty path)
        with pytest.raises(TypeError):
            client.export_tsv(None)
Exemplo n.º 13
0
    def strikes_of_a_type(strike_event):
        """Returns total count of a type of a selected strike event."""

        group = druid_client.groupby(
            datasource='denormalized_strike_events',
            granularity='all',
            intervals='2015-01-01/2018-01-01',
            dimensions=["strike_type"],
            filter=Dimension("strike_type") == escape(strike_event),
            aggregations={"count": doublesum("count")})

        strike_type_count = {}
        for entry in group.result:
            key = entry["event"]["strike_type"]
            value = int(entry["event"]["count"])
            strike_type_count[key] = value

        return strike_type_count
Exemplo n.º 14
0
    def test_client_allows_passing_default_parameters(self):
        # given
        client = AsyncPyDruid("http://localhost:%s" % (self.get_http_port(), ),
                              "druid/v2/return_results",
                              defaults=dict(request_timeout=120))
        top = yield client.topn(datasource="testdatasource",
                                granularity="all",
                                intervals="2015-12-29/pt1h",
                                aggregations={"count": doublesum("count")},
                                dimension="user_name",
                                metric="count",
                                filter=Dimension("user_lang") == "en",
                                threshold=1,
                                context={"timeout": 1000})

        # then
        self.assertIsNotNone(top)
        self.assertEqual(len(top.result), 1)
        self.assertEqual(len(top.result[0]['result']), 1)
Exemplo n.º 15
0
 def _add_filter_from_pre_query_data(self, df, dimensions, dim_filter):
     ret = dim_filter
     if df is not None and not df.empty:
         new_filters = []
         for unused, row in df.iterrows():
             fields = []
             for dim in dimensions:
                 f = Dimension(dim) == row[dim]
                 fields.append(f)
             if len(fields) > 1:
                 term = Filter(type='and', fields=fields)
                 new_filters.append(term)
             elif fields:
                 new_filters.append(fields[0])
         if new_filters:
             ff = Filter(type='or', fields=new_filters)
             if not dim_filter:
                 ret = ff
             else:
                 ret = Filter(type='and', fields=[ff, dim_filter])
     return ret
Exemplo n.º 16
0
def _construct_authorization_filter(user_identity):
    '''Converts the query permissions that the user holds into druid filters and returns the
    logical OR of the computed constituent druid filters.
    '''
    query_needs = enumerate_query_needs(user_identity)

    output_filter = EmptyFilter()
    query_need_added = False

    for query_need in query_needs:
        _filter = _construct_druid_filter_from_query_need(query_need)
        if not isinstance(_filter, EmptyFilter):
            output_filter |= _filter
            query_need_added = True

    if not query_need_added:
        # If the user has no query policies, ensure that any dimensions for which authorization is
        # enabled are unqueryable.
        for dimension_name in current_app.zen_config.filters.AUTHORIZABLE_DIMENSIONS:
            output_filter &= Dimension(dimension_name) == ''

    return output_filter
Exemplo n.º 17
0
    def load_ranges_from_druid(self):
        """Return a dictionary mapping data source name to a
        dictionary (minTime, maxTime) of datetime objects.
        """
        date_ranges = {}
        LOG.info('Querying time ranges of data from Druid...')
        aggregations = {
            MIN_TIME_FIELD: {
                'type': 'longMin',
                'fieldName': '__time'
            },
            MAX_TIME_FIELD: {
                'type': 'longMax',
                'fieldName': '__time'
            },
        }
        calculation = BaseCalculation(aggregations=aggregations)
        query = GroupByQueryBuilder(
            datasource=self.datasource.name,
            granularity='all',
            grouping_fields=[SOURCE_FIELD],
            intervals=INTERVAL,
            calculation=calculation,
        )
        query.query_filter &= Dimension(SOURCE_FIELD) != None

        query_result = self.query_client.run_query(query)
        for row in query_result.result:
            event = row['event']
            # making {data_source: (minTime, maxTime)}
            date_ranges[event[SOURCE_FIELD]] = {
                MIN_TIME_FIELD:
                self.date_from_timestamp(event[MIN_TIME_FIELD]),
                MAX_TIME_FIELD:
                self.date_from_timestamp(event[MAX_TIME_FIELD]),
            }

        LOG.info('Done querying date ranges of data')
        return date_ranges
Exemplo n.º 18
0
def _construct_druid_filter_from_query_need(query_need):
    '''Constructs a druid filter from an individual query need.
    '''

    output_filter = EmptyFilter()
    filtered_dimensions = set()

    # Go through the individual dimension filters in the `QueryNeed` and construct the appropriate
    # filters.
    for dimension_filter in query_need.dimension_filters:
        dimension = dimension_filter.dimension_name
        include_values = dimension_filter.include_values
        exclude_values = dimension_filter.exclude_values
        all_values = dimension_filter.all_values
        dimension_filter = None
        filtered_dimensions.add(dimension)

        if all_values and not exclude_values:
            continue
        elif exclude_values:
            dimension_filter = ~Filter(type=IN_FILTER_SYMBOL,
                                       dimension=dimension,
                                       values=list(exclude_values))
        else:
            dimension_filter = Filter(type=IN_FILTER_SYMBOL,
                                      dimension=dimension,
                                      values=list(include_values))

        output_filter &= dimension_filter

    # If there are any authorizable dimensions for which permissions were not explicitly defined,
    # ensure that they are completely filtered out.
    for dimension_name in current_app.zen_config.filters.AUTHORIZABLE_DIMENSIONS:
        if dimension_name not in filtered_dimensions:
            no_dimension_values_filter = Dimension(dimension_name) == ''
            output_filter &= no_dimension_values_filter

    return output_filter
Exemplo n.º 19
0
    def bake_query(self):
        """
        Doing a 2 phase query where we limit the number of series.
        """
        client = utils.get_pydruid_client()
        qry = self.query_obj()
        orig_filter = qry['filter'] if 'filter' in qry else ''
        qry['granularity'] = "all"
        client.groupby(**qry)
        df = client.export_pandas()
        if not df is None:
            dims = qry['dimensions']
            filters = []
            for index, row in df.iterrows():
                fields = []
                for dim in dims:
                    f = Filter.build_filter(Dimension(dim) == row[dim])
                    fields.append(f)
                if len(fields) > 1:
                    filters.append(
                        Filter.build_filter(Filter(type="and", fields=fields)))
                elif fields:
                    filters.append(fields[0])

            qry = self.query_obj()
            if filters:
                ff = Filter(type="or", fields=filters)
                if not orig_filter:
                    qry['filter'] = ff
                else:
                    qry['filter'] = Filter(type="and",
                                           fields=[
                                               Filter.build_filter(ff),
                                               Filter.build_filter(orig_filter)
                                           ])
            del qry['limit_spec']
            client.groupby(**qry)
        return client.export_pandas()
Exemplo n.º 20
0
    def run_query(  # noqa / druid
            self,
            groupby,
            metrics,
            granularity,
            from_dttm,
            to_dttm,
            filter=None,  # noqa
            is_timeseries=True,
            timeseries_limit=None,
            timeseries_limit_metric=None,
            row_limit=None,
            inner_from_dttm=None,
            inner_to_dttm=None,
            orderby=None,
            extras=None,  # noqa
            select=None,  # noqa
            columns=None,
            phase=2,
            client=None,
            form_data=None):
        """Runs a query against Druid and returns a dataframe.
        """
        # TODO refactor into using a TBD Query object
        client = client or self.cluster.get_pydruid_client()

        if not is_timeseries:
            granularity = 'all'
        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=DRUID_TZ)
        to_dttm = to_dttm.replace(tzinfo=DRUID_TZ)
        timezone = from_dttm.tzname()

        query_str = ""
        metrics_dict = {m.metric_name: m for m in self.metrics}

        columns_dict = {c.column_name: c for c in self.columns}

        all_metrics, post_aggs = self._metrics_and_post_aggs(
            metrics, metrics_dict)

        aggregations = OrderedDict()
        for m in self.metrics:
            if m.metric_name in all_metrics:
                aggregations[m.metric_name] = m.json_obj

        rejected_metrics = [
            m.metric_name for m in self.metrics
            if m.is_restricted and m.metric_name in aggregations.keys()
            and not sm.has_access('metric_access', m.perm)
        ]

        if rejected_metrics:
            raise MetricPermException("Access to the metrics denied: " +
                                      ', '.join(rejected_metrics))

        # the dimensions list with dimensionSpecs expanded
        dimensions = []
        groupby = [gb for gb in groupby if gb in columns_dict]
        for column_name in groupby:
            col = columns_dict.get(column_name)
            dim_spec = col.dimension_spec
            if dim_spec:
                dimensions.append(dim_spec)
            else:
                dimensions.append(column_name)
        qry = dict(
            datasource=self.datasource_name,
            dimensions=dimensions,
            aggregations=aggregations,
            granularity=DruidDatasource.granularity(
                granularity,
                timezone=timezone,
                origin=extras.get('druid_time_origin'),
            ),
            post_aggregations=post_aggs,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )

        filters = self.get_filters(filter)
        if filters:
            qry['filter'] = filters

        having_filters = self.get_having_filters(extras.get('having_druid'))
        if having_filters:
            qry['having'] = having_filters

        orig_filters = filters
        if len(groupby) == 0 and not having_filters:
            del qry['dimensions']
            client.timeseries(**qry)
        if not having_filters and len(groupby) == 1:
            qry['threshold'] = timeseries_limit or 1000
            if row_limit and granularity == 'all':
                qry['threshold'] = row_limit
            qry['dimension'] = list(qry.get('dimensions'))[0]
            del qry['dimensions']
            qry['metric'] = list(qry['aggregations'].keys())[0]
            client.topn(**qry)
        elif len(groupby) > 1 or having_filters:
            # If grouping on multiple fields or using a having filter
            # we have to force a groupby query
            if timeseries_limit and is_timeseries:
                order_by = metrics[0] if metrics else self.metrics[0]
                if timeseries_limit_metric:
                    order_by = timeseries_limit_metric
                # Limit on the number of timeseries, doing a two-phases query
                pre_qry = deepcopy(qry)
                pre_qry['granularity'] = "all"
                pre_qry['limit_spec'] = {
                    "type":
                    "default",
                    "limit":
                    timeseries_limit,
                    'intervals': (inner_from_dttm.isoformat() + '/' +
                                  inner_to_dttm.isoformat()),
                    "columns": [{
                        "dimension": order_by,
                        "direction": "descending",
                    }],
                }
                client.groupby(**pre_qry)
                query_str += "// Two phase query\n// Phase 1\n"
                query_str += json.dumps(
                    client.query_builder.last_query.query_dict, indent=2)
                query_str += "\n"
                if phase == 1:
                    return query_str
                query_str += (
                    "//\nPhase 2 (built based on phase one's results)\n")
                df = client.export_pandas()
                if df is not None and not df.empty:
                    dims = qry['dimensions']
                    filters = []
                    for unused, row in df.iterrows():
                        fields = []
                        for dim in dims:
                            f = Dimension(dim) == row[dim]
                            fields.append(f)
                        if len(fields) > 1:
                            filt = Filter(type="and", fields=fields)
                            filters.append(filt)
                        elif fields:
                            filters.append(fields[0])

                    if filters:
                        ff = Filter(type="or", fields=filters)
                        if not orig_filters:
                            qry['filter'] = ff
                        else:
                            qry['filter'] = Filter(type="and",
                                                   fields=[ff, orig_filters])
                    qry['limit_spec'] = None
            if row_limit:
                qry['limit_spec'] = {
                    "type":
                    "default",
                    "limit":
                    row_limit,
                    "columns": [{
                        "dimension":
                        (metrics[0] if metrics else self.metrics[0]),
                        "direction":
                        "descending",
                    }],
                }
            client.groupby(**qry)
        query_str += json.dumps(client.query_builder.last_query.query_dict,
                                indent=2)
        return query_str
Exemplo n.º 21
0
    def __init__(
        self,
        datasource,
        granularity,
        grouping_fields,
        intervals,
        calculation,
        dimension_filter=None,
        optimize=True,
        subtotal_dimensions=None,
        subtotal_result_label='TOTAL',
    ):
        super(GroupByQueryBuilder, self).__init__(datasource, granularity, intervals)
        self.dimensions = grouping_fields
        self.subtotals = (
            SubtotalConfig(self.dimensions, subtotal_dimensions, subtotal_result_label)
            if subtotal_dimensions
            else None
        )

        # Build a copy of the input calculation with the fully built
        # aggregations and post aggregations.
        self.calculation = BaseCalculation()

        # Copy the calculations aggregations into the query. Call the
        # handlers of any aggregations that require information about
        # the current query to be built.
        self.query_modifier = None
        for key, aggregation in calculation.aggregations.items():
            # NOTE(stephen): Very special case where an aggregation can
            # modify the query before it is issued.
            if isinstance(aggregation, QueryModifyingAggregation):
                if not self.query_modifier:
                    self.query_modifier = aggregation
                else:
                    # If a query modifier has already been set, we should merge
                    # this query modifier into that one so that both are called.
                    self.query_modifier = self.query_modifier.merge_compatible_aggregation(
                        aggregation
                    )
                continue

            new_aggregation = aggregation
            # QueryDependentAggregations rely on certain query-time information
            # to be able to build their full filter and value sets. For example,
            # some aggregations should only be computed during the final time
            # interval of a query and not for the entire query duration.
            if isinstance(aggregation, QueryDependentAggregation):
                new_aggregation = aggregation.build_full_aggregation(
                    dimensions=self.dimensions,
                    granularity=self.granularity,
                    intervals=self.intervals,
                )

            self.calculation.add_aggregation(key, new_aggregation)

        self.calculation.add_post_aggregations(calculation.post_aggregations)

        # Build query filter from the selected data fields and dimensions.
        # Store dimension filters separate from aggregation filters so that
        # QueryModifyingAggregation can easily distinguish the filter types.
        # NOTE(stephen): Doing this *before* count fields are added so that
        # we don't duplicate the aggregation filters. Duplicating the filters,
        # while seemingly not a big deal, caused certain simple queries to take
        # 8x longer to run.
        self.aggregation_filter = build_query_filter_from_aggregations(
            self.calculation.aggregations
        )
        self.dimension_filter = dimension_filter or EmptyFilter()

        # To workaround druid's default value of 0 for filtered aggregations,
        # we track the count of all fields that should have a null check
        # applied. If those fields have a count == 0, then in the parse step
        # after the query is run, their value will be replaced with None.
        strict_null_fields = calculation.strict_null_fields
        self.calculation.set_strict_null_fields(strict_null_fields)
        self.calculation.add_count_for_fields(strict_null_fields)

        # Store the aggregations/post aggregations at the top level of the query
        # dict since pydruid needs them in a specific place.
        # NOTE(stephen): This is kinda weird since we can become out of sync
        # with the calculation.
        self.aggregations = self.calculation.aggregations
        self.post_aggregations = self.calculation.post_aggregations

        # Combine the aggregation filters and the dimension filters in to the
        # full query filter to use.
        self.query_filter = self.aggregation_filter & self.dimension_filter

        # Remove RegionName = 'Nation' from national level query in the ET database.
        # When nation is selected and no dimension filters are set.
        # TODO(attila): We shouldn't have a region named 'Nation' in the first place ... ?
        # The national value could be computed as a post aggregation or in a dataframe.
        if (
            not self.dimensions
            and isinstance(self.dimension_filter, EmptyFilter)
            and datasource.startswith('et')
        ):
            self.query_filter &= Dimension('RegionName') != 'Nation'

        # HACK(stephen): There appears to be a bug in how Druid produces
        # subtotals. Events produced by the first GroupBy pass inside Druid
        # are *reevaluated* against the original query filter. If the events
        # do not pass the original filter (and most of the time they do not for
        # us because we use filtered aggregations), then the event is *dropped*
        # from the final result. This happens even if the subtotals being
        # computed match the input dimensions exactly. To overcome this, we add
        # an extra filter that will only be valid on the computed events and
        # won't include any extra rows in the intermediate result (inside
        # Druid). This provides a filter that all events will pass while
        # subtotals are computed and will also ensure the non-subtotal events
        # accurate.
        # NOTE(stephen): This is fixed (Druid issue #7820) and can be removed
        # when the release containing the fix is live.
        if self.subtotals:
            # Use the first aggregation as the dimension to filter on.
            extra_filter = BoundFilter(list(self.aggregations.keys())[0], 0, None)
            self.query_filter |= extra_filter
        self.optimize = optimize
Exemplo n.º 22
0
 def __init__(self, dim):
     PydDimension.__init__(self, dim=dim)
Exemplo n.º 23
0
    def get_query_str(  # noqa / druid
            self,
            client,
            qry_start_dttm,
            groupby,
            metrics,
            granularity,
            from_dttm,
            to_dttm,
            filter=None,  # noqa
            is_timeseries=True,
            timeseries_limit=None,
            timeseries_limit_metric=None,
            row_limit=None,
            inner_from_dttm=None,
            inner_to_dttm=None,
            orderby=None,
            extras=None,  # noqa
            select=None,  # noqa
            columns=None,
            phase=2):
        """Runs a query against Druid and returns a dataframe.

        This query interface is common to SqlAlchemy and Druid
        """
        # TODO refactor into using a TBD Query object
        if not is_timeseries:
            granularity = 'all'
        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=DRUID_TZ)
        to_dttm = to_dttm.replace(tzinfo=DRUID_TZ)
        timezone = from_dttm.tzname()

        query_str = ""
        metrics_dict = {m.metric_name: m for m in self.metrics}
        all_metrics = []
        post_aggs = {}

        columns_dict = {c.column_name: c for c in self.columns}

        def recursive_get_fields(_conf):
            _fields = _conf.get('fields', [])
            field_names = []
            for _f in _fields:
                _type = _f.get('type')
                if _type in ['fieldAccess', 'hyperUniqueCardinality']:
                    field_names.append(_f.get('fieldName'))
                elif _type == 'arithmetic':
                    field_names += recursive_get_fields(_f)
            return list(set(field_names))

        for metric_name in metrics:
            metric = metrics_dict[metric_name]
            if metric.metric_type != 'postagg':
                all_metrics.append(metric_name)
            else:
                mconf = metric.json_obj
                all_metrics += recursive_get_fields(mconf)
                all_metrics += mconf.get('fieldNames', [])
                if mconf.get('type') == 'javascript':
                    post_aggs[metric_name] = JavascriptPostAggregator(
                        name=mconf.get('name', ''),
                        field_names=mconf.get('fieldNames', []),
                        function=mconf.get('function', ''))
                elif mconf.get('type') == 'quantile':
                    post_aggs[metric_name] = Quantile(
                        mconf.get('name', ''),
                        mconf.get('probability', ''),
                    )
                elif mconf.get('type') == 'quantiles':
                    post_aggs[metric_name] = Quantiles(
                        mconf.get('name', ''),
                        mconf.get('probabilities', ''),
                    )
                elif mconf.get('type') == 'fieldAccess':
                    post_aggs[metric_name] = Field(mconf.get('name'), '')
                elif mconf.get('type') == 'constant':
                    post_aggs[metric_name] = Const(mconf.get('value'),
                                                   output_name=mconf.get(
                                                       'name', ''))
                elif mconf.get('type') == 'hyperUniqueCardinality':
                    post_aggs[metric_name] = HyperUniqueCardinality(
                        mconf.get('name'), '')
                else:
                    post_aggs[metric_name] = Postaggregator(
                        mconf.get('fn', "/"), mconf.get('fields', []),
                        mconf.get('name', ''))

        aggregations = OrderedDict()
        for m in self.metrics:
            if m.metric_name in all_metrics:
                aggregations[m.metric_name] = m.json_obj

        rejected_metrics = [
            m.metric_name for m in self.metrics
            if m.is_restricted and m.metric_name in aggregations.keys()
            and not sm.has_access('metric_access', m.perm)
        ]

        if rejected_metrics:
            raise MetricPermException("Access to the metrics denied: " +
                                      ', '.join(rejected_metrics))

        # the dimensions list with dimensionSpecs expanded
        dimensions = []
        groupby = [gb for gb in groupby if gb in columns_dict]
        for column_name in groupby:
            col = columns_dict.get(column_name)
            dim_spec = col.dimension_spec
            if dim_spec:
                dimensions.append(dim_spec)
            else:
                dimensions.append(column_name)
        qry = dict(
            datasource=self.datasource_name,
            dimensions=dimensions,
            aggregations=aggregations,
            granularity=DruidDatasource.granularity(
                granularity,
                timezone=timezone,
                origin=extras.get('druid_time_origin'),
            ),
            post_aggregations=post_aggs,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )

        filters = self.get_filters(filter)
        if filters:
            qry['filter'] = filters

        having_filters = self.get_having_filters(extras.get('having_druid'))
        if having_filters:
            qry['having'] = having_filters

        orig_filters = filters
        if len(groupby) == 0:
            del qry['dimensions']
            client.timeseries(**qry)
        if not having_filters and len(groupby) == 1:
            qry['threshold'] = timeseries_limit or 1000
            if row_limit and granularity == 'all':
                qry['threshold'] = row_limit
            qry['dimension'] = list(qry.get('dimensions'))[0]
            del qry['dimensions']
            qry['metric'] = list(qry['aggregations'].keys())[0]
            client.topn(**qry)
        elif len(groupby) > 1 or having_filters:
            # If grouping on multiple fields or using a having filter
            # we have to force a groupby query
            if timeseries_limit and is_timeseries:
                order_by = metrics[0] if metrics else self.metrics[0]
                if timeseries_limit_metric:
                    order_by = timeseries_limit_metric
                # Limit on the number of timeseries, doing a two-phases query
                pre_qry = deepcopy(qry)
                pre_qry['granularity'] = "all"
                pre_qry['limit_spec'] = {
                    "type":
                    "default",
                    "limit":
                    timeseries_limit,
                    'intervals': (inner_from_dttm.isoformat() + '/' +
                                  inner_to_dttm.isoformat()),
                    "columns": [{
                        "dimension": order_by,
                        "direction": "descending",
                    }],
                }
                client.groupby(**pre_qry)
                query_str += "// Two phase query\n// Phase 1\n"
                query_str += json.dumps(
                    client.query_builder.last_query.query_dict, indent=2)
                query_str += "\n"
                if phase == 1:
                    return query_str
                query_str += (
                    "//\nPhase 2 (built based on phase one's results)\n")
                df = client.export_pandas()
                if df is not None and not df.empty:
                    dims = qry['dimensions']
                    filters = []
                    for unused, row in df.iterrows():
                        fields = []
                        for dim in dims:
                            f = Dimension(dim) == row[dim]
                            fields.append(f)
                        if len(fields) > 1:
                            filt = Filter(type="and", fields=fields)
                            filters.append(filt)
                        elif fields:
                            filters.append(fields[0])

                    if filters:
                        ff = Filter(type="or", fields=filters)
                        if not orig_filters:
                            qry['filter'] = ff
                        else:
                            qry['filter'] = Filter(type="and",
                                                   fields=[ff, orig_filters])
                    qry['limit_spec'] = None
            if row_limit:
                qry['limit_spec'] = {
                    "type":
                    "default",
                    "limit":
                    row_limit,
                    "columns": [{
                        "dimension":
                        (metrics[0] if metrics else self.metrics[0]),
                        "direction":
                        "descending",
                    }],
                }
            client.groupby(**qry)
        query_str += json.dumps(client.query_builder.last_query.query_dict,
                                indent=2)
        return query_str
Exemplo n.º 24
0
from pydruid.utils.filters import Dimension, Filter
from pydruid.utils.aggregators import count, doublesum

from config.aggregation_rules import get_granularity_for_interval
from config.database import DATASOURCE
from config.system import STANDARD_DATA_DATE_FORMAT

# Import query tools.
from db.druid.query_client import DruidQueryClient
from db.druid.util import unpack_time_interval

TODAY_DATE_STR = datetime.strftime(datetime.now(), STANDARD_DATA_DATE_FORMAT)
START_DATE_STR = '2014-01-01'
DEFAULT_DATE_COLUMN = 'timestamp'
DEFAULT_FIELD = 'field'
DEFAULT_FILTER = Dimension('nation') == ''

QUERY = DruidQueryBuilder()


class PydruidQuery(object):
    ''' Class to query all data from the Zen druid db.
    Returned dataframe dimensions are; rows: time dimension, columns: indicators.
    Example usage (query all indicators summed at the national level):
        >> import os
        >> os.environ['ZEN_ENV'] = 'et'
        >> from pydruid_query import PydruidQuery
        >> conn = PydruidQuery()
        >> result_df = conn.fetch_data(include_count=True)'''

    def __init__(self, query_client=None):
Exemplo n.º 25
0
    def query(
        self,
        groupby,
        metrics,
        granularity,
        from_dttm,
        to_dttm,
        filter=None,  # noqa
        is_timeseries=True,
        timeseries_limit=None,
        row_limit=None,
        inner_from_dttm=None,
        inner_to_dttm=None,
        extras=None,  # noqa
        select=None,
    ):  # noqa
        """Runs a query against Druid and returns a dataframe.

        This query interface is common to SqlAlchemy and Druid
        """
        # TODO refactor into using a TBD Query object
        qry_start_dttm = datetime.now()

        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ"))
        to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ"))

        query_str = ""
        aggregations = {
            m.metric_name: m.json_obj
            for m in self.metrics if m.metric_name in metrics
        }
        granularity = granularity or "all"
        if granularity != "all":
            granularity = utils.parse_human_timedelta(
                granularity).total_seconds() * 1000
        if not isinstance(granularity, string_types):
            granularity = {"type": "duration", "duration": granularity}

        qry = dict(
            datasource=self.datasource_name,
            dimensions=groupby,
            aggregations=aggregations,
            granularity=granularity,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )
        filters = None
        for col, op, eq in filter:
            cond = None
            if op == '==':
                cond = Dimension(col) == eq
            elif op == '!=':
                cond = ~(Dimension(col) == eq)
            elif op in ('in', 'not in'):
                fields = []
                splitted = eq.split(',')
                if len(splitted) > 1:
                    for s in eq.split(','):
                        s = s.strip()
                        fields.append(Filter.build_filter(Dimension(col) == s))
                    cond = Filter(type="or", fields=fields)
                else:
                    cond = Dimension(col) == eq
                if op == 'not in':
                    cond = ~cond
            if filters:
                filters = Filter(type="and",
                                 fields=[
                                     Filter.build_filter(cond),
                                     Filter.build_filter(filters)
                                 ])
            else:
                filters = cond

        if filters:
            qry['filter'] = filters

        client = self.cluster.get_pydruid_client()
        orig_filters = filters
        if timeseries_limit and is_timeseries:
            # Limit on the number of timeseries, doing a two-phases query
            pre_qry = deepcopy(qry)
            pre_qry['granularity'] = "all"
            pre_qry['limit_spec'] = {
                "type":
                "default",
                "limit":
                timeseries_limit,
                'intervals': (inner_from_dttm.isoformat() + '/' +
                              inner_to_dttm.isoformat()),
                "columns": [{
                    "dimension":
                    metrics[0] if metrics else self.metrics[0],
                    "direction":
                    "descending",
                }],
            }
            client.groupby(**pre_qry)
            query_str += "// Two phase query\n// Phase 1\n"
            query_str += json.dumps(client.query_dict, indent=2) + "\n"
            query_str += "//\nPhase 2 (built based on phase one's results)\n"
            df = client.export_pandas()
            if df is not None and not df.empty:
                dims = qry['dimensions']
                filters = []
                for _, row in df.iterrows():
                    fields = []
                    for dim in dims:
                        f = Filter.build_filter(Dimension(dim) == row[dim])
                        fields.append(f)
                    if len(fields) > 1:
                        filt = Filter(type="and", fields=fields)
                        filters.append(Filter.build_filter(filt))
                    elif fields:
                        filters.append(fields[0])

                if filters:
                    ff = Filter(type="or", fields=filters)
                    if not orig_filters:
                        qry['filter'] = ff
                    else:
                        qry['filter'] = Filter(
                            type="and",
                            fields=[
                                Filter.build_filter(ff),
                                Filter.build_filter(orig_filters)
                            ])
                qry['limit_spec'] = None
        if row_limit:
            qry['limit_spec'] = {
                "type":
                "default",
                "limit":
                row_limit,
                "columns": [{
                    "dimension":
                    metrics[0] if metrics else self.metrics[0],
                    "direction":
                    "descending",
                }],
            }
        client.groupby(**qry)
        query_str += json.dumps(client.query_dict, indent=2)
        df = client.export_pandas()
        if df is None or df.size == 0:
            raise Exception("No data was returned.")

        if (not is_timeseries and granularity == "all"
                and 'timestamp' in df.columns):
            del df['timestamp']

        # Reordering columns
        cols = []
        if 'timestamp' in df.columns:
            cols += ['timestamp']
        cols += [col for col in groupby if col in df.columns]
        cols += [col for col in metrics if col in df.columns]
        cols += [col for col in df.columns if col not in cols]
        df = df[cols]
        return QueryResult(df=df,
                           query=query_str,
                           duration=datetime.now() - qry_start_dttm)
Exemplo n.º 26
0
    def run_query(self):
        '''
        Constructs and runs the Druid request for this query. The query is
        blocking.
        '''

        LOG.info('Running query...')

        # Filter the dimensions using the location filters passed in
        dimension_filter = GroupByQueryBuilder.build_dimension_filter(
            self.location_filters
        )

        # AND the selected locations with the non-location filters requested
        dimension_filter &= self.non_hierarchical_filter

        # Slice by selected granularity + all fields less specific than it. For
        # example, if user makes a Woreda query, we also want to slice by Zone
        # and Region.
        if self.geo_field:
            # Restrict query to non-null for the given geo
            dimension_filter &= Dimension(self.geo_field) != ''

            # Set the appropriate dimensions for this query
            self.druid_slice_dimensions = self.get_slice_dimensions()
            if self.latitude_field and self.longitude_field:
                self.druid_geo_dimensions = [self.latitude_field, self.longitude_field]

        grouping_fields = self.druid_slice_dimensions + self.druid_geo_dimensions

        batches = []
        overall_interval = build_time_interval(self.start_date, self.end_date)
        for selected_granularity in self.selected_granularities:
            granularity = selected_granularity
            intervals = [overall_interval]  # Druid expects time intervals as
            # a list
            granularity = current_app.zen_config.aggregation_rules.get_granularity_for_interval(
                selected_granularity, self.start_date, self.end_date
            )

            query = GroupByQueryBuilder(
                datasource=current_app.druid_context.current_datasource.name,
                granularity=granularity,
                grouping_fields=grouping_fields,
                intervals=intervals,
                calculation=self.calculation,
                dimension_filter=dimension_filter,
            )

            batch = QueryBatch(
                query,
                selected_granularity,
                self.geo_field,
                self.latitude_field,
                self.longitude_field,
                self.ordered_fields,
                self.denom,
                self.druid_slice_dimensions,
                self.query_client,
            )
            batches.append(batch)

        num_granularities = len(self.selected_granularities)
        if USE_THREAD_POOL and num_granularities > 1:
            pool = ThreadPool(num_granularities)
            pool.map(QueryBatch.run, batches)
            pool.close()
            pool.join()
        else:
            _ = [batch.run() for batch in batches]

        self.batches = batches
        return True
Exemplo n.º 27
0
    def get_filters(raw_filters, num_cols):  # noqa
        filters = None
        for flt in raw_filters:
            if not all(f in flt for f in ['col', 'op', 'val']):
                continue

            col = flt['col']
            op = flt['op']
            eq = flt['val']
            cond = None
            if op in ('in', 'not in'):
                eq = [
                    types.replace('"', '').strip() if isinstance(
                        types, string_types) else types for types in eq
                ]
            elif not isinstance(flt['val'], string_types):
                eq = eq[0] if eq and len(eq) > 0 else ''

            is_numeric_col = col in num_cols
            if is_numeric_col:
                if op in ('in', 'not in'):
                    eq = [utils.string_to_num(v) for v in eq]
                else:
                    eq = utils.string_to_num(eq)

            if op == '==':
                cond = Dimension(col) == eq
            elif op == '!=':
                cond = Dimension(col) != eq
            elif op in ('in', 'not in'):
                fields = []

                # ignore the filter if it has no value
                if not len(eq):
                    continue
                elif len(eq) == 1:
                    cond = Dimension(col) == eq[0]
                else:
                    for s in eq:
                        fields.append(Dimension(col) == s)
                    cond = Filter(type='or', fields=fields)

                if op == 'not in':
                    cond = ~cond

            elif op == 'regex':
                cond = Filter(type='regex', pattern=eq, dimension=col)
            elif op == '>=':
                cond = Bound(col, eq, None, alphaNumeric=is_numeric_col)
            elif op == '<=':
                cond = Bound(col, None, eq, alphaNumeric=is_numeric_col)
            elif op == '>':
                cond = Bound(
                    col,
                    eq,
                    None,
                    lowerStrict=True,
                    alphaNumeric=is_numeric_col,
                )
            elif op == '<':
                cond = Bound(
                    col,
                    None,
                    eq,
                    upperStrict=True,
                    alphaNumeric=is_numeric_col,
                )

            if filters:
                filters = Filter(type='and', fields=[
                    cond,
                    filters,
                ])
            else:
                filters = cond

        return filters
Exemplo n.º 28
0
    def query(
            self, groupby, metrics,
            granularity,
            from_dttm, to_dttm,
            limit_spec=None,
            filter=None,
            is_timeseries=True,
            timeseries_limit=None,
            row_limit=None,
            inner_from_dttm=None, inner_to_dttm=None,
            extras=None):
        qry_start_dttm = datetime.now()

        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ"))
        to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ"))

        query_str = ""
        aggregations = {
            m.metric_name: m.json_obj
            for m in self.metrics if m.metric_name in metrics
        }
        if not isinstance(granularity, basestring):
            granularity = {"type": "duration", "duration": granularity}

        qry = dict(
            datasource=self.datasource_name,
            dimensions=groupby,
            aggregations=aggregations,
            granularity=granularity,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )
        filters = None
        for col, op, eq in filter:
            cond = None
            if op == '==':
                cond = Dimension(col) == eq
            elif op == '!=':
                cond = ~(Dimension(col) == eq)
            elif op in ('in', 'not in'):
                fields = []
                splitted = eq.split(',')
                if len(splitted) > 1:
                    for s in eq.split(','):
                        s = s.strip()
                        fields.append(Filter.build_filter(Dimension(col) == s))
                    cond = Filter(type="or", fields=fields)
                else:
                    cond = Dimension(col) == eq
                if op == 'not in':
                    cond = ~cond
            if filters:
                filters = Filter(type="and", fields=[
                    Filter.build_filter(cond),
                    Filter.build_filter(filters)
                ])
            else:
                filters = cond

        if filters:
            qry['filter'] = filters

        client = self.cluster.get_pydruid_client()
        orig_filters = filters
        if timeseries_limit and is_timeseries:
            # Limit on the number of timeseries, doing a two-phases query
            pre_qry = deepcopy(qry)
            pre_qry['granularity'] = "all"
            pre_qry['limit_spec'] = {
                "type": "default",
                "limit": timeseries_limit,
                'intervals': inner_from_dttm.isoformat() + '/' + inner_to_dttm.isoformat(),
                "columns": [{
                    "dimension": metrics[0] if metrics else self.metrics[0],
                    "direction": "descending",
                }],
            }
            client.groupby(**pre_qry)
            query_str += "// Two phase query\n// Phase 1\n"
            query_str += json.dumps(client.query_dict, indent=2) + "\n"
            query_str += "//\nPhase 2 (built based on phase one's results)\n"
            df = client.export_pandas()
            if df is not None and not df.empty:
                dims = qry['dimensions']
                filters = []
                for index, row in df.iterrows():
                    fields = []
                    for dim in dims:
                        f = Filter.build_filter(Dimension(dim) == row[dim])
                        fields.append(f)
                    if len(fields) > 1:
                        filt = Filter(type="and", fields=fields)
                        filters.append(Filter.build_filter(filt))
                    elif fields:
                        filters.append(fields[0])

                if filters:
                    ff = Filter(type="or", fields=filters)
                    if not orig_filters:
                        qry['filter'] = ff
                    else:
                        qry['filter'] = Filter(type="and", fields=[
                            Filter.build_filter(ff),
                            Filter.build_filter(orig_filters)])
                qry['limit_spec'] = None
        if row_limit:
            qry['limit_spec'] = {
                "type": "default",
                "limit": row_limit,
                "columns": [{
                    "dimension": metrics[0] if metrics else self.metrics[0],
                    "direction": "descending",
                }],
            }
        client.groupby(**qry)
        query_str += json.dumps(client.query_dict, indent=2)
        df = client.export_pandas()
        return QueryResult(
            df=df,
            query=query_str,
            duration=datetime.now() - qry_start_dttm)
Exemplo n.º 29
0
    def query(  # druid
            self, groupby, metrics,
            granularity,
            from_dttm, to_dttm,
            filter=None,  # noqa
            is_timeseries=True,
            timeseries_limit=None,
            row_limit=None,
            inner_from_dttm=None, inner_to_dttm=None,
            extras=None,  # noqa
            select=None,):  # noqa
        """Runs a query against Druid and returns a dataframe.

        This query interface is common to SqlAlchemy and Druid
        """
        # TODO refactor into using a TBD Query object
        qry_start_dttm = datetime.now()

        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ"))
        to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ"))

        query_str = ""
        metrics_dict = {m.metric_name: m for m in self.metrics}
        all_metrics = []
        post_aggs = {}

        def recursive_get_fields(_conf):
            _fields = _conf.get('fields', [])
            field_names = []
            for _f in _fields:
                _type = _f.get('type')
                if _type in ['fieldAccess', 'hyperUniqueCardinality']:
                    field_names.append(_f.get('fieldName'))
                elif _type == 'arithmetic':
                    field_names += recursive_get_fields(_f)

            return list(set(field_names))

        for metric_name in metrics:
            metric = metrics_dict[metric_name]
            if metric.metric_type != 'postagg':
                all_metrics.append(metric_name)
            else:
                conf = metric.json_obj
                all_metrics += recursive_get_fields(conf)
                all_metrics += conf.get('fieldNames', [])
                if conf.get('type') == 'javascript':
                    post_aggs[metric_name] = JavascriptPostAggregator(
                        name=conf.get('name'),
                        field_names=conf.get('fieldNames'),
                        function=conf.get('function'))
                else:
                    post_aggs[metric_name] = Postaggregator(
                        conf.get('fn', "/"),
                        conf.get('fields', []),
                        conf.get('name', ''))

        aggregations = {
            m.metric_name: m.json_obj
            for m in self.metrics
            if m.metric_name in all_metrics
            }

        rejected_metrics = [
            m.metric_name for m in self.metrics
            if m.is_restricted and
            m.metric_name in aggregations.keys() and
            not sm.has_access('metric_access', m.perm)
            ]

        if rejected_metrics:
            raise MetricPermException(
                "Access to the metrics denied: " + ', '.join(rejected_metrics)
            )

        granularity = granularity or "all"
        if granularity != "all":
            granularity = utils.parse_human_timedelta(
                granularity).total_seconds() * 1000
        if not isinstance(granularity, string_types):
            granularity = {"type": "duration", "duration": granularity}
            origin = extras.get('druid_time_origin')
            if origin:
                dttm = utils.parse_human_datetime(origin)
                granularity['origin'] = dttm.isoformat()

        qry = dict(
            datasource=self.datasource_name,
            dimensions=groupby,
            aggregations=aggregations,
            granularity=granularity,
            post_aggregations=post_aggs,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )
        filters = None
        for col, op, eq in filter:
            cond = None
            if op == '==':
                cond = Dimension(col) == eq
            elif op == '!=':
                cond = ~(Dimension(col) == eq)
            elif op in ('in', 'not in'):
                fields = []
                splitted = eq.split(',')
                if len(splitted) > 1:
                    for s in eq.split(','):
                        s = s.strip()
                        fields.append(Dimension(col) == s)
                    cond = Filter(type="or", fields=fields)
                else:
                    cond = Dimension(col) == eq
                if op == 'not in':
                    cond = ~cond
            elif op == 'regex':
                cond = Filter(type="regex", pattern=eq, dimension=col)
            if filters:
                filters = Filter(type="and", fields=[
                    cond,
                    filters
                ])
            else:
                filters = cond

        if filters:
            qry['filter'] = filters

        client = self.cluster.get_pydruid_client()
        orig_filters = filters
        if timeseries_limit and is_timeseries:
            # Limit on the number of timeseries, doing a two-phases query
            pre_qry = deepcopy(qry)
            pre_qry['granularity'] = "all"
            pre_qry['limit_spec'] = {
                "type": "default",
                "limit": timeseries_limit,
                'intervals': (
                    inner_from_dttm.isoformat() + '/' +
                    inner_to_dttm.isoformat()),
                "columns": [{
                    "dimension": metrics[0] if metrics else self.metrics[0],
                    "direction": "descending",
                }],
            }
            client.groupby(**pre_qry)
            query_str += "// Two phase query\n// Phase 1\n"
            query_str += json.dumps(
                client.query_builder.last_query.query_dict, indent=2) + "\n"
            query_str += "//\nPhase 2 (built based on phase one's results)\n"
            df = client.export_pandas()
            if df is not None and not df.empty:
                dims = qry['dimensions']
                filters = []
                for unused, row in df.iterrows():
                    fields = []
                    for dim in dims:
                        f = Dimension(dim) == row[dim]
                        fields.append(f)
                    if len(fields) > 1:
                        filt = Filter(type="and", fields=fields)
                        filters.append(filt)
                    elif fields:
                        filters.append(fields[0])

                if filters:
                    ff = Filter(type="or", fields=filters)
                    if not orig_filters:
                        qry['filter'] = ff
                    else:
                        qry['filter'] = Filter(type="and", fields=[
                            ff,
                            orig_filters])
                qry['limit_spec'] = None
        if row_limit:
            qry['limit_spec'] = {
                "type": "default",
                "limit": row_limit,
                "columns": [{
                    "dimension": metrics[0] if metrics else self.metrics[0],
                    "direction": "descending",
                }],
            }
        client.groupby(**qry)
        query_str += json.dumps(
            client.query_builder.last_query.query_dict, indent=2)
        df = client.export_pandas()
        if df is None or df.size == 0:
            raise Exception(_("No data was returned."))

        if (
                not is_timeseries and
                granularity == "all" and
                'timestamp' in df.columns):
            del df['timestamp']

        # Reordering columns
        cols = []
        if 'timestamp' in df.columns:
            cols += ['timestamp']
        cols += [col for col in groupby if col in df.columns]
        cols += [col for col in metrics if col in df.columns]
        df = df[cols]
        return QueryResult(
            df=df,
            query=query_str,
            duration=datetime.now() - qry_start_dttm)
Exemplo n.º 30
0
    def parse_arguments(self):
        self.request_is_demo = self.request_data.get('demo')

        # Parse overall modifiers.
        self.use_randomized_data = USE_RANDOMIZED_DATA or self.request_is_demo

        # Location filters are default OR.
        # TODO(stephen, ian): When needed, allow more complex filtering
        filters = self.request_data.get('filters', [])
        for f in filters:
            if not len(list(f.keys())):
                # Skip empty filters.
                continue

            # HACK(stephen): Handle both hierarchical dimension filters (which
            # should be OR'd together) and non-hierarchical dimensions (which
            # should all be AND'd together) with the location filters
            first_key = list(f.keys())[0]
            if len(f) == 1 and first_key not in self.all_geo_dimensions:
                self.non_hierarchical_filter &= (
                    Dimension(first_key) == list(f.values())[0]
                )
                continue

            location_filter = {}
            # Validate that the dimensions being filtered on actually exist
            for key, value in list(f.items()):
                # NOTE(stephen): This should never happen
                if key not in self.all_geo_dimensions:
                    LOG.warn(
                        'A location filter contains non-location columns to '
                        'filter by. Filter: %s',
                        f,
                    )
                location_filter[key] = value
            if location_filter:
                self.location_filters.append(location_filter)

        geo_granularity = self.request_data.get('granularity')
        if geo_granularity != NATION_GEO_FIELD:
            latlng_fields = current_app.zen_config.aggregation.GEO_TO_LATLNG_FIELD.get(
                geo_granularity
            )
            if latlng_fields:
                self.latitude_field = latlng_fields[0]
                self.longitude_field = latlng_fields[1]
            self.geo_field = geo_granularity

        # Capture requested fields
        request_fields = self.request_data.get('fields', [])

        # Parse denominator
        denom = self.request_data.get('denominator')
        if denom:
            if denom in current_app.zen_config.indicators.VALID_FIELDS:
                self.denom = denom
                request_fields.append(denom)
            else:
                error_msg = 'Invalid denominator specified: %s' % denom
                self.response = Error(error_msg)
                return False

        # Deduplicate field list while maintaining the user's selected order
        # since the frontend has implicit requirements around field ordering
        for field in request_fields:
            self.data_fields.add(field)

            # TODO(stephen): Is this even necessary? Can the frontend send
            # duplicate fields? Also, would love an ordered set here instead
            # of searching the list.
            if field not in self.ordered_fields:
                self.ordered_fields.append(field)

        bad_fields = self.data_fields - current_app.zen_config.indicators.VALID_FIELDS
        if bad_fields:
            error_msg = 'Invalid fields specified: %s' % ', '.join(bad_fields)
            self.response = Error(error_msg)
            return False

        self.selected_granularities = self.request_data.get(
            'granularities', DEFAULT_GRANULARITIES
        )

        self.calculation = current_app.zen_config.aggregation_rules.get_calculation_for_fields(
            self.data_fields
        )
        self.calculation.set_strict_null_fields(self.data_fields)

        # Get dates
        # TODO(stephen, ian): Validate these
        self.start_date = datetime.strptime(
            self.request_data.get('start_date'), STANDARD_DATA_DATE_FORMAT
        ).date()
        self.end_date = datetime.strptime(
            self.request_data.get('end_date'), STANDARD_DATA_DATE_FORMAT
        ).date()
        self.time_bucket = self.request_data.get('time_bucket', DEFAULT_TIME_BUCKET)
        return True