Exemplo n.º 1
0
 def query_filters(self):
     args = self.form_data
     # Building filters
     filters = None
     for i in range(1, 10):
         col = args.get("flt_col_" + str(i))
         op = args.get("flt_op_" + str(i))
         eq = args.get("flt_eq_" + str(i))
         if col and op and eq:
             cond = None
             if op == '==':
                 cond = Dimension(col)==eq
             elif op == '!=':
                 cond = ~(Dimension(col)==eq)
             elif op in ('in', 'not in'):
                 fields = []
                 splitted = eq.split(',')
                 if len(splitted) > 1:
                     for s in eq.split(','):
                         s = s.strip()
                         fields.append(Filter.build_filter(Dimension(col)==s))
                     cond = Filter(type="or", fields=fields)
                 else:
                     cond = Dimension(col)==eq
                 if op == 'not in':
                     cond = ~cond
             if filters:
                 filters = Filter(type="and", fields=[
                     Filter.build_filter(cond),
                     Filter.build_filter(filters)
                 ])
             else:
                 filters = cond
     return filters
Exemplo n.º 2
0
    def bake_query(self):
        """
        Doing a 2 phase query where we limit the number of series.
        """
        client = utils.get_pydruid_client()
        qry = self.query_obj()
        orig_filter = qry['filter'] if 'filter' in qry else ''
        qry['granularity'] = "all"
        client.groupby(**qry)
        df = client.export_pandas()
        if not df is None:
            dims =  qry['dimensions']
            filters = []
            for index, row in df.iterrows():
                fields = []
                for dim in dims:
                    f = Filter.build_filter(Dimension(dim) == row[dim])
                    fields.append(f)
                if len(fields) > 1:
                    filters.append(Filter.build_filter(Filter(type="and", fields=fields)))
                elif fields:
                    filters.append(fields[0])

            qry = self.query_obj()
            if filters:
                ff = Filter(type="or", fields=filters)
                if not orig_filter:
                    qry['filter'] = ff
                else:
                    qry['filter'] = Filter(type="and", fields=[
                        Filter.build_filter(ff),
                        Filter.build_filter(orig_filter)])
            del qry['limit_spec']
            client.groupby(**qry)
        return client.export_pandas()
Exemplo n.º 3
0
    def timeseries(self, datasource, granularity, descending, intervals,
                   aggregations, context, filter):
        f = Filter.build_filter(filter)
        if f['type'] == 'and' and f['fields'][0]['type'] == 'selector' and \
           f['fields'][0]['dimension'] == 'agent_id' and \
           f['fields'][1]['type'] == 'selector' and \
           f['fields'][1]['dimension'] == 'process_name':
            # agent_id = f['fields'][0]['value']
            # process_name = f['fields'][1]['value']

            (interval_start, interval_end) = \
                parsers.interval(intervals)
            if interval_end is None:
                interval_end = datetime.now()

            if granularity in DruidAccessLayer.timeseries_granularities:
                query_granularity = self.__granularity_to_timedelta__(
                    granularity)
            else:
                query_granularity = parsers.duration(granularity['period'])

            body = []
            curr_time = interval_start

            while curr_time < interval_end:
                timestamp = curr_time.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
                body.append({'timestamp': timestamp,
                             'result': {'cpu': random.uniform(0, 1),
                                        'mem': random.randint(1, 10000000)}})
                curr_time += query_granularity
        else:
            body = []
        return PyDruidResultMock(body)
Exemplo n.º 4
0
    def build_query(self, query_type, args):
        """
        Build query based on given query type and arguments.

        :param string query_type: a type of query
        :param dict args: the dict of args to be sent
        :return: the resulting query
        :rtype: Query
        """
        query_dict = {'queryType': query_type}

        for key, val in six.iteritems(args):
            if key == 'aggregations':
                query_dict[key] = build_aggregators(val)
            elif key == 'post_aggregations':
                query_dict['postAggregations'] = Postaggregator.build_post_aggregators(val)
            elif key == 'datasource':
                query_dict['dataSource'] = val
            elif key == 'paging_spec':
                query_dict['pagingSpec'] = val
            elif key == 'limit_spec':
                query_dict['limitSpec'] = val
            elif key == "filter":
                query_dict[key] = Filter.build_filter(val)
            elif key == "having":
                query_dict[key] = Having.build_having(val)
            elif key == 'dimension':
                query_dict[key] = build_dimension(val)
            elif key == 'dimensions':
                query_dict[key] = [build_dimension(v) for v in val]
            else:
                query_dict[key] = val

        self.last_query = Query(query_dict, query_type)
        return self.last_query
Exemplo n.º 5
0
    def build_query(self, query_type, args):
        """
        Build query based on given query type and arguments.

        :param string query_type: a type of query
        :param dict args: the dict of args to be sent
        :return: the resulting query
        :rtype: Query
        """
        query_dict = {'queryType': query_type}

        for key, val in six.iteritems(args):
            if key == 'aggregations':
                query_dict[key] = build_aggregators(val)
            elif key == 'post_aggregations':
                query_dict['postAggregations'] = Postaggregator.build_post_aggregators(val)
            elif key == 'datasource':
                query_dict['dataSource'] = val
            elif key == 'paging_spec':
                query_dict['pagingSpec'] = val
            elif key == 'limit_spec':
                query_dict['limitSpec'] = val
            elif key == "filter":
                query_dict[key] = Filter.build_filter(val)
            elif key == "having":
                query_dict[key] = Having.build_having(val)
            elif key == 'dimension':
                query_dict[key] = build_dimension(val)
            elif key == 'dimensions':
                query_dict[key] = [build_dimension(v) for v in val]
            else:
                query_dict[key] = val

        self.last_query = Query(query_dict, query_type)
        return self.last_query
Exemplo n.º 6
0
    def bake_query(self):
        """
        Doing a 2 phase query where we limit the number of series.
        """
        client = utils.get_pydruid_client()
        qry = self.query_obj()
        orig_filter = qry['filter'] if 'filter' in qry else ''
        qry['granularity'] = "all"
        client.groupby(**qry)
        df = client.export_pandas()
        if not df is None:
            dims = qry['dimensions']
            filters = []
            for index, row in df.iterrows():
                fields = []
                for dim in dims:
                    f = Filter.build_filter(Dimension(dim) == row[dim])
                    fields.append(f)
                if len(fields) > 1:
                    filters.append(
                        Filter.build_filter(Filter(type="and", fields=fields)))
                elif fields:
                    filters.append(fields[0])

            qry = self.query_obj()
            if filters:
                ff = Filter(type="or", fields=filters)
                if not orig_filter:
                    qry['filter'] = ff
                else:
                    qry['filter'] = Filter(type="and",
                                           fields=[
                                               Filter.build_filter(ff),
                                               Filter.build_filter(orig_filter)
                                           ])
            del qry['limit_spec']
            client.groupby(**qry)
        return client.export_pandas()
Exemplo n.º 7
0
 def test_query_filter_having(self):
     f1 = Filter(type="selector", dimension="foo", value="bar")
     query_filter = Filter.build_filter(f1)
     h1 = Having(type="filter", filter=query_filter)
     actual = Having.build_having(h1)
     expected = {
         "type": "filter",
         "filter": {
             "type": "selector",
             "dimension": "foo",
             "value": "bar"
         },
     }
     assert actual == expected
Exemplo n.º 8
0
 def groupby(self, datasource, granularity, intervals, dimensions,
             filter, aggregations):
     f = Filter.build_filter(filter)
     if f['type'] == 'selector' and \
        f['dimension'] == 'agent_id' and 'value' in f:
         try:
             filename = 'groupby{0}.json'.format(f['value'])
             filepath = os.path.join(resources, filename)
             body = open(filepath).read().decode('utf-8')
         except:
             body = '[]'
     else:
         body = '[]'
     return PyDruidResultMock(body)
Exemplo n.º 9
0
def _build_filter_workaround(filter_obj):
    # make a copy so we don't overwrite the original objects stored fields
    raw_filter = filter_obj.filter['filter'].copy()
    filter_type = raw_filter.get('type')

    if not filter_type:
        return None
    if filter_type in ['and', 'or']:
        raw_filter['fields'] = [
            _build_filter_workaround(f) for f in raw_filter['fields']
        ]
    elif filter_type in ['not']:
        raw_filter['field'] = Filter.build_filter(raw_filter['field'])
    return raw_filter
Exemplo n.º 10
0
 def query_filters(self):
     args = self.form_data
     # Building filters
     filters = None
     for i in range(1, 10):
         col = args.get("flt_col_" + str(i))
         op = args.get("flt_op_" + str(i))
         eq = args.get("flt_eq_" + str(i))
         if col and op and eq:
             cond = None
             if op == '==':
                 cond = Dimension(col) == eq
             elif op == '!=':
                 cond = ~(Dimension(col) == eq)
             elif op in ('in', 'not in'):
                 fields = []
                 splitted = eq.split(',')
                 if len(splitted) > 1:
                     for s in eq.split(','):
                         s = s.strip()
                         fields.append(
                             Filter.build_filter(Dimension(col) == s))
                     cond = Filter(type="or", fields=fields)
                 else:
                     cond = Dimension(col) == eq
                 if op == 'not in':
                     cond = ~cond
             if filters:
                 filters = Filter(type="and",
                                  fields=[
                                      Filter.build_filter(cond),
                                      Filter.build_filter(filters)
                                  ])
             else:
                 filters = cond
     return filters
Exemplo n.º 11
0
    def build_query(self, query_type, args):
        """
        Build query based on given query type and arguments.

        :param string query_type: a type of query
        :param dict args: the dict of args to be sent
        :return: the resulting query
        :rtype: Query
        """
        query_dict = {"queryType": query_type}

        for key, val in six.iteritems(args):
            if key == "aggregations":
                query_dict[key] = build_aggregators(val)
            elif key == "post_aggregations":
                query_dict[
                    "postAggregations"] = Postaggregator.build_post_aggregators(
                        val)
            elif key == "context":
                query_dict["context"] = val
            elif key == "datasource":
                query_dict["dataSource"] = self.parse_datasource(
                    val, query_type)
            elif key == "paging_spec":
                query_dict["pagingSpec"] = val
            elif key == "limit_spec":
                query_dict["limitSpec"] = val
            elif key == "filter" and val is not None:
                query_dict[key] = Filter.build_filter(val)
            elif key == "having" and val is not None:
                query_dict[key] = Having.build_having(val)
            elif key == "dimension" and val is not None:
                query_dict[key] = build_dimension(val)
            elif key == "dimensions":
                query_dict[key] = [build_dimension(v) for v in val]
            else:
                query_dict[key] = val

        self.last_query = Query(query_dict, query_type)
        return self.last_query
Exemplo n.º 12
0
    def build_query_dict(self, query_type, args):
        query_dict = {"queryType": query_type}

        for key, val in six.iteritems(args):
            if key == "aggregations":
                query_dict[key] = build_aggregators(val)
            elif key == "post_aggregations":
                query_dict[
                    "postAggregations"] = Postaggregator.build_post_aggregators(
                        val)
            elif key == "context":
                query_dict["context"] = val
            elif key == "datasource":
                query_dict["dataSource"] = self.parse_datasource(
                    val, query_type)
            elif key == "paging_spec":
                query_dict["pagingSpec"] = val
            elif key == "limit_spec":
                query_dict["limitSpec"] = val
            elif key == "filter" and val is not None:
                query_dict[key] = Filter.build_filter(val)
            elif key == "having" and val is not None:
                query_dict[key] = Having.build_having(val)
            elif key == "dimension" and val is not None:
                query_dict[key] = build_dimension(val)
            elif key == "dimensions":
                query_dict[key] = [build_dimension(v) for v in val]
            elif key == 'virtualColumns':
                query_dict[key] = [
                    VirtualColumn.build_virtual_column(v) for v in val
                ]
            elif key == 'sub_query' and val is not None:
                query_dict['dataSource'] = {
                    'type': 'query',
                    'query': self.build_query_dict(query_type, val)
                }
            elif val is not None:
                query_dict[key] = val
        return query_dict
Exemplo n.º 13
0
    def query(
            self, groupby, metrics,
            granularity,
            from_dttm, to_dttm,
            limit_spec=None,
            filter=None,
            is_timeseries=True,
            timeseries_limit=None,
            row_limit=None,
            inner_from_dttm=None, inner_to_dttm=None,
            extras=None):
        qry_start_dttm = datetime.now()

        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ"))
        to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ"))

        query_str = ""
        aggregations = {
            m.metric_name: m.json_obj
            for m in self.metrics if m.metric_name in metrics
        }
        if granularity != "all":
            granularity = utils.parse_human_timedelta(
                granularity).total_seconds() * 1000
        if not isinstance(granularity, basestring):
            granularity = {"type": "duration", "duration": granularity}

        qry = dict(
            datasource=self.datasource_name,
            dimensions=groupby,
            aggregations=aggregations,
            granularity=granularity,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )
        filters = None
        for col, op, eq in filter:
            cond = None
            if op == '==':
                cond = Dimension(col) == eq
            elif op == '!=':
                cond = ~(Dimension(col) == eq)
            elif op in ('in', 'not in'):
                fields = []
                splitted = eq.split(',')
                if len(splitted) > 1:
                    for s in eq.split(','):
                        s = s.strip()
                        fields.append(Filter.build_filter(Dimension(col) == s))
                    cond = Filter(type="or", fields=fields)
                else:
                    cond = Dimension(col) == eq
                if op == 'not in':
                    cond = ~cond
            if filters:
                filters = Filter(type="and", fields=[
                    Filter.build_filter(cond),
                    Filter.build_filter(filters)
                ])
            else:
                filters = cond

        if filters:
            qry['filter'] = filters

        client = self.cluster.get_pydruid_client()
        orig_filters = filters
        if timeseries_limit and is_timeseries:
            # Limit on the number of timeseries, doing a two-phases query
            pre_qry = deepcopy(qry)
            pre_qry['granularity'] = "all"
            pre_qry['limit_spec'] = {
                "type": "default",
                "limit": timeseries_limit,
                'intervals': inner_from_dttm.isoformat() + '/' + inner_to_dttm.isoformat(),
                "columns": [{
                    "dimension": metrics[0] if metrics else self.metrics[0],
                    "direction": "descending",
                }],
            }
            client.groupby(**pre_qry)
            query_str += "// Two phase query\n// Phase 1\n"
            query_str += json.dumps(client.query_dict, indent=2) + "\n"
            query_str += "//\nPhase 2 (built based on phase one's results)\n"
            df = client.export_pandas()
            if df is not None and not df.empty:
                dims = qry['dimensions']
                filters = []
                for index, row in df.iterrows():
                    fields = []
                    for dim in dims:
                        f = Filter.build_filter(Dimension(dim) == row[dim])
                        fields.append(f)
                    if len(fields) > 1:
                        filt = Filter(type="and", fields=fields)
                        filters.append(Filter.build_filter(filt))
                    elif fields:
                        filters.append(fields[0])

                if filters:
                    ff = Filter(type="or", fields=filters)
                    if not orig_filters:
                        qry['filter'] = ff
                    else:
                        qry['filter'] = Filter(type="and", fields=[
                            Filter.build_filter(ff),
                            Filter.build_filter(orig_filters)])
                qry['limit_spec'] = None
        if row_limit:
            qry['limit_spec'] = {
                "type": "default",
                "limit": row_limit,
                "columns": [{
                    "dimension": metrics[0] if metrics else self.metrics[0],
                    "direction": "descending",
                }],
            }
        client.groupby(**qry)
        query_str += json.dumps(client.query_dict, indent=2)
        df = client.export_pandas()
        return QueryResult(
            df=df,
            query=query_str,
            duration=datetime.now() - qry_start_dttm)
Exemplo n.º 14
0
    def query(
            self, groupby, metrics,
            granularity,
            from_dttm, to_dttm,
            limit_spec=None,
            filter=None,
            is_timeseries=True,
            timeseries_limit=None,
            row_limit=None,
            inner_from_dttm=None, inner_to_dttm=None,
            extras=None):
        qry_start_dttm = datetime.now()

        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ"))
        to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ"))

        query_str = ""
        aggregations = {
            m.metric_name: m.json_obj
            for m in self.metrics if m.metric_name in metrics
        }
        if not isinstance(granularity, basestring):
            granularity = {"type": "duration", "duration": granularity}

        qry = dict(
            datasource=self.datasource_name,
            dimensions=groupby,
            aggregations=aggregations,
            granularity=granularity,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )
        filters = None
        for col, op, eq in filter:
            cond = None
            if op == '==':
                cond = Dimension(col) == eq
            elif op == '!=':
                cond = ~(Dimension(col) == eq)
            elif op in ('in', 'not in'):
                fields = []
                splitted = eq.split(',')
                if len(splitted) > 1:
                    for s in eq.split(','):
                        s = s.strip()
                        fields.append(Filter.build_filter(Dimension(col) == s))
                    cond = Filter(type="or", fields=fields)
                else:
                    cond = Dimension(col) == eq
                if op == 'not in':
                    cond = ~cond
            if filters:
                filters = Filter(type="and", fields=[
                    Filter.build_filter(cond),
                    Filter.build_filter(filters)
                ])
            else:
                filters = cond

        if filters:
            qry['filter'] = filters

        client = self.cluster.get_pydruid_client()
        orig_filters = filters
        if timeseries_limit and is_timeseries:
            # Limit on the number of timeseries, doing a two-phases query
            pre_qry = deepcopy(qry)
            pre_qry['granularity'] = "all"
            pre_qry['limit_spec'] = {
                "type": "default",
                "limit": timeseries_limit,
                'intervals': inner_from_dttm.isoformat() + '/' + inner_to_dttm.isoformat(),
                "columns": [{
                    "dimension": metrics[0] if metrics else self.metrics[0],
                    "direction": "descending",
                }],
            }
            client.groupby(**pre_qry)
            query_str += "// Two phase query\n// Phase 1\n"
            query_str += json.dumps(client.query_dict, indent=2) + "\n"
            query_str += "//\nPhase 2 (built based on phase one's results)\n"
            df = client.export_pandas()
            if df is not None and not df.empty:
                dims = qry['dimensions']
                filters = []
                for index, row in df.iterrows():
                    fields = []
                    for dim in dims:
                        f = Filter.build_filter(Dimension(dim) == row[dim])
                        fields.append(f)
                    if len(fields) > 1:
                        filt = Filter(type="and", fields=fields)
                        filters.append(Filter.build_filter(filt))
                    elif fields:
                        filters.append(fields[0])

                if filters:
                    ff = Filter(type="or", fields=filters)
                    if not orig_filters:
                        qry['filter'] = ff
                    else:
                        qry['filter'] = Filter(type="and", fields=[
                            Filter.build_filter(ff),
                            Filter.build_filter(orig_filters)])
                qry['limit_spec'] = None
        if row_limit:
            qry['limit_spec'] = {
                "type": "default",
                "limit": row_limit,
                "columns": [{
                    "dimension": metrics[0] if metrics else self.metrics[0],
                    "direction": "descending",
                }],
            }
        client.groupby(**qry)
        query_str += json.dumps(client.query_dict, indent=2)
        df = client.export_pandas()
        return QueryResult(
            df=df,
            query=query_str,
            duration=datetime.now() - qry_start_dttm)
Exemplo n.º 15
0
 def select(self, datasource, granularity, intervals, descending,
            dimensions, metrics, filter, paging_spec):
     f = Filter.build_filter(filter)
     print('f: %s' % f)
     body = []
     return PyDruidResultMock(body)
Exemplo n.º 16
0
    def query(
        self,
        groupby,
        metrics,
        granularity,
        from_dttm,
        to_dttm,
        filter=None,  # noqa
        is_timeseries=True,
        timeseries_limit=None,
        row_limit=None,
        inner_from_dttm=None,
        inner_to_dttm=None,
        extras=None,  # noqa
        select=None,
    ):  # noqa
        """Runs a query against Druid and returns a dataframe.

        This query interface is common to SqlAlchemy and Druid
        """
        # TODO refactor into using a TBD Query object
        qry_start_dttm = datetime.now()

        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ"))
        to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ"))

        query_str = ""
        aggregations = {
            m.metric_name: m.json_obj
            for m in self.metrics if m.metric_name in metrics
        }
        granularity = granularity or "all"
        if granularity != "all":
            granularity = utils.parse_human_timedelta(
                granularity).total_seconds() * 1000
        if not isinstance(granularity, string_types):
            granularity = {"type": "duration", "duration": granularity}

        qry = dict(
            datasource=self.datasource_name,
            dimensions=groupby,
            aggregations=aggregations,
            granularity=granularity,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )
        filters = None
        for col, op, eq in filter:
            cond = None
            if op == '==':
                cond = Dimension(col) == eq
            elif op == '!=':
                cond = ~(Dimension(col) == eq)
            elif op in ('in', 'not in'):
                fields = []
                splitted = eq.split(',')
                if len(splitted) > 1:
                    for s in eq.split(','):
                        s = s.strip()
                        fields.append(Filter.build_filter(Dimension(col) == s))
                    cond = Filter(type="or", fields=fields)
                else:
                    cond = Dimension(col) == eq
                if op == 'not in':
                    cond = ~cond
            if filters:
                filters = Filter(type="and",
                                 fields=[
                                     Filter.build_filter(cond),
                                     Filter.build_filter(filters)
                                 ])
            else:
                filters = cond

        if filters:
            qry['filter'] = filters

        client = self.cluster.get_pydruid_client()
        orig_filters = filters
        if timeseries_limit and is_timeseries:
            # Limit on the number of timeseries, doing a two-phases query
            pre_qry = deepcopy(qry)
            pre_qry['granularity'] = "all"
            pre_qry['limit_spec'] = {
                "type":
                "default",
                "limit":
                timeseries_limit,
                'intervals': (inner_from_dttm.isoformat() + '/' +
                              inner_to_dttm.isoformat()),
                "columns": [{
                    "dimension":
                    metrics[0] if metrics else self.metrics[0],
                    "direction":
                    "descending",
                }],
            }
            client.groupby(**pre_qry)
            query_str += "// Two phase query\n// Phase 1\n"
            query_str += json.dumps(client.query_dict, indent=2) + "\n"
            query_str += "//\nPhase 2 (built based on phase one's results)\n"
            df = client.export_pandas()
            if df is not None and not df.empty:
                dims = qry['dimensions']
                filters = []
                for _, row in df.iterrows():
                    fields = []
                    for dim in dims:
                        f = Filter.build_filter(Dimension(dim) == row[dim])
                        fields.append(f)
                    if len(fields) > 1:
                        filt = Filter(type="and", fields=fields)
                        filters.append(Filter.build_filter(filt))
                    elif fields:
                        filters.append(fields[0])

                if filters:
                    ff = Filter(type="or", fields=filters)
                    if not orig_filters:
                        qry['filter'] = ff
                    else:
                        qry['filter'] = Filter(
                            type="and",
                            fields=[
                                Filter.build_filter(ff),
                                Filter.build_filter(orig_filters)
                            ])
                qry['limit_spec'] = None
        if row_limit:
            qry['limit_spec'] = {
                "type":
                "default",
                "limit":
                row_limit,
                "columns": [{
                    "dimension":
                    metrics[0] if metrics else self.metrics[0],
                    "direction":
                    "descending",
                }],
            }
        client.groupby(**qry)
        query_str += json.dumps(client.query_dict, indent=2)
        df = client.export_pandas()
        if df is None or df.size == 0:
            raise Exception("No data was returned.")

        if (not is_timeseries and granularity == "all"
                and 'timestamp' in df.columns):
            del df['timestamp']

        # Reordering columns
        cols = []
        if 'timestamp' in df.columns:
            cols += ['timestamp']
        cols += [col for col in groupby if col in df.columns]
        cols += [col for col in metrics if col in df.columns]
        cols += [col for col in df.columns if col not in cols]
        df = df[cols]
        return QueryResult(df=df,
                           query=query_str,
                           duration=datetime.now() - qry_start_dttm)
Exemplo n.º 17
0
    def query(  # druid
            self, groupby, metrics,
            granularity,
            from_dttm, to_dttm,
            filter=None,  # noqa
            is_timeseries=True,
            timeseries_limit=None,
            row_limit=None,
            inner_from_dttm=None, inner_to_dttm=None,
            extras=None,  # noqa
            select=None,):  # noqa
        """Runs a query against Druid and returns a dataframe.

        This query interface is common to SqlAlchemy and Druid
        """
        # TODO refactor into using a TBD Query object
        qry_start_dttm = datetime.now()

        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ"))
        to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ"))

        query_str = ""
        metrics_dict = {m.metric_name: m for m in self.metrics}
        all_metrics = []
        post_aggs = {}
        for metric_name in metrics:
            metric = metrics_dict[metric_name]
            if metric.metric_type != 'postagg':
                all_metrics.append(metric_name)
            else:
                conf = metric.json_obj
                fields = conf.get('fields', [])
                all_metrics += [
                    f.get('fieldName') for f in fields
                    if f.get('type') == 'fieldAccess']
                all_metrics += conf.get('fieldNames', [])
                if conf.get('type') == 'javascript':
                    post_aggs[metric_name] = JavascriptPostAggregator(
                        name=conf.get('name'),
                        field_names=conf.get('fieldNames'),
                        function=conf.get('function'))
                else:
                    post_aggs[metric_name] = Postaggregator(
                        conf.get('fn', "/"),
                        conf.get('fields', []),
                        conf.get('name', ''))
        aggregations = {
            m.metric_name: m.json_obj
            for m in self.metrics
            if m.metric_name in all_metrics
        }
        granularity = granularity or "all"
        if granularity != "all":
            granularity = utils.parse_human_timedelta(
                granularity).total_seconds() * 1000
        if not isinstance(granularity, string_types):
            granularity = {"type": "duration", "duration": granularity}
            origin = extras.get('druid_time_origin')
            if origin:
                dttm = utils.parse_human_datetime(origin)
                granularity['origin'] = dttm.isoformat()

        qry = dict(
            datasource=self.datasource_name,
            dimensions=groupby,
            aggregations=aggregations,
            granularity=granularity,
            post_aggregations=post_aggs,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )
        filters = None
        for col, op, eq in filter:
            cond = None
            if op == '==':
                cond = Dimension(col) == eq
            elif op == '!=':
                cond = ~(Dimension(col) == eq)
            elif op in ('in', 'not in'):
                fields = []
                splitted = eq.split(',')
                if len(splitted) > 1:
                    for s in eq.split(','):
                        s = s.strip()
                        fields.append(Filter.build_filter(Dimension(col) == s))
                    cond = Filter(type="or", fields=fields)
                else:
                    cond = Dimension(col) == eq
                if op == 'not in':
                    cond = ~cond
            if filters:
                filters = Filter(type="and", fields=[
                    Filter.build_filter(cond),
                    Filter.build_filter(filters)
                ])
            else:
                filters = cond

        if filters:
            qry['filter'] = filters

        client = self.cluster.get_pydruid_client()
        orig_filters = filters
        if timeseries_limit and is_timeseries:
            # Limit on the number of timeseries, doing a two-phases query
            pre_qry = deepcopy(qry)
            pre_qry['granularity'] = "all"
            pre_qry['limit_spec'] = {
                "type": "default",
                "limit": timeseries_limit,
                'intervals': (
                    inner_from_dttm.isoformat() + '/' +
                    inner_to_dttm.isoformat()),
                "columns": [{
                    "dimension": metrics[0] if metrics else self.metrics[0],
                    "direction": "descending",
                }],
            }
            client.groupby(**pre_qry)
            query_str += "// Two phase query\n// Phase 1\n"
            query_str += json.dumps(client.query_dict, indent=2) + "\n"
            query_str += "//\nPhase 2 (built based on phase one's results)\n"
            df = client.export_pandas()
            if df is not None and not df.empty:
                dims = qry['dimensions']
                filters = []
                for unused, row in df.iterrows():
                    fields = []
                    for dim in dims:
                        f = Filter.build_filter(Dimension(dim) == row[dim])
                        fields.append(f)
                    if len(fields) > 1:
                        filt = Filter(type="and", fields=fields)
                        filters.append(Filter.build_filter(filt))
                    elif fields:
                        filters.append(fields[0])

                if filters:
                    ff = Filter(type="or", fields=filters)
                    if not orig_filters:
                        qry['filter'] = ff
                    else:
                        qry['filter'] = Filter(type="and", fields=[
                            Filter.build_filter(ff),
                            Filter.build_filter(orig_filters)])
                qry['limit_spec'] = None
        if row_limit:
            qry['limit_spec'] = {
                "type": "default",
                "limit": row_limit,
                "columns": [{
                    "dimension": metrics[0] if metrics else self.metrics[0],
                    "direction": "descending",
                }],
            }
        client.groupby(**qry)
        query_str += json.dumps(client.query_dict, indent=2)
        df = client.export_pandas()
        if df is None or df.size == 0:
            raise Exception(_("No data was returned."))

        if (
                not is_timeseries and
                granularity == "all" and
                'timestamp' in df.columns):
            del df['timestamp']

        # Reordering columns
        cols = []
        if 'timestamp' in df.columns:
            cols += ['timestamp']
        cols += [col for col in groupby if col in df.columns]
        cols += [col for col in metrics if col in df.columns]
        df = df[cols]
        return QueryResult(
            df=df,
            query=query_str,
            duration=datetime.now() - qry_start_dttm)
Exemplo n.º 18
0
    def query(  # druid
        self,
        groupby,
        metrics,
        granularity,
        from_dttm,
        to_dttm,
        filter=None,  # noqa
        is_timeseries=True,
        timeseries_limit=None,
        row_limit=None,
        inner_from_dttm=None,
        inner_to_dttm=None,
        extras=None,  # noqa
        select=None,
    ):  # noqa
        """Runs a query against Druid and returns a dataframe.

        This query interface is common to SqlAlchemy and Druid
        """
        # TODO refactor into using a TBD Query object
        qry_start_dttm = datetime.now()

        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ"))
        to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ"))

        query_str = ""
        metrics_dict = {m.metric_name: m for m in self.metrics}
        all_metrics = []
        post_aggs = {}

        def recursive_get_fields(_conf):
            _fields = _conf.get('fields', [])
            field_names = []
            for _f in _fields:
                _type = _f.get('type')
                if _type in ['fieldAccess', 'hyperUniqueCardinality']:
                    field_names.append(_f.get('fieldName'))
                elif _type == 'arithmetic':
                    field_names += recursive_get_fields(_f)

            return list(set(field_names))

        for metric_name in metrics:
            metric = metrics_dict[metric_name]
            if metric.metric_type != 'postagg':
                all_metrics.append(metric_name)
            else:
                conf = metric.json_obj
                all_metrics += recursive_get_fields(conf)
                all_metrics += conf.get('fieldNames', [])
                if conf.get('type') == 'javascript':
                    post_aggs[metric_name] = JavascriptPostAggregator(
                        name=conf.get('name'),
                        field_names=conf.get('fieldNames'),
                        function=conf.get('function'))
                else:
                    post_aggs[metric_name] = Postaggregator(
                        conf.get('fn', "/"), conf.get('fields', []),
                        conf.get('name', ''))
        aggregations = {
            m.metric_name: m.json_obj
            for m in self.metrics if m.metric_name in all_metrics
        }
        granularity = granularity or "all"
        if granularity != "all":
            granularity = utils.parse_human_timedelta(
                granularity).total_seconds() * 1000
        if not isinstance(granularity, string_types):
            granularity = {"type": "duration", "duration": granularity}
            origin = extras.get('druid_time_origin')
            if origin:
                dttm = utils.parse_human_datetime(origin)
                granularity['origin'] = dttm.isoformat()

        qry = dict(
            datasource=self.datasource_name,
            dimensions=groupby,
            aggregations=aggregations,
            granularity=granularity,
            post_aggregations=post_aggs,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )
        filters = None
        for col, op, eq in filter:
            cond = None
            if op == '==':
                cond = Dimension(col) == eq
            elif op == '!=':
                cond = ~(Dimension(col) == eq)
            elif op in ('in', 'not in'):
                fields = []
                splitted = eq.split(',')
                if len(splitted) > 1:
                    for s in eq.split(','):
                        s = s.strip()
                        fields.append(Filter.build_filter(Dimension(col) == s))
                    cond = Filter(type="or", fields=fields)
                else:
                    cond = Dimension(col) == eq
                if op == 'not in':
                    cond = ~cond
            elif op == 'regex':
                cond = Filter(type="regex", pattern=eq, dimension=col)
            if filters:
                filters = Filter(type="and",
                                 fields=[
                                     Filter.build_filter(cond),
                                     Filter.build_filter(filters)
                                 ])
            else:
                filters = cond

        if filters:
            qry['filter'] = filters

        client = self.cluster.get_pydruid_client()
        orig_filters = filters
        if timeseries_limit and is_timeseries:
            # Limit on the number of timeseries, doing a two-phases query
            pre_qry = deepcopy(qry)
            pre_qry['granularity'] = "all"
            pre_qry['limit_spec'] = {
                "type":
                "default",
                "limit":
                timeseries_limit,
                'intervals': (inner_from_dttm.isoformat() + '/' +
                              inner_to_dttm.isoformat()),
                "columns": [{
                    "dimension":
                    metrics[0] if metrics else self.metrics[0],
                    "direction":
                    "descending",
                }],
            }
            client.groupby(**pre_qry)
            query_str += "// Two phase query\n// Phase 1\n"
            query_str += json.dumps(client.query_builder.last_query.query_dict,
                                    indent=2) + "\n"
            query_str += "//\nPhase 2 (built based on phase one's results)\n"
            df = client.export_pandas()
            if df is not None and not df.empty:
                dims = qry['dimensions']
                filters = []
                for unused, row in df.iterrows():
                    fields = []
                    for dim in dims:
                        f = Filter.build_filter(Dimension(dim) == row[dim])
                        fields.append(f)
                    if len(fields) > 1:
                        filt = Filter(type="and", fields=fields)
                        filters.append(Filter.build_filter(filt))
                    elif fields:
                        filters.append(fields[0])

                if filters:
                    ff = Filter(type="or", fields=filters)
                    if not orig_filters:
                        qry['filter'] = ff
                    else:
                        qry['filter'] = Filter(
                            type="and",
                            fields=[
                                Filter.build_filter(ff),
                                Filter.build_filter(orig_filters)
                            ])
                qry['limit_spec'] = None
        if row_limit:
            qry['limit_spec'] = {
                "type":
                "default",
                "limit":
                row_limit,
                "columns": [{
                    "dimension":
                    metrics[0] if metrics else self.metrics[0],
                    "direction":
                    "descending",
                }],
            }
        client.groupby(**qry)
        query_str += json.dumps(client.query_builder.last_query.query_dict,
                                indent=2)
        df = client.export_pandas()
        if df is None or df.size == 0:
            raise Exception(_("No data was returned."))

        if (not is_timeseries and granularity == "all"
                and 'timestamp' in df.columns):
            del df['timestamp']

        # Reordering columns
        cols = []
        if 'timestamp' in df.columns:
            cols += ['timestamp']
        cols += [col for col in groupby if col in df.columns]
        cols += [col for col in metrics if col in df.columns]
        df = df[cols]
        return QueryResult(df=df,
                           query=query_str,
                           duration=datetime.now() - qry_start_dttm)