예제 #1
0
 def check_restricted_metrics(self, aggregations):
     rejected_metrics = [
         m.metric_name for m in self.metrics
         if m.is_restricted and m.metric_name in aggregations.keys()
         and not sm.has_access('metric_access', m.perm)
     ]
     if rejected_metrics:
         raise MetricPermException(
             'Access to the metrics denied: ' +
             ', '.join(rejected_metrics), )
예제 #2
0
파일: models.py 프로젝트: johnsonc/caravel
 def check_restricted_metrics(self, aggregations):
     rejected_metrics = [
         m.metric_name for m in self.metrics
         if m.is_restricted and
         m.metric_name in aggregations.keys() and
         not sm.has_access('metric_access', m.perm)
     ]
     if rejected_metrics:
         raise MetricPermException(
             'Access to the metrics denied: ' + ', '.join(rejected_metrics),
         )
예제 #3
0
 def get_dim_acl_where(self):
     #维度验证
     cols = {col.column_name: col for col in self.columns if self.has_col_access(col)}
     dim_acslist = security.get_permission_view_by_permission("dim_access")
     dim_acl_map = {}
     acl_where_clause_and = []
     for ac in dim_acslist:
         if sm.has_access('dim_access', ac) and len(ac.split('_')) > 1 and ac.split('_')[0] in self.filterable_column_names :
             if dim_acl_map.has_key(ac.split('_')[0]):
                 dim_acl_map[ac.split('_')[0]] += [ac.split('_')[1]]
             else :
                 dim_acl_map[ac.split('_')[0]] = [ac.split('_')[1]]
     acl_filter=[]
     for  dim in dim_acl_map :
         acl_filter += [{
             'col': dim,
             'op': 'in',
             'val': dim_acl_map[dim],
         }]
     for flt in acl_filter:
         if not all([flt.get(s) for s in ['col', 'op', 'val']]):
             continue
         col = flt['col']
         op = flt['op']
         eq = flt['val']
         if col not in cols:
             raise Exception(("字段 '{}' 不存在,或没有权限访问".format(col)))
         col_obj = cols.get(col)
         if col_obj:
             if op in ('in', 'not in'):
                 values = []
                 for v in eq:
                     # For backwards compatibility and edge cases
                     # where a column data type might have changed
                     if isinstance(v, basestring):
                         v = v.strip("'").strip('"')
                         if col_obj.is_num:
                             v = utils.string_to_num(v)
                     # Removing empty strings and non numeric values
                     # targeting numeric columns
                     if v is not None:
                         values.append(v)
                 cond = col_obj.sqla_col.in_(values)
                 acl_where_clause_and.append(cond)
     return   acl_where_clause_and
예제 #4
0
    def run_query(  # noqa / druid
            self,
            groupby,
            metrics,
            granularity,
            from_dttm,
            to_dttm,
            filter=None,  # noqa
            is_timeseries=True,
            timeseries_limit=None,
            timeseries_limit_metric=None,
            row_limit=None,
            inner_from_dttm=None,
            inner_to_dttm=None,
            orderby=None,
            extras=None,  # noqa
            select=None,  # noqa
            columns=None,
            phase=2,
            client=None,
            form_data=None,
            order_desc=True):
        """Runs a query against Druid and returns a dataframe.
        """
        # TODO refactor into using a TBD Query object
        client = client or self.cluster.get_pydruid_client()

        if not is_timeseries:
            granularity = 'all'
        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=DRUID_TZ)
        to_dttm = to_dttm.replace(tzinfo=DRUID_TZ)
        timezone = from_dttm.tzname()

        query_str = ''
        metrics_dict = {m.metric_name: m for m in self.metrics}

        columns_dict = {c.column_name: c for c in self.columns}

        all_metrics, post_aggs = self._metrics_and_post_aggs(
            metrics, metrics_dict)

        aggregations = OrderedDict()
        for m in self.metrics:
            if m.metric_name in all_metrics:
                aggregations[m.metric_name] = m.json_obj

        rejected_metrics = [
            m.metric_name for m in self.metrics
            if m.is_restricted and m.metric_name in aggregations.keys()
            and not sm.has_access('metric_access', m.perm)
        ]

        if rejected_metrics:
            raise MetricPermException(
                'Access to the metrics denied: ' +
                ', '.join(rejected_metrics), )

        # the dimensions list with dimensionSpecs expanded
        dimensions = []
        groupby = [gb for gb in groupby if gb in columns_dict]
        for column_name in groupby:
            col = columns_dict.get(column_name)
            dim_spec = col.dimension_spec
            if dim_spec:
                dimensions.append(dim_spec)
            else:
                dimensions.append(column_name)
        qry = dict(
            datasource=self.datasource_name,
            dimensions=dimensions,
            aggregations=aggregations,
            granularity=DruidDatasource.granularity(
                granularity,
                timezone=timezone,
                origin=extras.get('druid_time_origin'),
            ),
            post_aggregations=post_aggs,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )

        filters = DruidDatasource.get_filters(filter, self.num_cols)
        if filters:
            qry['filter'] = filters

        having_filters = self.get_having_filters(extras.get('having_druid'))
        if having_filters:
            qry['having'] = having_filters
        order_direction = 'descending' if order_desc else 'ascending'
        if len(groupby) == 0 and not having_filters:
            del qry['dimensions']
            client.timeseries(**qry)

        if (not having_filters and len(groupby) == 1 and order_desc
                and not isinstance(list(qry.get('dimensions'))[0], dict)):
            dim = list(qry.get('dimensions'))[0]
            if timeseries_limit_metric:
                order_by = timeseries_limit_metric
            else:
                order_by = list(qry['aggregations'].keys())[0]
            # Limit on the number of timeseries, doing a two-phases query
            pre_qry = deepcopy(qry)
            pre_qry['granularity'] = 'all'
            pre_qry['threshold'] = min(row_limit, timeseries_limit
                                       or row_limit)
            pre_qry['metric'] = order_by
            pre_qry['dimension'] = dim
            del pre_qry['dimensions']
            client.topn(**pre_qry)
            query_str += '// Two phase query\n// Phase 1\n'
            query_str += json.dumps(client.query_builder.last_query.query_dict,
                                    indent=2)
            query_str += '\n'
            if phase == 1:
                return query_str
            query_str += ("// Phase 2 (built based on phase one's results)\n")
            df = client.export_pandas()
            qry['filter'] = self._add_filter_from_pre_query_data(
                df, qry['dimensions'], filters)
            qry['threshold'] = timeseries_limit or 1000
            if row_limit and granularity == 'all':
                qry['threshold'] = row_limit
            qry['dimension'] = list(qry.get('dimensions'))[0]
            qry['dimension'] = dim
            del qry['dimensions']
            qry['metric'] = list(qry['aggregations'].keys())[0]
            client.topn(**qry)
        elif len(groupby) > 1 or having_filters or not order_desc:
            # If grouping on multiple fields or using a having filter
            # we have to force a groupby query
            if timeseries_limit and is_timeseries:
                order_by = metrics[0] if metrics else self.metrics[0]
                if timeseries_limit_metric:
                    order_by = timeseries_limit_metric
                # Limit on the number of timeseries, doing a two-phases query
                pre_qry = deepcopy(qry)
                pre_qry['granularity'] = 'all'
                pre_qry['limit_spec'] = {
                    'type':
                    'default',
                    'limit':
                    min(timeseries_limit, row_limit),
                    'intervals': (inner_from_dttm.isoformat() + '/' +
                                  inner_to_dttm.isoformat()),
                    'columns': [{
                        'dimension': order_by,
                        'direction': order_direction,
                    }],
                }
                client.groupby(**pre_qry)
                query_str += '// Two phase query\n// Phase 1\n'
                query_str += json.dumps(
                    client.query_builder.last_query.query_dict, indent=2)
                query_str += '\n'
                if phase == 1:
                    return query_str
                query_str += (
                    "// Phase 2 (built based on phase one's results)\n")
                df = client.export_pandas()
                qry['filter'] = self._add_filter_from_pre_query_data(
                    df,
                    qry['dimensions'],
                    filters,
                )
                qry['limit_spec'] = None
            if row_limit:
                qry['limit_spec'] = {
                    'type':
                    'default',
                    'limit':
                    row_limit,
                    'columns': [{
                        'dimension':
                        (metrics[0] if metrics else self.metrics[0]),
                        'direction':
                        order_direction,
                    }],
                }
            client.groupby(**qry)
        query_str += json.dumps(client.query_builder.last_query.query_dict,
                                indent=2)
        return query_str
예제 #5
0
    def run_query(  # noqa / druid
            self,
            groupby,
            metrics,
            granularity,
            from_dttm,
            to_dttm,
            filter=None,  # noqa
            is_timeseries=True,
            timeseries_limit=None,
            timeseries_limit_metric=None,
            row_limit=None,
            inner_from_dttm=None,
            inner_to_dttm=None,
            orderby=None,
            extras=None,  # noqa
            select=None,  # noqa
            columns=None,
            phase=2,
            client=None,
            form_data=None):
        """Runs a query against Druid and returns a dataframe.
        """
        # TODO refactor into using a TBD Query object
        client = client or self.cluster.get_pydruid_client()

        if not is_timeseries:
            granularity = 'all'
        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=DRUID_TZ)
        to_dttm = to_dttm.replace(tzinfo=DRUID_TZ)
        timezone = from_dttm.tzname()

        query_str = ""
        metrics_dict = {m.metric_name: m for m in self.metrics}

        columns_dict = {c.column_name: c for c in self.columns}

        all_metrics, post_aggs = self._metrics_and_post_aggs(
            metrics, metrics_dict)

        aggregations = OrderedDict()
        for m in self.metrics:
            if m.metric_name in all_metrics:
                aggregations[m.metric_name] = m.json_obj

        rejected_metrics = [
            m.metric_name for m in self.metrics
            if m.is_restricted and m.metric_name in aggregations.keys()
            and not sm.has_access('metric_access', m.perm)
        ]

        if rejected_metrics:
            raise MetricPermException("Access to the metrics denied: " +
                                      ', '.join(rejected_metrics))

        # the dimensions list with dimensionSpecs expanded
        dimensions = []
        groupby = [gb for gb in groupby if gb in columns_dict]
        for column_name in groupby:
            col = columns_dict.get(column_name)
            dim_spec = col.dimension_spec
            if dim_spec:
                dimensions.append(dim_spec)
            else:
                dimensions.append(column_name)
        qry = dict(
            datasource=self.datasource_name,
            dimensions=dimensions,
            aggregations=aggregations,
            granularity=DruidDatasource.granularity(
                granularity,
                timezone=timezone,
                origin=extras.get('druid_time_origin'),
            ),
            post_aggregations=post_aggs,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )

        filters = self.get_filters(filter)
        if filters:
            qry['filter'] = filters

        having_filters = self.get_having_filters(extras.get('having_druid'))
        if having_filters:
            qry['having'] = having_filters

        orig_filters = filters
        if len(groupby) == 0 and not having_filters:
            del qry['dimensions']
            client.timeseries(**qry)
        if not having_filters and len(groupby) == 1:
            qry['threshold'] = timeseries_limit or 1000
            if row_limit and granularity == 'all':
                qry['threshold'] = row_limit
            qry['dimension'] = list(qry.get('dimensions'))[0]
            del qry['dimensions']
            qry['metric'] = list(qry['aggregations'].keys())[0]
            client.topn(**qry)
        elif len(groupby) > 1 or having_filters:
            # If grouping on multiple fields or using a having filter
            # we have to force a groupby query
            if timeseries_limit and is_timeseries:
                order_by = metrics[0] if metrics else self.metrics[0]
                if timeseries_limit_metric:
                    order_by = timeseries_limit_metric
                # Limit on the number of timeseries, doing a two-phases query
                pre_qry = deepcopy(qry)
                pre_qry['granularity'] = "all"
                pre_qry['limit_spec'] = {
                    "type":
                    "default",
                    "limit":
                    timeseries_limit,
                    'intervals': (inner_from_dttm.isoformat() + '/' +
                                  inner_to_dttm.isoformat()),
                    "columns": [{
                        "dimension": order_by,
                        "direction": "descending",
                    }],
                }
                client.groupby(**pre_qry)
                query_str += "// Two phase query\n// Phase 1\n"
                query_str += json.dumps(
                    client.query_builder.last_query.query_dict, indent=2)
                query_str += "\n"
                if phase == 1:
                    return query_str
                query_str += (
                    "//\nPhase 2 (built based on phase one's results)\n")
                df = client.export_pandas()
                if df is not None and not df.empty:
                    dims = qry['dimensions']
                    filters = []
                    for unused, row in df.iterrows():
                        fields = []
                        for dim in dims:
                            f = Dimension(dim) == row[dim]
                            fields.append(f)
                        if len(fields) > 1:
                            filt = Filter(type="and", fields=fields)
                            filters.append(filt)
                        elif fields:
                            filters.append(fields[0])

                    if filters:
                        ff = Filter(type="or", fields=filters)
                        if not orig_filters:
                            qry['filter'] = ff
                        else:
                            qry['filter'] = Filter(type="and",
                                                   fields=[ff, orig_filters])
                    qry['limit_spec'] = None
            if row_limit:
                qry['limit_spec'] = {
                    "type":
                    "default",
                    "limit":
                    row_limit,
                    "columns": [{
                        "dimension":
                        (metrics[0] if metrics else self.metrics[0]),
                        "direction":
                        "descending",
                    }],
                }
            client.groupby(**qry)
        query_str += json.dumps(client.query_builder.last_query.query_dict,
                                indent=2)
        return query_str
예제 #6
0
    def run_query(  # noqa / druid
            self,
            groupby, metrics,
            granularity,
            from_dttm, to_dttm,
            filter=None,  # noqa
            is_timeseries=True,
            timeseries_limit=None,
            timeseries_limit_metric=None,
            row_limit=None,
            inner_from_dttm=None, inner_to_dttm=None,
            orderby=None,
            extras=None,  # noqa
            select=None,  # noqa
            columns=None, phase=2, client=None, form_data=None,
            order_desc=True):
        """Runs a query against Druid and returns a dataframe.
        """
        # TODO refactor into using a TBD Query object
        client = client or self.cluster.get_pydruid_client()

        if not is_timeseries:
            granularity = 'all'
        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=DRUID_TZ)
        to_dttm = to_dttm.replace(tzinfo=DRUID_TZ)
        timezone = from_dttm.tzname()

        query_str = ""
        metrics_dict = {m.metric_name: m for m in self.metrics}

        columns_dict = {c.column_name: c for c in self.columns}

        all_metrics, post_aggs = self._metrics_and_post_aggs(
                                      metrics,
                                      metrics_dict)

        aggregations = OrderedDict()
        for m in self.metrics:
            if m.metric_name in all_metrics:
                aggregations[m.metric_name] = m.json_obj

        rejected_metrics = [
            m.metric_name for m in self.metrics
            if m.is_restricted and
            m.metric_name in aggregations.keys() and
            not sm.has_access('metric_access', m.perm)
        ]

        if rejected_metrics:
            raise MetricPermException(
                "Access to the metrics denied: " + ', '.join(rejected_metrics)
            )

        # the dimensions list with dimensionSpecs expanded
        dimensions = []
        groupby = [gb for gb in groupby if gb in columns_dict]
        for column_name in groupby:
            col = columns_dict.get(column_name)
            dim_spec = col.dimension_spec
            if dim_spec:
                dimensions.append(dim_spec)
            else:
                dimensions.append(column_name)
        qry = dict(
            datasource=self.datasource_name,
            dimensions=dimensions,
            aggregations=aggregations,
            granularity=DruidDatasource.granularity(
                granularity,
                timezone=timezone,
                origin=extras.get('druid_time_origin'),
            ),
            post_aggregations=post_aggs,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )

        filters = self.get_filters(filter)
        if filters:
            qry['filter'] = filters

        having_filters = self.get_having_filters(extras.get('having_druid'))
        if having_filters:
            qry['having'] = having_filters
        order_direction = "descending" if order_desc else "ascending"
        if len(groupby) == 0 and not having_filters:
            del qry['dimensions']
            client.timeseries(**qry)
        if not having_filters and len(groupby) == 1 and order_desc:
            dim = list(qry.get('dimensions'))[0]
            if timeseries_limit_metric:
                order_by = timeseries_limit_metric
            else:
                order_by = list(qry['aggregations'].keys())[0]
            # Limit on the number of timeseries, doing a two-phases query
            pre_qry = deepcopy(qry)
            pre_qry['granularity'] = "all"
            pre_qry['threshold'] = min(row_limit,
                                       timeseries_limit or row_limit)
            pre_qry['metric'] = order_by
            pre_qry['dimension'] = dim
            del pre_qry['dimensions']
            client.topn(**pre_qry)
            query_str += "// Two phase query\n// Phase 1\n"
            query_str += json.dumps(
              client.query_builder.last_query.query_dict, indent=2)
            query_str += "\n"
            if phase == 1:
                return query_str
            query_str += (
              "//\nPhase 2 (built based on phase one's results)\n")
            df = client.export_pandas()
            qry['filter'] = self._add_filter_from_pre_query_data(
                                df,
                                qry['dimensions'], filters)
            qry['threshold'] = timeseries_limit or 1000
            if row_limit and granularity == 'all':
                qry['threshold'] = row_limit
            qry['dimension'] = list(qry.get('dimensions'))[0]
            qry['dimension'] = dim
            del qry['dimensions']
            qry['metric'] = list(qry['aggregations'].keys())[0]
            client.topn(**qry)
        elif len(groupby) > 1 or having_filters or not order_desc:
            # If grouping on multiple fields or using a having filter
            # we have to force a groupby query
            if timeseries_limit and is_timeseries:
                order_by = metrics[0] if metrics else self.metrics[0]
                if timeseries_limit_metric:
                    order_by = timeseries_limit_metric
                # Limit on the number of timeseries, doing a two-phases query
                pre_qry = deepcopy(qry)
                pre_qry['granularity'] = "all"
                pre_qry['limit_spec'] = {
                    "type": "default",
                    "limit": min(timeseries_limit, row_limit),
                    'intervals': (
                        inner_from_dttm.isoformat() + '/' +
                        inner_to_dttm.isoformat()),
                    "columns": [{
                        "dimension": order_by,
                        "direction": order_direction,
                    }],
                }
                client.groupby(**pre_qry)
                query_str += "// Two phase query\n// Phase 1\n"
                query_str += json.dumps(
                    client.query_builder.last_query.query_dict, indent=2)
                query_str += "\n"
                if phase == 1:
                    return query_str
                query_str += (
                    "//\nPhase 2 (built based on phase one's results)\n")
                df = client.export_pandas()
                qry['filter'] = self._add_filter_from_pre_query_data(
                                    df,
                                    qry['dimensions'], filters)
                qry['limit_spec'] = None
            if row_limit:
                qry['limit_spec'] = {
                    "type": "default",
                    "limit": row_limit,
                    "columns": [{
                        "dimension": (
                            metrics[0] if metrics else self.metrics[0]),
                        "direction": order_direction,
                    }],
                }
            client.groupby(**qry)
        query_str += json.dumps(
            client.query_builder.last_query.query_dict, indent=2)
        return query_str
예제 #7
0
    def run_query(  # noqa / druid
            self,
            groupby, metrics,
            granularity,
            from_dttm, to_dttm,
            filter=None,  # noqa
            is_timeseries=True,
            timeseries_limit=None,
            timeseries_limit_metric=None,
            row_limit=None,
            inner_from_dttm=None, inner_to_dttm=None,
            orderby=None,
            extras=None,  # noqa
            select=None,  # noqa
            columns=None, phase=2, client=None, form_data=None):
        """Runs a query against Druid and returns a dataframe.
        """
        # TODO refactor into using a TBD Query object
        client = client or self.cluster.get_pydruid_client()
        if not is_timeseries:
            granularity = 'all'
        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=DRUID_TZ)
        to_dttm = to_dttm.replace(tzinfo=DRUID_TZ)
        timezone = from_dttm.tzname()

        query_str = ""
        metrics_dict = {m.metric_name: m for m in self.metrics}
        all_metrics = []
        post_aggs = {}

        columns_dict = {c.column_name: c for c in self.columns}

        def recursive_get_fields(_conf):
            _fields = _conf.get('fields', [])
            field_names = []
            for _f in _fields:
                _type = _f.get('type')
                if _type in ['fieldAccess', 'hyperUniqueCardinality']:
                    field_names.append(_f.get('fieldName'))
                elif _type == 'arithmetic':
                    field_names += recursive_get_fields(_f)
            return list(set(field_names))

        for metric_name in metrics:
            metric = metrics_dict[metric_name]
            if metric.metric_type != 'postagg':
                all_metrics.append(metric_name)
            else:
                mconf = metric.json_obj
                all_metrics += recursive_get_fields(mconf)
                all_metrics += mconf.get('fieldNames', [])
                if mconf.get('type') == 'javascript':
                    post_aggs[metric_name] = JavascriptPostAggregator(
                        name=mconf.get('name', ''),
                        field_names=mconf.get('fieldNames', []),
                        function=mconf.get('function', ''))
                elif mconf.get('type') == 'quantile':
                    post_aggs[metric_name] = Quantile(
                        mconf.get('name', ''),
                        mconf.get('probability', ''),
                    )
                elif mconf.get('type') == 'quantiles':
                    post_aggs[metric_name] = Quantiles(
                        mconf.get('name', ''),
                        mconf.get('probabilities', ''),
                    )
                elif mconf.get('type') == 'fieldAccess':
                    post_aggs[metric_name] = Field(mconf.get('name'))
                elif mconf.get('type') == 'constant':
                    post_aggs[metric_name] = Const(
                        mconf.get('value'),
                        output_name=mconf.get('name', '')
                    )
                elif mconf.get('type') == 'hyperUniqueCardinality':
                    post_aggs[metric_name] = HyperUniqueCardinality(
                        mconf.get('name')
                    )
                else:
                    post_aggs[metric_name] = Postaggregator(
                        mconf.get('fn', "/"),
                        mconf.get('fields', []),
                        mconf.get('name', ''))

        aggregations = OrderedDict()
        for m in self.metrics:
            if m.metric_name in all_metrics:
                aggregations[m.metric_name] = m.json_obj

        rejected_metrics = [
            m.metric_name for m in self.metrics
            if m.is_restricted and
            m.metric_name in aggregations.keys() and
            not sm.has_access('metric_access', m.perm)
        ]

        if rejected_metrics:
            raise MetricPermException(
                "Access to the metrics denied: " + ', '.join(rejected_metrics)
            )

        # the dimensions list with dimensionSpecs expanded
        dimensions = []
        groupby = [gb for gb in groupby if gb in columns_dict]
        for column_name in groupby:
            col = columns_dict.get(column_name)
            dim_spec = col.dimension_spec
            if dim_spec:
                dimensions.append(dim_spec)
            else:
                dimensions.append(column_name)
        qry = dict(
            datasource=self.datasource_name,
            dimensions=dimensions,
            aggregations=aggregations,
            granularity=DruidDatasource.granularity(
                granularity,
                timezone=timezone,
                origin=extras.get('druid_time_origin'),
            ),
            post_aggregations=post_aggs,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )

        filters = self.get_filters(filter)
        if filters:
            qry['filter'] = filters

        having_filters = self.get_having_filters(extras.get('having_druid'))
        if having_filters:
            qry['having'] = having_filters

        orig_filters = filters
        if len(groupby) == 0 and not having_filters:
            del qry['dimensions']
            client.timeseries(**qry)
        if not having_filters and len(groupby) == 1:
            qry['threshold'] = timeseries_limit or 1000
            if row_limit and granularity == 'all':
                qry['threshold'] = row_limit
            qry['dimension'] = list(qry.get('dimensions'))[0]
            del qry['dimensions']
            qry['metric'] = list(qry['aggregations'].keys())[0]
            client.topn(**qry)
        elif len(groupby) > 1 or having_filters:
            # If grouping on multiple fields or using a having filter
            # we have to force a groupby query
            if timeseries_limit and is_timeseries:
                order_by = metrics[0] if metrics else self.metrics[0]
                if timeseries_limit_metric:
                    order_by = timeseries_limit_metric
                # Limit on the number of timeseries, doing a two-phases query
                pre_qry = deepcopy(qry)
                pre_qry['granularity'] = "all"
                pre_qry['limit_spec'] = {
                    "type": "default",
                    "limit": timeseries_limit,
                    'intervals': (
                        inner_from_dttm.isoformat() + '/' +
                        inner_to_dttm.isoformat()),
                    "columns": [{
                        "dimension": order_by,
                        "direction": "descending",
                    }],
                }
                client.groupby(**pre_qry)
                query_str += "// Two phase query\n// Phase 1\n"
                query_str += json.dumps(
                    client.query_builder.last_query.query_dict, indent=2)
                query_str += "\n"
                if phase == 1:
                    return query_str
                query_str += (
                    "//\nPhase 2 (built based on phase one's results)\n")
                df = client.export_pandas()
                if df is not None and not df.empty:
                    dims = qry['dimensions']
                    filters = []
                    for unused, row in df.iterrows():
                        fields = []
                        for dim in dims:
                            f = Dimension(dim) == row[dim]
                            fields.append(f)
                        if len(fields) > 1:
                            filt = Filter(type="and", fields=fields)
                            filters.append(filt)
                        elif fields:
                            filters.append(fields[0])

                    if filters:
                        ff = Filter(type="or", fields=filters)
                        if not orig_filters:
                            qry['filter'] = ff
                        else:
                            qry['filter'] = Filter(type="and", fields=[
                                ff,
                                orig_filters])
                    qry['limit_spec'] = None
            if row_limit:
                qry['limit_spec'] = {
                    "type": "default",
                    "limit": row_limit,
                    "columns": [{
                        "dimension": (
                            metrics[0] if metrics else self.metrics[0]),
                        "direction": "descending",
                    }],
                }
            client.groupby(**qry)
        query_str += json.dumps(
            client.query_builder.last_query.query_dict, indent=2)
        return query_str
예제 #8
0
파일: models.py 프로젝트: yiday/superset
    def get_query_str(  # noqa / druid
            self,
            client,
            qry_start_dttm,
            groupby,
            metrics,
            granularity,
            from_dttm,
            to_dttm,
            filter=None,  # noqa
            is_timeseries=True,
            timeseries_limit=None,
            timeseries_limit_metric=None,
            row_limit=None,
            inner_from_dttm=None,
            inner_to_dttm=None,
            orderby=None,
            extras=None,  # noqa
            select=None,  # noqa
            columns=None,
            phase=2):
        """Runs a query against Druid and returns a dataframe.

        This query interface is common to SqlAlchemy and Druid
        """
        # TODO refactor into using a TBD Query object
        if not is_timeseries:
            granularity = 'all'
        inner_from_dttm = inner_from_dttm or from_dttm
        inner_to_dttm = inner_to_dttm or to_dttm

        # add tzinfo to native datetime with config
        from_dttm = from_dttm.replace(tzinfo=DRUID_TZ)
        to_dttm = to_dttm.replace(tzinfo=DRUID_TZ)
        timezone = from_dttm.tzname()

        query_str = ""
        metrics_dict = {m.metric_name: m for m in self.metrics}
        all_metrics = []
        post_aggs = {}

        columns_dict = {c.column_name: c for c in self.columns}

        def recursive_get_fields(_conf):
            _fields = _conf.get('fields', [])
            field_names = []
            for _f in _fields:
                _type = _f.get('type')
                if _type in ['fieldAccess', 'hyperUniqueCardinality']:
                    field_names.append(_f.get('fieldName'))
                elif _type == 'arithmetic':
                    field_names += recursive_get_fields(_f)
            return list(set(field_names))

        for metric_name in metrics:
            metric = metrics_dict[metric_name]
            if metric.metric_type != 'postagg':
                all_metrics.append(metric_name)
            else:
                mconf = metric.json_obj
                all_metrics += recursive_get_fields(mconf)
                all_metrics += mconf.get('fieldNames', [])
                if mconf.get('type') == 'javascript':
                    post_aggs[metric_name] = JavascriptPostAggregator(
                        name=mconf.get('name', ''),
                        field_names=mconf.get('fieldNames', []),
                        function=mconf.get('function', ''))
                elif mconf.get('type') == 'quantile':
                    post_aggs[metric_name] = Quantile(
                        mconf.get('name', ''),
                        mconf.get('probability', ''),
                    )
                elif mconf.get('type') == 'quantiles':
                    post_aggs[metric_name] = Quantiles(
                        mconf.get('name', ''),
                        mconf.get('probabilities', ''),
                    )
                elif mconf.get('type') == 'fieldAccess':
                    post_aggs[metric_name] = Field(mconf.get('name'), '')
                elif mconf.get('type') == 'constant':
                    post_aggs[metric_name] = Const(mconf.get('value'),
                                                   output_name=mconf.get(
                                                       'name', ''))
                elif mconf.get('type') == 'hyperUniqueCardinality':
                    post_aggs[metric_name] = HyperUniqueCardinality(
                        mconf.get('name'), '')
                else:
                    post_aggs[metric_name] = Postaggregator(
                        mconf.get('fn', "/"), mconf.get('fields', []),
                        mconf.get('name', ''))

        aggregations = OrderedDict()
        for m in self.metrics:
            if m.metric_name in all_metrics:
                aggregations[m.metric_name] = m.json_obj

        rejected_metrics = [
            m.metric_name for m in self.metrics
            if m.is_restricted and m.metric_name in aggregations.keys()
            and not sm.has_access('metric_access', m.perm)
        ]

        if rejected_metrics:
            raise MetricPermException("Access to the metrics denied: " +
                                      ', '.join(rejected_metrics))

        # the dimensions list with dimensionSpecs expanded
        dimensions = []
        groupby = [gb for gb in groupby if gb in columns_dict]
        for column_name in groupby:
            col = columns_dict.get(column_name)
            dim_spec = col.dimension_spec
            if dim_spec:
                dimensions.append(dim_spec)
            else:
                dimensions.append(column_name)
        qry = dict(
            datasource=self.datasource_name,
            dimensions=dimensions,
            aggregations=aggregations,
            granularity=DruidDatasource.granularity(
                granularity,
                timezone=timezone,
                origin=extras.get('druid_time_origin'),
            ),
            post_aggregations=post_aggs,
            intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(),
        )

        filters = self.get_filters(filter)
        if filters:
            qry['filter'] = filters

        having_filters = self.get_having_filters(extras.get('having_druid'))
        if having_filters:
            qry['having'] = having_filters

        orig_filters = filters
        if len(groupby) == 0:
            del qry['dimensions']
            client.timeseries(**qry)
        if not having_filters and len(groupby) == 1:
            qry['threshold'] = timeseries_limit or 1000
            if row_limit and granularity == 'all':
                qry['threshold'] = row_limit
            qry['dimension'] = list(qry.get('dimensions'))[0]
            del qry['dimensions']
            qry['metric'] = list(qry['aggregations'].keys())[0]
            client.topn(**qry)
        elif len(groupby) > 1 or having_filters:
            # If grouping on multiple fields or using a having filter
            # we have to force a groupby query
            if timeseries_limit and is_timeseries:
                order_by = metrics[0] if metrics else self.metrics[0]
                if timeseries_limit_metric:
                    order_by = timeseries_limit_metric
                # Limit on the number of timeseries, doing a two-phases query
                pre_qry = deepcopy(qry)
                pre_qry['granularity'] = "all"
                pre_qry['limit_spec'] = {
                    "type":
                    "default",
                    "limit":
                    timeseries_limit,
                    'intervals': (inner_from_dttm.isoformat() + '/' +
                                  inner_to_dttm.isoformat()),
                    "columns": [{
                        "dimension": order_by,
                        "direction": "descending",
                    }],
                }
                client.groupby(**pre_qry)
                query_str += "// Two phase query\n// Phase 1\n"
                query_str += json.dumps(
                    client.query_builder.last_query.query_dict, indent=2)
                query_str += "\n"
                if phase == 1:
                    return query_str
                query_str += (
                    "//\nPhase 2 (built based on phase one's results)\n")
                df = client.export_pandas()
                if df is not None and not df.empty:
                    dims = qry['dimensions']
                    filters = []
                    for unused, row in df.iterrows():
                        fields = []
                        for dim in dims:
                            f = Dimension(dim) == row[dim]
                            fields.append(f)
                        if len(fields) > 1:
                            filt = Filter(type="and", fields=fields)
                            filters.append(filt)
                        elif fields:
                            filters.append(fields[0])

                    if filters:
                        ff = Filter(type="or", fields=filters)
                        if not orig_filters:
                            qry['filter'] = ff
                        else:
                            qry['filter'] = Filter(type="and",
                                                   fields=[ff, orig_filters])
                    qry['limit_spec'] = None
            if row_limit:
                qry['limit_spec'] = {
                    "type":
                    "default",
                    "limit":
                    row_limit,
                    "columns": [{
                        "dimension":
                        (metrics[0] if metrics else self.metrics[0]),
                        "direction":
                        "descending",
                    }],
                }
            client.groupby(**qry)
        query_str += json.dumps(client.query_builder.last_query.query_dict,
                                indent=2)
        return query_str
예제 #9
0
 def has_met_access(m):
     return not m.is_restricted or (m.is_restricted and sm.has_access('metric_access', m.perm))
예제 #10
0
 def has_col_access(c):
     return not c.is_restricted or (c.is_restricted and sm.has_access('columns_access', c.perm))