def get_post_agg(mconf): """ For a metric specified as `postagg` returns the kind of post aggregation for pydruid. """ if mconf.get('type') == 'javascript': return JavascriptPostAggregator(name=mconf.get('name', ''), field_names=mconf.get( 'fieldNames', []), function=mconf.get('function', '')) elif mconf.get('type') == 'quantile': return Quantile( mconf.get('name', ''), mconf.get('probability', ''), ) elif mconf.get('type') == 'quantiles': return Quantiles( mconf.get('name', ''), mconf.get('probabilities', ''), ) elif mconf.get('type') == 'fieldAccess': return Field(mconf.get('name')) elif mconf.get('type') == 'constant': return Const( mconf.get('value'), output_name=mconf.get('name', ''), ) elif mconf.get('type') == 'hyperUniqueCardinality': return HyperUniqueCardinality(mconf.get('name'), ) elif mconf.get('type') == 'arithmetic': return Postaggregator(mconf.get('fn', '/'), mconf.get('fields', []), mconf.get('name', '')) else: return CustomPostAggregator(mconf.get('name', ''), mconf)
def _build_from_node(node): node_type = type(node).__name__ # Variables encountered are considered to be druid data fields if node_type == 'Name': return Field(node.id) if node_type == 'Num': return Const(node.n) if node_type == 'BinOp': node_op_type = type(node.op).__name__ if node_op_type not in ARITHMETIC_NODE_MAP: raise UnsupportedNodeError(node_op_type) fn = ARITHMETIC_NODE_MAP[node_op_type] # Recurse to get fields being operated on left = _build_from_node(node.left).post_aggregator right = _build_from_node(node.right).post_aggregator fields = [left, right] # Collapse equivalent calculations into a single calculation if left.get('fn') == fn: fields = left['fields'] + [right] elif right.get('fn') == fn: fields = [left] + right['fields'] return Postaggregator(fn, fields, 'bin_op') raise UnsupportedNodeError(node_type)
def _metrics_and_post_aggs(metrics, metrics_dict): all_metrics = [] post_aggs = {} def recursive_get_fields(_conf): _type = _conf.get('type') _field = _conf.get('field') _fields = _conf.get('fields') field_names = [] if _type in [ 'fieldAccess', 'hyperUniqueCardinality', 'quantile', 'quantiles' ]: field_names.append(_conf.get('fieldName', '')) if _field: field_names += recursive_get_fields(_field) if _fields: for _f in _fields: field_names += recursive_get_fields(_f) return list(set(field_names)) for metric_name in metrics: metric = metrics_dict[metric_name] if metric.metric_type != 'postagg': all_metrics.append(metric_name) else: mconf = metric.json_obj all_metrics += recursive_get_fields(mconf) all_metrics += mconf.get('fieldNames', []) if mconf.get('type') == 'javascript': post_aggs[metric_name] = JavascriptPostAggregator( name=mconf.get('name', ''), field_names=mconf.get('fieldNames', []), function=mconf.get('function', '')) elif mconf.get('type') == 'quantile': post_aggs[metric_name] = Quantile( mconf.get('name', ''), mconf.get('probability', ''), ) elif mconf.get('type') == 'quantiles': post_aggs[metric_name] = Quantiles( mconf.get('name', ''), mconf.get('probabilities', ''), ) elif mconf.get('type') == 'fieldAccess': post_aggs[metric_name] = Field(mconf.get('name')) elif mconf.get('type') == 'constant': post_aggs[metric_name] = Const( mconf.get('value'), output_name=mconf.get('name', ''), ) elif mconf.get('type') == 'hyperUniqueCardinality': post_aggs[metric_name] = HyperUniqueCardinality( mconf.get('name'), ) elif mconf.get('type') == 'arithmetic': post_aggs[metric_name] = Postaggregator( mconf.get('fn', '/'), mconf.get('fields', []), mconf.get('name', '')) else: post_aggs[metric_name] = CustomPostAggregator( mconf.get('name', ''), mconf) return all_metrics, post_aggs
def run_query( # noqa / druid self, groupby, metrics, granularity, from_dttm, to_dttm, filter=None, # noqa is_timeseries=True, timeseries_limit=None, timeseries_limit_metric=None, row_limit=None, inner_from_dttm=None, inner_to_dttm=None, orderby=None, extras=None, # noqa select=None, # noqa columns=None, phase=2, client=None, form_data=None): """Runs a query against Druid and returns a dataframe. """ # TODO refactor into using a TBD Query object client = client or self.cluster.get_pydruid_client() if not is_timeseries: granularity = 'all' inner_from_dttm = inner_from_dttm or from_dttm inner_to_dttm = inner_to_dttm or to_dttm # add tzinfo to native datetime with config from_dttm = from_dttm.replace(tzinfo=DRUID_TZ) to_dttm = to_dttm.replace(tzinfo=DRUID_TZ) timezone = from_dttm.tzname() query_str = "" metrics_dict = {m.metric_name: m for m in self.metrics} all_metrics = [] post_aggs = {} columns_dict = {c.column_name: c for c in self.columns} def recursive_get_fields(_conf): _fields = _conf.get('fields', []) field_names = [] for _f in _fields: _type = _f.get('type') if _type in ['fieldAccess', 'hyperUniqueCardinality']: field_names.append(_f.get('fieldName')) elif _type == 'arithmetic': field_names += recursive_get_fields(_f) return list(set(field_names)) for metric_name in metrics: metric = metrics_dict[metric_name] if metric.metric_type != 'postagg': all_metrics.append(metric_name) else: mconf = metric.json_obj all_metrics += recursive_get_fields(mconf) all_metrics += mconf.get('fieldNames', []) if mconf.get('type') == 'javascript': post_aggs[metric_name] = JavascriptPostAggregator( name=mconf.get('name', ''), field_names=mconf.get('fieldNames', []), function=mconf.get('function', '')) elif mconf.get('type') == 'quantile': post_aggs[metric_name] = Quantile( mconf.get('name', ''), mconf.get('probability', ''), ) elif mconf.get('type') == 'quantiles': post_aggs[metric_name] = Quantiles( mconf.get('name', ''), mconf.get('probabilities', ''), ) elif mconf.get('type') == 'fieldAccess': post_aggs[metric_name] = Field(mconf.get('name')) elif mconf.get('type') == 'constant': post_aggs[metric_name] = Const(mconf.get('value'), output_name=mconf.get( 'name', '')) elif mconf.get('type') == 'hyperUniqueCardinality': post_aggs[metric_name] = HyperUniqueCardinality( mconf.get('name')) else: post_aggs[metric_name] = Postaggregator( mconf.get('fn', "/"), mconf.get('fields', []), mconf.get('name', '')) aggregations = OrderedDict() for m in self.metrics: if m.metric_name in all_metrics: aggregations[m.metric_name] = m.json_obj rejected_metrics = [ m.metric_name for m in self.metrics if m.is_restricted and m.metric_name in aggregations.keys() and not sm.has_access('metric_access', m.perm) ] if rejected_metrics: raise MetricPermException("Access to the metrics denied: " + ', '.join(rejected_metrics)) # the dimensions list with dimensionSpecs expanded dimensions = [] groupby = [gb for gb in groupby if gb in columns_dict] for column_name in groupby: col = columns_dict.get(column_name) dim_spec = col.dimension_spec if dim_spec: dimensions.append(dim_spec) else: dimensions.append(column_name) qry = dict( datasource=self.datasource_name, dimensions=dimensions, aggregations=aggregations, granularity=DruidDatasource.granularity( granularity, timezone=timezone, origin=extras.get('druid_time_origin'), ), post_aggregations=post_aggs, intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(), ) filters = self.get_filters(filter) if filters: qry['filter'] = filters having_filters = self.get_having_filters(extras.get('having_druid')) if having_filters: qry['having'] = having_filters orig_filters = filters if len(groupby) == 0 and not having_filters: del qry['dimensions'] client.timeseries(**qry) if not having_filters and len(groupby) == 1: qry['threshold'] = timeseries_limit or 1000 if row_limit and granularity == 'all': qry['threshold'] = row_limit qry['dimension'] = list(qry.get('dimensions'))[0] del qry['dimensions'] qry['metric'] = list(qry['aggregations'].keys())[0] client.topn(**qry) elif len(groupby) > 1 or having_filters: # If grouping on multiple fields or using a having filter # we have to force a groupby query if timeseries_limit and is_timeseries: order_by = metrics[0] if metrics else self.metrics[0] if timeseries_limit_metric: order_by = timeseries_limit_metric # Limit on the number of timeseries, doing a two-phases query pre_qry = deepcopy(qry) pre_qry['granularity'] = "all" pre_qry['limit_spec'] = { "type": "default", "limit": timeseries_limit, 'intervals': (inner_from_dttm.isoformat() + '/' + inner_to_dttm.isoformat()), "columns": [{ "dimension": order_by, "direction": "descending", }], } client.groupby(**pre_qry) query_str += "// Two phase query\n// Phase 1\n" query_str += json.dumps( client.query_builder.last_query.query_dict, indent=2) query_str += "\n" if phase == 1: return query_str query_str += ( "//\nPhase 2 (built based on phase one's results)\n") df = client.export_pandas() if df is not None and not df.empty: dims = qry['dimensions'] filters = [] for unused, row in df.iterrows(): fields = [] for dim in dims: f = Dimension(dim) == row[dim] fields.append(f) if len(fields) > 1: filt = Filter(type="and", fields=fields) filters.append(filt) elif fields: filters.append(fields[0]) if filters: ff = Filter(type="or", fields=filters) if not orig_filters: qry['filter'] = ff else: qry['filter'] = Filter(type="and", fields=[ff, orig_filters]) qry['limit_spec'] = None if row_limit: qry['limit_spec'] = { "type": "default", "limit": row_limit, "columns": [{ "dimension": (metrics[0] if metrics else self.metrics[0]), "direction": "descending", }], } client.groupby(**qry) query_str += json.dumps(client.query_builder.last_query.query_dict, indent=2) return query_str