def get_data(self): df = self.get_df() form_data = self.form_data df.columns = ["timestamp", "metric"] timestamps = {str(obj["timestamp"].value / 10**9): obj.get("metric") for obj in df.to_dict("records")} start = utils.parse_human_datetime(form_data.get("since")) end = utils.parse_human_datetime(form_data.get("until")) domain = form_data.get("domain_granularity") diff_delta = rdelta.relativedelta(end, start) diff_secs = (end - start).total_seconds() if domain == "year": range_ = diff_delta.years + 1 elif domain == "month": range_ = diff_delta.years * 12 + diff_delta.months + 1 elif domain == "week": range_ = diff_delta.years * 53 + diff_delta.weeks + 1 elif domain == "day": range_ = diff_secs // (24*60*60) + 1 else: range_ = diff_secs // (60*60) + 1 return { "timestamps": timestamps, "start": start, "domain": domain, "subdomain": form_data.get("subdomain_granularity"), "range": range_, }
def get_data(self): df = self.get_df() form_data = self.form_data df.columns = ["timestamp", "metric"] timestamps = { str(obj["timestamp"].value / 10**9): obj.get("metric") for obj in df.to_dict("records") } start = utils.parse_human_datetime(form_data.get("since")) end = utils.parse_human_datetime(form_data.get("until")) domain = form_data.get("domain_granularity") diff_delta = rdelta.relativedelta(end, start) diff_secs = (end - start).total_seconds() if domain == "year": range_ = diff_delta.years + 1 elif domain == "month": range_ = diff_delta.years * 12 + diff_delta.months + 1 elif domain == "week": range_ = diff_delta.years * 53 + diff_delta.weeks + 1 elif domain == "day": range_ = diff_secs // (24 * 60 * 60) + 1 else: range_ = diff_secs // (60 * 60) + 1 return { "timestamps": timestamps, "start": start, "domain": domain, "subdomain": form_data.get("subdomain_granularity"), "range": range_, }
def query_obj(self): """Building a query object""" form_data = self.form_data groupby = form_data.get("groupby") or [] metrics = form_data.get("metrics") or ['count'] granularity = \ form_data.get("granularity") or form_data.get("granularity_sqla") limit = int(form_data.get("limit", 0)) row_limit = int( form_data.get("row_limit", config.get("ROW_LIMIT"))) since = form_data.get("since", "1 year ago") from_dttm = utils.parse_human_datetime(since) if from_dttm > datetime.now(): from_dttm = datetime.now() - (from_dttm-datetime.now()) until = form_data.get("until", "now") to_dttm = utils.parse_human_datetime(until) if from_dttm > to_dttm: flasher("The date range doesn't seem right.", "danger") from_dttm = to_dttm # Making them identical to not raise # extras are used to query elements specific to a datasource type # for instance the extra where clause that applies only to Tables extras = { 'where': form_data.get("where", ''), 'having': form_data.get("having", ''), 'time_grain_sqla': form_data.get("time_grain_sqla", ''), 'druid_time_origin': form_data.get("druid_time_origin", ''), } d = { 'granularity': granularity, 'from_dttm': from_dttm, 'to_dttm': to_dttm, 'is_timeseries': self.is_timeseries, 'groupby': groupby, 'metrics': metrics, 'row_limit': row_limit, 'filter': self.query_filters(), 'timeseries_limit': limit, 'extras': extras, } return d
def query( # druid self, groupby, metrics, granularity, from_dttm, to_dttm, filter=None, # noqa is_timeseries=True, timeseries_limit=None, row_limit=None, inner_from_dttm=None, inner_to_dttm=None, extras=None, # noqa select=None,): # noqa """Runs a query against Druid and returns a dataframe. This query interface is common to SqlAlchemy and Druid """ # TODO refactor into using a TBD Query object qry_start_dttm = datetime.now() inner_from_dttm = inner_from_dttm or from_dttm inner_to_dttm = inner_to_dttm or to_dttm # add tzinfo to native datetime with config from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ")) to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ")) query_str = "" metrics_dict = {m.metric_name: m for m in self.metrics} all_metrics = [] post_aggs = {} for metric_name in metrics: metric = metrics_dict[metric_name] if metric.metric_type != 'postagg': all_metrics.append(metric_name) else: conf = metric.json_obj fields = conf.get('fields', []) all_metrics += [ f.get('fieldName') for f in fields if f.get('type') == 'fieldAccess'] all_metrics += conf.get('fieldNames', []) if conf.get('type') == 'javascript': post_aggs[metric_name] = JavascriptPostAggregator( name=conf.get('name'), field_names=conf.get('fieldNames'), function=conf.get('function')) else: post_aggs[metric_name] = Postaggregator( conf.get('fn', "/"), conf.get('fields', []), conf.get('name', '')) aggregations = { m.metric_name: m.json_obj for m in self.metrics if m.metric_name in all_metrics } granularity = granularity or "all" if granularity != "all": granularity = utils.parse_human_timedelta( granularity).total_seconds() * 1000 if not isinstance(granularity, string_types): granularity = {"type": "duration", "duration": granularity} origin = extras.get('druid_time_origin') if origin: dttm = utils.parse_human_datetime(origin) granularity['origin'] = dttm.isoformat() qry = dict( datasource=self.datasource_name, dimensions=groupby, aggregations=aggregations, granularity=granularity, post_aggregations=post_aggs, intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(), ) filters = None for col, op, eq in filter: cond = None if op == '==': cond = Dimension(col) == eq elif op == '!=': cond = ~(Dimension(col) == eq) elif op in ('in', 'not in'): fields = [] splitted = eq.split(',') if len(splitted) > 1: for s in eq.split(','): s = s.strip() fields.append(Filter.build_filter(Dimension(col) == s)) cond = Filter(type="or", fields=fields) else: cond = Dimension(col) == eq if op == 'not in': cond = ~cond if filters: filters = Filter(type="and", fields=[ Filter.build_filter(cond), Filter.build_filter(filters) ]) else: filters = cond if filters: qry['filter'] = filters client = self.cluster.get_pydruid_client() orig_filters = filters if timeseries_limit and is_timeseries: # Limit on the number of timeseries, doing a two-phases query pre_qry = deepcopy(qry) pre_qry['granularity'] = "all" pre_qry['limit_spec'] = { "type": "default", "limit": timeseries_limit, 'intervals': ( inner_from_dttm.isoformat() + '/' + inner_to_dttm.isoformat()), "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**pre_qry) query_str += "// Two phase query\n// Phase 1\n" query_str += json.dumps(client.query_dict, indent=2) + "\n" query_str += "//\nPhase 2 (built based on phase one's results)\n" df = client.export_pandas() if df is not None and not df.empty: dims = qry['dimensions'] filters = [] for unused, row in df.iterrows(): fields = [] for dim in dims: f = Filter.build_filter(Dimension(dim) == row[dim]) fields.append(f) if len(fields) > 1: filt = Filter(type="and", fields=fields) filters.append(Filter.build_filter(filt)) elif fields: filters.append(fields[0]) if filters: ff = Filter(type="or", fields=filters) if not orig_filters: qry['filter'] = ff else: qry['filter'] = Filter(type="and", fields=[ Filter.build_filter(ff), Filter.build_filter(orig_filters)]) qry['limit_spec'] = None if row_limit: qry['limit_spec'] = { "type": "default", "limit": row_limit, "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**qry) query_str += json.dumps(client.query_dict, indent=2) df = client.export_pandas() if df is None or df.size == 0: raise Exception(_("No data was returned.")) if ( not is_timeseries and granularity == "all" and 'timestamp' in df.columns): del df['timestamp'] # Reordering columns cols = [] if 'timestamp' in df.columns: cols += ['timestamp'] cols += [col for col in groupby if col in df.columns] cols += [col for col in metrics if col in df.columns] df = df[cols] return QueryResult( df=df, query=query_str, duration=datetime.now() - qry_start_dttm)
def query( # druid self, groupby, metrics, granularity, from_dttm, to_dttm, filter=None, # noqa is_timeseries=True, timeseries_limit=None, row_limit=None, inner_from_dttm=None, inner_to_dttm=None, extras=None, # noqa select=None,): # noqa """Runs a query against Druid and returns a dataframe. This query interface is common to SqlAlchemy and Druid """ # TODO refactor into using a TBD Query object qry_start_dttm = datetime.now() inner_from_dttm = inner_from_dttm or from_dttm inner_to_dttm = inner_to_dttm or to_dttm # add tzinfo to native datetime with config from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ")) to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ")) query_str = "" metrics_dict = {m.metric_name: m for m in self.metrics} all_metrics = [] post_aggs = {} def recursive_get_fields(_conf): _fields = _conf.get('fields', []) field_names = [] for _f in _fields: _type = _f.get('type') if _type in ['fieldAccess', 'hyperUniqueCardinality']: field_names.append(_f.get('fieldName')) elif _type == 'arithmetic': field_names += recursive_get_fields(_f) return list(set(field_names)) for metric_name in metrics: metric = metrics_dict[metric_name] if metric.metric_type != 'postagg': all_metrics.append(metric_name) else: conf = metric.json_obj all_metrics += recursive_get_fields(conf) all_metrics += conf.get('fieldNames', []) if conf.get('type') == 'javascript': post_aggs[metric_name] = JavascriptPostAggregator( name=conf.get('name'), field_names=conf.get('fieldNames'), function=conf.get('function')) else: post_aggs[metric_name] = Postaggregator( conf.get('fn', "/"), conf.get('fields', []), conf.get('name', '')) aggregations = { m.metric_name: m.json_obj for m in self.metrics if m.metric_name in all_metrics } rejected_metrics = [ m.metric_name for m in self.metrics if m.is_restricted and m.metric_name in aggregations.keys() and not sm.has_access('metric_access', m.perm) ] if rejected_metrics: raise MetricPermException( "Access to the metrics denied: " + ', '.join(rejected_metrics) ) granularity = granularity or "all" if granularity != "all": granularity = utils.parse_human_timedelta( granularity).total_seconds() * 1000 if not isinstance(granularity, string_types): granularity = {"type": "duration", "duration": granularity} origin = extras.get('druid_time_origin') if origin: dttm = utils.parse_human_datetime(origin) granularity['origin'] = dttm.isoformat() qry = dict( datasource=self.datasource_name, dimensions=groupby, aggregations=aggregations, granularity=granularity, post_aggregations=post_aggs, intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(), ) filters = None for col, op, eq in filter: cond = None if op == '==': cond = Dimension(col) == eq elif op == '!=': cond = ~(Dimension(col) == eq) elif op in ('in', 'not in'): fields = [] splitted = eq.split(',') if len(splitted) > 1: for s in eq.split(','): s = s.strip() fields.append(Dimension(col) == s) cond = Filter(type="or", fields=fields) else: cond = Dimension(col) == eq if op == 'not in': cond = ~cond elif op == 'regex': cond = Filter(type="regex", pattern=eq, dimension=col) if filters: filters = Filter(type="and", fields=[ cond, filters ]) else: filters = cond if filters: qry['filter'] = filters client = self.cluster.get_pydruid_client() orig_filters = filters if timeseries_limit and is_timeseries: # Limit on the number of timeseries, doing a two-phases query pre_qry = deepcopy(qry) pre_qry['granularity'] = "all" pre_qry['limit_spec'] = { "type": "default", "limit": timeseries_limit, 'intervals': ( inner_from_dttm.isoformat() + '/' + inner_to_dttm.isoformat()), "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**pre_qry) query_str += "// Two phase query\n// Phase 1\n" query_str += json.dumps( client.query_builder.last_query.query_dict, indent=2) + "\n" query_str += "//\nPhase 2 (built based on phase one's results)\n" df = client.export_pandas() if df is not None and not df.empty: dims = qry['dimensions'] filters = [] for unused, row in df.iterrows(): fields = [] for dim in dims: f = Dimension(dim) == row[dim] fields.append(f) if len(fields) > 1: filt = Filter(type="and", fields=fields) filters.append(filt) elif fields: filters.append(fields[0]) if filters: ff = Filter(type="or", fields=filters) if not orig_filters: qry['filter'] = ff else: qry['filter'] = Filter(type="and", fields=[ ff, orig_filters]) qry['limit_spec'] = None if row_limit: qry['limit_spec'] = { "type": "default", "limit": row_limit, "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**qry) query_str += json.dumps( client.query_builder.last_query.query_dict, indent=2) df = client.export_pandas() if df is None or df.size == 0: raise Exception(_("No data was returned.")) if ( not is_timeseries and granularity == "all" and 'timestamp' in df.columns): del df['timestamp'] # Reordering columns cols = [] if 'timestamp' in df.columns: cols += ['timestamp'] cols += [col for col in groupby if col in df.columns] cols += [col for col in metrics if col in df.columns] df = df[cols] return QueryResult( df=df, query=query_str, duration=datetime.now() - qry_start_dttm)
def query( self, groupby, metrics, granularity, from_dttm, to_dttm, filter=None, # noqa is_timeseries=True, timeseries_limit=None, row_limit=None, inner_from_dttm=None, inner_to_dttm=None, extras=None, # noqa select=None,): # noqa """Runs a query against Druid and returns a dataframe. This query interface is common to SqlAlchemy and Druid """ # TODO refactor into using a TBD Query object qry_start_dttm = datetime.now() inner_from_dttm = inner_from_dttm or from_dttm inner_to_dttm = inner_to_dttm or to_dttm # add tzinfo to native datetime with config from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ")) to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ")) query_str = "" aggregations = { m.metric_name: m.json_obj for m in self.metrics if m.metric_name in metrics } granularity = granularity or "all" if granularity != "all": granularity = utils.parse_human_timedelta( granularity).total_seconds() * 1000 if not isinstance(granularity, string_types): granularity = {"type": "duration", "duration": granularity} origin = extras.get('druid_time_origin') if origin: dttm = utils.parse_human_datetime(origin) granularity['origin'] = dttm.isoformat() qry = dict( datasource=self.datasource_name, dimensions=groupby, aggregations=aggregations, granularity=granularity, intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(), ) filters = None for col, op, eq in filter: cond = None if op == '==': cond = Dimension(col) == eq elif op == '!=': cond = ~(Dimension(col) == eq) elif op in ('in', 'not in'): fields = [] splitted = eq.split(',') if len(splitted) > 1: for s in eq.split(','): s = s.strip() fields.append(Filter.build_filter(Dimension(col) == s)) cond = Filter(type="or", fields=fields) else: cond = Dimension(col) == eq if op == 'not in': cond = ~cond if filters: filters = Filter(type="and", fields=[ Filter.build_filter(cond), Filter.build_filter(filters) ]) else: filters = cond if filters: qry['filter'] = filters client = self.cluster.get_pydruid_client() orig_filters = filters if timeseries_limit and is_timeseries: # Limit on the number of timeseries, doing a two-phases query pre_qry = deepcopy(qry) pre_qry['granularity'] = "all" pre_qry['limit_spec'] = { "type": "default", "limit": timeseries_limit, 'intervals': ( inner_from_dttm.isoformat() + '/' + inner_to_dttm.isoformat()), "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**pre_qry) query_str += "// Two phase query\n// Phase 1\n" query_str += json.dumps(client.query_dict, indent=2) + "\n" query_str += "//\nPhase 2 (built based on phase one's results)\n" df = client.export_pandas() if df is not None and not df.empty: dims = qry['dimensions'] filters = [] for _, row in df.iterrows(): fields = [] for dim in dims: f = Filter.build_filter(Dimension(dim) == row[dim]) fields.append(f) if len(fields) > 1: filt = Filter(type="and", fields=fields) filters.append(Filter.build_filter(filt)) elif fields: filters.append(fields[0]) if filters: ff = Filter(type="or", fields=filters) if not orig_filters: qry['filter'] = ff else: qry['filter'] = Filter(type="and", fields=[ Filter.build_filter(ff), Filter.build_filter(orig_filters)]) qry['limit_spec'] = None if row_limit: qry['limit_spec'] = { "type": "default", "limit": row_limit, "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**qry) query_str += json.dumps(client.query_dict, indent=2) df = client.export_pandas() if df is None or df.size == 0: raise Exception("No data was returned.") if ( not is_timeseries and granularity == "all" and 'timestamp' in df.columns): del df['timestamp'] # Reordering columns cols = [] if 'timestamp' in df.columns: cols += ['timestamp'] cols += [col for col in groupby if col in df.columns] cols += [col for col in metrics if col in df.columns] cols += [col for col in df.columns if col not in cols] df = df[cols] return QueryResult( df=df, query=query_str, duration=datetime.now() - qry_start_dttm)