def all_referenced_columns(query: Query): """ Return the set of all columns that are used by a query. """ col_exprs: MutableSequence[Any] = [] if query.get_arrayjoin(): col_exprs.extend(to_list(query.get_arrayjoin())) if query.get_groupby(): col_exprs.extend(to_list(query.get_groupby())) if query.get_orderby(): col_exprs.extend(to_list(query.get_orderby())) if query.get_selected_columns(): col_exprs.extend(to_list(query.get_selected_columns())) # Conditions need flattening as they can be nested as AND/OR if query.get_conditions(): flat_conditions = list( chain(*[[c] if is_condition(c) else c for c in query.get_conditions()])) col_exprs.extend([c[0] for c in flat_conditions]) if query.get_aggregations(): col_exprs.extend([a[1] for a in query.get_aggregations()]) # Return the set of all columns referenced in any expression return set(chain(*[columns_in_expr(ex) for ex in col_exprs]))
def get_all_referenced_columns(self) -> Sequence[Any]: """ Return the set of all columns that are used by a query. TODO: This does not actually return all columns referenced in the query since there are some corner cases left out: - functions expressed in the form f(column) in aggregations. Will fix both when adding a better column abstraction. Also the replace_column method behave consistently with this one. Any change to this method should be reflected there. """ col_exprs: MutableSequence[Any] = [] if self.get_arrayjoin(): col_exprs.extend(to_list(self.get_arrayjoin())) if self.get_groupby(): col_exprs.extend(to_list(self.get_groupby())) if self.get_orderby(): col_exprs.extend(to_list(self.get_orderby())) if self.get_selected_columns(): col_exprs.extend(to_list(self.get_selected_columns())) # Conditions need flattening as they can be nested as AND/OR self.__add_flat_conditions(col_exprs, self.get_conditions()) self.__add_flat_conditions(col_exprs, self.get_having()) self.__add_flat_conditions(col_exprs, self.get_prewhere()) if self.get_aggregations(): col_exprs.extend([a[1] for a in self.get_aggregations()]) # Return the set of all columns referenced in any expression return self.__get_referenced_columns(col_exprs)
def wrapper(dataset, request: Request, *args, **kwargs): (use_split, ) = state.get_configs([("use_split", 0)]) query_limit = request.query.get_limit() limit = query_limit if query_limit is not None else 0 remaining_offset = request.query.get_offset() orderby = util.to_list(request.query.get_orderby()) common_conditions = use_split and limit and not request.query.get_groupby( ) if common_conditions: total_col_count = len(request.query.get_all_referenced_columns()) column_split_spec = dataset.get_split_query_spec() if column_split_spec: copied_query = copy.deepcopy(request.query) copied_query.set_selected_columns( column_split_spec.get_min_columns()) min_col_count = len(copied_query.get_all_referenced_columns()) else: min_col_count = None if (column_split_spec and request.query.get_selected_columns() and not request.query.get_aggregations() and total_col_count > min_col_count): return col_split(dataset, request, column_split_spec, *args, **kwargs) elif orderby[:1] == ["-timestamp"] and remaining_offset < 1000: return time_split(dataset, request, *args, **kwargs) return query_func(dataset, request, *args, **kwargs)
def wrapper(dataset, request: Request, *args, **kwargs): use_split = state.get_configs([ ('use_split', 0), ]) limit = request.query.get_limit() remaining_offset = request.query.get_offset() orderby = util.to_list(request.query.get_orderby()) common_conditions = use_split and limit and not request.query.get_groupby( ) if common_conditions: # TODO: Move all_referenced_columns into query and remove this dependency. # In order to do this we need to break a circular dependency first total_col_count = len( util.all_referenced_columns(request.query.get_body())) min_col_count = len( util.all_referenced_columns({ **request.query.get_body(), 'selected_columns': MIN_COLS })) if (request.query.get_selected_columns() and not request.query.get_aggregations() and total_col_count > min_col_count): return col_split(dataset, request, *args, **kwargs) elif orderby[:1] == ['-timestamp'] and remaining_offset < 1000: return time_split(dataset, request, *args, **kwargs) return query_func(dataset, request, *args, **kwargs)
def wrapper(dataset, request: Request, *args, **kwargs): use_split = state.get_configs([ ('use_split', 0), ]) query_limit = request.query.get_limit() limit = query_limit if query_limit is not None else 0 remaining_offset = request.query.get_offset() orderby = util.to_list(request.query.get_orderby()) common_conditions = use_split and limit and not request.query.get_groupby() if common_conditions: # TODO: Move all_referenced_columns into query and remove this dependency. # In order to do this we need to break a circular dependency first total_col_count = len(all_referenced_columns(request.query)) column_split_spec = dataset.get_split_query_spec() if column_split_spec: copied_query = copy.deepcopy(request.query) copied_query.set_selected_columns(column_split_spec.get_min_columns()) min_col_count = len(all_referenced_columns(copied_query)) else: min_col_count = None if ( column_split_spec and request.query.get_selected_columns() and not request.query.get_aggregations() and total_col_count > min_col_count ): return col_split(dataset, request, column_split_spec, *args, **kwargs) elif orderby[:1] == ['-timestamp'] and remaining_offset < 1000: return time_split(dataset, request, *args, **kwargs) return query_func(dataset, request, *args, **kwargs)
def __replace_col_in_list( self, expressions: Any, old_column: str, new_column: str, ) -> Sequence[Any]: return [ self.__replace_col_in_expression(expr, old_column, new_column) for expr in to_list(expressions) ]
def process_query( self, query: Query, extension_data: ExtensionData, request_settings: RequestSettings, ) -> None: project_ids = util.to_list(extension_data['project']) if project_ids: query.add_conditions([('project_id', 'IN', project_ids)]) request_settings.add_rate_limit(self._get_rate_limit_params(project_ids)) self.do_post_processing(project_ids, query, request_settings)
def process_query( self, query: Query, extension_data: ExtensionData, request_settings: RequestSettings, ) -> None: project_ids = util.to_list(extension_data["project"]) if project_ids: query.add_condition_to_ast( in_condition( Column(None, None, self.__project_column), [Literal(None, p) for p in project_ids], )) request_settings.add_rate_limit( self._get_rate_limit_params(project_ids))
def _parse_query_impl(body: MutableMapping[str, Any], entity: Entity) -> Query: def build_selected_expressions( raw_expressions: Sequence[Any], ) -> List[SelectedExpression]: output = [] for raw_expression in raw_expressions: exp = parse_expression(tuplify(raw_expression), entity.get_data_model(), set()) output.append( SelectedExpression( # An expression in the query can be a string or a # complex list with an alias. In the second case # we trust the parser to find the alias. name=raw_expression if isinstance(raw_expression, str) else exp.alias, expression=exp, )) return output aggregations = [] for aggregation in body.get("aggregations", []): if not isinstance(aggregation, Sequence): raise ParsingException(( f"Invalid aggregation structure {aggregation}. " "It must be a sequence containing expression, column and alias." )) aggregation_function = aggregation[0] column_expr = aggregation[1] column_expr = column_expr if column_expr else [] alias = aggregation[2] alias = alias if alias else None aggregations.append( SelectedExpression( name=alias, expression=parse_aggregation( aggregation_function, column_expr, alias, entity.get_data_model(), set(), ), )) groupby_clause = build_selected_expressions( to_list(body.get("groupby", []))) select_clause = ( groupby_clause + aggregations + build_selected_expressions(body.get("selected_columns", []))) array_join_cols = set() arrayjoin = body.get("arrayjoin") # TODO: Properly detect all array join columns in all clauses of the query. # This is missing an arrayJoin in condition with an alias that is then # used in the select. if arrayjoin: array_join_cols.add(arrayjoin) array_join_expr: Optional[Expression] = parse_expression( body["arrayjoin"], entity.get_data_model(), {arrayjoin}) else: array_join_expr = None for select_expr in select_clause: if isinstance(select_expr.expression, FunctionCall): if select_expr.expression.function_name == "arrayJoin": parameters = select_expr.expression.parameters if len(parameters) != 1: raise ParsingException( "arrayJoin(...) only accepts a single parameter.") if isinstance(parameters[0], Column): array_join_cols.add(parameters[0].column_name) else: # We only accepts columns or functions that do not # reference columns. We could not say whether we are # actually arrayjoining on the values of the column # if it is nested in an arbitrary function. But # functions of literals are fine. for e in parameters[0]: if isinstance(e, Column): raise ParsingException( "arrayJoin(...) cannot contain columns nested in functions." ) where_expr = parse_conditions_to_expr(body.get("conditions", []), entity, array_join_cols) having_expr = parse_conditions_to_expr(body.get("having", []), entity, array_join_cols) orderby_exprs = [] for orderby in to_list(body.get("orderby", [])): if isinstance(orderby, str): match = NEGATE_RE.match(orderby) if match is None: raise ParsingException(( f"Invalid Order By clause {orderby}. If the Order By is a string, " "it must respect the format `[-]column`")) direction, col = match.groups() orderby = col elif is_function(orderby): match = NEGATE_RE.match(orderby[0]) if match is None: raise ParsingException(( f"Invalid Order By clause {orderby}. If the Order By is an expression, " "the function name must respect the format `[-]func_name`" )) direction, col = match.groups() orderby = [col] + orderby[1:] else: raise ParsingException( (f"Invalid Order By clause {orderby}. The Clause was neither " "a string nor a function call.")) orderby_parsed = parse_expression(tuplify(orderby), entity.get_data_model(), set()) orderby_exprs.append( OrderBy( OrderByDirection.DESC if direction == "-" else OrderByDirection.ASC, orderby_parsed, )) return Query( body, None, selected_columns=select_clause, array_join=array_join_expr, condition=where_expr, groupby=[g.expression for g in groupby_clause], having=having_expr, order_by=orderby_exprs, )
def _parse_query_impl(body: MutableMapping[str, Any], dataset: Dataset) -> Query: aggregate_exprs = [] for aggregation in body.get("aggregations", []): assert isinstance(aggregation, (list, tuple)) aggregation_function = aggregation[0] column_expr = aggregation[1] column_expr = column_expr if column_expr else [] alias = aggregation[2] alias = alias if alias else None aggregate_exprs.append( parse_aggregation(aggregation_function, column_expr, alias)) groupby_exprs = [ parse_expression(tuplify(group_by)) for group_by in to_list(body.get("groupby", [])) ] select_exprs = [ parse_expression(tuplify(select)) for select in body.get("selected_columns", []) ] selected_cols = groupby_exprs + aggregate_exprs + select_exprs arrayjoin = body.get("arrayjoin") if arrayjoin: array_join_expr: Optional[Expression] = parse_expression( body["arrayjoin"]) else: array_join_expr = None where_expr = parse_conditions_to_expr(body.get("conditions", []), dataset, arrayjoin) having_expr = parse_conditions_to_expr(body.get("having", []), dataset, arrayjoin) orderby_exprs = [] for orderby in to_list(body.get("orderby", [])): if isinstance(orderby, str): match = NEGATE_RE.match(orderby) assert match is not None, f"Invalid Order By clause {orderby}" direction, col = match.groups() orderby = col elif is_function(orderby): match = NEGATE_RE.match(orderby[0]) assert match is not None, f"Invalid Order By clause {orderby}" direction, col = match.groups() orderby = [col] + orderby[1:] else: raise ValueError(f"Invalid Order By clause {orderby}") orderby_parsed = parse_expression(tuplify(orderby)) orderby_exprs.append( OrderBy( OrderByDirection.DESC if direction == "-" else OrderByDirection.ASC, orderby_parsed, )) source = dataset.get_dataset_schemas().get_read_schema().get_data_source() return Query( body, source, selected_columns=selected_cols, array_join=array_join_expr, condition=where_expr, groupby=groupby_exprs, having=having_expr, order_by=orderby_exprs, )
def __init__( self, dataset: Dataset, query: Query, settings: RequestSettings, ) -> None: parsing_context = ParsingContext() aggregate_exprs = [ column_expr(dataset, col, query, parsing_context, alias, agg) for (agg, col, alias) in query.get_aggregations() ] groupby = util.to_list(query.get_groupby()) group_exprs = [ column_expr(dataset, gb, query, parsing_context) for gb in groupby ] column_names = query.get_selected_columns() or [] selected_cols = [ column_expr(dataset, util.tuplify(colname), query, parsing_context) for colname in column_names ] select_clause = u"SELECT {}".format( ", ".join(group_exprs + aggregate_exprs + selected_cols)) from_clause = u"FROM {}".format(query.get_data_source().format_from()) if query.get_final(): from_clause = u"{} FINAL".format(from_clause) if not query.get_data_source().supports_sample(): sample_rate = None else: if query.get_sample(): sample_rate = query.get_sample() elif settings.get_turbo(): sample_rate = snuba_settings.TURBO_SAMPLE_RATE else: sample_rate = None if sample_rate: from_clause = u"{} SAMPLE {}".format(from_clause, sample_rate) join_clause = "" if query.get_arrayjoin(): join_clause = u"ARRAY JOIN {}".format(query.get_arrayjoin()) where_clause = "" if query.get_conditions(): where_clause = u"WHERE {}".format( conditions_expr(dataset, query.get_conditions(), query, parsing_context)) prewhere_clause = "" if query.get_prewhere(): prewhere_clause = u"PREWHERE {}".format( conditions_expr(dataset, query.get_prewhere(), query, parsing_context)) group_clause = "" if groupby: group_clause = "GROUP BY ({})".format(", ".join( column_expr(dataset, gb, query, parsing_context) for gb in groupby)) if query.has_totals(): group_clause = "{} WITH TOTALS".format(group_clause) having_clause = "" having_conditions = query.get_having() if having_conditions: assert groupby, "found HAVING clause with no GROUP BY" having_clause = u"HAVING {}".format( conditions_expr(dataset, having_conditions, query, parsing_context)) order_clause = "" if query.get_orderby(): orderby = [ column_expr(dataset, util.tuplify(ob), query, parsing_context) for ob in util.to_list(query.get_orderby()) ] orderby = [ u"{} {}".format(ob.lstrip("-"), "DESC" if ob.startswith("-") else "ASC") for ob in orderby ] order_clause = u"ORDER BY {}".format(", ".join(orderby)) limitby_clause = "" if query.get_limitby() is not None: limitby_clause = "LIMIT {} BY {}".format(*query.get_limitby()) limit_clause = "" if query.get_limit() is not None: limit_clause = "LIMIT {}, {}".format(query.get_offset(), query.get_limit()) self.__formatted_query = " ".join([ c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause, ] if c ])
def __init__( self, dataset: Dataset, query: Query, settings: RequestSettings, prewhere_conditions: Sequence[str], ) -> None: parsing_context = ParsingContext() aggregate_exprs = [ column_expr(dataset, col, query, parsing_context, alias, agg) for (agg, col, alias) in query.get_aggregations() ] groupby = util.to_list(query.get_groupby()) group_exprs = [ column_expr(dataset, gb, query, parsing_context) for gb in groupby ] column_names = query.get_selected_columns() or [] selected_cols = [ column_expr(dataset, util.tuplify(colname), query, parsing_context) for colname in column_names ] select_clause = u'SELECT {}'.format( ', '.join(group_exprs + aggregate_exprs + selected_cols)) from_clause = u'FROM {}'.format(query.get_data_source().format_from()) if query.get_final(): from_clause = u'{} FINAL'.format(from_clause) if query.get_sample(): sample_rate = query.get_sample() elif settings.get_turbo(): sample_rate = snuba_settings.TURBO_SAMPLE_RATE else: sample_rate = None if sample_rate: from_clause = u'{} SAMPLE {}'.format(from_clause, sample_rate) join_clause = '' if query.get_arrayjoin(): join_clause = u'ARRAY JOIN {}'.format(query.get_arrayjoin()) where_clause = '' if query.get_conditions(): where_clause = u'WHERE {}'.format( conditions_expr(dataset, query.get_conditions(), query, parsing_context)) prewhere_clause = '' if prewhere_conditions: prewhere_clause = u'PREWHERE {}'.format( conditions_expr(dataset, prewhere_conditions, query, parsing_context)) group_clause = '' if groupby: group_clause = 'GROUP BY ({})'.format(', '.join( column_expr(dataset, gb, query, parsing_context) for gb in groupby)) if query.has_totals(): group_clause = '{} WITH TOTALS'.format(group_clause) having_clause = '' having_conditions = query.get_having() if having_conditions: assert groupby, 'found HAVING clause with no GROUP BY' having_clause = u'HAVING {}'.format( conditions_expr(dataset, having_conditions, query, parsing_context)) order_clause = '' if query.get_orderby(): orderby = [ column_expr(dataset, util.tuplify(ob), query, parsing_context) for ob in util.to_list(query.get_orderby()) ] orderby = [ u'{} {}'.format(ob.lstrip('-'), 'DESC' if ob.startswith('-') else 'ASC') for ob in orderby ] order_clause = u'ORDER BY {}'.format(', '.join(orderby)) limitby_clause = '' if query.get_limitby() is not None: limitby_clause = 'LIMIT {} BY {}'.format(*query.get_limitby()) limit_clause = '' if query.get_limit() is not None: limit_clause = 'LIMIT {}, {}'.format(query.get_offset(), query.get_limit()) self.__formatted_query = ' '.join([ c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause ] if c ])
def format(self) -> str: """Generate a SQL string from the parameters.""" body = self.__request.body query = self.__request.query source = self.__dataset \ .get_dataset_schemas() \ .get_read_schema() \ .get_data_source() aggregate_exprs = [ util.column_expr(self.__dataset, col, body, alias, agg) for (agg, col, alias) in query.get_aggregations() ] groupby = util.to_list(query.get_groupby()) group_exprs = [ util.column_expr(self.__dataset, gb, body) for gb in groupby ] column_names = query.get_selected_columns() or [] selected_cols = [ util.column_expr(self.__dataset, util.tuplify(colname), body) for colname in column_names ] select_clause = u'SELECT {}'.format( ', '.join(group_exprs + aggregate_exprs + selected_cols)) from_clause = u'FROM {}'.format(source) if self.__final: from_clause = u'{} FINAL'.format(from_clause) if query.get_sample(): from_clause = u'{} SAMPLE {}'.format(from_clause, query.get_sample()) join_clause = '' if 'arrayjoin' in body: join_clause = u'ARRAY JOIN {}'.format(body['arrayjoin']) where_clause = '' if query.get_conditions(): where_clause = u'WHERE {}'.format( util.conditions_expr(self.__dataset, query.get_conditions(), body)) prewhere_clause = '' if self.__prewhere_conditions: prewhere_clause = u'PREWHERE {}'.format( util.conditions_expr(self.__dataset, self.__prewhere_conditions, body)) group_clause = '' if groupby: group_clause = 'GROUP BY ({})'.format(', '.join( util.column_expr(self.__dataset, gb, body) for gb in groupby)) if body.get('totals', False): group_clause = '{} WITH TOTALS'.format(group_clause) having_clause = '' having_conditions = body.get('having', []) if having_conditions: assert groupby, 'found HAVING clause with no GROUP BY' having_clause = u'HAVING {}'.format( util.conditions_expr(self.__dataset, having_conditions, body)) order_clause = '' if query.get_orderby(): orderby = [ util.column_expr(self.__dataset, util.tuplify(ob), body) for ob in util.to_list(query.get_orderby()) ] orderby = [ u'{} {}'.format(ob.lstrip('-'), 'DESC' if ob.startswith('-') else 'ASC') for ob in orderby ] order_clause = u'ORDER BY {}'.format(', '.join(orderby)) limitby_clause = '' if 'limitby' in body: limitby_clause = 'LIMIT {} BY {}'.format(*body['limitby']) limit_clause = '' if 'limit' in body: limit_clause = 'LIMIT {}, {}'.format(query.get_offset(), body['limit']) return ' '.join([ c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause ] if c ])
def replace_column(self, old_column: str, new_column: str) -> None: """ Replaces a column in all fields of the query. The Query object is mutated in place while the internal fields are replaced. This behaves consistently with get_all_referenced_columns (which does not really behave correctly since it is missing a few fields that can contain columns). Will fix both when adding a better column abstraction. In the current implementation we can only replace a column identified by a string with another column identified by a string. This does not support replacing a column with a complex expression. Columns represented as strings include expresions like "tags[a]" or "f(column)" This method will replace them as well if requested, but that would not be a good idea since such columns are processed by column_expr later in the flow. """ if self.get_selected_columns(): self.set_selected_columns( self.__replace_col_in_list( self.get_selected_columns(), old_column, new_column, )) if self.get_arrayjoin(): self.set_arrayjoin( self.__replace_col_in_expression(self.get_arrayjoin(), old_column, new_column)) if self.get_groupby(): self.set_groupby( self.__replace_col_in_list( self.get_groupby(), old_column, new_column, )) if self.get_orderby(): self.set_orderby( self.__replace_col_in_list( self.get_orderby(), old_column, new_column, )) if self.get_aggregations(): self.set_aggregations([ [ aggr[0], self.__replace_col_in_expression(aggr[1], old_column, new_column) if not isinstance(aggr[1], (list, tuple)) # This can be an expresison or a list of expressions else self.__replace_col_in_list(aggr[1], old_column, new_column), aggr[2], ] for aggr in to_list(self.get_aggregations()) ]) if self.get_conditions(): self.set_conditions( self.__replace_col_in_condition( to_list(self.get_conditions()), old_column, new_column, ))
def parse_and_run_query(validated_body, timer): body = deepcopy(validated_body) turbo = body.get('turbo', False) max_days, table, date_align, config_sample, force_final, max_group_ids_exclude = state.get_configs([ ('max_days', None), ('clickhouse_table', settings.CLICKHOUSE_TABLE), ('date_align_seconds', 1), ('sample', 1), # 1: always use FINAL, 0: never use final, undefined/None: use project setting. ('force_final', 0 if turbo else None), ('max_group_ids_exclude', settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE), ]) stats = {} to_date = util.parse_datetime(body['to_date'], date_align) from_date = util.parse_datetime(body['from_date'], date_align) assert from_date <= to_date if max_days is not None and (to_date - from_date).days > max_days: from_date = to_date - timedelta(days=max_days) where_conditions = body.get('conditions', []) where_conditions.extend([ ('timestamp', '>=', from_date), ('timestamp', '<', to_date), ('deleted', '=', 0), ]) # NOTE: we rely entirely on the schema to make sure that regular snuba # queries are required to send a project_id filter. Some other special # internal query types do not require a project_id filter. project_ids = util.to_list(body['project']) if project_ids: where_conditions.append(('project_id', 'IN', project_ids)) having_conditions = body.get('having', []) aggregate_exprs = [ util.column_expr(col, body, alias, agg) for (agg, col, alias) in body['aggregations'] ] groupby = util.to_list(body['groupby']) group_exprs = [util.column_expr(gb, body) for gb in groupby] selected_cols = [util.column_expr(util.tuplify(colname), body) for colname in body.get('selected_columns', [])] select_exprs = group_exprs + aggregate_exprs + selected_cols select_clause = u'SELECT {}'.format(', '.join(select_exprs)) from_clause = u'FROM {}'.format(table) # For now, we only need FINAL if: # 1. The project has been marked as needing FINAL (in redis) because of recent # replacements (and it affects too many groups for us just to exclude # those groups from the query) # OR # 2. the force_final setting = 1 needs_final, exclude_group_ids = get_projects_query_flags(project_ids) if len(exclude_group_ids) > max_group_ids_exclude: # Cap the number of groups to exclude by query and flip to using FINAL if necessary needs_final = True exclude_group_ids = [] used_final = False if force_final == 1 or (force_final is None and needs_final): from_clause = u'{} FINAL'.format(from_clause) used_final = True elif exclude_group_ids: where_conditions.append(('group_id', 'NOT IN', exclude_group_ids)) sample = body.get('sample', settings.TURBO_SAMPLE_RATE if turbo else config_sample) if sample != 1: from_clause = u'{} SAMPLE {}'.format(from_clause, sample) joins = [] if 'arrayjoin' in body: joins.append(u'ARRAY JOIN {}'.format(body['arrayjoin'])) join_clause = ' '.join(joins) where_clause = '' if where_conditions: where_conditions = list(set(util.tuplify(where_conditions))) where_clause = u'WHERE {}'.format(util.conditions_expr(where_conditions, body)) prewhere_conditions = [] if settings.PREWHERE_KEYS: # Add any condition to PREWHERE if: # - It is a single top-level condition (not OR-nested), and # - Any of its referenced columns are in PREWHERE_KEYS prewhere_candidates = [ (util.columns_in_expr(cond[0]), cond) for cond in where_conditions if util.is_condition(cond) and any(col in settings.PREWHERE_KEYS for col in util.columns_in_expr(cond[0])) ] # Use the condition that has the highest priority (based on the # position of its columns in the PREWHERE_KEYS list) prewhere_candidates = sorted([ (min(settings.PREWHERE_KEYS.index(col) for col in cols if col in settings.PREWHERE_KEYS), cond) for cols, cond in prewhere_candidates ]) if prewhere_candidates: prewhere_conditions = [cond for _, cond in prewhere_candidates][:settings.MAX_PREWHERE_CONDITIONS] prewhere_clause = '' if prewhere_conditions: prewhere_clause = u'PREWHERE {}'.format(util.conditions_expr(prewhere_conditions, body)) having_clause = '' if having_conditions: assert groupby, 'found HAVING clause with no GROUP BY' having_clause = u'HAVING {}'.format(util.conditions_expr(having_conditions, body)) group_clause = ', '.join(util.column_expr(gb, body) for gb in groupby) if group_clause: if body.get('totals', False): group_clause = 'GROUP BY ({}) WITH TOTALS'.format(group_clause) else: group_clause = 'GROUP BY ({})'.format(group_clause) order_clause = '' if body.get('orderby'): orderby = [util.column_expr(util.tuplify(ob), body) for ob in util.to_list(body['orderby'])] orderby = [u'{} {}'.format( ob.lstrip('-'), 'DESC' if ob.startswith('-') else 'ASC' ) for ob in orderby] order_clause = u'ORDER BY {}'.format(', '.join(orderby)) limitby_clause = '' if 'limitby' in body: limitby_clause = 'LIMIT {} BY {}'.format(*body['limitby']) limit_clause = '' if 'limit' in body: limit_clause = 'LIMIT {}, {}'.format(body.get('offset', 0), body['limit']) sql = ' '.join([c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause ] if c]) timer.mark('prepare_query') stats.update({ 'clickhouse_table': table, 'final': used_final, 'referrer': request.referrer, 'num_days': (to_date - from_date).days, 'num_projects': len(project_ids), 'sample': sample, }) return util.raw_query( validated_body, sql, clickhouse_ro, timer, stats )