def validate(self, exp: Expression, dataset: Dataset) -> None: if not isinstance(exp, FunctionCall): return dataset_validators = dataset.get_function_call_validators() common_function_validators = ( dataset_validators.keys() & default_validators.keys() ) if common_function_validators: logger.warning( "Dataset validators are overlapping with default ones. Dataset: %s. Overlap %r", dataset, common_function_validators, exc_info=True, ) validators = ChainMap(default_validators, dataset_validators) try: validator = validators.get(exp.function_name) if validator is not None: validator.validate(exp.parameters, dataset.get_abstract_columnset()) except InvalidFunctionCall as exception: raise InvalidExpressionException( exp, f"Illegal call to function {exp.function_name}: {str(exception)}", ) from exception
def run(conn: Client, dataset: Dataset) -> None: schemas: MutableSequence[Schema] = [] writable_storage = dataset.get_writable_storage() if writable_storage: writer = writable_storage.get_table_writer() schemas.append(writer.get_schema()) for storage in dataset.get_all_storages(): schemas.append(storage.get_schemas().get_read_schema()) for schema in schemas: _run_schema(conn, schema)
def parse_query(body: MutableMapping[str, Any], dataset: Dataset) -> Query: """ Parses the query body generating the AST. This only takes into account the initial query body. Extensions are parsed by extension processors and are supposed to update the AST. Parsing includes two phases. The first transforms the json body into a minimal query Object resolving expressions, conditions, etc. The second phase performs some query processing to provide a sane query to the dataset specific section. - It prevents alias shadowing. - It transforms columns from the tags[asd] form into SubscriptableReference. - Applies aliases to all columns that do not have one and that do not represent a reference to an existing alias. During query processing a column can be transformed into a different expression. It is essential to preserve the original column name so that the result set still has a column with the name provided by the user no matter on which transformation we applied. By applying aliases at this stage every processor just needs to preserve them to guarantee the correctness of the query. - Expands all the references to aliases by inlining the expression to make aliasing transparent to all query processing phases. References to aliases are reintroduced at the end of the query processing. Alias references are packaged back at the end of processing. """ # TODO: Parse the entity out of the query body and select the correct one from the dataset entity = dataset.get_default_entity() query = _parse_query_impl(body, entity) # TODO: These should support composite queries. _validate_empty_table_names(query) _validate_aliases(query) _parse_subscriptables(query) _apply_column_aliases(query) _expand_aliases(query) # WARNING: These steps above assume table resolution did not happen # yet. If it is put earlier than here (unlikely), we need to adapt them. _deescape_aliases(query) _mangle_aliases(query) _validate_arrayjoin(query) # XXX: Select the entity to be used for the query. This step is temporary. Eventually # entity selection will be moved to Sentry and specified for all SnQL queries. selected_entity = dataset.select_entity(query) query_entity = QueryEntity( selected_entity, get_entity(selected_entity).get_data_model() ) query.set_from_clause(query_entity) validate_query(query) return query
def enforce_table_writer(dataset: Dataset) -> TableWriter: writable_storage = dataset.get_writable_storage() assert ( writable_storage is not None ), f"Dataset{dataset} does not have a writable storage." return writable_storage.get_table_writer()
def _run_query_pipeline( dataset: Dataset, request: Request, timer: Timer, query_metadata: SnubaQueryMetadata, ) -> QueryResult: """ Runs the query processing and execution pipeline for a Snuba Query. This means it takes a Dataset and a Request and returns the results of the query. This process includes: - Applying dataset specific syntax extensions (QueryExtension) - Applying dataset query processors on the abstract Snuba query. - Using the dataset provided ClickhouseQueryPlanBuilder to build a ClickhouseQueryPlan. This step transforms the Snuba Query into the Storage Query (that is contextual to the storage/s). From this point on none should depend on the dataset. - Executing the plan specific query processors. - Providing the newly built Query, processors to be run for each DB query and a QueryRunner to the QueryExecutionStrategy to actually run the DB Query. """ if not request.settings.get_turbo() and SampleClauseFinder().visit( request.query.get_from_clause() ): metrics.increment("sample_without_turbo", tags={"referrer": request.referrer}) query_runner = partial( _run_and_apply_column_names, timer, query_metadata, request.referrer, ) return ( dataset.get_query_pipeline_builder() .build_execution_pipeline(request, query_runner) .execute() )
def drop(*, dataset: Dataset): for statement in dataset.get_dataset_schemas().get_drop_statements(): clickhouse_rw.execute(statement.statement) ensure_table_exists(dataset, force=True) redis_client.flushdb() return ("ok", 200, {"Content-Type": "text/plain"})
def eventstream(*, dataset: Dataset): record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, http_request.data, []), datetime.now(), ) type_ = record[1] storage = dataset.get_writable_storage() assert storage is not None if type_ == "insert": from snuba.consumer import ConsumerWorker worker = ConsumerWorker(storage, metrics=metrics) else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(storage, metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ("ok", 200, {"Content-Type": "text/plain"})
def build_request( self, dataset: Dataset, timestamp: datetime, offset: Optional[int], timer: Timer ) -> Request: """ Returns a Request that can be used to run a query via `parse_and_run_query`. :param dataset: The Dataset to build the request for :param timestamp: Date that the query should run up until :param offset: Maximum offset we should query for """ schema = RequestSchema.build_with_extensions( dataset.get_extensions(), SubscriptionRequestSettings, ) extra_conditions: Sequence[Condition] = [] if offset is not None: extra_conditions = [[["ifnull", ["offset", 0]], "<=", offset]] return validate_request_content( { "project": self.project_id, "conditions": [*self.conditions, *extra_conditions], "aggregations": self.aggregations, "from_date": (timestamp - self.time_window).isoformat(), "to_date": timestamp.isoformat(), }, schema, timer, dataset, SUBSCRIPTION_REFERRER, )
def validate_request_content(body, schema: RequestSchema, timer, dataset: Dataset) -> Request: source = dataset.get_dataset_schemas().get_read_schema().get_data_source() try: request = schema.validate(body, source) except jsonschema.ValidationError as error: raise BadRequest(str(error)) from error timer.mark('validate_schema') return request
def delete_subscription( *, dataset: Dataset, partition: int, key: str, entity: Entity ) -> RespTuple: if entity not in dataset.get_all_entities(): raise InvalidSubscriptionError( "Invalid subscription dataset and entity combination" ) entity_key = ENTITY_NAME_LOOKUP[entity] SubscriptionDeleter(entity_key, PartitionId(partition)).delete(UUID(key)) metrics.increment("subscription_deleted", tags={"entity": entity_key.value}) return "ok", 202, {"Content-Type": "text/plain"}
def eventstream(*, dataset: Dataset) -> RespTuple: record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, http_request.data, []), datetime.now(), ) type_ = record[1] storage = dataset.get_default_entity().get_writable_storage() assert storage is not None if type_ == "insert": from arroyo.processing.strategies.streaming import ( KafkaConsumerStrategyFactory, ) from snuba.consumers.consumer import build_batch_writer, process_message table_writer = storage.get_table_writer() stream_loader = table_writer.get_stream_loader() strategy = KafkaConsumerStrategyFactory( stream_loader.get_pre_filter(), functools.partial( process_message, stream_loader.get_processor(), "consumer_grouup" ), build_batch_writer(table_writer, metrics=metrics), max_batch_size=1, max_batch_time=1.0, processes=None, input_block_size=None, output_block_size=None, ).create(lambda offsets: None) strategy.submit(message) strategy.close() strategy.join() else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(storage, "consumer_group", metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ("ok", 200, {"Content-Type": "text/plain"})
def __init__(self, clickhouse: ClickhousePool, dataset: Dataset, metrics: MetricsBackend) -> None: self.clickhouse = clickhouse self.dataset = dataset self.metrics = metrics self.__all_column_names = [ col.escaped for col in enforce_table_writer( dataset).get_schema().get_columns() ] self.__required_columns = [ col.escaped for col in dataset.get_required_columns() ]
def dataset_query(dataset: Dataset, body, timer: Timer) -> Response: assert http_request.method == "POST" with sentry_sdk.start_span(description="build_schema", op="validate"): schema = RequestSchema.build_with_extensions( dataset.get_extensions(), HTTPRequestSettings ) request = build_request(body, schema, timer, dataset, http_request.referrer) try: result = parse_and_run_query(dataset, request, timer) except QueryException as exception: status = 500 details: Mapping[str, Any] cause = exception.__cause__ if isinstance(cause, RateLimitExceeded): status = 429 details = { "type": "rate-limited", "message": "rate limit exceeded", } elif isinstance(cause, ClickhouseError): details = { "type": "clickhouse", "message": str(cause), "code": cause.code, } elif isinstance(cause, Exception): details = { "type": "unknown", "message": str(cause), } else: raise # exception should have been chained return Response( json.dumps( {"error": details, "timing": timer.for_json(), **exception.extra} ), status, {"Content-Type": "application/json"}, ) payload: MutableMapping[str, Any] = {**result.result, "timing": timer.for_json()} if settings.STATS_IN_RESPONSE or request.settings.get_debug(): payload.update(result.extra) return Response(json.dumps(payload), 200, {"Content-Type": "application/json"})
def truncate_dataset(dataset: Dataset) -> None: for storage in dataset.get_all_storages(): cluster = storage.get_cluster() clickhouse = cluster.get_query_connection(ClickhouseClientSettings.MIGRATE) database = cluster.get_database() schema = storage.get_schema() if not isinstance(schema, TableSchema): return table = schema.get_local_table_name() clickhouse.execute(f"TRUNCATE TABLE IF EXISTS {database}.{table}")
def dataset_query_view(*, dataset: Dataset, timer: Timer): if http_request.method == "GET": schema = RequestSchema.build_with_extensions( dataset.get_extensions(), HTTPRequestSettings ) return render_template( "query.html", query_template=json.dumps(schema.generate_template(), indent=4,), ) elif http_request.method == "POST": body = parse_request_body(http_request) return dataset_query(dataset, body, timer) else: assert False, "unexpected fallthrough"
def test_no_schema_diffs(dataset: Dataset) -> None: from snuba.migrations.parse_schema import get_local_schema writable_storage = dataset.get_writable_storage() if not writable_storage: pytest.skip(f"{dataset!r} has no writable storage") clickhouse = writable_storage.get_cluster().get_query_connection( ClickhouseClientSettings.MIGRATE) table_writer = writable_storage.get_table_writer() dataset_schema = table_writer.get_schema() local_table_name = dataset_schema.get_local_table_name() local_schema = get_local_schema(clickhouse, local_table_name) assert not dataset_schema.get_column_differences(local_schema)
def create_subscription(*, dataset: Dataset, timer: Timer, entity: Entity) -> RespTuple: if entity not in dataset.get_all_entities(): raise InvalidSubscriptionError( "Invalid subscription dataset and entity combination" ) entity_key = ENTITY_NAME_LOOKUP[entity] subscription = SubscriptionDataCodec(entity_key).decode(http_request.data) identifier = SubscriptionCreator(dataset, entity_key).create(subscription, timer) metrics.increment("subscription_created", tags={"entity": entity_key.value}) return ( json.dumps({"subscription_id": str(identifier)}), 202, {"Content-Type": "application/json"}, )
def dataset_query(dataset: Dataset, body, timer: Timer) -> Response: assert http_request.method == "POST" ensure_table_exists(dataset) return format_result( run_query( dataset, validate_request_content( body, RequestSchema.build_with_extensions(dataset.get_extensions(), HTTPRequestSettings), timer, dataset, http_request.referrer, ), timer, ))
def ensure_table_exists(dataset: Dataset, force: bool = False) -> None: if not force and _ensured.get(dataset, False): return assert local_dataset_mode(), "Cannot create table in distributed mode" from snuba import migrate # We cannot build distributed tables this way. So this only works in local # mode. for statement in dataset.get_dataset_schemas().get_create_statements(): clickhouse_rw.execute(statement.statement) migrate.run(clickhouse_rw, dataset) _ensured[dataset] = True
def dataset_query_view(*, dataset: Dataset, timer: Timer) -> Union[Response, str]: if http_request.method == "GET": schema = RequestSchema.build_with_extensions( dataset.get_default_entity().get_extensions(), HTTPRequestSettings, Language.LEGACY, ) return render_template( "query.html", query_template=json.dumps(schema.generate_template(), indent=4,), ) elif http_request.method == "POST": body = parse_request_body(http_request) _trace_transaction(dataset) return dataset_query(dataset, body, timer, Language.LEGACY) else: assert False, "unexpected fallthrough"
def truncate_dataset(dataset: Dataset) -> None: for entity in dataset.get_all_entities(): for storage in entity.get_all_storages(): cluster = storage.get_cluster() nodes = [*cluster.get_local_nodes(), *cluster.get_distributed_nodes()] for node in nodes: clickhouse = cluster.get_node_connection( ClickhouseClientSettings.MIGRATE, node ) database = cluster.get_database() schema = storage.get_schema() if not isinstance(schema, TableSchema): return table = schema.get_local_table_name() clickhouse.execute(f"TRUNCATE TABLE IF EXISTS {database}.{table}")
def parse_query(body: MutableMapping[str, Any], dataset: Dataset) -> Query: """ Parses the query body generating the AST. This only takes into account the initial query body. Extensions are parsed by extension processors and are supposed to update the AST. """ try: return _parse_query_impl(body, dataset) except Exception as e: # During the development there is no need to fail Snuba queries if the parser # has an issue, anyway the production query is ran based on the old query # representation. # Once we will be actually using the ast to build the Clickhouse query # this try/except block will disappear. enforce_validity = state.get_config("query_parsing_enforce_validity", 0) if enforce_validity: raise e else: logger.exception("Failed to parse query") source = dataset.get_dataset_schemas().get_read_schema( ).get_data_source() return Query(body, source)
def _run_query_pipeline( dataset: Dataset, request: Request, timer: Timer, query_metadata: SnubaQueryMetadata, ) -> RawQueryResult: """ Runs the query processing and execution pipeline for a Snuba Query. This means it takes a Dataset and a Request and returns the results of the query. This process includes: - Applying dataset specific syntax extensions (QueryExtension) - Applying dataset query processors on the abstract Snuba query. - Using the dataset provided StorageQueryPlanBuilder to build a StorageQueryPlan. This step transforms the Snuba Query into the Storage Query (that is contextual to the storage/s). From this point on none should depend on the dataset. - Executing the storage specific query processors. - Providing the newly built Query and a QueryRunner to the QueryExecutionStrategy to actually run the DB Query. """ # TODO: this will work perfectly with datasets that are not time series. Remove it. from_date, to_date = TimeSeriesExtensionProcessor.get_time_limit( request.extensions["timeseries"]) if (request.query.get_sample() is not None and request.query.get_sample() != 1.0) and not request.settings.get_turbo(): metrics.increment("sample_without_turbo", tags={"referrer": request.referrer}) extensions = dataset.get_extensions() for name, extension in extensions.items(): extension.get_processor().process_query(request.query, request.extensions[name], request.settings) # TODO: Fit this in a query processor. All query transformations should be driven by # datasets/storages and never hardcoded. if request.settings.get_turbo(): request.query.set_final(False) for processor in dataset.get_query_processors(): processor.process_query(request.query, request.settings) storage_query_plan = dataset.get_query_plan_builder().build_plan(request) # TODO: This below should be a storage specific query processor. relational_source = request.query.get_data_source() request.query.add_conditions(relational_source.get_mandatory_conditions()) for processor in storage_query_plan.query_processors: processor.process_query(request.query, request.settings) query_runner = partial( _format_storage_query_and_run, dataset, timer, query_metadata, from_date, to_date, ) return storage_query_plan.execution_strategy.execute(request, query_runner)
def parse_and_run_query( dataset: Dataset, request: Request, timer: Timer ) -> ClickhouseQueryResult: from_date, to_date = TimeSeriesExtensionProcessor.get_time_limit( request.extensions["timeseries"] ) if ( request.query.get_sample() is not None and request.query.get_sample() != 1.0 ) and not request.settings.get_turbo(): metrics.increment("sample_without_turbo", tags={"referrer": request.referrer}) extensions = dataset.get_extensions() for name, extension in extensions.items(): extension.get_processor().process_query( request.query, request.extensions[name], request.settings ) request.query.add_conditions(dataset.default_conditions()) if request.settings.get_turbo(): request.query.set_final(False) for processor in dataset.get_query_processors(): processor.process_query(request.query, request.settings) relational_source = request.query.get_data_source() request.query.add_conditions(relational_source.get_mandatory_conditions()) source = relational_source.format_from() with sentry_sdk.start_span(description="create_query", op="db"): # TODO: consider moving the performance logic and the pre_where generation into # ClickhouseQuery since they are Clickhouse specific query = DictClickhouseQuery(dataset, request.query, request.settings) timer.mark("prepare_query") num_days = (to_date - from_date).days stats = { "clickhouse_table": source, "final": request.query.get_final(), "referrer": request.referrer, "num_days": num_days, "sample": request.query.get_sample(), } with sentry_sdk.configure_scope() as scope: if scope.span: scope.span.set_tag("dataset", type(dataset).__name__) scope.span.set_tag("referrer", http_request.referrer) scope.span.set_tag("timeframe_days", num_days) with sentry_sdk.start_span(description=query.format_sql(), op="db") as span: span.set_tag("dataset", type(dataset).__name__) span.set_tag("table", source) try: span.set_tag( "ast_query", AstClickhouseQuery(request.query, request.settings).format_sql(), ) except Exception: logger.exception("Failed to format ast query") result = raw_query( request, query, NativeDriverReader(clickhouse_ro), timer, stats ) with sentry_sdk.configure_scope() as scope: if scope.span: if "max_threads" in stats: scope.span.set_tag("max_threads", stats["max_threads"]) return result
def _parse_query_impl(body: MutableMapping[str, Any], dataset: Dataset) -> Query: aggregate_exprs = [] for aggregation in body.get("aggregations", []): assert isinstance(aggregation, (list, tuple)) aggregation_function = aggregation[0] column_expr = aggregation[1] column_expr = column_expr if column_expr else [] alias = aggregation[2] alias = alias if alias else None aggregate_exprs.append( parse_aggregation(aggregation_function, column_expr, alias)) groupby_exprs = [ parse_expression(tuplify(group_by)) for group_by in to_list(body.get("groupby", [])) ] select_exprs = [ parse_expression(tuplify(select)) for select in body.get("selected_columns", []) ] selected_cols = groupby_exprs + aggregate_exprs + select_exprs arrayjoin = body.get("arrayjoin") if arrayjoin: array_join_expr: Optional[Expression] = parse_expression( body["arrayjoin"]) else: array_join_expr = None where_expr = parse_conditions_to_expr(body.get("conditions", []), dataset, arrayjoin) having_expr = parse_conditions_to_expr(body.get("having", []), dataset, arrayjoin) orderby_exprs = [] for orderby in to_list(body.get("orderby", [])): if isinstance(orderby, str): match = NEGATE_RE.match(orderby) assert match is not None, f"Invalid Order By clause {orderby}" direction, col = match.groups() orderby = col elif is_function(orderby): match = NEGATE_RE.match(orderby[0]) assert match is not None, f"Invalid Order By clause {orderby}" direction, col = match.groups() orderby = [col] + orderby[1:] else: raise ValueError(f"Invalid Order By clause {orderby}") orderby_parsed = parse_expression(tuplify(orderby)) orderby_exprs.append( OrderBy( OrderByDirection.DESC if direction == "-" else OrderByDirection.ASC, orderby_parsed, )) source = dataset.get_dataset_schemas().get_read_schema().get_data_source() return Query( body, source, selected_columns=selected_cols, array_join=array_join_expr, condition=where_expr, groupby=groupby_exprs, having=having_expr, order_by=orderby_exprs, )
def write(*, dataset: Dataset) -> RespTuple: return _write_to_entity(entity=dataset.get_default_entity())
def parse_conditions( operand_builder: Callable[[Any], TExpression], and_builder: Callable[[Sequence[TExpression]], Optional[TExpression]], or_builder: Callable[[Sequence[TExpression]], Optional[TExpression]], unpack_array_condition_builder: Callable[[TExpression, str, Any], TExpression], simple_condition_builder: Callable[[TExpression, str, Any], TExpression], dataset: Dataset, conditions: Any, array_join: Optional[str], depth: int = 0, ) -> Optional[TExpression]: """ Return a boolean expression suitable for putting in the WHERE clause of the query. The expression is constructed by ANDing groups of OR expressions. Expansion of columns is handled, as is replacement of columns with aliases, if the column has already been expanded and aliased elsewhere. operand_builder: Builds the TExpression representing the left hand side of a simple condition. This can be as nested as the user wants and_builder / or_builder: Combine a list of expressions in AND/OR unpack_array_condition_builder: Deals with a special case where we unpack conditions on array columns. More details in the code. simple_condition_builder: Generates a simple condition made by expression on the left hand side, an operator and a literal on the right hand side. """ from snuba.clickhouse.columns import Array if not conditions: return None if depth == 0: # dedupe conditions at top level, but keep them in order sub = OrderedDict(( parse_conditions( operand_builder, and_builder, or_builder, unpack_array_condition_builder, simple_condition_builder, dataset, cond, array_join, depth + 1, ), None, ) for cond in conditions) return and_builder([s for s in sub.keys() if s]) elif is_condition(conditions): lhs, op, lit = dataset.process_condition(conditions) # facilitate deduping IN conditions by sorting them. if op in ("IN", "NOT IN") and isinstance(lit, tuple): lit = tuple(sorted(lit)) # If the LHS is a simple column name that refers to an array column # (and we are not arrayJoining on that column, which would make it # scalar again) and the RHS is a scalar value, we assume that the user # actually means to check if any (or all) items in the array match the # predicate, so we return an `any(x == value for x in array_column)` # type expression. We assume that operators looking for a specific value # (IN, =, LIKE) are looking for rows where any array value matches, and # exclusionary operators (NOT IN, NOT LIKE, !=) are looking for rows # where all elements match (eg. all NOT LIKE 'foo'). columns = dataset.get_dataset_schemas().get_read_schema().get_columns() if (isinstance(lhs, str) and lhs in columns and isinstance(columns[lhs].type, Array) and columns[lhs].base_name != array_join and not isinstance(lit, (list, tuple))): return unpack_array_condition_builder(operand_builder(lhs), op, lit) else: return simple_condition_builder(operand_builder(lhs), op, lit) elif depth == 1: sub_expression = (parse_conditions( operand_builder, and_builder, or_builder, unpack_array_condition_builder, simple_condition_builder, dataset, cond, array_join, depth + 1, ) for cond in conditions) return or_builder([s for s in sub_expression if s]) else: raise InvalidConditionException(str(conditions))
def enforce_table_writer(dataset: Dataset) -> TableWriter: table_writer = dataset.get_table_writer() assert table_writer is not None, f"Dataset{dataset} is not writable" return table_writer
def dataset_query( dataset: Dataset, body: MutableMapping[str, Any], timer: Timer, language: Language ) -> Response: assert http_request.method == "POST" referrer = http_request.referrer or "<unknown>" # mypy if language == Language.SNQL: metrics.increment("snql.query.incoming", tags={"referrer": referrer}) parser: Callable[ [RequestParts, RequestSettings, Dataset], Union[Query, CompositeQuery[Entity]], ] = partial(parse_snql_query, []) else: parser = parse_legacy_query with sentry_sdk.start_span(description="build_schema", op="validate"): schema = RequestSchema.build_with_extensions( dataset.get_default_entity().get_extensions(), HTTPRequestSettings, language ) request = build_request( body, parser, HTTPRequestSettings, schema, dataset, timer, referrer ) try: result = parse_and_run_query(dataset, request, timer) # Some metrics to track the adoption of SnQL query_type = "simple" if language == Language.SNQL: if isinstance(request.query, CompositeQuery): if isinstance(request.query.get_from_clause(), JoinClause): query_type = "join" else: query_type = "subquery" metrics.increment( "snql.query.success", tags={"referrer": referrer, "type": query_type} ) except QueryException as exception: status = 500 details: Mapping[str, Any] cause = exception.__cause__ if isinstance(cause, RateLimitExceeded): status = 429 details = { "type": "rate-limited", "message": "rate limit exceeded", } elif isinstance(cause, ClickhouseError): details = { "type": "clickhouse", "message": str(cause), "code": cause.code, } elif isinstance(cause, Exception): details = { "type": "unknown", "message": str(cause), } else: raise # exception should have been chained if language == Language.SNQL: metrics.increment( "snql.query.failed", tags={"referrer": referrer, "status": f"{status}"}, ) return Response( json.dumps( {"error": details, "timing": timer.for_json(), **exception.extra} ), status, {"Content-Type": "application/json"}, ) payload: MutableMapping[str, Any] = {**result.result, "timing": timer.for_json()} if settings.STATS_IN_RESPONSE or request.settings.get_debug(): payload.update(result.extra) return Response(json.dumps(payload), 200, {"Content-Type": "application/json"})
def _parse_query_impl(body: MutableMapping[str, Any], dataset: Dataset) -> Query: def build_selected_expressions( raw_expressions: Sequence[Any], ) -> List[SelectedExpression]: output = [] for raw_expression in raw_expressions: exp = parse_expression(tuplify(raw_expression), dataset.get_abstract_columnset(), set()) output.append( SelectedExpression( # An expression in the query can be a string or a # complex list with an alias. In the second case # we trust the parser to find the alias. name=raw_expression if isinstance(raw_expression, str) else exp.alias, expression=exp, )) return output aggregations = [] for aggregation in body.get("aggregations", []): if not isinstance(aggregation, Sequence): raise ParsingException(( f"Invalid aggregation structure {aggregation}. " "It must be a sequence containing expression, column and alias." )) aggregation_function = aggregation[0] column_expr = aggregation[1] column_expr = column_expr if column_expr else [] alias = aggregation[2] alias = alias if alias else None aggregations.append( SelectedExpression( name=alias, expression=parse_aggregation( aggregation_function, column_expr, alias, dataset.get_abstract_columnset(), set(), ), )) groupby_clause = build_selected_expressions( to_list(body.get("groupby", []))) select_clause = ( groupby_clause + aggregations + build_selected_expressions(body.get("selected_columns", []))) array_join_cols = set() arrayjoin = body.get("arrayjoin") # TODO: Properly detect all array join columns in all clauses of the query. # This is missing an arrayJoin in condition with an alias that is then # used in the select. if arrayjoin: array_join_cols.add(arrayjoin) array_join_expr: Optional[Expression] = parse_expression( body["arrayjoin"], dataset.get_abstract_columnset(), {arrayjoin}) else: array_join_expr = None for select_expr in select_clause: if isinstance(select_expr.expression, FunctionCall): if select_expr.expression.function_name == "arrayJoin": parameters = select_expr.expression.parameters if len(parameters) != 1: raise ParsingException( "arrayJoin(...) only accepts a single parameter.") if isinstance(parameters[0], Column): array_join_cols.add(parameters[0].column_name) else: # We only accepts columns or functions that do not # reference columns. We could not say whether we are # actually arrayjoining on the values of the column # if it is nested in an arbitrary function. But # functions of literals are fine. for e in parameters[0]: if isinstance(e, Column): raise ParsingException( "arrayJoin(...) cannot contain columns nested in functions." ) where_expr = parse_conditions_to_expr(body.get("conditions", []), dataset, array_join_cols) having_expr = parse_conditions_to_expr(body.get("having", []), dataset, array_join_cols) orderby_exprs = [] for orderby in to_list(body.get("orderby", [])): if isinstance(orderby, str): match = NEGATE_RE.match(orderby) if match is None: raise ParsingException(( f"Invalid Order By clause {orderby}. If the Order By is a string, " "it must respect the format `[-]column`")) direction, col = match.groups() orderby = col elif is_function(orderby): match = NEGATE_RE.match(orderby[0]) if match is None: raise ParsingException(( f"Invalid Order By clause {orderby}. If the Order By is an expression, " "the function name must respect the format `[-]func_name`" )) direction, col = match.groups() orderby = [col] + orderby[1:] else: raise ParsingException( (f"Invalid Order By clause {orderby}. The Clause was neither " "a string nor a function call.")) orderby_parsed = parse_expression(tuplify(orderby), dataset.get_abstract_columnset(), set()) orderby_exprs.append( OrderBy( OrderByDirection.DESC if direction == "-" else OrderByDirection.ASC, orderby_parsed, )) return Query( body, None, selected_columns=select_clause, array_join=array_join_expr, condition=where_expr, groupby=[g.expression for g in groupby_clause], having=having_expr, order_by=orderby_exprs, )