def execute( self, query: FormattedQuery, # TODO: move Clickhouse specific arguments into clickhouse.query.Query settings: Optional[Mapping[str, str]] = None, with_totals: bool = False, robust: bool = False, capture_trace: bool = False, ) -> Result: settings = {**settings} if settings is not None else {} query_id = None if "query_id" in settings: query_id = settings.pop("query_id") execute_func = (self.__client.execute_robust if robust is True else self.__client.execute) return self.__transform_result( execute_func( query.get_sql(), with_column_types=True, query_id=query_id, settings=settings, capture_trace=capture_trace, ), with_totals=with_totals, )
def test_mock_consumer() -> None: storage = get_writable_storage(StorageKey.ERRORS) strategy = KafkaConsumerStrategyFactory( None, lambda message: None, build_mock_batch_writer(storage, True, TestingMetricsBackend(), 100, 50), max_batch_size=1, max_batch_time=1, processes=None, input_block_size=None, output_block_size=None, initialize_parallel_transform=None, ).create(lambda message: None) strategy.submit( Message( Partition(Topic("events"), 0), 1, KafkaPayload(None, b"INVALID MESSAGE", []), datetime.now(), )) strategy.close() strategy.join() # If the mock was not applied correctly we would have data in Clickhouse reader = storage.get_cluster().get_reader() result = reader.execute( FormattedQuery([StringNode("SELECT count() as c from errors_local")])) assert result["data"] == [{"c": 0}]
def format_query(query: FormattableQuery) -> FormattedQuery: """ Formats a Clickhouse Query from the AST representation into an intermediate structure that can either be serialized into a string (for clickhouse) or extracted as a sequence (for logging and tracing). This is the entry point for any type of query, whether simple or composite. """ return FormattedQuery( _format_query_content(query, ClickhouseExpressionFormatter))
def execute( self, query: FormattedQuery, # TODO: move Clickhouse specific arguments into clickhouse.query.Query settings: Optional[Mapping[str, str]] = None, with_totals: bool = False, ) -> Result: settings = {**settings} if settings is not None else {} kwargs = {} if "query_id" in settings: kwargs["query_id"] = settings.pop("query_id") return self.__transform_result( self.__client.execute( query.get_sql(), with_column_types=True, settings=settings, **kwargs ), with_totals=with_totals, )
def format_query(query: FormattableQuery, settings: RequestSettings) -> FormattedQuery: """ Formats a Clickhouse Query from the AST representation into an intermediate structure that can either be serialized into a string (for clickhouse) or extracted as a sequence (for logging and tracing). This is the entry point for any type of query, whether simple or composite. TODO: Remove this method entirely and move the sampling logic into a query processor. """ if isinstance(query, Query): if settings.get_turbo() and not query.get_from_clause().sampling_rate: query.set_from_clause( replace( query.get_from_clause(), sampling_rate=snuba_settings.TURBO_SAMPLE_RATE, )) return FormattedQuery(_format_query_content(query))
def raw_query( # TODO: Passing the whole clickhouse query here is needed as long # as the execute method depends on it. Otherwise we can make this # file rely either entirely on clickhouse query or entirely on # the formatter. clickhouse_query: Union[Query, CompositeQuery[Table]], request_settings: RequestSettings, formatted_query: FormattedQuery, reader: Reader, timer: Timer, query_metadata: SnubaQueryMetadata, stats: MutableMapping[str, Any], trace_id: Optional[str] = None, ) -> QueryResult: """ Submits a raw SQL query to the DB and does some post-processing on it to fix some of the formatting issues in the result JSON. This function is not supposed to depend on anything higher level than the clickhouse query. If this function ends up depending on the dataset, something is wrong. """ all_confs = state.get_all_configs() query_settings: MutableMapping[str, Any] = { k.split("/", 1)[1]: v for k, v in all_confs.items() if k.startswith("query_settings/") } timer.mark("get_configs") sql = formatted_query.get_sql() update_with_status = partial( update_query_metadata_and_stats, clickhouse_query, sql, timer, stats, query_metadata, query_settings, trace_id, ) execute_query_strategy = ( execute_query_with_readthrough_caching if state.get_config( "use_readthrough_query_cache", 1) else execute_query_with_caching) try: result = execute_query_strategy( clickhouse_query, request_settings, formatted_query, reader, timer, stats, query_settings, ) except Exception as cause: if isinstance(cause, RateLimitExceeded): stats = update_with_status(QueryStatus.RATE_LIMITED) else: with configure_scope() as scope: if isinstance(cause, ClickhouseError): scope.fingerprint = ["{{default}}", str(cause.code)] logger.exception("Error running query: %s\n%s", sql, cause) stats = update_with_status(QueryStatus.ERROR) raise QueryException({"stats": stats, "sql": sql}) from cause else: stats = update_with_status(QueryStatus.SUCCESS) return QueryResult(result, {"stats": stats, "sql": sql})
def get_query_cache_key(formatted_query: FormattedQuery) -> str: return md5(force_bytes(formatted_query.get_sql())).hexdigest()
def format_query_anonymized(query: FormattableQuery) -> FormattedQuery: return FormattedQuery( _format_query_content(query, ClickHouseExpressionFormatterAnonymized))
def format_snql_anonymized( query: Union[LogicalQuery, CompositeQuery[Entity]] ) -> FormattedQuery: return FormattedQuery(_format_query_content(query, ExpressionFormatterAnonymized))
def test_composite_query() -> None: query = FormattedQuery( [ StringNode("SELECT avg(a)"), PaddingNode( "FROM", FormattedSubQuery( [ StringNode("SELECT t_a.a, t_b.b"), PaddingNode( "FROM", SequenceNode( [ PaddingNode( None, FormattedSubQuery( [ StringNode("SELECT a, b"), StringNode("FROM somewhere"), ] ), "t_a", ), StringNode("INNER SEMI JOIN"), PaddingNode( None, FormattedSubQuery( [ StringNode("SELECT a, b"), StringNode("FROM somewhere_else"), ] ), "t_b", ), StringNode("ON t_a.a = t_b.b"), ], ), ), ], ), ), StringNode("WHERE something something"), ], ) assert query.get_sql(format="JSON") == ( "SELECT avg(a) FROM " "(SELECT t_a.a, t_b.b FROM " "(SELECT a, b FROM somewhere) t_a " "INNER SEMI JOIN " "(SELECT a, b FROM somewhere_else) t_b " "ON t_a.a = t_b.b) " "WHERE something something " "FORMAT JSON" ) assert query.structured() == [ "SELECT avg(a)", [ "FROM", [ "SELECT t_a.a, t_b.b", [ "FROM", [ [["SELECT a, b", "FROM somewhere"], "t_a"], "INNER SEMI JOIN", [["SELECT a, b", "FROM somewhere_else"], "t_b"], "ON t_a.a = t_b.b", ], ], ], ], "WHERE something something", ]
def raw_query( # TODO: Passing the whole clickhouse query here is needed as long # as the execute method depends on it. Otherwise we can make this # file rely either entirely on clickhouse query or entirely on # the formatter. clickhouse_query: Union[Query, CompositeQuery[Table]], query_settings: QuerySettings, formatted_query: FormattedQuery, reader: Reader, timer: Timer, query_metadata: SnubaQueryMetadata, stats: MutableMapping[str, Any], trace_id: Optional[str] = None, robust: bool = False, ) -> QueryResult: """ Submits a raw SQL query to the DB and does some post-processing on it to fix some of the formatting issues in the result JSON. This function is not supposed to depend on anything higher level than the clickhouse query. If this function ends up depending on the dataset, something is wrong. """ all_confs = state.get_all_configs() clickhouse_query_settings: MutableMapping[str, Any] = { k.split("/", 1)[1]: v for k, v in all_confs.items() if k.startswith("query_settings/") } timer.mark("get_configs") sql = formatted_query.get_sql() update_with_status = partial( update_query_metadata_and_stats, clickhouse_query, sql, timer, stats, query_metadata, clickhouse_query_settings, trace_id, ) execute_query_strategy = ( execute_query_with_readthrough_caching if state.get_config("use_readthrough_query_cache", 1) else execute_query_with_caching ) try: result = execute_query_strategy( clickhouse_query, query_settings, formatted_query, reader, timer, stats, clickhouse_query_settings, robust=robust, ) except Exception as cause: if isinstance(cause, RateLimitExceeded): stats = update_with_status(QueryStatus.RATE_LIMITED) else: error_code = None with configure_scope() as scope: if isinstance(cause, ClickhouseError): error_code = cause.code scope.fingerprint = ["{{default}}", str(cause.code)] if scope.span: if cause.code == errors.ErrorCodes.TOO_SLOW: sentry_sdk.set_tag("timeout", "predicted") elif cause.code == errors.ErrorCodes.TIMEOUT_EXCEEDED: sentry_sdk.set_tag("timeout", "query_timeout") elif cause.code in ( errors.ErrorCodes.SOCKET_TIMEOUT, errors.ErrorCodes.NETWORK_ERROR, ): sentry_sdk.set_tag("timeout", "network") elif isinstance( cause, (TimeoutError, ExecutionTimeoutError, TigerExecutionTimeoutError), ): if scope.span: sentry_sdk.set_tag("timeout", "cache_timeout") logger.exception("Error running query: %s\n%s", sql, cause) stats = update_with_status(QueryStatus.ERROR, error_code=error_code) raise QueryException( { "stats": stats, "sql": sql, "experiments": clickhouse_query.get_experiments(), } ) from cause else: stats = update_with_status(QueryStatus.SUCCESS, result["profile"]) return QueryResult( result, { "stats": stats, "sql": sql, "experiments": clickhouse_query.get_experiments(), }, )