def process_query(self, query: Query, request_settings: RequestSettings,) -> None: from_clause = query.get_data_source() if not isinstance(from_clause, JoinClause): return referenced_columns = query.get_all_referenced_columns() referenced_aliases = set() for qualified_column in referenced_columns: # This will be much better when we will represent columns # with a more structured data type than strings. match = QUALIFIED_COLUMN_REGEX.match(qualified_column) if match: # match[1] is the first parenthesized group in the regex, thus # the table alias. table_alias = match[1] referenced_aliases.add(table_alias) assert ( len(referenced_aliases) > 0 ), "Trying to otpimize a join query without aliases" if len(referenced_aliases) > 1: return from_tables = from_clause.get_tables() table = from_tables[referenced_aliases.pop()] query.set_data_source(table)
def execute_query_with_caching( clickhouse_query: Query, request_settings: RequestSettings, formatted_query: SqlQuery, reader: Reader[SqlQuery], timer: Timer, stats: MutableMapping[str, Any], query_settings: MutableMapping[str, Any], ) -> Result: # XXX: ``uncompressed_cache_max_cols`` is used to control both the result # cache, as well as the uncompressed cache. These should be independent. use_cache, uc_max = state.get_configs( [("use_cache", settings.USE_RESULT_CACHE), ("uncompressed_cache_max_cols", 5)] ) if len(clickhouse_query.get_all_referenced_columns()) > uc_max: use_cache = False execute = partial( execute_query_with_rate_limits, clickhouse_query, request_settings, formatted_query, reader, timer, stats, query_settings, ) with sentry_sdk.start_span(description="execute", op="db") as span: if use_cache: key = get_query_cache_key(formatted_query) result = cache.get(key) timer.mark("cache_get") stats["cache_hit"] = result is not None if result is not None: span.set_tag("cache", "hit") return result span.set_tag("cache", "miss") result = execute() cache.set(key, result) timer.mark("cache_set") return result else: return execute()
def execute_query( # TODO: Passing the whole clickhouse query here is needed as long # as the execute method depends on it. Otherwise we can make this # file rely either entirely on clickhouse query or entirely on # the formatter. clickhouse_query: Query, request_settings: RequestSettings, formatted_query: SqlQuery, reader: Reader[SqlQuery], timer: Timer, stats: MutableMapping[str, Any], query_settings: MutableMapping[str, Any], ) -> Result: """ Execute a query and return a result. """ # Experiment, if we are going to grab more than X columns worth of data, # don't use uncompressed_cache in ClickHouse. uc_max = state.get_config("uncompressed_cache_max_cols", 5) if len(clickhouse_query.get_all_referenced_columns()) > uc_max: query_settings["use_uncompressed_cache"] = 0 # Force query to use the first shard replica, which # should have synchronously received any cluster writes # before this query is run. consistent = request_settings.get_consistent() stats["consistent"] = consistent if consistent: query_settings["load_balancing"] = "in_order" query_settings["max_threads"] = 1 result = reader.execute( formatted_query, query_settings, with_totals=clickhouse_query.has_totals(), ) timer.mark("execute") stats.update( {"result_rows": len(result["data"]), "result_cols": len(result["meta"])} ) return result