def select_storage(self, query: Query, request_settings: RequestSettings) -> StorageAndMappers: use_readonly_storage = (state.get_config( "enable_events_readonly_table", False) and not request_settings.get_consistent()) storage = (self.__events_ro_table if use_readonly_storage else self.__events_table) return StorageAndMappers(storage, event_translator)
def execute_query( # TODO: Passing the whole clickhouse query here is needed as long # as the execute method depends on it. Otherwise we can make this # file rely either entirely on clickhouse query or entirely on # the formatter. clickhouse_query: Query, request_settings: RequestSettings, formatted_query: SqlQuery, reader: Reader[SqlQuery], timer: Timer, stats: MutableMapping[str, Any], query_settings: MutableMapping[str, Any], ) -> Result: """ Execute a query and return a result. """ # Experiment, if we are going to grab more than X columns worth of data, # don't use uncompressed_cache in ClickHouse. uc_max = state.get_config("uncompressed_cache_max_cols", 5) if (len( set(( # Skip aliases when counting columns (c.table_name, c.column_name) for c in clickhouse_query.get_all_ast_referenced_columns()))) > uc_max): query_settings["use_uncompressed_cache"] = 0 # Force query to use the first shard replica, which # should have synchronously received any cluster writes # before this query is run. consistent = request_settings.get_consistent() stats["consistent"] = consistent if consistent: query_settings["load_balancing"] = "in_order" query_settings["max_threads"] = 1 result = reader.execute( formatted_query, query_settings, with_totals=clickhouse_query.has_totals(), ) timer.mark("execute") stats.update({ "result_rows": len(result["data"]), "result_cols": len(result["meta"]) }) return result
def execute_query( # TODO: Passing the whole clickhouse query here is needed as long # as the execute method depends on it. Otherwise we can make this # file rely either entirely on clickhouse query or entirely on # the formatter. clickhouse_query: Union[Query, CompositeQuery[Table]], request_settings: RequestSettings, formatted_query: FormattedQuery, reader: Reader, timer: Timer, stats: MutableMapping[str, Any], query_settings: MutableMapping[str, Any], robust: bool, ) -> Result: """ Execute a query and return a result. """ # Experiment, if we are going to grab more than X columns worth of data, # don't use uncompressed_cache in ClickHouse. uc_max = state.get_config("uncompressed_cache_max_cols", 5) assert isinstance(uc_max, int) column_counter = ReferencedColumnsCounter() column_counter.visit(clickhouse_query.get_from_clause()) if column_counter.count_columns() > uc_max: query_settings["use_uncompressed_cache"] = 0 # Force query to use the first shard replica, which # should have synchronously received any cluster writes # before this query is run. consistent = request_settings.get_consistent() stats["consistent"] = consistent if consistent: query_settings["load_balancing"] = "in_order" query_settings["max_threads"] = 1 result = reader.execute( formatted_query, query_settings, with_totals=clickhouse_query.has_totals(), robust=robust, ) timer.mark("execute") stats.update({ "result_rows": len(result["data"]), "result_cols": len(result["meta"]) }) return result
def select_storage(self, query: Query, request_settings: RequestSettings) -> StorageAndMappers: table = detect_table( query, self.__abstract_events_columns, self.__abstract_transactions_columns, True, ) if table == TRANSACTIONS: return StorageAndMappers(self.__transactions_table, self.__transaction_translator) else: use_readonly_storage = (state.get_config( "enable_events_readonly_table", False) and not request_settings.get_consistent()) return (StorageAndMappers(self.__events_ro_table, self.__event_translator) if use_readonly_storage else StorageAndMappers( self.__events_table, self.__event_translator))
def process_query(self, query: Query, request_settings: RequestSettings) -> None: readonly_enabled = state.get_config("enable_events_readonly_table", False) if not readonly_enabled: return if request_settings.get_consistent(): return data_source = query.get_data_source() if data_source.format_from() != self.__table_to_replace: return new_source = TableSource( table_name=self.__read_only_table, columns=data_source.get_columns(), mandatory_conditions=data_source.get_mandatory_conditions(), prewhere_candidates=data_source.get_prewhere_candidates(), ) query.set_data_source(new_source)
def callback_func( storage: str, query: Query, request_settings: RequestSettings, referrer: str, results: List[Result[QueryResult]], ) -> None: cache_hit = False is_duplicate = False # Captures if any of the queries involved was a cache hit or duplicate, as cache # hits may a cause of inconsistency between results. # Doesn't attempt to distinguish between all of the specific scenarios (one or both # queries, or splits of those queries could have hit the cache). if any([result.result.extra["stats"].get("cache_hit", 0) for result in results]): cache_hit = True elif any( [result.result.extra["stats"].get("is_duplicate", 0) for result in results] ): is_duplicate = True consistent = request_settings.get_consistent() if not results: metrics.increment( "query_result", tags={"storage": storage, "match": "empty", "referrer": referrer}, ) return primary_result = results.pop(0) primary_result_data = primary_result.result.result["data"] for result in results: result_data = result.result.result["data"] metrics.timing( "diff_ms", round((result.execution_time - primary_result.execution_time) * 1000), tags={ "referrer": referrer, "cache_hit": str(cache_hit), "is_duplicate": str(is_duplicate), "consistent": str(consistent), }, ) # Do not bother diffing the actual results of sampled queries if request_settings.get_turbo() or query.get_sample() not in [None, 1.0]: return if result_data == primary_result_data: metrics.increment( "query_result", tags={ "storage": storage, "match": "true", "referrer": referrer, "cache_hit": str(cache_hit), "is_duplicate": str(is_duplicate), "consistent": str(consistent), }, ) else: # Do not log cache hits to Sentry as it creates too much noise if cache_hit: continue reason = assign_reason_category(result_data, primary_result_data, referrer) metrics.increment( "query_result", tags={ "storage": storage, "match": "false", "referrer": referrer, "reason": reason, "cache_hit": str(cache_hit), "is_duplicate": str(is_duplicate), "consistent": str(consistent), }, ) if len(result_data) != len(primary_result_data): sentry_sdk.capture_message( f"Non matching {storage} result - different length", level="warning", tags={ "referrer": referrer, "storage": storage, "reason": reason, "cache_hit": str(cache_hit), "is_duplicate": str(is_duplicate), "consistent": str(consistent), }, extras={ "query": format_query(query), "primary_result": len(primary_result_data), "other_result": len(result_data), }, ) break # Avoid sending too much data to Sentry - just one row for now for idx in range(len(result_data)): if result_data[idx] != primary_result_data[idx]: sentry_sdk.capture_message( "Non matching result - different result", level="warning", tags={ "referrer": referrer, "storage": storage, "reason": reason, "cache_hit": str(cache_hit), "is_duplicate": str(is_duplicate), "consistent": str(consistent), }, extras={ "query": format_query(query), "primary_result": primary_result_data[idx], "other_result": result_data[idx], }, ) break