def test_time_split_ast() -> None: """ Test that the time split transforms the query properly both on the old representation and on the AST representation. """ found_timestamps = [] def do_query( query: ClickhouseQuery, request_settings: RequestSettings, ) -> QueryResult: from_date_ast, to_date_ast = get_time_range(query, "timestamp") assert from_date_ast is not None and isinstance( from_date_ast, datetime) assert to_date_ast is not None and isinstance(to_date_ast, datetime) found_timestamps.append( (from_date_ast.isoformat(), to_date_ast.isoformat())) return QueryResult({"data": []}, {}) body = { "selected_columns": [ "event_id", "level", "logger", "server_name", "transaction", "timestamp", "project_id", ], "conditions": [ ("timestamp", ">=", "2019-09-18T10:00:00"), ("timestamp", "<", "2019-09-19T12:00:00"), ("project_id", "IN", [1]), ], "limit": 10, "orderby": ["-timestamp"], } query = parse_query(body, get_dataset("events")) entity = get_entity(query.get_from_clause().key) settings = HTTPRequestSettings() for p in entity.get_query_processors(): p.process_query(query, settings) clickhouse_query = identity_translate(query) splitter = TimeSplitQueryStrategy("timestamp") splitter.execute(clickhouse_query, settings, do_query) assert found_timestamps == [ ("2019-09-19T11:00:00", "2019-09-19T12:00:00"), ("2019-09-19T01:00:00", "2019-09-19T11:00:00"), ("2019-09-18T10:00:00", "2019-09-19T01:00:00"), ]
def test_no_split(dataset_name: str, id_column: str, project_column: str, timestamp_column: str) -> None: events = get_dataset(dataset_name) query = ClickhouseQuery( events.get_default_entity().get_all_storages() [0].get_schema().get_data_source(), ) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader, ) -> QueryResult: assert query == query return QueryResult({}, {}) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy( id_column=id_column, project_column=project_column, timestamp_column=timestamp_column, ), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
def test_time_split_ast() -> None: """ Test that the time split transforms the query properly both on the old representation and on the AST representation. """ found_timestamps = [] def do_query( query: ClickhouseQuery, query_settings: QuerySettings, ) -> QueryResult: from_date_ast, to_date_ast = get_time_range(query, "timestamp") assert from_date_ast is not None and isinstance( from_date_ast, datetime) assert to_date_ast is not None and isinstance(to_date_ast, datetime) found_timestamps.append( (from_date_ast.isoformat(), to_date_ast.isoformat())) return QueryResult({"data": []}, {}) body = """ MATCH (events) SELECT event_id, level, logger, server_name, transaction, timestamp, project_id WHERE timestamp >= toDateTime('2019-09-18T10:00:00') AND timestamp < toDateTime('2019-09-19T12:00:00') AND project_id IN tuple(1) ORDER BY timestamp DESC LIMIT 10 """ query, _ = parse_snql_query(body, get_dataset("events")) entity = get_entity(query.get_from_clause().key) settings = HTTPQuerySettings() for p in entity.get_query_processors(): p.process_query(query, settings) clickhouse_query = identity_translate(query) splitter = TimeSplitQueryStrategy("timestamp") splitter.execute(clickhouse_query, settings, do_query) assert found_timestamps == [ ("2019-09-19T11:00:00", "2019-09-19T12:00:00"), ("2019-09-19T01:00:00", "2019-09-19T11:00:00"), ("2019-09-18T10:00:00", "2019-09-19T01:00:00"), ]
def test_col_split( dataset_name: str, id_column: str, project_column: str, timestamp_column: str, first_query_data: Sequence[MutableMapping[str, Any]], second_query_data: Sequence[MutableMapping[str, Any]], ) -> None: def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader[SqlQuery], ) -> QueryResult: selected_cols = query.get_selected_columns() assert selected_cols == [ c.expression.column_name for c in query.get_selected_columns_from_ast() or [] if isinstance(c.expression, Column) ] if selected_cols == list(first_query_data[0].keys()): return QueryResult({"data": first_query_data}, {}) elif selected_cols == list(second_query_data[0].keys()): return QueryResult({"data": second_query_data}, {}) else: raise ValueError(f"Unexpected selected columns: {selected_cols}") events = get_dataset(dataset_name) query = ClickhouseQuery( LogicalQuery( { "selected_columns": list(second_query_data[0].keys()), "conditions": [""], "orderby": "events.event_id", "sample": 10, "limit": 100, "offset": 50, }, events.get_all_storages()[0].get_schema().get_data_source(), selected_columns=[ SelectedExpression(name=col_name, expression=Column(None, None, col_name)) for col_name in second_query_data[0].keys() ], )) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy(id_column, project_column, timestamp_column), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
def test_col_split( dataset_name: str, id_column: str, project_column: str, timestamp_column: str, first_query_data: Sequence[MutableMapping[str, Any]], second_query_data: Sequence[MutableMapping[str, Any]], ) -> None: def do_query( query: ClickhouseQuery, query_settings: QuerySettings, reader: Reader, ) -> QueryResult: selected_col_names = [ c.expression.column_name for c in query.get_selected_columns() or [] if isinstance(c.expression, Column) ] if selected_col_names == list(first_query_data[0].keys()): return QueryResult({"data": first_query_data}, {}) elif selected_col_names == list(second_query_data[0].keys()): return QueryResult({"data": second_query_data}, {}) else: raise ValueError( f"Unexpected selected columns: {selected_col_names}") events = get_dataset(dataset_name) query = ClickhouseQuery( events.get_default_entity().get_all_storages() [0].get_schema().get_data_source(), selected_columns=[ SelectedExpression(name=col_name, expression=Column(None, None, col_name)) for col_name in second_query_data[0].keys() ], ) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy(id_column, project_column, timestamp_column), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPQuerySettings(), do_query)
def test_no_split( dataset_name: str, id_column: str, project_column: str, timestamp_column: str ) -> None: events = get_dataset(dataset_name) query = ClickhouseQuery( LogicalQuery( { "selected_columns": ["event_id"], "conditions": [""], "orderby": "event_id", "sample": 10, "limit": 100, "offset": 50, }, events.get_all_storages()[0].get_schema().get_data_source(), ) ) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader[SqlQuery], ) -> QueryResult: assert query == query return QueryResult({}, {}) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy( id_column=id_column, project_column=project_column, timestamp_column=timestamp_column, ), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
# during create statement # (https://github.com/ClickHouse/ClickHouse/issues/12586), so the # materialization is added with a migration. skipped_cols_on_creation={"_tags_hash_map"}, ) storage = WritableTableStorage( storage_key=StorageKey.TRANSACTIONS, storage_set_key=StorageSetKey.TRANSACTIONS, schema=schema, query_processors=[ NestedFieldConditionOptimizer( "contexts", "_contexts_flattened", {"start_ts", "finish_ts"}, BEGINNING_OF_TIME, ), MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), TransactionColumnProcessor(), ArrayJoinKeyValueOptimizer("tags"), ArrayJoinKeyValueOptimizer("measurements"), PrewhereProcessor(), ], stream_loader=KafkaStreamLoader( processor=TransactionsMessageProcessor(), default_topic="events", ), query_splitters=[TimeSplitQueryStrategy(timestamp_col="finish_ts")], writer_options={"insert_allow_materialized_columns": 1}, )
"sentry:dist": "dist", "sentry:user": "******", }, "contexts": {"trace.trace_id": "trace_id"}, } ), MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), ArrayJoinKeyValueOptimizer("tags"), UUIDColumnProcessor(set(["event_id", "trace_id"])), EventsBooleanContextsProcessor(), PrewhereProcessor( [ "event_id", "release", "message", "transaction_name", "environment", "project_id", ] ), ], query_splitters=[ ColumnSplitQueryStrategy( id_column="event_id", project_column="project_id", timestamp_column="timestamp", ), TimeSplitQueryStrategy(timestamp_col="timestamp"), ], )
]) schema = TableSchema( columns=columns, local_table_name="discover_local", dist_table_name="discover_dist", storage_set_key=StorageSetKey.DISCOVER, mandatory_conditions=mandatory_conditions, prewhere_candidates=[ "event_id", "release", "message", "transaction_name", "environment", "project_id", ], ) storage = ReadableTableStorage( storage_key=StorageKey.DISCOVER, storage_set_key=StorageSetKey.DISCOVER, schema=schema, query_processors=[ MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), EventIdColumnProcessor(), ArrayJoinKeyValueOptimizer("tags"), PrewhereProcessor(), ], query_splitters=[TimeSplitQueryStrategy(timestamp_col="timestamp")], )
def test_time_split_ast() -> None: """ Test that the time split transforms the query properly both on the old representation and on the AST representation. """ found_timestamps = [] def do_query( query: ClickhouseQuery, request_settings: RequestSettings, ) -> QueryResult: from_date_ast, to_date_ast = get_time_range(query, "timestamp") assert from_date_ast is not None and isinstance( from_date_ast, datetime) assert to_date_ast is not None and isinstance(to_date_ast, datetime) conditions = query.get_conditions() or [] from_date_str = next( (condition[2] for condition in conditions if condition[0] == "timestamp" and condition[1] == ">="), None, ) to_date_str = next( (condition[2] for condition in conditions if condition[0] == "timestamp" and condition[1] == "<"), None, ) assert from_date_str == from_date_ast.isoformat() assert to_date_str == to_date_ast.isoformat() found_timestamps.append( (from_date_ast.isoformat(), to_date_ast.isoformat())) return QueryResult({"data": []}, {}) body = { "selected_columns": [ "event_id", "level", "logger", "server_name", "transaction", "timestamp", "project_id", ], "conditions": [ ("timestamp", ">=", "2019-09-18T10:00:00"), ("timestamp", "<", "2019-09-19T12:00:00"), ("project_id", "IN", [1]), ], "limit": 10, "orderby": ["-timestamp"], } events = get_dataset("events") query = parse_query(body, events) splitter = TimeSplitQueryStrategy("timestamp") splitter.execute(ClickhouseQuery(query), HTTPRequestSettings(), do_query) assert found_timestamps == [ ("2019-09-19T11:00:00", "2019-09-19T12:00:00"), ("2019-09-19T01:00:00", "2019-09-19T11:00:00"), ("2019-09-18T10:00:00", "2019-09-19T01:00:00"), ]