def test_failures( query_body: MutableMapping[str, Any], expected_exception: Type[InvalidQueryException], ) -> None: with pytest.raises(expected_exception): events = get_dataset("events") parse_query(query_body, events)
def test_shadowing() -> None: with pytest.raises(AliasShadowingException): parse_query( { "selected_columns": [ ["f1", ["column1", "column2"], "f1_alias"], ["f2", [], "f2_alias"], ], "aggregations": [["testF", ["platform", "field2"], "f1_alias" ] # Shadowing! ], }, get_dataset("events"), )
def test_find_projects( query_body: MutableMapping[str, Any], expected_projects: Set[int] ) -> None: events = get_dataset("events") query = identity_translate(parse_query(query_body, events)) project_ids_ast = get_project_ids_in_query_ast(query, "project_id") assert project_ids_ast == expected_projects
def test_get_time_range() -> None: """ Test finding the time range of a query. """ body = { "selected_columns": ["event_id"], "conditions": [ # Cannot test complex conditions based on explicit calls # the `and` and `or` functions, because they would not be # parsed as datetime by the old parser. ("timestamp", ">=", "2019-09-18T10:00:00"), ("timestamp", ">=", "2000-09-18T10:00:00"), ("timestamp", "<", "2019-09-19T12:00:00"), [("timestamp", "<", "2019-09-18T12:00:00"), ("project_id", "IN", [1])], ("project_id", "IN", [1]), ], } events = get_dataset("events") query = parse_query(body, events) processors = events.get_default_entity().get_query_processors() for processor in processors: if isinstance(processor, TimeSeriesProcessor): processor.process_query(query, HTTPRequestSettings()) from_date_ast, to_date_ast = get_time_range(identity_translate(query), "timestamp") assert (from_date_ast is not None and isinstance(from_date_ast, datetime) and from_date_ast.isoformat() == "2019-09-18T10:00:00") assert (to_date_ast is not None and isinstance(to_date_ast, datetime) and to_date_ast.isoformat() == "2019-09-19T12:00:00")
def test_col_split_conditions( id_column: str, project_column: str, timestamp_column: str, query, expected_result ) -> None: dataset = get_dataset("events") query = parse_query(query, dataset) splitter = ColumnSplitQueryStrategy(id_column, project_column, timestamp_column) request = Request("a", query, HTTPRequestSettings(), {}, "r") entity = get_entity(query.get_from_clause().key) plan = entity.get_query_plan_builder().build_plan(request) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, ) -> QueryResult: return QueryResult( { "data": [ { id_column: "asd123", project_column: 123, timestamp_column: "2019-10-01 22:33:42", } ] }, {}, ) assert ( splitter.execute(plan.query, HTTPRequestSettings(), do_query) is not None ) == expected_result
def test_get_time_range() -> None: """ Test finding the time range of a query. """ body = { "selected_columns": ["event_id"], "conditions": [ ("timestamp", ">=", "2019-09-18T10:00:00"), ("timestamp", ">=", "2000-09-18T10:00:00"), ("timestamp", "<", "2019-09-19T12:00:00"), [("timestamp", "<", "2019-09-18T12:00:00"), ("project_id", "IN", [1])], ("project_id", "IN", [1]), ], } events = get_dataset("events") query = parse_query(body, events) processors = events.get_query_processors() for processor in processors: if isinstance(processor, TimeSeriesProcessor): processor.process_query(query, HTTPRequestSettings()) from_date_ast, to_date_ast = get_time_range(ClickhouseQuery(query), "timestamp") assert (from_date_ast is not None and isinstance(from_date_ast, datetime) and from_date_ast.isoformat() == "2019-09-18T10:00:00") assert (to_date_ast is not None and isinstance(to_date_ast, datetime) and to_date_ast.isoformat() == "2019-09-19T12:00:00")
def validate(self, value, dataset: Dataset, referrer: str) -> Request: value = validate_jsonschema(value, self.__composite_schema) query_body = { key: value.pop(key) for key in self.__query_schema["properties"].keys() if key in value } settings = { key: value.pop(key) for key in self.__settings_schema["properties"].keys() if key in value } extensions = {} for extension_name, extension_schema in self.__extension_schemas.items( ): extensions[extension_name] = { key: value.pop(key) for key in extension_schema["properties"].keys() if key in value } query = parse_query(query_body, dataset) return Request(query, self.__setting_class(**settings), extensions, referrer)
def test() -> None: cv = threading.Condition() query_result = QueryResult({}, {"stats": {}, "sql": ""}) mock_query_runner = Mock(return_value=query_result) def callback_func(args: List[Tuple[str, QueryResult]]) -> None: with cv: cv.notify() mock_callback = Mock(side_effect=callback_func) query_body = { "selected_columns": ["type", "project_id"], } events = get_dataset("events") query = parse_query(query_body, events) events_pipeline = SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=get_storage(StorageKey.EVENTS)), ) errors_pipeline = SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=get_storage(StorageKey.ERRORS)), ) delegator = PipelineDelegator( query_pipeline_builders={ "events": events_pipeline, "errors": errors_pipeline }, selector_func=lambda query, referrer: ("events", ["errors"]), callback_func=mock_callback, ) with cv: request_settings = HTTPRequestSettings() delegator.build_execution_pipeline( Request( "", query_body, query, request_settings, "ref", ), mock_query_runner, ).execute() cv.wait(timeout=5) assert mock_query_runner.call_count == 2 assert mock_callback.call_args == call( query, request_settings, "ref", [ Result("events", query_result, ANY), Result("errors", query_result, ANY) ], )
def validate(self, value, dataset: Dataset, referrer: str) -> Request: try: value = validate_jsonschema(value, self.__composite_schema) except jsonschema.ValidationError as error: raise JsonSchemaValidationException(str(error)) from error query_body = { key: value.pop(key) for key in self.__query_schema["properties"].keys() if key in value } settings = { key: value.pop(key) for key in self.__settings_schema["properties"].keys() if key in value } extensions = {} for extension_name, extension_schema in self.__extension_schemas.items( ): extensions[extension_name] = { key: value.pop(key) for key in extension_schema["properties"].keys() if key in value } query = parse_query(query_body, dataset) request_id = uuid.uuid4().hex return Request(request_id, query, self.__setting_class(**settings), extensions, referrer)
def test_data_source( query_body: MutableMapping[str, Any], expected_entity: EntityKey, ) -> None: dataset = get_dataset("discover") query = parse_query(query_body, dataset) assert query.get_from_clause().key == expected_entity
def test_format_expressions(query_body: MutableMapping[str, Any], expected_query: Query) -> None: state.set_config("query_parsing_expand_aliases", 1) events = get_dataset("events") query = parse_query(query_body, events) eq, reason = query.equals(expected_query) assert eq, reason
def parse_legacy_query( request_parts: RequestParts, settings: RequestSettings, dataset: Dataset, ) -> Union[Query, CompositeQuery[Entity]]: query = parse_query(request_parts.query, dataset) apply_query_extensions(query, request_parts.extensions, settings) return query
def test_circular_aliases() -> None: with pytest.raises(CyclicAliasException): parse_query( { "selected_columns": [ ["f1", ["column1", "f2"], "f1"], ["f2", ["f1"], "f2"], ], }, get_dataset("events"), ) with pytest.raises(CyclicAliasException): parse_query( {"selected_columns": [["f1", [["f2", ["c"], "f2"]], "c"]]}, get_dataset("events"), )
def test_alias_validation(query_body: MutableMapping[str, Any], expected_result: bool) -> None: events = get_dataset("events") query = parse_query(query_body, events) query_plan = events.get_query_plan_builder().build_plan( Request("", query, HTTPRequestSettings(), {}, "")) assert query_plan.query.validate_aliases() == expected_result
def test_tags_processor(query_body, expected_query) -> None: state.set_config("ast_tag_processor_enabled", 1) dataset = get_dataset("transactions") query = parse_query(query_body, dataset) request_settings = HTTPRequestSettings() assert (DictClickhouseQuery( dataset, query, request_settings).format_sql() == expected_query)
def test_sessions_processing() -> None: query_body = { "selected_columns": ["duration_quantiles", "sessions", "users"], "conditions": [ ["org_id", "=", 1], ["project_id", "=", 1], ["started", ">", "2020-01-01 12:00:00"], ], } sessions = get_dataset("sessions") query = parse_query(query_body, sessions) request = Request("", query_body, query, HTTPRequestSettings(), "") def query_runner(query: Query, settings: RequestSettings, reader: Reader) -> QueryResult: quantiles = tuple( Literal(None, quant) for quant in [0.5, 0.75, 0.9, 0.95, 0.99, 1]) assert query.get_selected_columns() == [ SelectedExpression( "duration_quantiles", CurriedFunctionCall( "_snuba_duration_quantiles", FunctionCall( None, "quantilesIfMerge", quantiles, ), (Column(None, None, "duration_quantiles"), ), ), ), SelectedExpression( "sessions", FunctionCall( "_snuba_sessions", "plus", ( FunctionCall(None, "countIfMerge", (Column(None, None, "sessions"), )), FunctionCall( None, "sumIfMerge", (Column(None, None, "sessions_preaggr"), ), ), ), ), ), SelectedExpression( "users", FunctionCall("_snuba_users", "uniqIfMerge", (Column(None, None, "users"), )), ), ] return QueryResult({}, {}) sessions.get_default_entity().get_query_pipeline_builder( ).build_execution_pipeline(request, query_runner).execute()
def test_time_split_ast() -> None: """ Test that the time split transforms the query properly both on the old representation and on the AST representation. """ found_timestamps = [] def do_query( query: ClickhouseQuery, request_settings: RequestSettings, ) -> QueryResult: from_date_ast, to_date_ast = get_time_range(query, "timestamp") assert from_date_ast is not None and isinstance( from_date_ast, datetime) assert to_date_ast is not None and isinstance(to_date_ast, datetime) found_timestamps.append( (from_date_ast.isoformat(), to_date_ast.isoformat())) return QueryResult({"data": []}, {}) body = { "selected_columns": [ "event_id", "level", "logger", "server_name", "transaction", "timestamp", "project_id", ], "conditions": [ ("timestamp", ">=", "2019-09-18T10:00:00"), ("timestamp", "<", "2019-09-19T12:00:00"), ("project_id", "IN", [1]), ], "limit": 10, "orderby": ["-timestamp"], } query = parse_query(body, get_dataset("events")) entity = get_entity(query.get_from_clause().key) settings = HTTPRequestSettings() for p in entity.get_query_processors(): p.process_query(query, settings) clickhouse_query = identity_translate(query) splitter = TimeSplitQueryStrategy("timestamp") splitter.execute(clickhouse_query, settings, do_query) assert found_timestamps == [ ("2019-09-19T11:00:00", "2019-09-19T12:00:00"), ("2019-09-19T01:00:00", "2019-09-19T11:00:00"), ("2019-09-18T10:00:00", "2019-09-19T01:00:00"), ]
def test_find_projects(query_body: MutableMapping[str, Any], expected_projects: Set[int]) -> None: events = get_dataset("events") query = parse_query(query_body, events) query = ClickhouseQuery(query) project_ids = get_project_ids_in_query(query, "project_id") assert project_ids == expected_projects project_ids_ast = get_project_ids_in_query_ast(query, "project_id") assert project_ids_ast == expected_projects
def parse_and_process(query_body: MutableMapping[str, Any]) -> ClickhouseQuery: dataset = get_dataset("transactions") query = parse_query(query_body, dataset) request = Request("a", query, HTTPRequestSettings(), {}, "r") for p in dataset.get_query_processors(): p.process_query(query, request.settings) plan = dataset.get_query_plan_builder().build_plan(request) ArrayJoinKeyValueOptimizer("tags").process_query(plan.query, request.settings) return plan.query
def test_alias_validation(query_body: MutableMapping[str, Any], expected_result: bool) -> None: events = get_dataset("events") query = parse_query(query_body, events) settings = HTTPRequestSettings() query_plan = ( events.get_default_entity().get_query_pipeline_builder().build_planner( query, settings)).build_best_plan() execute_all_clickhouse_processors(query_plan, settings) assert query_plan.query.validate_aliases() == expected_result
def test_nested_optimizer(query_body, expected_condition) -> None: query = parse_query(query_body, get_dataset("transactions")) request_settings = HTTPRequestSettings() processor = NestedFieldConditionOptimizer( nested_col="tags", flattened_col="tags_map", timestamp_cols={"start_ts", "finish_ts"}, beginning_of_time=datetime(2019, 12, 11, 0, 0, 0), ) processor.process_query(query, request_settings) assert query.get_conditions() == expected_condition
def validate( self, value: MutableMapping[str, Any], dataset: Dataset, referrer: str ) -> Request: try: value = validate_jsonschema(value, self.__composite_schema) except jsonschema.ValidationError as error: raise JsonSchemaValidationException(str(error)) from error query_body = { key: value.pop(key) for key in self.__query_schema["properties"].keys() if key in value } settings = { key: value.pop(key) for key in self.__settings_schema["properties"].keys() if key in value } class_name = self.__setting_class if isinstance(class_name, type(HTTPRequestSettings)): settings_obj: Union[ HTTPRequestSettings, SubscriptionRequestSettings ] = class_name(**settings) elif isinstance(class_name, type(SubscriptionRequestSettings)): settings_obj = class_name() extensions = {} for extension_name, extension_schema in self.__extension_schemas.items(): extensions[extension_name] = { key: value.pop(key) for key in extension_schema["properties"].keys() if key in value } if self.__language == Language.SNQL: query = parse_snql_query(query_body["query"], dataset) else: query = parse_query(query_body, dataset) apply_query_extensions(query, extensions, settings_obj) request_id = uuid.uuid4().hex return Request( request_id, # TODO: Replace this with the actual query raw body. # this can have an impact on subscriptions so we need # to be careful with the change. ChainMap(query_body, *extensions.values()), query, settings_obj, referrer, )
def test_select_storage(query_body: MutableMapping[str, Any], expected_table: str) -> None: sessions = get_dataset("sessions") query = parse_query(query_body, sessions) request = Request("", query_body, query, HTTPRequestSettings(), "") def query_runner(query: Query, settings: RequestSettings, reader: Reader) -> QueryResult: assert query.get_from_clause().table_name == expected_table return QueryResult({}, {}) sessions.get_default_entity().get_query_pipeline_builder( ).build_execution_pipeline(request, query_runner).execute()
def test_events_processing() -> None: query_body = { "selected_columns": ["tags[transaction]", "contexts[browser.name]"] } events_dataset = get_dataset("events") events_entity = events_dataset.get_default_entity() events_storage = events_entity.get_writable_storage() query = parse_query(query_body, events_dataset) request = Request("", query_body, query, HTTPRequestSettings(), "") def query_runner(query: Query, settings: RequestSettings, reader: Reader) -> QueryResult: if events_storage.get_storage_key() == StorageKey.EVENTS: transaction_col_name = "transaction" else: transaction_col_name = "transaction_name" assert query.get_selected_columns_from_ast() == [ SelectedExpression( "tags[transaction]", Column("_snuba_tags[transaction]", None, transaction_col_name), ), SelectedExpression( "contexts[browser.name]", FunctionCall( "_snuba_contexts[browser.name]", "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "browser.name"), ), ), ), ), ), ] return QueryResult({}, {}) events_entity.get_query_pipeline_builder().build_execution_pipeline( request, query_runner).execute()
def parse_and_process(query_body: MutableMapping[str, Any]) -> ClickhouseQuery: dataset = get_dataset("transactions") query = parse_query(query_body, dataset) request = Request("a", query_body, query, HTTPRequestSettings(), "r") entity = get_entity(query.get_from_clause().key) for p in entity.get_query_processors(): p.process_query(query, request.settings) ArrayJoinKeyValueOptimizer("tags").process_query(query, request.settings) query_plan = SingleStorageQueryPlanBuilder( storage=entity.get_writable_storage(), mappers=transaction_translator, ).build_and_rank_plans(query, request.settings)[0] return query_plan.query
def test_prewhere( query_body: MutableMapping[str, Any], keys: Sequence[str], new_ast_condition: Optional[Expression], new_prewhere_ast_condition: Optional[Expression], ) -> None: settings.MAX_PREWHERE_CONDITIONS = 2 events = get_dataset("events") query = parse_query(query_body, events) query.set_data_source(TableSource("my_table", ColumnSet([]), None, keys)) request_settings = HTTPRequestSettings() processor = PrewhereProcessor() processor.process_query(Query(query), request_settings) assert query.get_condition_from_ast() == new_ast_condition assert query.get_prewhere_ast() == new_prewhere_ast_condition
def test_nested_optimizer(query_body, expected_condition) -> None: transactions = get_dataset("transactions") query = parse_query(query_body, transactions) request_settings = HTTPRequestSettings() request = Request("", query, request_settings, {}, "") query_plan = transactions.get_query_plan_builder().build_plan(request) processor = NestedFieldConditionOptimizer( nested_col="tags", flattened_col="tags_map", timestamp_cols={"start_ts", "finish_ts"}, beginning_of_time=datetime(2019, 12, 11, 0, 0, 0), ) clickhouse_query = query_plan.query processor.process_query(clickhouse_query, request_settings) assert clickhouse_query.get_conditions() == expected_condition
def test_format_expressions(query_body: MutableMapping[str, Any], expected_query: Query) -> None: events = get_dataset("events") query = parse_query(query_body, events) # We cannot just run == on the query objects. The content of the two # objects is different, being one the AST and the ont the AST + raw body assert (query.get_selected_columns_from_ast() == expected_query.get_selected_columns_from_ast()) assert query.get_groupby_from_ast() == expected_query.get_groupby_from_ast( ) assert query.get_condition_from_ast( ) == expected_query.get_condition_from_ast() assert query.get_arrayjoin_from_ast( ) == expected_query.get_arrayjoin_from_ast() assert query.get_having_from_ast() == expected_query.get_having_from_ast() assert query.get_orderby_from_ast() == expected_query.get_orderby_from_ast( )
def test_data_source( query_body: MutableMapping[str, Any], expected_table: str, ) -> None: request_settings = HTTPRequestSettings() dataset = get_dataset("discover") query = parse_query(query_body, dataset) request = Request("a", query, request_settings, {}, "r") for processor in get_dataset("discover").get_query_processors(): processor.process_query(request.query, request.settings) plan = dataset.get_query_plan_builder().build_plan(request) for physical_processor in plan.plan_processors: physical_processor.process_query(plan.query, request.settings) assert plan.query.get_data_source().format_from() == expected_table, json.dumps( query_body )
def test_sessions_processing() -> None: query_body = { "selected_columns": ["duration_quantiles", "sessions", "users"] } sessions = get_dataset("sessions") query = parse_query(query_body, sessions) request = Request("", query, HTTPRequestSettings(), {}, "") query_plan = (sessions.get_default_entity().get_query_plan_builder(). build_plan(request)) for clickhouse_processor in query_plan.plan_processors: clickhouse_processor.process_query(query_plan.query, request.settings) def query_runner(query: Query, settings: RequestSettings, reader: Reader[SqlQuery]) -> QueryResult: assert query.get_selected_columns_from_ast() == [ SelectedExpression( "duration_quantiles", CurriedFunctionCall( "duration_quantiles", FunctionCall( None, "quantilesIfMerge", (Literal(None, 0.5), Literal(None, 0.9)), ), (Column(None, None, "duration_quantiles"), ), ), ), SelectedExpression( "sessions", FunctionCall("sessions", "countIfMerge", (Column(None, None, "sessions"), )), ), SelectedExpression( "users", FunctionCall("users", "uniqIfMerge", (Column(None, None, "users"), )), ), ] return QueryResult({}, {}) query_plan.execution_strategy.execute(query_plan.query, request.settings, query_runner)