def get_all_entities(self) -> Sequence[Entity]: return ( get_entity(EntityKey.METRICS_COUNTERS), get_entity(EntityKey.METRICS_DISTRIBUTIONS), get_entity(EntityKey.METRICS_SETS), get_entity(EntityKey.ORG_METRICS_COUNTERS), )
def test_entity_column_validation(query_body: str, expected_query: LogicalQuery, set_configs: Any) -> None: state.set_config("query_parsing_expand_aliases", 1) events = get_dataset("events") # TODO: Potentially remove this once entities have actual join relationships mapping = { "contains": (EntityKey.TRANSACTIONS, "event_id"), "connected": (EntityKey.SPANS, "trace_id"), } def events_mock(relationship: str) -> JoinRelationship: entity_key, rhs_column = mapping[relationship] return JoinRelationship( rhs_entity=entity_key, join_type=JoinType.INNER, columns=[("event_id", rhs_column)], equivalences=[], ) events_entity = get_entity(EntityKey.EVENTS) old_get_join = events_entity.get_join_relationship try: setattr(events_entity, "get_join_relationship", events_mock) query = parse_snql_query(query_body, [], events) eq, reason = query.equals(expected_query) assert eq, reason finally: setattr(events_entity, "get_join_relationship", old_get_join)
def setup_method(self, test_method: Any) -> None: super().setup_method(test_method) # values for test data self.metric_id = 1002 self.org_id = 103 self.project_ids = [1, 2] # 2 projects self.seconds = 180 * 60 self.default_tags = { TAG_1_KEY: TAG_1_VALUE_1, TAG_2_KEY: TAG_2_VALUE_1, TAG_3_KEY: TAG_3_VALUE_1, TAG_4_KEY: TAG_4_VALUE_1, } self.skew = timedelta(seconds=self.seconds) self.base_time = utc_yesterday_12_15() - timedelta( minutes=self.seconds) self.storage = cast( WritableTableStorage, get_entity(EntityKey.METRICS_SETS).get_writable_storage(), ) self.unique_set_values = 100 self.generate_sets()
def test_failures(query_body: str, message: str) -> None: state.set_config("query_parsing_expand_aliases", 1) # TODO: Potentially remove this once entities have actual join relationships mapping = { "contains": (EntityKey.TRANSACTIONS, "event_id"), "assigned": (EntityKey.GROUPASSIGNEE, "group_id"), "bookmark": (EntityKey.GROUPEDMESSAGES, "first_release_id"), "activity": (EntityKey.SESSIONS, "org_id"), } def events_mock(relationship: str) -> Optional[JoinRelationship]: if relationship not in mapping: return None entity_key, rhs_column = mapping[relationship] return JoinRelationship( rhs_entity=entity_key, join_type=JoinType.INNER, columns=[("event_id", rhs_column)], equivalences=[], ) events = get_dataset("events") events_entity = get_entity(EntityKey.EVENTS) setattr(events_entity, "get_join_relationship", events_mock) with pytest.raises(ParsingException, match=re.escape(message)): parse_snql_query(query_body, [], events)
def visit_relationship_match( self, node: Node, visited_children: Tuple[ Any, IndividualNode[QueryEntity], Any, Node, Any, IndividualNode[QueryEntity], ], ) -> RelationshipTuple: _, lhs, _, relationship, _, rhs = visited_children assert isinstance(lhs.data_source, QueryEntity) assert isinstance(rhs.data_source, QueryEntity) lhs_entity = get_entity(lhs.data_source.key) data = lhs_entity.get_join_relationship(relationship) if data is None: raise ParsingException( f"{lhs.data_source.key.value} does not have a join relationship -[{relationship}]->" ) elif data.rhs_entity != rhs.data_source.key: raise ParsingException( f"-[{relationship}]-> cannot be used to join {lhs.data_source.key.value} to {rhs.data_source.key.value}" ) return RelationshipTuple(lhs, relationship, rhs, data)
def column_expr( self, column_name: str, query: Query, parsing_context: ParsingContext, table_alias: str = "", ) -> Union[None, Any]: detected_entity = detect_table( query, self.__events_columns, self.__transactions_columns, False, ) if detected_entity == TRANSACTIONS: if column_name == "group_id": # TODO: We return 0 here instead of NULL so conditions like group_id # in (1, 2, 3) will work, since Clickhouse won't run a query like: # SELECT (NULL AS group_id) FROM transactions WHERE group_id IN (1, 2, 3) # When we have the query AST, we should solve this by transforming the # nonsensical conditions instead. return "0" if self.__events_columns.get(column_name): return "NULL" else: if column_name == "release": column_name = "tags[sentry:release]" if column_name == "dist": column_name = "tags[sentry:dist]" if column_name == "user": column_name = "tags[sentry:user]" if self.__transactions_columns.get(column_name): return "NULL" return get_entity(detected_entity).column_expr( column_name, query, parsing_context )
def test_format_expressions(query_body: str, expected_query: LogicalQuery) -> None: state.set_config("query_parsing_expand_aliases", 1) events = get_dataset("events") # TODO: Potentially remove this once entities have actual join relationships mapping = { "contains": (EntityKey.TRANSACTIONS, "event_id"), "assigned": (EntityKey.GROUPASSIGNEE, "group_id"), "bookmark": (EntityKey.GROUPEDMESSAGES, "first_release_id"), "activity": (EntityKey.SESSIONS, "org_id"), } def events_mock(relationship: str) -> JoinRelationship: entity_key, rhs_column = mapping[relationship] return JoinRelationship( rhs_entity=entity_key, join_type=JoinType.INNER, columns=[("event_id", rhs_column)], equivalences=[], ) events_entity = get_entity(EntityKey.EVENTS) setattr(events_entity, "get_join_relationship", events_mock) query = parse_snql_query(query_body, events) eq, reason = query.equals(expected_query) assert eq, reason
def test_functions( default_validators: Mapping[str, FunctionCallValidator], entity_validators: Mapping[str, FunctionCallValidator], exception: Optional[Type[InvalidExpressionException]], ) -> None: fn_cached = functions.default_validators functions.default_validators = default_validators entity_return = MagicMock() entity_return.return_value = entity_validators events_entity = get_entity(EntityKey.EVENTS) cached = events_entity.get_function_call_validators setattr(events_entity, "get_function_call_validators", entity_return) data_source = QueryEntity(EntityKey.EVENTS, ColumnSet([])) expression = FunctionCall( None, "f", (Column(alias=None, table_name=None, column_name="col"), )) if exception is None: FunctionCallsValidator().validate(expression, data_source) else: with pytest.raises(exception): FunctionCallsValidator().validate(expression, data_source) # TODO: This should use fixture to do this setattr(events_entity, "get_function_call_validators", cached) functions.default_validators = fn_cached
def test_outcomes_columns_validation(key: EntityKey) -> None: entity = get_entity(key) query_entity = QueryEntity(key, entity.get_data_model()) bad_query = LogicalQuery( query_entity, selected_columns=[ SelectedExpression("asdf", Column("_snuba_asdf", None, "asdf")), ], ) good_query = LogicalQuery( query_entity, selected_columns=[ SelectedExpression( column.name, Column(f"_snuba_{column.name}", None, column.name)) for column in entity.get_data_model().columns ], ) validator = EntityContainsColumnsValidator( entity.get_data_model(), validation_mode=ColumnValidationMode.ERROR) with pytest.raises(InvalidQueryException): validator.validate(bad_query) validator.validate(good_query)
def test_col_split_conditions( id_column: str, project_column: str, timestamp_column: str, query, expected_result ) -> None: dataset = get_dataset("events") query = parse_query(query, dataset) splitter = ColumnSplitQueryStrategy(id_column, project_column, timestamp_column) request = Request("a", query, HTTPRequestSettings(), {}, "r") entity = get_entity(query.get_from_clause().key) plan = entity.get_query_plan_builder().build_plan(request) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, ) -> QueryResult: return QueryResult( { "data": [ { id_column: "asd123", project_column: 123, timestamp_column: "2019-10-01 22:33:42", } ] }, {}, ) assert ( splitter.execute(plan.query, HTTPRequestSettings(), do_query) is not None ) == expected_result
def setup_method(self, test_method: Any) -> None: super().setup_method(test_method) # values for test data self.metric_id = 1010 self.org_id = 103 self.project_ids = [1, 2] # 2 projects self.seconds = 180 * 60 self.d_range_min, self.d_range_max = (0, 100) self.default_tags = { TAG_1_KEY: TAG_1_VALUE_1, TAG_2_KEY: TAG_2_VALUE_1, TAG_3_KEY: TAG_3_VALUE_1, TAG_4_KEY: TAG_4_VALUE_1, } self.skew = timedelta(seconds=self.seconds) self.base_time = utc_yesterday_12_15() - timedelta( seconds=self.seconds) self.storage = cast( WritableTableStorage, get_entity(EntityKey.METRICS_DISTRIBUTIONS).get_writable_storage(), ) self.generate_uniform_distributions()
def describe(entity_name: str) -> None: try: entity = get_entity(EntityKey(entity_name)) click.echo(f"Entity {entity_name}") entity.describe().accept(CLIDescriber()) except InvalidEntityError: click.echo( f"Entity {entity_name} does not exists or it is not registered.")
def test_complex_conditions_expr(actual, expected) -> None: entity = get_entity(EntityKey.EVENTS) assert ( parse_function_to_expr( actual, entity.get_data_model(), {"sdk_integrations", "tags.key"} ) == expected ), actual
def __init__(self, dataset: Dataset, entity_key: EntityKey): self.dataset = dataset self.entity_key = entity_key entity = get_entity(entity_key) self.__partitioner = TopicSubscriptionDataPartitioner( enforce_table_writer(entity).get_stream_loader().get_default_topic_spec() )
def _align_max_days_date_align( key: EntityKey, old_top_level: Sequence[Expression], max_days: Optional[int], date_align: int, alias: Optional[str] = None, ) -> Sequence[Expression]: entity = get_entity(key) if not entity.required_time_column: return old_top_level # If there is an = or IN condition on time, we don't need to do any of this match = build_match( entity.required_time_column, [ConditionFunctions.EQ], datetime, alias ) if any(match.match(cond) for cond in old_top_level): return old_top_level lower, upper = get_time_range_expressions( old_top_level, entity.required_time_column, alias ) if not lower: raise ParsingException( f"missing >= condition on column {entity.required_time_column} for entity {key.value}" ) elif not upper: raise ParsingException( f"missing < condition on column {entity.required_time_column} for entity {key.value}" ) from_date, from_exp = lower to_date, to_exp = upper from_date = from_date - timedelta( seconds=(from_date - from_date.min).seconds % date_align ) to_date = to_date - timedelta(seconds=(to_date - to_date.min).seconds % date_align) if from_date > to_date: raise ParsingException(f"invalid time conditions on entity {key.value}") if max_days is not None and (to_date - from_date).days > max_days: from_date = to_date - timedelta(days=max_days) def replace_cond(exp: Expression) -> Expression: if not isinstance(exp, FunctionCall): return exp elif exp == from_exp: return replace( exp, parameters=(from_exp.parameters[0], Literal(None, from_date)), ) elif exp == to_exp: return replace( exp, parameters=(to_exp.parameters[0], Literal(None, to_date)) ) return exp return list(map(replace_cond, old_top_level))
def test_invalid_conditions() -> None: entity = get_entity(EntityKey.EVENTS) is_null = [["group_id", "IS NULL", "I am not valid"]] with pytest.raises(Exception): parse_conditions_to_expr(is_null, entity, set()) binary = [["group_id", "=", None]] with pytest.raises(Exception): parse_conditions_to_expr(binary, entity, set())
def _get_entity_watermark_mode( self, entity_key: EntityKey ) -> SchedulingWatermarkMode: storage = get_entity(entity_key).get_writable_storage() assert storage is not None, "Entity does not have a writable storage" stream_loader = storage.get_table_writer().get_stream_loader() mode = stream_loader.get_subscription_scheduler_mode() assert mode is not None, "Entity is not subscriptable" return mode
def test_time_split_ast() -> None: """ Test that the time split transforms the query properly both on the old representation and on the AST representation. """ found_timestamps = [] def do_query( query: ClickhouseQuery, request_settings: RequestSettings, ) -> QueryResult: from_date_ast, to_date_ast = get_time_range(query, "timestamp") assert from_date_ast is not None and isinstance( from_date_ast, datetime) assert to_date_ast is not None and isinstance(to_date_ast, datetime) found_timestamps.append( (from_date_ast.isoformat(), to_date_ast.isoformat())) return QueryResult({"data": []}, {}) body = { "selected_columns": [ "event_id", "level", "logger", "server_name", "transaction", "timestamp", "project_id", ], "conditions": [ ("timestamp", ">=", "2019-09-18T10:00:00"), ("timestamp", "<", "2019-09-19T12:00:00"), ("project_id", "IN", [1]), ], "limit": 10, "orderby": ["-timestamp"], } query = parse_query(body, get_dataset("events")) entity = get_entity(query.get_from_clause().key) settings = HTTPRequestSettings() for p in entity.get_query_processors(): p.process_query(query, settings) clickhouse_query = identity_translate(query) splitter = TimeSplitQueryStrategy("timestamp") splitter.execute(clickhouse_query, settings, do_query) assert found_timestamps == [ ("2019-09-19T11:00:00", "2019-09-19T12:00:00"), ("2019-09-19T01:00:00", "2019-09-19T11:00:00"), ("2019-09-18T10:00:00", "2019-09-19T01:00:00"), ]
def build_execution_pipeline( self, request: Request, runner: QueryRunner) -> QueryExecutionPipeline: if isinstance(request.query, Query): entity = get_entity(request.query.get_from_clause().key) return entity.get_query_pipeline_builder( ).build_execution_pipeline(request, runner) else: return CompositeExecutionPipeline(request.query, request.query_settings, runner)
def setup_method(self, test_method): super().setup_method(test_method) self.app.post = partial(self.app.post, headers={"referer": "test"}) self.event = get_raw_event() self.project_id = self.event["project_id"] self.base_time = datetime.utcnow().replace( second=0, microsecond=0, tzinfo=pytz.utc ) - timedelta(minutes=90) self.next_time = self.base_time + timedelta(minutes=95) self.events_storage = get_entity(EntityKey.EVENTS).get_writable_storage() write_unprocessed_events(self.events_storage, [self.event]) groups = [ { "offset": 0, "project_id": self.project_id, "id": self.event["group_id"], "record_deleted": 0, "status": 0, } ] groups_storage = get_entity(EntityKey.GROUPEDMESSAGES).get_writable_storage() groups_storage.get_table_writer().get_batch_writer( metrics=DummyMetricsBackend(strict=True) ).write([json.dumps(group).encode("utf-8") for group in groups]) assignees = [ { "offset": 0, "project_id": self.project_id, "group_id": self.event["group_id"], "record_deleted": 0, "user_id": 100, } ] assignees_storage = get_entity(EntityKey.GROUPASSIGNEE).get_writable_storage() assignees_storage.get_table_writer().get_batch_writer( metrics=DummyMetricsBackend(strict=True) ).write([json.dumps(assignee).encode("utf-8") for assignee in assignees])
def visit_entity_single( self, node: Node, visited_children: Tuple[Any, Any, EntityKey, Union[Optional[float], Node], Any, Any], ) -> QueryEntity: _, _, name, sample, _, _ = visited_children if isinstance(sample, Node): sample = None return QueryEntity(name, get_entity(name).get_data_model(), sample)
def _visit_simple_query( self, data_source: ProcessableQuery[Entity]) -> CompositeDataSourcePlan: assert isinstance( data_source, LogicalQuery ), f"Only subqueries are allowed at query planning stage. {type(data_source)} found." return CompositeDataSourcePlan.from_simple_query_plan( get_entity(data_source.get_from_clause().key ).get_query_pipeline_builder().build_planner( data_source, self.__settings).build_best_plan())
def test_aggregation_parsing(aggregation: Any, expected_function: FunctionCall) -> None: entity = get_entity(EntityKey.EVENTS) function = parse_aggregation( aggregation[0], aggregation[1], aggregation[2], entity.get_data_model(), set(), ) assert function == expected_function, expected_function
def execute_entity_processors(query: LogicalQuery, settings: RequestSettings) -> None: """ Executes the entity query processors for the query. These are taken from the entity. """ entity = get_entity(query.get_from_clause().key) for processor in entity.get_query_processors(): with sentry_sdk.start_span(description=type(processor).__name__, op="processor"): processor.process_query(query, settings)
def parse_and_process(query_body: MutableMapping[str, Any]) -> ClickhouseQuery: dataset = get_dataset("transactions") query = parse_query(query_body, dataset) request = Request("a", query, HTTPRequestSettings(), {}, "r") entity = get_entity(query.get_from_clause().key) for p in entity.get_query_processors(): p.process_query(query, request.settings) plan = entity.get_query_plan_builder().build_plan(request) ArrayJoinKeyValueOptimizer("tags").process_query(plan.query, request.settings) return plan.query
def visit_individual_node( self, node: IndividualNode[Entity] ) -> Mapping[str, Sequence[ClickhouseQueryPlan]]: assert isinstance( node.data_source, LogicalQuery ), "Invalid composite query. All nodes must be subqueries." plans = (get_entity(node.data_source.get_from_clause().key). get_query_pipeline_builder().build_planner( node.data_source, self.__settings).build_and_rank_plans()) return {node.alias: plans}
def parse_query(body: MutableMapping[str, Any], dataset: Dataset) -> Query: """ Parses the query body generating the AST. This only takes into account the initial query body. Extensions are parsed by extension processors and are supposed to update the AST. Parsing includes two phases. The first transforms the json body into a minimal query Object resolving expressions, conditions, etc. The second phase performs some query processing to provide a sane query to the dataset specific section. - It prevents alias shadowing. - It transforms columns from the tags[asd] form into SubscriptableReference. - Applies aliases to all columns that do not have one and that do not represent a reference to an existing alias. During query processing a column can be transformed into a different expression. It is essential to preserve the original column name so that the result set still has a column with the name provided by the user no matter on which transformation we applied. By applying aliases at this stage every processor just needs to preserve them to guarantee the correctness of the query. - Expands all the references to aliases by inlining the expression to make aliasing transparent to all query processing phases. References to aliases are reintroduced at the end of the query processing. Alias references are packaged back at the end of processing. """ # TODO: Parse the entity out of the query body and select the correct one from the dataset entity = dataset.get_default_entity() query = _parse_query_impl(body, entity) # TODO: These should support composite queries. _validate_empty_table_names(query) _validate_aliases(query) _parse_subscriptables(query) _apply_column_aliases(query) _expand_aliases(query) # WARNING: These steps above assume table resolution did not happen # yet. If it is put earlier than here (unlikely), we need to adapt them. _deescape_aliases(query) _mangle_aliases(query) _validate_arrayjoin(query) # XXX: Select the entity to be used for the query. This step is temporary. Eventually # entity selection will be moved to Sentry and specified for all SnQL queries. selected_entity = dataset.select_entity(query) query_entity = QueryEntity( selected_entity, get_entity(selected_entity).get_data_model() ) query.set_from_clause(query_entity) validate_query(query) return query
def _validate_required_conditions( query: Union[CompositeQuery[QueryEntity], LogicalQuery], ) -> None: if isinstance(query, LogicalQuery): entity = get_entity(query.get_from_clause().key) if not entity.validate_required_conditions(query): raise ParsingException( f"{query.get_from_clause().key} is missing required conditions" ) else: from_clause = query.get_from_clause() if isinstance(from_clause, (LogicalQuery, CompositeQuery)): return _validate_required_conditions(from_clause) assert isinstance(from_clause, JoinClause) # mypy alias_map = from_clause.get_alias_node_map() for alias, node in alias_map.items(): assert isinstance(node.data_source, QueryEntity) # mypy entity = get_entity(node.data_source.key) if not entity.validate_required_conditions(query, alias): raise ParsingException( f"{node.data_source.key} is missing required conditions")
def add_conditions( self, timestamp: datetime, offset: Optional[int], query: Union[CompositeQuery[Entity], Query], ) -> None: # TODO: Support composite queries with multiple entities. from_clause = query.get_from_clause() if not isinstance(from_clause, Entity): raise InvalidSubscriptionError("Only simple queries are supported") entity = get_entity(from_clause.key) required_timestamp_column = entity.required_time_column if required_timestamp_column is None: raise InvalidSubscriptionError( "Entity must have a timestamp column for subscriptions") conditions_to_add: List[Expression] = [ binary_condition( ConditionFunctions.EQ, Column(None, None, "project_id"), Literal(None, self.project_id), ), binary_condition( ConditionFunctions.GTE, Column(None, None, required_timestamp_column), Literal(None, (timestamp - self.time_window)), ), binary_condition( ConditionFunctions.LT, Column(None, None, required_timestamp_column), Literal(None, timestamp), ), ] if offset is not None: conditions_to_add.append( binary_condition( ConditionFunctions.LTE, FunctionCall( None, "ifnull", (Column(None, None, "offset"), Literal(None, 0)), ), Literal(None, offset), )) new_condition = combine_and_conditions(conditions_to_add) condition = query.get_condition() if condition: new_condition = binary_condition(BooleanFunctions.AND, condition, new_condition) query.set_ast_condition(new_condition)
def validate_subscription( self, query: Union[CompositeQuery[Entity], Query]) -> None: # TODO: Support composite queries with multiple entities. from_clause = query.get_from_clause() if not isinstance(from_clause, Entity): raise InvalidSubscriptionError("Only simple queries are supported") entity = get_entity(from_clause.key) SubscriptionAllowedClausesValidator().validate(query) if entity.required_time_column: NoTimeBasedConditionValidator( entity.required_time_column).validate(query)