def visit_join_clause(self, node: JoinClause[Table]) -> JoinClause[Table]: # TODO: Skip this optimization if the left side of the query # runs aggregations that imply the cardinality of the right # side is important. join_cols = set() for condition in node.keys: if condition.left.table_alias == node.right_node.alias: join_cols.add(condition.left.column) elif condition.right.table_alias == node.right_node.alias: join_cols.add(condition.right.column) for c in self.__referenced_columns: if c.table_name == node.right_node.alias and c.column_name not in join_cols: return node return JoinClause( left_node=node.left_node.accept(self), right_node=node.right_node, keys=node.keys, join_type=node.join_type, join_modifier=JoinModifier.ANY if node.join_type == JoinType.INNER else JoinModifier.SEMI, )
def visit_join_clause(self, node: JoinClause[Entity]) -> JoinDataSourcePlan: left_node = node.left_node.accept(self) right_node = self.visit_individual_node(node.right_node) # TODO: Actually return multiple plans for each subquery (one per storage # set) and rank them picking a combination that fits in a single storage # set or valid storage set combination. assert is_valid_storage_set_combination( left_node.storage_set_key, right_node.storage_set_key ), f"Incompatible storage sets found in plan: {left_node.storage_set_key} {right_node.storage_set_key}" # mypy does not know that the method above only produces individual nodes. assert isinstance(right_node.translated_source, IndividualNode) return JoinDataSourcePlan( translated_source=JoinClause( left_node=left_node.translated_source, right_node=right_node.translated_source, keys=node.keys, join_type=node.join_type, join_modifier=node.join_modifier, ), processors={ **left_node.processors, **right_node.processors }, storage_set_key=left_node.storage_set_key, )
def visit_join_clause(self, node: JoinClause[Entity]) -> JoinClause[Entity]: """ This tweaks the names of the columns in the ON clause as they cannot reference the entity fields directly (as in the original query) but they have to reference the mangled aliases generated by SubqueriesInitializer. This is needed because Clickhouse does not know about the names of the SelectedExpression nodes in the AST. It is only aware of aliases as a way to reference expressions from the external query to the subqueries. """ left = node.left_node.accept(self) right = self.visit_individual_node(node.right_node) keys = [ JoinCondition( left=JoinConditionExpression(k.left.table_alias, aliasify_column(k.left.column)), right=JoinConditionExpression(k.right.table_alias, aliasify_column(k.right.column)), ) for k in node.keys ] return JoinClause(left, right, keys, node.join_type, node.join_modifier)
def test_complex_joins() -> None: e = Entity(key=EntityKey.EVENTS, schema=ERRORS_SCHEMA) node_err = IndividualNode(alias="err", data_source=e) g = Entity(key=EntityKey.GROUPEDMESSAGES, schema=GROUPS_SCHEMA) node_group = IndividualNode(alias="groups", data_source=g) a = Entity(key=EntityKey.GROUPASSIGNEE, schema=GROUPS_ASSIGNEE) query = Query( from_clause=a, selected_columns=[ SelectedExpression("id", Column("id", None, "id")), SelectedExpression("assigned_user", Column("assigned_user", None, "user")), ], ) node_query = IndividualNode(alias="assignee", data_source=query) join = JoinClause( left_node=JoinClause( left_node=node_err, right_node=node_group, keys=[ JoinCondition( left=JoinConditionExpression("err", "group_id"), right=JoinConditionExpression("groups", "id"), ) ], join_type=JoinType.INNER, ), right_node=node_query, keys=[ JoinCondition( left=JoinConditionExpression("err", "group_id"), right=JoinConditionExpression("assignee", "id"), ) ], join_type=JoinType.INNER, ) assert join.get_column_sets() == { "err": ERRORS_SCHEMA, "assignee": ColumnSet([("id", Any()), ("assigned_user", Any())]), "groups": GROUPS_SCHEMA, }
def test_simple_join() -> None: e = Entity(key=EntityKey.EVENTS, schema=ERRORS_SCHEMA) node_err = IndividualNode(alias="err", data_source=e) g = Entity(key=EntityKey.GROUPEDMESSAGES, schema=GROUPS_SCHEMA) node_group = IndividualNode(alias="groups", data_source=g) join = JoinClause( left_node=node_err, right_node=node_group, keys=[ JoinCondition( left=JoinConditionExpression("err", "group_id"), right=JoinConditionExpression("groups", "id"), ) ], join_type=JoinType.INNER, join_modifier=JoinModifier.SEMI, ) assert join.get_column_sets() == { "err": ERRORS_SCHEMA, "groups": GROUPS_SCHEMA } joined_cols = join.get_columns() assert "err.group_id" in joined_cols assert "err.event_id" in joined_cols assert "groups.id" in joined_cols assert "groups.message" in joined_cols with pytest.raises(AssertionError): JoinClause( left_node=node_err, right_node=node_group, keys=[ JoinCondition( left=JoinConditionExpression("err", "missing_col"), right=JoinConditionExpression("groups", "another_missing_col"), ) ], join_type=JoinType.INNER, )
def events_groups_join( left: IndividualNode[TNode], right: IndividualNode[TNode], ) -> JoinClause[TNode]: return JoinClause( left_node=left, right_node=right, keys=[ JoinCondition( left=JoinConditionExpression("ev", "_snuba_group_id"), right=JoinConditionExpression("gr", "_snuba_id"), ) ], join_type=JoinType.INNER, )
def join_clause(lhs_alias: str, lhs: Union[str, JoinClause[QueryEntity]], rhs: str) -> JoinClause[QueryEntity]: rhs_alias, rhs = rhs.split(":", 1) return JoinClause( left_node=node(lhs_alias, lhs) if isinstance(lhs, str) else lhs, right_node=node(rhs_alias, rhs), keys=[ JoinCondition( left=JoinConditionExpression(lhs_alias, "event_id"), right=JoinConditionExpression(rhs_alias, "event_id"), ) ], join_type=JoinType.INNER, )
def build_join_clause_loop( node_list: Node, lhs: Optional[Union[IndividualNode[QueryEntity], JoinClause[QueryEntity]]], ) -> Union[IndividualNode[QueryEntity], JoinClause[QueryEntity]]: rhs = node_list.entity_data if lhs is None: lhs = rhs else: assert node_list.relationship is not None # mypy lhs = JoinClause( left_node=lhs, right_node=rhs, keys=node_list.join_conditions, join_type=node_list.relationship.join_type, ) if node_list.child is None: return lhs return build_join_clause_loop(node_list.child, lhs)
def test_join_query() -> None: events_query = LogicalQuery( Entity( EntityKey.EVENTS, ColumnSet([("event_id", String()), ("group_id", UInt(32))]), ), selected_columns=[ SelectedExpression("group_id", Column("group_id", None, "group_id")), SelectedExpression("string_evt_id", Column("string_evt_id", None, "event_id")), ], ) groups_query = LogicalQuery( Entity( EntityKey.GROUPEDMESSAGES, ColumnSet([("id", UInt(32)), ("message", String())]), ), selected_columns=[ SelectedExpression("group_id", Column("group_id", None, "id")) ], ) join_query = CompositeQuery(from_clause=JoinClause( left_node=IndividualNode("e", events_query), right_node=IndividualNode("g", groups_query), keys=[ JoinCondition( left=JoinConditionExpression("e", "group_id"), right=JoinConditionExpression("g", "group_id"), ) ], join_type=JoinType.INNER, )) data_source = join_query.get_from_clause() assert "e.string_evt_id" in data_source.get_columns() assert "g.group_id" in data_source.get_columns()
), ], ), id="Query with a subquery", ), pytest.param( CompositeQuery( from_clause=JoinClause( left_node=IndividualNode( alias="err", data_source=events_ent, ), right_node=IndividualNode( alias="groups", data_source=groups_ent, ), keys=[ JoinCondition( left=JoinConditionExpression("err", "group_id"), right=JoinConditionExpression("groups", "id"), ) ], join_type=JoinType.INNER, ), selected_columns=[ SelectedExpression( "f_release", FunctionCall( "f_release", "f", (Column(None, "err", "release"), ),
AND t.project_id=1 AND t.finish_ts>=toDateTime('2021-01-01T00:30:00') AND t.finish_ts<toDateTime('2021-01-07T00:30:00')""", CompositeQuery( from_clause=JoinClause( left_node=IndividualNode( "e", QueryEntity( EntityKey.EVENTS, get_entity(EntityKey.EVENTS).get_data_model(), ), ), right_node=IndividualNode( "t", QueryEntity( EntityKey.TRANSACTIONS, get_entity(EntityKey.TRANSACTIONS).get_data_model(), ), ), keys=[ JoinCondition( JoinConditionExpression("e", "event_id"), JoinConditionExpression("t", "event_id"), ) ], join_type=JoinType.INNER, ), selected_columns=[ SelectedExpression( "4-5", FunctionCall(None, "minus",
"SELECT column1, (avg(column2) AS sub_average), column3 " "FROM my_table " "WHERE eq((column3 AS al), 'blabla') " "GROUP BY column2" ") " "GROUP BY alias"), id="Composite query", ), pytest.param( CompositeQuery( from_clause=JoinClause( left_node=node_err, right_node=node_group, keys=[ JoinCondition( left=JoinConditionExpression("err", "group_id"), right=JoinConditionExpression("groups", "id"), ) ], join_type=JoinType.INNER, ), selected_columns=[ SelectedExpression("error_id", Column("error_id", "err", "event_id")), SelectedExpression("message", Column("message", "groups", "message")), ], condition=binary_condition("eq", Column(None, "groups", "id"), Literal(None, 1)), ), [
GROUPS_SCHEMA, Events, GroupAssignee, GroupedMessage, ) TEST_CASES = [ pytest.param( JoinClause( IndividualNode("ev", EntitySource(EntityKey.EVENTS, EVENTS_SCHEMA, None)), IndividualNode( "gr", EntitySource(EntityKey.GROUPEDMESSAGES, GROUPS_SCHEMA, None)), [ JoinCondition( JoinConditionExpression("ev", "group_id"), JoinConditionExpression("gr", "id"), ) ], JoinType.INNER, None, ), { QualifiedCol(EntityKey.EVENTS, "group_id"): { QualifiedCol(EntityKey.GROUPEDMESSAGES, "id"), }, QualifiedCol(EntityKey.GROUPEDMESSAGES, "id"): { QualifiedCol(EntityKey.EVENTS, "group_id"), }, QualifiedCol(EntityKey.EVENTS, "project_id"): {
) assert condition.transform( partial(_replace_col, "ev", "project_id", "gr", "project_id")) == binary_condition( ConditionFunctions.EQ, Column(None, "gr", "project_id"), Literal(None, 1)) ENTITY_GROUP_JOIN = JoinClause( IndividualNode("ev", EntitySource(EntityKey.EVENTS, EVENTS_SCHEMA, None)), IndividualNode( "gr", EntitySource(EntityKey.GROUPEDMESSAGES, GROUPS_SCHEMA, None)), [ JoinCondition( JoinConditionExpression("ev", "group_id"), JoinConditionExpression("gr", "id"), ) ], JoinType.INNER, None, ) TEST_REPLACEMENT = [ pytest.param( binary_condition(ConditionFunctions.EQ, Column(None, "ev", "event_id"), Literal(None, 1)), ENTITY_GROUP_JOIN, binary_condition(ConditionFunctions.EQ, Column(None, "ev", "event_id"), Literal(None, 1)), id="No condition to add",
from snuba.query.formatters.tracing import TExpression, format_query from snuba.query.logical import Query as LogicalQuery from tests.query.joins.equivalence_schema import ( EVENTS_SCHEMA, GROUPS_SCHEMA, ) BASIC_JOIN = JoinClause( left_node=IndividualNode( alias="ev", data_source=Entity(EntityKey.EVENTS, EVENTS_SCHEMA, None), ), right_node=IndividualNode( alias="gr", data_source=Entity(EntityKey.GROUPEDMESSAGES, GROUPS_SCHEMA, None), ), keys=[ JoinCondition( left=JoinConditionExpression("ev", "group_id"), right=JoinConditionExpression("gr", "id"), ) ], join_type=JoinType.INNER, ) LOGICAL_QUERY = LogicalQuery( from_clause=Entity(EntityKey.EVENTS, EVENTS_SCHEMA, 0.5), selected_columns=[ SelectedExpression("c1", Column("_snuba_c1", "t", "c")), SelectedExpression( "f1", FunctionCall("_snuba_f1", "f", (Column(None, "t", "c2"), ))),
) from tests.query.joins.join_structures import ( events_groups_join, events_node, groups_node, ) BASIC_JOIN = JoinClause( left_node=IndividualNode( alias="ev", data_source=Entity(EntityKey.EVENTS, EVENTS_SCHEMA, None), ), right_node=IndividualNode( alias="gr", data_source=Entity(EntityKey.GROUPEDMESSAGES, GROUPS_SCHEMA, None), ), keys=[ JoinCondition( left=JoinConditionExpression("ev", "group_id"), right=JoinConditionExpression("gr", "id"), ) ], join_type=JoinType.INNER, ) TEST_CASES = [ pytest.param( CompositeQuery( from_clause=BASIC_JOIN, selected_columns=[], ),
), 3, {"errors_local"}, True, None, id="Nested query. Count the inner query", ), pytest.param( CompositeQuery( from_clause=JoinClause( left_node=IndividualNode(alias="err", data_source=SIMPLE_QUERY), right_node=IndividualNode( alias="groups", data_source=Table("groups_local", GROUPS_SCHEMA) ), keys=[ JoinCondition( left=JoinConditionExpression("err", "group_id"), right=JoinConditionExpression("groups", "id"), ) ], join_type=JoinType.INNER, ), selected_columns=[ SelectedExpression( "event_id", FunctionCall("alias", "something", (Column(None, "err", "alias"),)), ), SelectedExpression( "group_id", Column("group_id", "groups", "group_id"), ),
from_clause=JoinClause( left_node=events_groups_join( clickhouse_events_node([ SelectedExpression( "_snuba_group_id", Column("_snuba_group_id", None, "group_id"), ), ]), clickhouse_groups_node([ SelectedExpression("_snuba_id", Column("_snuba_id", None, "id")), SelectedExpression( "_snuba_col1", Column("_snuba_col1", None, "something")), ]), ), right_node=clickhouse_assignees_node([ SelectedExpression( "_snuba_group_id", Column("_snuba_group_id", None, "group_id"), ), SelectedExpression( "_snuba_col1", Column("_snuba_col1", None, "something")), ]), keys=[ JoinCondition( left=JoinConditionExpression("ev", "_snuba_group_id"), right=JoinConditionExpression("as", "_snuba_group_id"), ) ], join_type=JoinType.INNER, ),
id="Special array join functions", ), pytest.param( "MATCH (e: events) -[contains]-> (t: transactions) SELECT 4-5, e.c", CompositeQuery( from_clause=JoinClause( left_node=IndividualNode( "e", QueryEntity( EntityKey.EVENTS, get_entity(EntityKey.EVENTS).get_data_model(), ), ), right_node=IndividualNode( "t", QueryEntity( EntityKey.TRANSACTIONS, get_entity(EntityKey.TRANSACTIONS).get_data_model(), ), ), keys=[ JoinCondition( JoinConditionExpression("e", "event_id"), JoinConditionExpression("t", "event_id"), ) ], join_type=JoinType.INNER, ), selected_columns=[ SelectedExpression( "4-5", FunctionCall(None, "minus", (Literal(None, 4), Literal(None, 5))), ),