def test_nested_aggregate_legacy_format(self, dataset): source = (dataset.get_all_storages() [0].get_schemas().get_read_schema().get_data_source()) priority = [ "toUInt64(plus(multiply(log(times_seen), 600), last_seen))", "", "priority", ] assert ( column_expr( dataset, "", Query({"aggregations": [priority]}, source), ParsingContext(), priority[2], priority[0], ) == "(toUInt64(plus(multiply(log(times_seen), 600), last_seen)) AS priority)" ) top_k = ["topK(3)", "logger", "top_3"] assert (column_expr( dataset, top_k[1], Query({"aggregations": [top_k]}, source), ParsingContext(), top_k[2], top_k[0], ) == "(topK(3)(logger) AS top_3)")
def test_conditions_expr(): dataset = get_dataset("groups") state.set_config('use_escape_alias', 1) conditions = [['events.a', '=', 1]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == '(events.a AS `events.a`) = 1' conditions = [[['events.a', '=', 1], ['groups.b', '=', 2]], [['events.c', '=', 3], ['groups.d', '=', 4]]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == ('((events.a AS `events.a`) = 1 OR (groups.b AS `groups.b`) = 2)' ' AND ((events.c AS `events.c`) = 3 OR (groups.d AS `groups.d`) = 4)' ) # Test column expansion conditions = [[['events.tags[foo]', '=', 1], ['groups.b', '=', 2]]] expanded = column_expr(dataset, 'events.tags[foo]', Query({}), ParsingContext()) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == '({} = 1 OR (groups.b AS `groups.b`) = 2)'.format(expanded) # Test using alias if column has already been expanded in SELECT clause reuse_query = Query({}) parsing_context = ParsingContext() conditions = [[['events.tags[foo]', '=', 1], ['groups.b', '=', 2]]] column_expr(dataset, 'events.tags[foo]', reuse_query, parsing_context) # Expand it once so the next time is aliased assert conditions_expr(dataset, conditions, reuse_query, parsing_context) \ == '(`events.tags[foo]` = 1 OR (groups.b AS `groups.b`) = 2)' # Test special output format of LIKE conditions = [['events.primary_hash', 'LIKE', '%foo%']] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == '(events.primary_hash AS `events.primary_hash`) LIKE \'%foo%\'' conditions = tuplify( [[['notEmpty', ['arrayElement', ['events.exception_stacks.type', 1]]], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == 'notEmpty(arrayElement((events.exception_stacks.type AS `events.exception_stacks.type`), 1)) = 1' conditions = tuplify([[['notEmpty', ['events.tags[sentry:user]']], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == 'notEmpty(`events.tags[sentry:user]`) = 1' conditions = tuplify([[['notEmpty', ['events.tags_key']], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == 'notEmpty((arrayJoin(events.tags.key) AS `events.tags_key`)) = 1' # Test scalar condition on array column is expanded as an iterator. conditions = [['events.exception_frames.filename', 'LIKE', '%foo%']] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == 'arrayExists(x -> assumeNotNull(x LIKE \'%foo%\'), (events.exception_frames.filename AS `events.exception_frames.filename`))'
def test_aliases() -> None: # No context col1 = Column("al1", "column1", "table1") col2 = Column("al1", "column1", "table1") assert col1.accept( ClickhouseExpressionFormatter()) == "(table1.column1 AS al1)" assert col2.accept( ClickhouseExpressionFormatter()) == "(table1.column1 AS al1)" # With Context pc = ParsingContext() assert col1.accept( ClickhouseExpressionFormatter(pc)) == "(table1.column1 AS al1)" assert col2.accept(ClickhouseExpressionFormatter(pc)) == "al1" # Hierarchical expression inherits parsing context and applies alaises f = FunctionCall( None, "f1", ( FunctionCall("tag[something]", "tag", (Column(None, "column1", "table1"))), FunctionCall("tag[something]", "tag", (Column(None, "column1", "table1"))), FunctionCall("tag[something]", "tag", (Column(None, "column1", "table1"))), ), ) expected = "f1((tag(table1.column1) AS `tag[something]`), `tag[something]`, `tag[something]`)" assert f.accept(ClickhouseExpressionFormatter()) == expected
def _format_query_content( query: FormattableQuery, expression_formatter_type: Type[ExpressionFormatterBase], ) -> Sequence[FormattedNode]: """ Produces the content of the formatted query. It works for both the composite query and the simple one as the only difference is the presence of the prewhere condition. Should we have more differences going on we should break this method into smaller ones. """ parsing_context = ParsingContext() formatter = expression_formatter_type(parsing_context) return [ v for v in [ _format_select(query, formatter), PaddingNode( "FROM", DataSourceFormatter(expression_formatter_type).visit( query.get_from_clause()), ), _format_arrayjoin(query, formatter), _build_optional_string_node("PREWHERE", query.get_prewhere_ast( ), formatter) if isinstance(query, Query) else None, _build_optional_string_node("WHERE", query.get_condition(), formatter), _format_groupby(query, formatter), _build_optional_string_node("HAVING", query.get_having(), formatter), _format_orderby(query, formatter), _format_limitby(query, formatter), _format_limit(query, formatter), ] if v is not None ]
def test_nested_aggregate_legacy_format(self, dataset): source = dataset.get_dataset_schemas().get_read_schema( ).get_data_source() priority = [ 'toUInt64(plus(multiply(log(times_seen), 600), last_seen))', '', 'priority' ] assert column_expr( dataset, '', Query({'aggregations': [priority]}, source), ParsingContext(), priority[2], priority[0] ) == '(toUInt64(plus(multiply(log(times_seen), 600), last_seen)) AS priority)' top_k = ['topK(3)', 'logger', 'top_3'] assert column_expr(dataset, top_k[1], Query({'aggregations': [top_k]}, source), ParsingContext(), top_k[2], top_k[0]) == '(topK(3)(logger) AS top_3)'
def test_order_by(): dataset = get_dataset("groups") source = dataset.get_dataset_schemas().get_read_schema().get_data_source() body = {} query = Query(body, source) assert ( column_expr(dataset, "-events.event_id", deepcopy(query), ParsingContext()) == "-(events.event_id AS `events.event_id`)" ) context = ParsingContext() context.add_alias("`events.event_id`") assert ( column_expr(dataset, "-events.event_id", deepcopy(query), context,) == "-`events.event_id`" )
def alias_expr(expr: str, alias: str, parsing_context: ParsingContext) -> str: """ Return the correct expression to use in the final SQL. Keeps a cache of the previously created expressions and aliases, so it knows when it can subsequently replace a redundant expression with an alias. 1. If the expression and alias are equal, just return that. 2. Otherwise, if the expression is new, add it to the cache and its alias so it can be reused later and return `expr AS alias` 3. If the expression has been aliased before, return the alias """ if expr == alias: return expr elif parsing_context.is_alias_present(alias): return alias else: parsing_context.add_alias(alias) return "({} AS {})".format(expr, alias)
def test_apdex_expression(self, dataset): body = {"aggregations": [["apdex(duration, 300)", "", "apdex_score"]]} parsing_context = ParsingContext() source = (dataset.get_all_storages() [0].get_schemas().get_read_schema().get_data_source()) exprs = [ column_expr(dataset, col, Query(body, source), parsing_context, alias, agg) for (agg, col, alias) in body["aggregations"] ] assert exprs == [ "((countIf(duration <= 300) + (countIf((duration > 300) AND (duration <= 1200)) / 2)) / count() AS apdex_score)" ]
def test_impact_expression(self, dataset): body = { "aggregations": [["impact(duration, 300, user)", "", "impact_score"]] } parsing_context = ParsingContext() source = dataset.get_dataset_schemas().get_read_schema( ).get_data_source() exprs = [ column_expr(dataset, col, Query(body, source), parsing_context, alias, agg) for (agg, col, alias) in body["aggregations"] ] assert exprs == [ "((1 - (countIf(duration <= 300) + (countIf((duration > 300) AND (duration <= 1200)) / 2)) / count()) + ((1 - (1 / sqrt(uniq(user)))) * 3) AS impact_score)" ]
def test_duplicate_expression_alias(self, dataset): body = { 'aggregations': [ ['top3', 'logger', 'dupe_alias'], ['uniq', 'environment', 'dupe_alias'], ] } parsing_context = ParsingContext() # In the case where 2 different expressions are aliased # to the same thing, one ends up overwriting the other. # This may not be ideal as it may mask bugs in query conditions exprs = [ column_expr(dataset, col, Query(body), parsing_context, alias, agg) for (agg, col, alias) in body['aggregations'] ] assert exprs == ['(topK(3)(logger) AS dupe_alias)', 'dupe_alias']
def test_complex_conditions_expr(self, dataset): query = Query({}) assert complex_column_expr(dataset, tuplify(['count', []]), deepcopy(query), ParsingContext()) == 'count()' assert complex_column_expr(dataset, tuplify(['notEmpty', ['foo']]), deepcopy(query), ParsingContext()) == 'notEmpty(foo)' assert complex_column_expr(dataset, tuplify(['notEmpty', ['arrayElement', ['foo', 1]]]), deepcopy(query), ParsingContext()) == 'notEmpty(arrayElement(foo, 1))' assert complex_column_expr(dataset, tuplify(['foo', ['bar', ['qux'], 'baz']]), deepcopy(query), ParsingContext()) == 'foo(bar(qux), baz)' assert complex_column_expr(dataset, tuplify(['foo', [], 'a']), deepcopy(query), ParsingContext()) == '(foo() AS a)' assert complex_column_expr(dataset, tuplify(['foo', ['b', 'c'], 'd']), deepcopy(query), ParsingContext()) == '(foo(b, c) AS d)' assert complex_column_expr(dataset, tuplify(['foo', ['b', 'c', ['d']]]), deepcopy(query), ParsingContext()) == 'foo(b, c(d))' assert complex_column_expr(dataset, tuplify(['top3', ['project_id']]), deepcopy(query), ParsingContext()) == 'topK(3)(project_id)' assert complex_column_expr(dataset, tuplify(['top10', ['project_id'], 'baz']), deepcopy(query), ParsingContext()) == '(topK(10)(project_id) AS baz)' assert complex_column_expr(dataset, tuplify(['emptyIfNull', ['project_id']]), deepcopy(query), ParsingContext()) == 'ifNull(project_id, \'\')' assert complex_column_expr(dataset, tuplify(['emptyIfNull', ['project_id'], 'foo']), deepcopy(query), ParsingContext()) == '(ifNull(project_id, \'\') AS foo)' assert complex_column_expr(dataset, tuplify(['or', ['a', 'b']]), deepcopy(query), ParsingContext()) == 'or(a, b)' assert complex_column_expr(dataset, tuplify(['and', ['a', 'b']]), deepcopy(query), ParsingContext()) == 'and(a, b)' assert complex_column_expr(dataset, tuplify(['or', [['or', ['a', 'b']], 'c']]), deepcopy(query), ParsingContext()) == 'or(or(a, b), c)' assert complex_column_expr(dataset, tuplify(['and', [['and', ['a', 'b']], 'c']]), deepcopy(query), ParsingContext()) == 'and(and(a, b), c)' # (A OR B) AND C assert complex_column_expr(dataset, tuplify(['and', [['or', ['a', 'b']], 'c']]), deepcopy(query), ParsingContext()) == 'and(or(a, b), c)' # (A AND B) OR C assert complex_column_expr(dataset, tuplify(['or', [['and', ['a', 'b']], 'c']]), deepcopy(query), ParsingContext()) == 'or(and(a, b), c)' # A OR B OR C OR D assert complex_column_expr(dataset, tuplify(['or', [['or', [['or', ['c', 'd']], 'b']], 'a']]), deepcopy(query), ParsingContext()) == 'or(or(or(c, d), b), a)' assert complex_column_expr(dataset, tuplify(['if', [['in', ['release', 'tuple', ["'foo'"], ], ], 'release', "'other'"], 'release', ]), deepcopy(query), ParsingContext()) == "(if(in(release, tuple('foo')), release, 'other') AS release)" assert complex_column_expr(dataset, tuplify(['if', ['in', ['release', 'tuple', ["'foo'"]], 'release', "'other'", ], 'release']), deepcopy(query), ParsingContext()) == "(if(in(release, tuple('foo')), release, 'other') AS release)" # TODO once search_message is filled in everywhere, this can be just 'message' again. message_expr = '(coalesce(search_message, message) AS message)' assert complex_column_expr(dataset, tuplify(['positionCaseInsensitive', ['message', "'lol 'single' quotes'"]]), deepcopy(query), ParsingContext())\ == "positionCaseInsensitive({message_expr}, 'lol \\'single\\' quotes')".format(**locals()) # dangerous characters are allowed but escaped in literals and column names assert complex_column_expr(dataset, tuplify(['safe', ['fo`o', "'ba'r'"]]), deepcopy(query), ParsingContext()) == r"safe(`fo\`o`, 'ba\'r')" # Dangerous characters not allowed in functions with pytest.raises(AssertionError): assert complex_column_expr(dataset, tuplify([r"dang'erous", ['message', '`']]), deepcopy(query), ParsingContext()) # Or nested functions with pytest.raises(AssertionError): assert complex_column_expr(dataset, tuplify([r"safe", ['dang`erous', ['message']]]), deepcopy(query), ParsingContext())
def test_alias_in_alias(self): source = self.dataset.get_dataset_schemas().get_read_schema().get_data_source() query = Query({"groupby": ["tags_key", "tags_value"]}, source,) context = ParsingContext() assert column_expr(self.dataset, "tags_key", query, context) == ( "(((arrayJoin(arrayMap((x,y) -> [x,y], tags.key, tags.value)) " "AS all_tags))[1] AS tags_key)" ) # If we want to use `tags_key` again, make sure we use the # already-created alias verbatim assert column_expr(self.dataset, "tags_key", query, context) == "tags_key" # If we also want to use `tags_value`, make sure that we use # the `all_tags` alias instead of re-expanding the tags arrayJoin assert ( column_expr(self.dataset, "tags_value", query, context) == "((all_tags)[2] AS tags_value)" )
def test_duplicate_expression_alias(self, dataset): body = { "aggregations": [ ["top3", "logger", "dupe_alias"], ["uniq", "environment", "dupe_alias"], ] } parsing_context = ParsingContext() source = (dataset.get_all_storages() [0].get_schemas().get_read_schema().get_data_source()) # In the case where 2 different expressions are aliased # to the same thing, one ends up overwriting the other. # This may not be ideal as it may mask bugs in query conditions exprs = [ column_expr(dataset, col, Query(body, source), parsing_context, alias, agg) for (agg, col, alias) in body["aggregations"] ] assert exprs == ["(topK(3)(logger) AS dupe_alias)", "dupe_alias"]
def test_alias_in_alias(): state.set_config('use_escape_alias', 1) dataset = get_dataset("groups") body = {'groupby': ['events.tags_key', 'events.tags_value']} query = Query(body) parsing_context = ParsingContext() assert column_expr(dataset, 'events.tags_key', query, parsing_context) == ( '(((arrayJoin(arrayMap((x,y) -> [x,y], events.tags.key, events.tags.value)) ' 'AS all_tags))[1] AS `events.tags_key`)') # If we want to use `tags_key` again, make sure we use the # already-created alias verbatim assert column_expr(dataset, 'events.tags_key', query, parsing_context) == '`events.tags_key`' # If we also want to use `tags_value`, make sure that we use # the `all_tags` alias instead of re-expanding the tags arrayJoin assert column_expr( dataset, 'events.tags_value', query, parsing_context) == '((all_tags)[2] AS `events.tags_value`)'
def test_duplicate_expression_alias(): dataset = get_dataset("groups") state.set_config('use_escape_alias', 1) body = { 'aggregations': [ ['top3', 'events.logger', 'dupe_alias'], ['uniq', 'events.environment', 'dupe_alias'], ] } query = Query(body) # In the case where 2 different expressions are aliased # to the same thing, one ends up overwriting the other. # This may not be ideal as it may mask bugs in query conditions parsing_context = ParsingContext() exprs = [ column_expr(dataset, col, query, parsing_context, alias, agg) for (agg, col, alias) in body['aggregations'] ] assert exprs == ['(topK(3)(events.logger) AS dupe_alias)', 'dupe_alias']
def test_duplicate_expression_alias(): dataset = get_dataset("groups") source = dataset.get_dataset_schemas().get_read_schema().get_data_source() state.set_config("use_escape_alias", 1) body = { "aggregations": [ ["top3", "events.logger", "dupe_alias"], ["uniq", "events.environment", "dupe_alias"], ] } query = Query(body, source) # In the case where 2 different expressions are aliased # to the same thing, one ends up overwriting the other. # This may not be ideal as it may mask bugs in query conditions parsing_context = ParsingContext() exprs = [ column_expr(dataset, col, query, parsing_context, alias, agg) for (agg, col, alias) in body["aggregations"] ] assert exprs == ["(topK(3)(events.logger) AS dupe_alias)", "dupe_alias"]
def test_alias_in_alias(): state.set_config("use_escape_alias", 1) dataset = get_dataset("groups") source = (dataset.get_all_storages() [0].get_schemas().get_read_schema().get_data_source()) body = {"groupby": ["events.tags_key", "events.tags_value"]} query = Query(body, source) parsing_context = ParsingContext() assert column_expr(dataset, "events.tags_key", query, parsing_context) == ( "(((arrayJoin(arrayMap((x,y) -> [x,y], events.tags.key, events.tags.value)) " "AS all_tags))[1] AS `events.tags_key`)") # If we want to use `tags_key` again, make sure we use the # already-created alias verbatim assert (column_expr(dataset, "events.tags_key", query, parsing_context) == "`events.tags_key`") # If we also want to use `tags_value`, make sure that we use # the `all_tags` alias instead of re-expanding the tags arrayJoin assert (column_expr( dataset, "events.tags_value", query, parsing_context) == "((all_tags)[2] AS `events.tags_value`)")
def test_complex_conditions_expr(self, dataset): source = (dataset.get_all_storages() [0].get_schemas().get_read_schema().get_data_source()) query = Query({}, source) assert (complex_column_expr(dataset, tuplify(["count", []]), deepcopy(query), ParsingContext()) == "count()") assert (complex_column_expr( dataset, tuplify(["notEmpty", ["foo"]]), deepcopy(query), ParsingContext(), ) == "notEmpty(foo)") assert (complex_column_expr( dataset, tuplify(["notEmpty", ["arrayElement", ["foo", 1]]]), deepcopy(query), ParsingContext(), ) == "notEmpty(arrayElement(foo, 1))") assert (complex_column_expr( dataset, tuplify(["foo", ["bar", ["qux"], "baz"]]), deepcopy(query), ParsingContext(), ) == "foo(bar(qux), baz)") assert (complex_column_expr(dataset, tuplify(["foo", [], "a"]), deepcopy(query), ParsingContext()) == "(foo() AS a)") assert (complex_column_expr( dataset, tuplify(["foo", ["b", "c"], "d"]), deepcopy(query), ParsingContext(), ) == "(foo(b, c) AS d)") assert (complex_column_expr( dataset, tuplify(["foo", ["b", "c", ["d"]]]), deepcopy(query), ParsingContext(), ) == "foo(b, c(d))") assert (complex_column_expr( dataset, tuplify(["top3", ["project_id"]]), deepcopy(query), ParsingContext(), ) == "topK(3)(project_id)") assert (complex_column_expr( dataset, tuplify(["top10", ["project_id"], "baz"]), deepcopy(query), ParsingContext(), ) == "(topK(10)(project_id) AS baz)") assert (complex_column_expr( dataset, tuplify(["emptyIfNull", ["project_id"]]), deepcopy(query), ParsingContext(), ) == "ifNull(project_id, '')") assert (complex_column_expr( dataset, tuplify(["emptyIfNull", ["project_id"], "foo"]), deepcopy(query), ParsingContext(), ) == "(ifNull(project_id, '') AS foo)") assert (complex_column_expr(dataset, tuplify(["or", ["a", "b"]]), deepcopy(query), ParsingContext()) == "or(a, b)") assert (complex_column_expr(dataset, tuplify(["and", ["a", "b"]]), deepcopy(query), ParsingContext()) == "and(a, b)") assert (complex_column_expr( dataset, tuplify(["or", [["or", ["a", "b"]], "c"]]), deepcopy(query), ParsingContext(), ) == "or(or(a, b), c)") assert (complex_column_expr( dataset, tuplify(["and", [["and", ["a", "b"]], "c"]]), deepcopy(query), ParsingContext(), ) == "and(and(a, b), c)") # (A OR B) AND C assert (complex_column_expr( dataset, tuplify(["and", [["or", ["a", "b"]], "c"]]), deepcopy(query), ParsingContext(), ) == "and(or(a, b), c)") # (A AND B) OR C assert (complex_column_expr( dataset, tuplify(["or", [["and", ["a", "b"]], "c"]]), deepcopy(query), ParsingContext(), ) == "or(and(a, b), c)") # A OR B OR C OR D assert (complex_column_expr( dataset, tuplify(["or", [["or", [["or", ["c", "d"]], "b"]], "a"]]), deepcopy(query), ParsingContext(), ) == "or(or(or(c, d), b), a)") assert (complex_column_expr( dataset, tuplify([ "if", [ ["in", ["release", "tuple", ["'foo'"]]], "release", "'other'", ], "release", ]), deepcopy(query), ParsingContext(), ) == "(if(in(release, tuple('foo')), release, 'other') AS release)") assert (complex_column_expr( dataset, tuplify([ "if", ["in", ["release", "tuple", ["'foo'"]], "release", "'other'"], "release", ]), deepcopy(query), ParsingContext(), ) == "(if(in(release, tuple('foo')), release, 'other') AS release)") # TODO once search_message is filled in everywhere, this can be just 'message' again. message_expr = "(coalesce(search_message, message) AS message)" assert complex_column_expr( dataset, tuplify([ "positionCaseInsensitive", ["message", "'lol 'single' quotes'"] ]), deepcopy(query), ParsingContext(), ) == "positionCaseInsensitive({message_expr}, 'lol \\'single\\' quotes')".format( **locals()) # dangerous characters are allowed but escaped in literals and column names assert (complex_column_expr( dataset, tuplify(["safe", ["fo`o", "'ba'r'"]]), deepcopy(query), ParsingContext(), ) == r"safe(`fo\`o`, 'ba\'r')") # Dangerous characters not allowed in functions with pytest.raises(AssertionError): assert complex_column_expr( dataset, tuplify([r"dang'erous", ["message", "`"]]), deepcopy(query), ParsingContext(), ) # Or nested functions with pytest.raises(AssertionError): assert complex_column_expr( dataset, tuplify([r"safe", ["dang`erous", ["message"]]]), deepcopy(query), ParsingContext(), )
def test_conditions_expr(self, dataset): state.set_config("use_escape_alias", 1) conditions = [["a", "=", 1]] source = (dataset.get_all_storages() [0].get_schemas().get_read_schema().get_data_source()) assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "a = 1") conditions = [] assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "") conditions = [[[]], []] assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "") conditions = [[["a", "=", 1]]] assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "a = 1") conditions = [["a", "=", 1], ["b", "=", 2]] assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "a = 1 AND b = 2") conditions = [[["a", "=", 1], ["b", "=", 2]]] assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "(a = 1 OR b = 2)") conditions = [[["a", "=", 1], ["b", "=", 2]], ["c", "=", 3]] assert (conditions_expr(dataset, conditions, Query( {}, source), ParsingContext()) == "(a = 1 OR b = 2) AND c = 3") conditions = [[["a", "=", 1], ["b", "=", 2]], [["c", "=", 3], ["d", "=", 4]]] assert (conditions_expr( dataset, conditions, Query({}, source), ParsingContext()) == "(a = 1 OR b = 2) AND (c = 3 OR d = 4)") # Malformed condition input conditions = [[["a", "=", 1], []]] assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "a = 1") # Test column expansion conditions = [[["tags[foo]", "=", 1], ["b", "=", 2]]] expanded = column_expr(dataset, "tags[foo]", Query({}, source), ParsingContext()) assert conditions_expr( dataset, conditions, Query({}, source), ParsingContext()) == "({} = 1 OR b = 2)".format(expanded) # Test using alias if column has already been expanded in SELECT clause reuse_query = Query({}, source) parsing_context = ParsingContext() conditions = [[["tags[foo]", "=", 1], ["b", "=", 2]]] column_expr( dataset, "tags[foo]", reuse_query, parsing_context) # Expand it once so the next time is aliased assert (conditions_expr( dataset, conditions, reuse_query, parsing_context) == "(`tags[foo]` = 1 OR b = 2)") # Test special output format of LIKE conditions = [["primary_hash", "LIKE", "%foo%"]] assert (conditions_expr(dataset, conditions, Query( {}, source), ParsingContext()) == "primary_hash LIKE '%foo%'") conditions = tuplify( [[["notEmpty", ["arrayElement", ["exception_stacks.type", 1]]], "=", 1]]) assert ( conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "notEmpty(arrayElement((exception_stacks.type AS `exception_stacks.type`), 1)) = 1" ) conditions = tuplify([[["notEmpty", ["tags[sentry:user]"]], "=", 1]]) assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 1") conditions = tuplify([[["notEmpty", ["tags_key"]], "=", 1]]) assert (conditions_expr( dataset, conditions, Query({"conditions": [[["notEmpty", ["tags_key"]], "=", 1]]}, source), ParsingContext(), ) == "notEmpty((arrayJoin(tags.key) AS tags_key)) = 1") conditions = tuplify([ [ [["notEmpty", ["tags[sentry:environment]"]], "=", "dev"], [["notEmpty", ["tags[sentry:environment]"]], "=", "prod"], ], [ [["notEmpty", ["tags[sentry:user]"]], "=", "joe"], [["notEmpty", ["tags[sentry:user]"]], "=", "bob"], ], ]) assert ( conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == """(notEmpty((tags.value[indexOf(tags.key, 'sentry:environment')] AS `tags[sentry:environment]`)) = 'dev' OR notEmpty(`tags[sentry:environment]`) = 'prod') AND (notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 'joe' OR notEmpty(`tags[sentry:user]`) = 'bob')""" ) # Test scalar condition on array column is expanded as an iterator. conditions = [["exception_frames.filename", "LIKE", "%foo%"]] assert ( conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "arrayExists(x -> assumeNotNull(x LIKE '%foo%'), (exception_frames.filename AS `exception_frames.filename`))" ) # Test negative scalar condition on array column is expanded as an all() type iterator. conditions = [["exception_frames.filename", "NOT LIKE", "%foo%"]] assert ( conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "arrayAll(x -> assumeNotNull(x NOT LIKE '%foo%'), (exception_frames.filename AS `exception_frames.filename`))" ) # Test that a duplicate IN condition is deduplicated even if # the lists are in different orders.[ conditions = tuplify([["platform", "IN", ["a", "b", "c"]], ["platform", "IN", ["c", "b", "a"]]]) assert (conditions_expr(dataset, conditions, Query( {}, source), ParsingContext()) == "platform IN ('a', 'b', 'c')")
def test_simple_column_expr(): dataset = get_dataset("groups") state.set_config('use_escape_alias', 1) body = {'granularity': 86400} query = Query(body) assert column_expr(dataset, "events.event_id", deepcopy(query), ParsingContext()) \ == "(events.event_id AS `events.event_id`)" assert column_expr(dataset, "groups.id", deepcopy(query), ParsingContext()) \ == "(groups.id AS `groups.id`)" assert column_expr(dataset, "events.event_id", deepcopy(query), ParsingContext(), "MyVerboseAlias") \ == "(events.event_id AS MyVerboseAlias)" # Single tag expression assert column_expr(dataset, 'events.tags[foo]', deepcopy(query), ParsingContext()) ==\ "(events.tags.value[indexOf(events.tags.key, \'foo\')] AS `events.tags[foo]`)" # Promoted tag expression / no translation assert column_expr(dataset, 'events.tags[server_name]', deepcopy(query), ParsingContext()) ==\ "(events.server_name AS `events.tags[server_name]`)" # All tag keys expression assert column_expr(dataset, 'events.tags_key', deepcopy(query), ParsingContext()) == ( '(arrayJoin(events.tags.key) AS `events.tags_key`)') # If we are going to use both tags_key and tags_value, expand both tag_group_body = {'groupby': ['events.tags_key', 'events.tags_value']} parsing_context = ParsingContext() assert column_expr( dataset, 'events.tags_key', Query(tag_group_body), parsing_context ) == ( '(((arrayJoin(arrayMap((x,y) -> [x,y], events.tags.key, events.tags.value)) ' 'AS all_tags))[1] AS `events.tags_key`)') assert column_expr(dataset, 'events.time', deepcopy(query), ParsingContext()) ==\ "(toDate(events.timestamp) AS `events.time`)" assert column_expr(dataset, 'events.col', deepcopy(query), ParsingContext(), aggregate='sum') ==\ "(sum(events.col) AS `events.col`)" assert column_expr(dataset, 'events.col', deepcopy(query), ParsingContext(), alias='summation', aggregate='sum') ==\ "(sum(events.col) AS summation)" assert column_expr(dataset, '', deepcopy(query), ParsingContext(), alias='aggregate', aggregate='count()') ==\ "(count() AS aggregate)" # Columns that need escaping assert column_expr(dataset, 'events.sentry:release', deepcopy(query), ParsingContext()) == '`events.sentry:release`' # A 'column' that is actually a string literal assert column_expr(dataset, '\'hello world\'', deepcopy(query), ParsingContext()) == '\'hello world\'' # Complex expressions (function calls) involving both string and column arguments assert column_expr(dataset, tuplify(['concat', ['a', '\':\'', 'b']]), deepcopy(query), ParsingContext()) == 'concat(a, \':\', b)' group_id_body = deepcopy(query) assert column_expr( dataset, 'events.issue', group_id_body, ParsingContext()) == '(nullIf(events.group_id, 0) AS `events.issue`)' # turn uniq() into ifNull(uniq(), 0) so it doesn't return null where a number was expected. assert column_expr( dataset, 'events.tags[environment]', deepcopy(query), ParsingContext(), alias='unique_envs', aggregate='uniq' ) == "(ifNull(uniq(events.environment), 0) AS unique_envs)"
def __init__( self, dataset: Dataset, query: Query, settings: RequestSettings, ) -> None: parsing_context = ParsingContext() aggregate_exprs = [ column_expr(dataset, col, query, parsing_context, alias, agg) for (agg, col, alias) in query.get_aggregations() ] groupby = util.to_list(query.get_groupby()) group_exprs = [ column_expr(dataset, gb, query, parsing_context) for gb in groupby ] column_names = query.get_selected_columns() or [] selected_cols = [ column_expr(dataset, util.tuplify(colname), query, parsing_context) for colname in column_names ] select_clause = u"SELECT {}".format( ", ".join(group_exprs + aggregate_exprs + selected_cols)) from_clause = u"FROM {}".format(query.get_data_source().format_from()) if query.get_final(): from_clause = u"{} FINAL".format(from_clause) if not query.get_data_source().supports_sample(): sample_rate = None else: if query.get_sample(): sample_rate = query.get_sample() elif settings.get_turbo(): sample_rate = snuba_settings.TURBO_SAMPLE_RATE else: sample_rate = None if sample_rate: from_clause = u"{} SAMPLE {}".format(from_clause, sample_rate) join_clause = "" if query.get_arrayjoin(): join_clause = u"ARRAY JOIN {}".format(query.get_arrayjoin()) where_clause = "" if query.get_conditions(): where_clause = u"WHERE {}".format( conditions_expr(dataset, query.get_conditions(), query, parsing_context)) prewhere_clause = "" if query.get_prewhere(): prewhere_clause = u"PREWHERE {}".format( conditions_expr(dataset, query.get_prewhere(), query, parsing_context)) group_clause = "" if groupby: group_clause = "GROUP BY ({})".format(", ".join( column_expr(dataset, gb, query, parsing_context) for gb in groupby)) if query.has_totals(): group_clause = "{} WITH TOTALS".format(group_clause) having_clause = "" having_conditions = query.get_having() if having_conditions: assert groupby, "found HAVING clause with no GROUP BY" having_clause = u"HAVING {}".format( conditions_expr(dataset, having_conditions, query, parsing_context)) order_clause = "" if query.get_orderby(): orderby = [ column_expr(dataset, util.tuplify(ob), query, parsing_context) for ob in util.to_list(query.get_orderby()) ] orderby = [ u"{} {}".format(ob.lstrip("-"), "DESC" if ob.startswith("-") else "ASC") for ob in orderby ] order_clause = u"ORDER BY {}".format(", ".join(orderby)) limitby_clause = "" if query.get_limitby() is not None: limitby_clause = "LIMIT {} BY {}".format(*query.get_limitby()) limit_clause = "" if query.get_limit() is not None: limit_clause = "LIMIT {}, {}".format(query.get_offset(), query.get_limit()) self.__formatted_query = " ".join([ c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause, ] if c ])
def test_column_expr(self): source = ( self.dataset.get_all_storages()[0] .get_schemas() .get_read_schema() .get_data_source() ) query = Query({"granularity": 86400}, source,) # Single tag expression assert ( column_expr(self.dataset, "tags[foo]", deepcopy(query), ParsingContext()) == "(tags.value[indexOf(tags.key, 'foo')] AS `tags[foo]`)" ) # Promoted tag expression / no translation assert ( column_expr( self.dataset, "tags[server_name]", deepcopy(query), ParsingContext() ) == "(server_name AS `tags[server_name]`)" ) # Promoted tag expression / with translation assert ( column_expr( self.dataset, "tags[app.device]", deepcopy(query), ParsingContext() ) == "(app_device AS `tags[app.device]`)" ) # Promoted context expression / with translation assert ( column_expr( self.dataset, "contexts[device.battery_level]", deepcopy(query), ParsingContext(), ) == "(toString(device_battery_level) AS `contexts[device.battery_level]`)" ) # All tag keys expression q = Query({"granularity": 86400, "selected_columns": ["tags_key"]}, source,) assert column_expr(self.dataset, "tags_key", q, ParsingContext()) == ( "(arrayJoin(tags.key) AS tags_key)" ) # If we are going to use both tags_key and tags_value, expand both tag_group_body = {"groupby": ["tags_key", "tags_value"]} assert column_expr( self.dataset, "tags_key", Query(tag_group_body, source), ParsingContext() ) == ( "(((arrayJoin(arrayMap((x,y) -> [x,y], tags.key, tags.value)) " "AS all_tags))[1] AS tags_key)" ) assert ( column_expr(self.dataset, "time", deepcopy(query), ParsingContext()) == "(toDate(timestamp) AS time)" ) assert ( column_expr(self.dataset, "rtime", deepcopy(query), ParsingContext()) == "(toDate(received) AS rtime)" ) assert ( column_expr( self.dataset, "col", deepcopy(query), ParsingContext(), aggregate="sum" ) == "(sum(col) AS col)" ) assert ( column_expr( self.dataset, "col", deepcopy(query), ParsingContext(), alias="summation", aggregate="sum", ) == "(sum(col) AS summation)" ) # Special cases where count() doesn't need a column assert ( column_expr( self.dataset, "", deepcopy(query), ParsingContext(), alias="count", aggregate="count()", ) == "(count() AS count)" ) assert ( column_expr( self.dataset, "", deepcopy(query), ParsingContext(), alias="aggregate", aggregate="count()", ) == "(count() AS aggregate)" ) # Columns that need escaping assert ( column_expr( self.dataset, "sentry:release", deepcopy(query), ParsingContext() ) == "`sentry:release`" ) # A 'column' that is actually a string literal assert ( column_expr( self.dataset, "'hello world'", deepcopy(query), ParsingContext() ) == "'hello world'" ) # Complex expressions (function calls) involving both string and column arguments assert ( column_expr( self.dataset, tuplify(["concat", ["a", "':'", "b"]]), deepcopy(query), ParsingContext(), ) == "concat(a, ':', b)" ) group_id_query = deepcopy(query) assert ( column_expr(self.dataset, "group_id", group_id_query, ParsingContext()) == "(nullIf(group_id, 0) AS group_id)" ) # turn uniq() into ifNull(uniq(), 0) so it doesn't return null where a number was expected. assert ( column_expr( self.dataset, "tags[environment]", deepcopy(query), ParsingContext(), alias="unique_envs", aggregate="uniq", ) == "(ifNull(uniq(environment), 0) AS unique_envs)" )
def test_conditions_expr(): dataset = get_dataset("groups") source = (dataset.get_all_storages() [0].get_schemas().get_read_schema().get_data_source()) state.set_config("use_escape_alias", 1) conditions = [["events.a", "=", 1]] query = Query({}, source) assert (conditions_expr( dataset, conditions, deepcopy(query), ParsingContext()) == "(events.a AS `events.a`) = 1") conditions = [ [["events.a", "=", 1], ["groups.b", "=", 2]], [["events.c", "=", 3], ["groups.d", "=", 4]], ] assert conditions_expr( dataset, conditions, deepcopy(query), ParsingContext() ) == ( "((events.a AS `events.a`) = 1 OR (groups.b AS `groups.b`) = 2)" " AND ((events.c AS `events.c`) = 3 OR (groups.d AS `groups.d`) = 4)") # Test column expansion conditions = [[["events.tags[foo]", "=", 1], ["groups.b", "=", 2]]] expanded = column_expr(dataset, "events.tags[foo]", deepcopy(query), ParsingContext()) assert conditions_expr( dataset, conditions, deepcopy(query), ParsingContext()) == "({} = 1 OR (groups.b AS `groups.b`) = 2)".format( expanded) # Test using alias if column has already been expanded in SELECT clause reuse_query = deepcopy(query) parsing_context = ParsingContext() conditions = [[["events.tags[foo]", "=", 1], ["groups.b", "=", 2]]] column_expr(dataset, "events.tags[foo]", reuse_query, parsing_context) # Expand it once so the next time is aliased assert (conditions_expr(dataset, conditions, reuse_query, parsing_context) == "(`events.tags[foo]` = 1 OR (groups.b AS `groups.b`) = 2)") # Test special output format of LIKE conditions = [["events.primary_hash", "LIKE", "%foo%"]] assert (conditions_expr(dataset, conditions, deepcopy(query), ParsingContext()) == "(events.primary_hash AS `events.primary_hash`) LIKE '%foo%'") conditions = tuplify( [[["notEmpty", ["arrayElement", ["events.exception_stacks.type", 1]]], "=", 1]]) assert ( conditions_expr(dataset, conditions, deepcopy(query), ParsingContext()) == "notEmpty(arrayElement((events.exception_stacks.type AS `events.exception_stacks.type`), 1)) = 1" ) conditions = tuplify([[["notEmpty", ["events.tags[sentry:user]"]], "=", 1]]) assert (conditions_expr( dataset, conditions, deepcopy(query), ParsingContext()) == "notEmpty(`events.tags[sentry:user]`) = 1") conditions = tuplify([[["notEmpty", ["events.tags_key"]], "=", 1]]) q = Query({"selected_columns": ["events.tags_key"]}, source) assert (conditions_expr(dataset, conditions, q, ParsingContext()) == "notEmpty((arrayJoin(events.tags.key) AS `events.tags_key`)) = 1") # Test scalar condition on array column is expanded as an iterator. conditions = [["events.exception_frames.filename", "LIKE", "%foo%"]] assert ( conditions_expr(dataset, conditions, deepcopy(query), ParsingContext()) == "arrayExists(x -> assumeNotNull(x LIKE '%foo%'), (events.exception_frames.filename AS `events.exception_frames.filename`))" )
def test_simple_column_expr(): dataset = get_dataset("groups") source = (dataset.get_all_storages() [0].get_schemas().get_read_schema().get_data_source()) body = {"granularity": 86400} query = Query(body, source) assert (column_expr( dataset, "events.event_id", deepcopy(query), ParsingContext()) == "(events.event_id AS `events.event_id`)") assert (column_expr(dataset, "groups.id", deepcopy(query), ParsingContext()) == "(groups.id AS `groups.id`)") assert (column_expr( dataset, "events.event_id", deepcopy(query), ParsingContext(), "MyVerboseAlias", ) == "(events.event_id AS MyVerboseAlias)") # Single tag expression assert ( column_expr(dataset, "events.tags[foo]", deepcopy(query), ParsingContext()) == "(events.tags.value[indexOf(events.tags.key, 'foo')] AS `events.tags[foo]`)" ) # Promoted tag expression / no translation assert (column_expr(dataset, "events.tags[server_name]", deepcopy(query), ParsingContext()) == "(events.server_name AS `events.tags[server_name]`)") # All tag keys expression q = Query({"selected_columns": ["events.tags_key"]}, source) assert column_expr(dataset, "events.tags_key", q, ParsingContext()) == ( "(arrayJoin(events.tags.key) AS `events.tags_key`)") # If we are going to use both tags_key and tags_value, expand both tag_group_body = {"groupby": ["events.tags_key", "events.tags_value"]} parsing_context = ParsingContext() assert column_expr(dataset, "events.tags_key", Query( tag_group_body, source ), parsing_context) == ( "(((arrayJoin(arrayMap((x,y) -> [x,y], events.tags.key, events.tags.value)) " "AS all_tags))[1] AS `events.tags_key`)") assert (column_expr( dataset, "events.time", deepcopy(query), ParsingContext()) == "(toDate(events.timestamp) AS `events.time`)") assert (column_expr( dataset, "events.col", deepcopy(query), ParsingContext(), aggregate="sum") == "(sum(events.col) AS `events.col`)") assert (column_expr( dataset, "events.col", deepcopy(query), ParsingContext(), alias="summation", aggregate="sum", ) == "(sum(events.col) AS summation)") assert (column_expr( dataset, "", deepcopy(query), ParsingContext(), alias="aggregate", aggregate="count()", ) == "(count() AS aggregate)") # Columns that need escaping assert (column_expr(dataset, "events.sentry:release", deepcopy(query), ParsingContext()) == "`events.sentry:release`") # A 'column' that is actually a string literal assert (column_expr(dataset, "'hello world'", deepcopy(query), ParsingContext()) == "'hello world'") # Complex expressions (function calls) involving both string and column arguments assert (column_expr( dataset, tuplify(["concat", ["a", "':'", "b"]]), deepcopy(query), ParsingContext(), ) == "concat(a, ':', b)") group_id_body = deepcopy(query) assert (column_expr(dataset, "events.group_id", group_id_body, ParsingContext()) == "(nullIf(events.group_id, 0) AS `events.group_id`)") # turn uniq() into ifNull(uniq(), 0) so it doesn't return null where a number was expected. assert (column_expr( dataset, "events.tags[environment]", deepcopy(query), ParsingContext(), alias="unique_envs", aggregate="uniq", ) == "(ifNull(uniq(events.environment), 0) AS unique_envs)")
def test_order_by(self): """ Order by in Snuba are represented as -COL_NAME when ordering DESC. since the column is provided with the `-` character in front when reaching the column_expr call, this can introduce a ton of corner cases depending whether the column is aliased, whether it gets processed into something else or whether it is escaped. This test is supposed to cover those cases. """ source = ( self.dataset.get_all_storages()[0] .get_schemas() .get_read_schema() .get_data_source() ) query = Query({}, source) # Columns that start with a negative sign (used in orderby to signify # sort order) retain the '-' sign outside the escaping backticks (if any) assert ( column_expr(self.dataset, "-timestamp", deepcopy(query), ParsingContext()) == "-timestamp" ) assert ( column_expr( self.dataset, "-sentry:release", deepcopy(query), ParsingContext() ) == "-`sentry:release`" ) context = ParsingContext() context.add_alias("al1") assert ( column_expr(self.dataset, "-timestamp", deepcopy(query), context, "al1") == "-al1" ) assert ( column_expr( self.dataset, "-timestamp", deepcopy(query), ParsingContext(), "al1" ) == "-(timestamp AS al1)" ) assert ( column_expr( self.dataset, "-exception_stacks.type", deepcopy(query), ParsingContext(), ) == "-(exception_stacks.type AS `exception_stacks.type`)" ) context = ParsingContext() context.add_alias("`exception_stacks.type`") assert ( column_expr( self.dataset, "-exception_stacks.type", deepcopy(query), context, ) == "-`exception_stacks.type`" )
def test_column_expr(self): source = self.dataset.get_dataset_schemas().get_read_schema( ).get_data_source() query = Query( {'granularity': 86400}, source, ) # Single tag expression assert column_expr(self.dataset, 'tags[foo]', deepcopy(query), ParsingContext()) ==\ "(tags.value[indexOf(tags.key, \'foo\')] AS `tags[foo]`)" # Promoted tag expression / no translation assert column_expr(self.dataset, 'tags[server_name]', deepcopy(query), ParsingContext()) ==\ "(server_name AS `tags[server_name]`)" # Promoted tag expression / with translation assert column_expr(self.dataset, 'tags[app.device]', deepcopy(query), ParsingContext()) ==\ "(app_device AS `tags[app.device]`)" # All tag keys expression assert column_expr( self.dataset, 'tags_key', deepcopy(query), ParsingContext()) == ('(arrayJoin(tags.key) AS tags_key)') # If we are going to use both tags_key and tags_value, expand both tag_group_body = {'groupby': ['tags_key', 'tags_value']} assert column_expr( self.dataset, 'tags_key', Query(tag_group_body, source), ParsingContext()) == ( '(((arrayJoin(arrayMap((x,y) -> [x,y], tags.key, tags.value)) ' 'AS all_tags))[1] AS tags_key)') assert column_expr(self.dataset, 'time', deepcopy(query), ParsingContext()) ==\ "(toDate(timestamp) AS time)" assert column_expr(self.dataset, 'rtime', deepcopy(query), ParsingContext()) ==\ "(toDate(received) AS rtime)" assert column_expr(self.dataset, 'col', deepcopy(query), ParsingContext(), aggregate='sum') ==\ "(sum(col) AS col)" assert column_expr(self.dataset, 'col', deepcopy(query), ParsingContext(), alias='summation', aggregate='sum') ==\ "(sum(col) AS summation)" # Special cases where count() doesn't need a column assert column_expr(self.dataset, '', deepcopy(query), ParsingContext(), alias='count', aggregate='count()') ==\ "(count() AS count)" assert column_expr(self.dataset, '', deepcopy(query), ParsingContext(), alias='aggregate', aggregate='count()') ==\ "(count() AS aggregate)" # Columns that need escaping assert column_expr(self.dataset, 'sentry:release', deepcopy(query), ParsingContext()) == '`sentry:release`' # Columns that start with a negative sign (used in orderby to signify # sort order) retain the '-' sign outside the escaping backticks (if any) assert column_expr(self.dataset, '-timestamp', deepcopy(query), ParsingContext()) == '-timestamp' assert column_expr(self.dataset, '-sentry:release', deepcopy(query), ParsingContext()) == '-`sentry:release`' # A 'column' that is actually a string literal assert column_expr(self.dataset, '\'hello world\'', deepcopy(query), ParsingContext()) == '\'hello world\'' # Complex expressions (function calls) involving both string and column arguments assert column_expr(self.dataset, tuplify(['concat', ['a', '\':\'', 'b']]), deepcopy(query), ParsingContext()) == 'concat(a, \':\', b)' group_id_query = deepcopy(query) assert column_expr( self.dataset, 'issue', group_id_query, ParsingContext()) == '(nullIf(group_id, 0) AS issue)' assert column_expr( self.dataset, 'group_id', group_id_query, ParsingContext()) == '(nullIf(group_id, 0) AS group_id)' # turn uniq() into ifNull(uniq(), 0) so it doesn't return null where a number was expected. assert column_expr(self.dataset, 'tags[environment]', deepcopy(query), ParsingContext(), alias='unique_envs', aggregate='uniq' ) == "(ifNull(uniq(environment), 0) AS unique_envs)"
def _sql_data_list(self) -> Sequence[Tuple[str, str]]: if self.__sql_data_list: return self.__sql_data_list parsing_context = ParsingContext() formatter = ClickhouseExpressionFormatter(parsing_context) selected_cols = [ e.expression.accept(formatter) for e in self.__selected_columns ] select_clause = f"SELECT {', '.join(selected_cols)}" # TODO: The visitor approach will be used for the FROM clause as well. from_clause = f"FROM {self.__data_source.format_from()}" if self.__final: from_clause = f"{from_clause} FINAL" # TODO: Sampling rate will become one step of Clickhouse query processing if not self.__data_source.supports_sample(): sample_rate = None else: if self.__sample: sample_rate = self.__sample elif self.__settings.get_turbo(): sample_rate = settings.TURBO_SAMPLE_RATE else: sample_rate = None if sample_rate: from_clause = f"{from_clause} SAMPLE {sample_rate}" array_join_clause = "" if self.__arrayjoin: formatted_array_join = self.__arrayjoin.accept(formatter) array_join_clause = f"ARRAY JOIN {formatted_array_join}" prewhere_clause = "" if self.__prewhere: formatted_prewhere = self.__prewhere.accept(formatter) prewhere_clause = f"PREWHERE {formatted_prewhere}" where_clause = "" if self.__condition: where_clause = f"WHERE {self.__condition.accept(formatter)}" group_clause = "" if self.__groupby: # reformat to use aliases generated during the select clause formatting. groupby_expressions = [e.accept(formatter) for e in self.__groupby] group_clause = f"GROUP BY ({', '.join(groupby_expressions)})" if self.__hastotals: group_clause = f"{group_clause} WITH TOTALS" having_clause = "" if self.__having: having_clause = f"HAVING {self.__having.accept(formatter)}" order_clause = "" if self.__orderby: orderby = [ f"{e.expression.accept(formatter)} {e.direction.value}" for e in self.__orderby ] order_clause = f"ORDER BY {', '.join(orderby)}" limitby_clause = "" if self.__limitby is not None: limitby_clause = "LIMIT {} BY {}".format(*self.__limitby) limit_clause = "" if self.__limit is not None: limit_clause = f"LIMIT {self.__limit} OFFSET {self.__offset}" self.__sql_data_list = [ (k, v) for k, v in [ ("select", select_clause), ("from", from_clause), ("array_join", array_join_clause), ("prewhere", prewhere_clause), ("where", where_clause), ("group", group_clause), ("having", having_clause), ("order", order_clause), ("limitby", limitby_clause), ("limit", limit_clause), ] if v ] return self.__sql_data_list
def __init__( self, dataset: Dataset, query: Query, settings: RequestSettings, prewhere_conditions: Sequence[str], ) -> None: parsing_context = ParsingContext() aggregate_exprs = [ column_expr(dataset, col, query, parsing_context, alias, agg) for (agg, col, alias) in query.get_aggregations() ] groupby = util.to_list(query.get_groupby()) group_exprs = [ column_expr(dataset, gb, query, parsing_context) for gb in groupby ] column_names = query.get_selected_columns() or [] selected_cols = [ column_expr(dataset, util.tuplify(colname), query, parsing_context) for colname in column_names ] select_clause = u'SELECT {}'.format( ', '.join(group_exprs + aggregate_exprs + selected_cols)) from_clause = u'FROM {}'.format(query.get_data_source().format_from()) if query.get_final(): from_clause = u'{} FINAL'.format(from_clause) if query.get_sample(): sample_rate = query.get_sample() elif settings.get_turbo(): sample_rate = snuba_settings.TURBO_SAMPLE_RATE else: sample_rate = None if sample_rate: from_clause = u'{} SAMPLE {}'.format(from_clause, sample_rate) join_clause = '' if query.get_arrayjoin(): join_clause = u'ARRAY JOIN {}'.format(query.get_arrayjoin()) where_clause = '' if query.get_conditions(): where_clause = u'WHERE {}'.format( conditions_expr(dataset, query.get_conditions(), query, parsing_context)) prewhere_clause = '' if prewhere_conditions: prewhere_clause = u'PREWHERE {}'.format( conditions_expr(dataset, prewhere_conditions, query, parsing_context)) group_clause = '' if groupby: group_clause = 'GROUP BY ({})'.format(', '.join( column_expr(dataset, gb, query, parsing_context) for gb in groupby)) if query.has_totals(): group_clause = '{} WITH TOTALS'.format(group_clause) having_clause = '' having_conditions = query.get_having() if having_conditions: assert groupby, 'found HAVING clause with no GROUP BY' having_clause = u'HAVING {}'.format( conditions_expr(dataset, having_conditions, query, parsing_context)) order_clause = '' if query.get_orderby(): orderby = [ column_expr(dataset, util.tuplify(ob), query, parsing_context) for ob in util.to_list(query.get_orderby()) ] orderby = [ u'{} {}'.format(ob.lstrip('-'), 'DESC' if ob.startswith('-') else 'ASC') for ob in orderby ] order_clause = u'ORDER BY {}'.format(', '.join(orderby)) limitby_clause = '' if query.get_limitby() is not None: limitby_clause = 'LIMIT {} BY {}'.format(*query.get_limitby()) limit_clause = '' if query.get_limit() is not None: limit_clause = 'LIMIT {}, {}'.format(query.get_offset(), query.get_limit()) self.__formatted_query = ' '.join([ c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause ] if c ])
def test_conditions_expr(self, dataset): state.set_config('use_escape_alias', 1) conditions = [['a', '=', 1]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'a = 1' conditions = [[['a', '=', 1]]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'a = 1' conditions = [['a', '=', 1], ['b', '=', 2]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'a = 1 AND b = 2' conditions = [[['a', '=', 1], ['b', '=', 2]]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == '(a = 1 OR b = 2)' conditions = [[['a', '=', 1], ['b', '=', 2]], ['c', '=', 3]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == '(a = 1 OR b = 2) AND c = 3' conditions = [[['a', '=', 1], ['b', '=', 2]], [['c', '=', 3], ['d', '=', 4]]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == '(a = 1 OR b = 2) AND (c = 3 OR d = 4)' # Malformed condition input conditions = [[['a', '=', 1], []]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'a = 1' # Test column expansion conditions = [[['tags[foo]', '=', 1], ['b', '=', 2]]] expanded = column_expr(dataset, 'tags[foo]', Query({}), ParsingContext()) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == '({} = 1 OR b = 2)'.format(expanded) # Test using alias if column has already been expanded in SELECT clause reuse_query = Query({}) parsing_context = ParsingContext() conditions = [[['tags[foo]', '=', 1], ['b', '=', 2]]] column_expr(dataset, 'tags[foo]', reuse_query, parsing_context) # Expand it once so the next time is aliased assert conditions_expr(dataset, conditions, reuse_query, parsing_context) == '(`tags[foo]` = 1 OR b = 2)' # Test special output format of LIKE conditions = [['primary_hash', 'LIKE', '%foo%']] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'primary_hash LIKE \'%foo%\'' conditions = tuplify([[['notEmpty', ['arrayElement', ['exception_stacks.type', 1]]], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'notEmpty(arrayElement((exception_stacks.type AS `exception_stacks.type`), 1)) = 1' conditions = tuplify([[['notEmpty', ['tags[sentry:user]']], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 1' conditions = tuplify([[['notEmpty', ['tags_key']], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'notEmpty((arrayJoin(tags.key) AS tags_key)) = 1' conditions = tuplify([ [ [['notEmpty', ['tags[sentry:environment]']], '=', 'dev'], [['notEmpty', ['tags[sentry:environment]']], '=', 'prod'] ], [ [['notEmpty', ['tags[sentry:user]']], '=', 'joe'], [['notEmpty', ['tags[sentry:user]']], '=', 'bob'] ], ]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == \ """(notEmpty((tags.value[indexOf(tags.key, 'sentry:environment')] AS `tags[sentry:environment]`)) = 'dev' OR notEmpty(`tags[sentry:environment]`) = 'prod') AND (notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 'joe' OR notEmpty(`tags[sentry:user]`) = 'bob')""" # Test scalar condition on array column is expanded as an iterator. conditions = [['exception_frames.filename', 'LIKE', '%foo%']] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'arrayExists(x -> assumeNotNull(x LIKE \'%foo%\'), (exception_frames.filename AS `exception_frames.filename`))' # Test negative scalar condition on array column is expanded as an all() type iterator. conditions = [['exception_frames.filename', 'NOT LIKE', '%foo%']] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'arrayAll(x -> assumeNotNull(x NOT LIKE \'%foo%\'), (exception_frames.filename AS `exception_frames.filename`))' # Test that a duplicate IN condition is deduplicated even if # the lists are in different orders.[ conditions = tuplify([ ['platform', 'IN', ['a', 'b', 'c']], ['platform', 'IN', ['c', 'b', 'a']] ]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == "platform IN ('a', 'b', 'c')"
def __init__(self, parsing_context: Optional[ParsingContext] = None) -> None: self._parsing_context = (parsing_context if parsing_context is not None else ParsingContext())