def test_conditions_expr(): dataset = get_dataset("groups") state.set_config('use_escape_alias', 1) conditions = [['events.a', '=', 1]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == '(events.a AS `events.a`) = 1' conditions = [[['events.a', '=', 1], ['groups.b', '=', 2]], [['events.c', '=', 3], ['groups.d', '=', 4]]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == ('((events.a AS `events.a`) = 1 OR (groups.b AS `groups.b`) = 2)' ' AND ((events.c AS `events.c`) = 3 OR (groups.d AS `groups.d`) = 4)' ) # Test column expansion conditions = [[['events.tags[foo]', '=', 1], ['groups.b', '=', 2]]] expanded = column_expr(dataset, 'events.tags[foo]', Query({}), ParsingContext()) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == '({} = 1 OR (groups.b AS `groups.b`) = 2)'.format(expanded) # Test using alias if column has already been expanded in SELECT clause reuse_query = Query({}) parsing_context = ParsingContext() conditions = [[['events.tags[foo]', '=', 1], ['groups.b', '=', 2]]] column_expr(dataset, 'events.tags[foo]', reuse_query, parsing_context) # Expand it once so the next time is aliased assert conditions_expr(dataset, conditions, reuse_query, parsing_context) \ == '(`events.tags[foo]` = 1 OR (groups.b AS `groups.b`) = 2)' # Test special output format of LIKE conditions = [['events.primary_hash', 'LIKE', '%foo%']] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == '(events.primary_hash AS `events.primary_hash`) LIKE \'%foo%\'' conditions = tuplify( [[['notEmpty', ['arrayElement', ['events.exception_stacks.type', 1]]], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == 'notEmpty(arrayElement((events.exception_stacks.type AS `events.exception_stacks.type`), 1)) = 1' conditions = tuplify([[['notEmpty', ['events.tags[sentry:user]']], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == 'notEmpty(`events.tags[sentry:user]`) = 1' conditions = tuplify([[['notEmpty', ['events.tags_key']], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == 'notEmpty((arrayJoin(events.tags.key) AS `events.tags_key`)) = 1' # Test scalar condition on array column is expanded as an iterator. conditions = [['events.exception_frames.filename', 'LIKE', '%foo%']] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == 'arrayExists(x -> assumeNotNull(x LIKE \'%foo%\'), (events.exception_frames.filename AS `events.exception_frames.filename`))'
def build_selected_expressions( raw_expressions: Sequence[Any], ) -> List[SelectedExpression]: output = [] for raw_expression in raw_expressions: exp = parse_expression(tuplify(raw_expression), entity.get_data_model(), set()) output.append( SelectedExpression( # An expression in the query can be a string or a # complex list with an alias. In the second case # we trust the parser to find the alias. name=raw_expression if isinstance(raw_expression, str) else exp.alias, expression=exp, )) return output
def __init__( self, dataset: Dataset, query: Query, settings: RequestSettings, ) -> None: parsing_context = ParsingContext() aggregate_exprs = [ column_expr(dataset, col, query, parsing_context, alias, agg) for (agg, col, alias) in query.get_aggregations() ] groupby = util.to_list(query.get_groupby()) group_exprs = [ column_expr(dataset, gb, query, parsing_context) for gb in groupby ] column_names = query.get_selected_columns() or [] selected_cols = [ column_expr(dataset, util.tuplify(colname), query, parsing_context) for colname in column_names ] select_clause = u"SELECT {}".format( ", ".join(group_exprs + aggregate_exprs + selected_cols)) from_clause = u"FROM {}".format(query.get_data_source().format_from()) if query.get_final(): from_clause = u"{} FINAL".format(from_clause) if not query.get_data_source().supports_sample(): sample_rate = None else: if query.get_sample(): sample_rate = query.get_sample() elif settings.get_turbo(): sample_rate = snuba_settings.TURBO_SAMPLE_RATE else: sample_rate = None if sample_rate: from_clause = u"{} SAMPLE {}".format(from_clause, sample_rate) join_clause = "" if query.get_arrayjoin(): join_clause = u"ARRAY JOIN {}".format(query.get_arrayjoin()) where_clause = "" if query.get_conditions(): where_clause = u"WHERE {}".format( conditions_expr(dataset, query.get_conditions(), query, parsing_context)) prewhere_clause = "" if query.get_prewhere(): prewhere_clause = u"PREWHERE {}".format( conditions_expr(dataset, query.get_prewhere(), query, parsing_context)) group_clause = "" if groupby: group_clause = "GROUP BY ({})".format(", ".join( column_expr(dataset, gb, query, parsing_context) for gb in groupby)) if query.has_totals(): group_clause = "{} WITH TOTALS".format(group_clause) having_clause = "" having_conditions = query.get_having() if having_conditions: assert groupby, "found HAVING clause with no GROUP BY" having_clause = u"HAVING {}".format( conditions_expr(dataset, having_conditions, query, parsing_context)) order_clause = "" if query.get_orderby(): orderby = [ column_expr(dataset, util.tuplify(ob), query, parsing_context) for ob in util.to_list(query.get_orderby()) ] orderby = [ u"{} {}".format(ob.lstrip("-"), "DESC" if ob.startswith("-") else "ASC") for ob in orderby ] order_clause = u"ORDER BY {}".format(", ".join(orderby)) limitby_clause = "" if query.get_limitby() is not None: limitby_clause = "LIMIT {} BY {}".format(*query.get_limitby()) limit_clause = "" if query.get_limit() is not None: limit_clause = "LIMIT {}, {}".format(query.get_offset(), query.get_limit()) self.__formatted_query = " ".join([ c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause, ] if c ])
def __init__( self, dataset: Dataset, query: Query, settings: RequestSettings, prewhere_conditions: Sequence[str], ) -> None: parsing_context = ParsingContext() aggregate_exprs = [ column_expr(dataset, col, query, parsing_context, alias, agg) for (agg, col, alias) in query.get_aggregations() ] groupby = util.to_list(query.get_groupby()) group_exprs = [ column_expr(dataset, gb, query, parsing_context) for gb in groupby ] column_names = query.get_selected_columns() or [] selected_cols = [ column_expr(dataset, util.tuplify(colname), query, parsing_context) for colname in column_names ] select_clause = u'SELECT {}'.format( ', '.join(group_exprs + aggregate_exprs + selected_cols)) from_clause = u'FROM {}'.format(query.get_data_source().format_from()) if query.get_final(): from_clause = u'{} FINAL'.format(from_clause) if query.get_sample(): sample_rate = query.get_sample() elif settings.get_turbo(): sample_rate = snuba_settings.TURBO_SAMPLE_RATE else: sample_rate = None if sample_rate: from_clause = u'{} SAMPLE {}'.format(from_clause, sample_rate) join_clause = '' if query.get_arrayjoin(): join_clause = u'ARRAY JOIN {}'.format(query.get_arrayjoin()) where_clause = '' if query.get_conditions(): where_clause = u'WHERE {}'.format( conditions_expr(dataset, query.get_conditions(), query, parsing_context)) prewhere_clause = '' if prewhere_conditions: prewhere_clause = u'PREWHERE {}'.format( conditions_expr(dataset, prewhere_conditions, query, parsing_context)) group_clause = '' if groupby: group_clause = 'GROUP BY ({})'.format(', '.join( column_expr(dataset, gb, query, parsing_context) for gb in groupby)) if query.has_totals(): group_clause = '{} WITH TOTALS'.format(group_clause) having_clause = '' having_conditions = query.get_having() if having_conditions: assert groupby, 'found HAVING clause with no GROUP BY' having_clause = u'HAVING {}'.format( conditions_expr(dataset, having_conditions, query, parsing_context)) order_clause = '' if query.get_orderby(): orderby = [ column_expr(dataset, util.tuplify(ob), query, parsing_context) for ob in util.to_list(query.get_orderby()) ] orderby = [ u'{} {}'.format(ob.lstrip('-'), 'DESC' if ob.startswith('-') else 'ASC') for ob in orderby ] order_clause = u'ORDER BY {}'.format(', '.join(orderby)) limitby_clause = '' if query.get_limitby() is not None: limitby_clause = 'LIMIT {} BY {}'.format(*query.get_limitby()) limit_clause = '' if query.get_limit() is not None: limit_clause = 'LIMIT {}, {}'.format(query.get_offset(), query.get_limit()) self.__formatted_query = ' '.join([ c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause ] if c ])
), ), Literal(None, 1), ), ), ), # Test array columns in boolean functions are expanded as an iterator. ( [["tags.key", "=", "key"]], FunctionCall( None, ConditionFunctions.EQ, (Column(None, None, "tags.key"), Literal(None, "key")), ), ), # Array columns not expanded because in arrayjoin ( tuplify([["platform", "IN", ["a", "b", "c"]], ["platform", "IN", ["c", "b", "a"]]]), FunctionCall( None, ConditionFunctions.IN, ( Column(None, None, "platform"), FunctionCall( None, "tuple", (Literal(None, "a"), Literal(None, "b"), Literal( None, "c")), ), ), ), ), # Test that a duplicate IN condition is de-duplicated even if the lists are in different orders. (
def test_complex_conditions_expr(self, dataset): source = (dataset.get_all_storages() [0].get_schemas().get_read_schema().get_data_source()) query = Query({}, source) assert (complex_column_expr(dataset, tuplify(["count", []]), deepcopy(query), ParsingContext()) == "count()") assert (complex_column_expr( dataset, tuplify(["notEmpty", ["foo"]]), deepcopy(query), ParsingContext(), ) == "notEmpty(foo)") assert (complex_column_expr( dataset, tuplify(["notEmpty", ["arrayElement", ["foo", 1]]]), deepcopy(query), ParsingContext(), ) == "notEmpty(arrayElement(foo, 1))") assert (complex_column_expr( dataset, tuplify(["foo", ["bar", ["qux"], "baz"]]), deepcopy(query), ParsingContext(), ) == "foo(bar(qux), baz)") assert (complex_column_expr(dataset, tuplify(["foo", [], "a"]), deepcopy(query), ParsingContext()) == "(foo() AS a)") assert (complex_column_expr( dataset, tuplify(["foo", ["b", "c"], "d"]), deepcopy(query), ParsingContext(), ) == "(foo(b, c) AS d)") assert (complex_column_expr( dataset, tuplify(["foo", ["b", "c", ["d"]]]), deepcopy(query), ParsingContext(), ) == "foo(b, c(d))") assert (complex_column_expr( dataset, tuplify(["top3", ["project_id"]]), deepcopy(query), ParsingContext(), ) == "topK(3)(project_id)") assert (complex_column_expr( dataset, tuplify(["top10", ["project_id"], "baz"]), deepcopy(query), ParsingContext(), ) == "(topK(10)(project_id) AS baz)") assert (complex_column_expr( dataset, tuplify(["emptyIfNull", ["project_id"]]), deepcopy(query), ParsingContext(), ) == "ifNull(project_id, '')") assert (complex_column_expr( dataset, tuplify(["emptyIfNull", ["project_id"], "foo"]), deepcopy(query), ParsingContext(), ) == "(ifNull(project_id, '') AS foo)") assert (complex_column_expr(dataset, tuplify(["or", ["a", "b"]]), deepcopy(query), ParsingContext()) == "or(a, b)") assert (complex_column_expr(dataset, tuplify(["and", ["a", "b"]]), deepcopy(query), ParsingContext()) == "and(a, b)") assert (complex_column_expr( dataset, tuplify(["or", [["or", ["a", "b"]], "c"]]), deepcopy(query), ParsingContext(), ) == "or(or(a, b), c)") assert (complex_column_expr( dataset, tuplify(["and", [["and", ["a", "b"]], "c"]]), deepcopy(query), ParsingContext(), ) == "and(and(a, b), c)") # (A OR B) AND C assert (complex_column_expr( dataset, tuplify(["and", [["or", ["a", "b"]], "c"]]), deepcopy(query), ParsingContext(), ) == "and(or(a, b), c)") # (A AND B) OR C assert (complex_column_expr( dataset, tuplify(["or", [["and", ["a", "b"]], "c"]]), deepcopy(query), ParsingContext(), ) == "or(and(a, b), c)") # A OR B OR C OR D assert (complex_column_expr( dataset, tuplify(["or", [["or", [["or", ["c", "d"]], "b"]], "a"]]), deepcopy(query), ParsingContext(), ) == "or(or(or(c, d), b), a)") assert (complex_column_expr( dataset, tuplify([ "if", [ ["in", ["release", "tuple", ["'foo'"]]], "release", "'other'", ], "release", ]), deepcopy(query), ParsingContext(), ) == "(if(in(release, tuple('foo')), release, 'other') AS release)") assert (complex_column_expr( dataset, tuplify([ "if", ["in", ["release", "tuple", ["'foo'"]], "release", "'other'"], "release", ]), deepcopy(query), ParsingContext(), ) == "(if(in(release, tuple('foo')), release, 'other') AS release)") # TODO once search_message is filled in everywhere, this can be just 'message' again. message_expr = "(coalesce(search_message, message) AS message)" assert complex_column_expr( dataset, tuplify([ "positionCaseInsensitive", ["message", "'lol 'single' quotes'"] ]), deepcopy(query), ParsingContext(), ) == "positionCaseInsensitive({message_expr}, 'lol \\'single\\' quotes')".format( **locals()) # dangerous characters are allowed but escaped in literals and column names assert (complex_column_expr( dataset, tuplify(["safe", ["fo`o", "'ba'r'"]]), deepcopy(query), ParsingContext(), ) == r"safe(`fo\`o`, 'ba\'r')") # Dangerous characters not allowed in functions with pytest.raises(AssertionError): assert complex_column_expr( dataset, tuplify([r"dang'erous", ["message", "`"]]), deepcopy(query), ParsingContext(), ) # Or nested functions with pytest.raises(AssertionError): assert complex_column_expr( dataset, tuplify([r"safe", ["dang`erous", ["message"]]]), deepcopy(query), ParsingContext(), )
def _parse_query_impl(body: MutableMapping[str, Any], dataset: Dataset) -> Query: aggregate_exprs = [] for aggregation in body.get("aggregations", []): assert isinstance(aggregation, (list, tuple)) aggregation_function = aggregation[0] column_expr = aggregation[1] column_expr = column_expr if column_expr else [] alias = aggregation[2] alias = alias if alias else None aggregate_exprs.append( parse_aggregation(aggregation_function, column_expr, alias)) groupby_exprs = [ parse_expression(tuplify(group_by)) for group_by in to_list(body.get("groupby", [])) ] select_exprs = [ parse_expression(tuplify(select)) for select in body.get("selected_columns", []) ] selected_cols = groupby_exprs + aggregate_exprs + select_exprs arrayjoin = body.get("arrayjoin") if arrayjoin: array_join_expr: Optional[Expression] = parse_expression( body["arrayjoin"]) else: array_join_expr = None where_expr = parse_conditions_to_expr(body.get("conditions", []), dataset, arrayjoin) having_expr = parse_conditions_to_expr(body.get("having", []), dataset, arrayjoin) orderby_exprs = [] for orderby in to_list(body.get("orderby", [])): if isinstance(orderby, str): match = NEGATE_RE.match(orderby) assert match is not None, f"Invalid Order By clause {orderby}" direction, col = match.groups() orderby = col elif is_function(orderby): match = NEGATE_RE.match(orderby[0]) assert match is not None, f"Invalid Order By clause {orderby}" direction, col = match.groups() orderby = [col] + orderby[1:] else: raise ValueError(f"Invalid Order By clause {orderby}") orderby_parsed = parse_expression(tuplify(orderby)) orderby_exprs.append( OrderBy( OrderByDirection.DESC if direction == "-" else OrderByDirection.ASC, orderby_parsed, )) source = dataset.get_dataset_schemas().get_read_schema().get_data_source() return Query( body, source, selected_columns=selected_cols, array_join=array_join_expr, condition=where_expr, groupby=groupby_exprs, having=having_expr, order_by=orderby_exprs, )
def test_complex_conditions_expr(self, dataset): query = Query({}) assert complex_column_expr(dataset, tuplify(['count', []]), deepcopy(query), ParsingContext()) == 'count()' assert complex_column_expr(dataset, tuplify(['notEmpty', ['foo']]), deepcopy(query), ParsingContext()) == 'notEmpty(foo)' assert complex_column_expr(dataset, tuplify(['notEmpty', ['arrayElement', ['foo', 1]]]), deepcopy(query), ParsingContext()) == 'notEmpty(arrayElement(foo, 1))' assert complex_column_expr(dataset, tuplify(['foo', ['bar', ['qux'], 'baz']]), deepcopy(query), ParsingContext()) == 'foo(bar(qux), baz)' assert complex_column_expr(dataset, tuplify(['foo', [], 'a']), deepcopy(query), ParsingContext()) == '(foo() AS a)' assert complex_column_expr(dataset, tuplify(['foo', ['b', 'c'], 'd']), deepcopy(query), ParsingContext()) == '(foo(b, c) AS d)' assert complex_column_expr(dataset, tuplify(['foo', ['b', 'c', ['d']]]), deepcopy(query), ParsingContext()) == 'foo(b, c(d))' assert complex_column_expr(dataset, tuplify(['top3', ['project_id']]), deepcopy(query), ParsingContext()) == 'topK(3)(project_id)' assert complex_column_expr(dataset, tuplify(['top10', ['project_id'], 'baz']), deepcopy(query), ParsingContext()) == '(topK(10)(project_id) AS baz)' assert complex_column_expr(dataset, tuplify(['emptyIfNull', ['project_id']]), deepcopy(query), ParsingContext()) == 'ifNull(project_id, \'\')' assert complex_column_expr(dataset, tuplify(['emptyIfNull', ['project_id'], 'foo']), deepcopy(query), ParsingContext()) == '(ifNull(project_id, \'\') AS foo)' assert complex_column_expr(dataset, tuplify(['or', ['a', 'b']]), deepcopy(query), ParsingContext()) == 'or(a, b)' assert complex_column_expr(dataset, tuplify(['and', ['a', 'b']]), deepcopy(query), ParsingContext()) == 'and(a, b)' assert complex_column_expr(dataset, tuplify(['or', [['or', ['a', 'b']], 'c']]), deepcopy(query), ParsingContext()) == 'or(or(a, b), c)' assert complex_column_expr(dataset, tuplify(['and', [['and', ['a', 'b']], 'c']]), deepcopy(query), ParsingContext()) == 'and(and(a, b), c)' # (A OR B) AND C assert complex_column_expr(dataset, tuplify(['and', [['or', ['a', 'b']], 'c']]), deepcopy(query), ParsingContext()) == 'and(or(a, b), c)' # (A AND B) OR C assert complex_column_expr(dataset, tuplify(['or', [['and', ['a', 'b']], 'c']]), deepcopy(query), ParsingContext()) == 'or(and(a, b), c)' # A OR B OR C OR D assert complex_column_expr(dataset, tuplify(['or', [['or', [['or', ['c', 'd']], 'b']], 'a']]), deepcopy(query), ParsingContext()) == 'or(or(or(c, d), b), a)' assert complex_column_expr(dataset, tuplify(['if', [['in', ['release', 'tuple', ["'foo'"], ], ], 'release', "'other'"], 'release', ]), deepcopy(query), ParsingContext()) == "(if(in(release, tuple('foo')), release, 'other') AS release)" assert complex_column_expr(dataset, tuplify(['if', ['in', ['release', 'tuple', ["'foo'"]], 'release', "'other'", ], 'release']), deepcopy(query), ParsingContext()) == "(if(in(release, tuple('foo')), release, 'other') AS release)" # TODO once search_message is filled in everywhere, this can be just 'message' again. message_expr = '(coalesce(search_message, message) AS message)' assert complex_column_expr(dataset, tuplify(['positionCaseInsensitive', ['message', "'lol 'single' quotes'"]]), deepcopy(query), ParsingContext())\ == "positionCaseInsensitive({message_expr}, 'lol \\'single\\' quotes')".format(**locals()) # dangerous characters are allowed but escaped in literals and column names assert complex_column_expr(dataset, tuplify(['safe', ['fo`o', "'ba'r'"]]), deepcopy(query), ParsingContext()) == r"safe(`fo\`o`, 'ba\'r')" # Dangerous characters not allowed in functions with pytest.raises(AssertionError): assert complex_column_expr(dataset, tuplify([r"dang'erous", ['message', '`']]), deepcopy(query), ParsingContext()) # Or nested functions with pytest.raises(AssertionError): assert complex_column_expr(dataset, tuplify([r"safe", ['dang`erous', ['message']]]), deepcopy(query), ParsingContext())
def test_conditions_expr(self): conditions = [['a', '=', 1]] assert conditions_expr(conditions, {}) == 'a = 1' conditions = [[['a', '=', 1]]] assert conditions_expr(conditions, {}) == 'a = 1' conditions = [['a', '=', 1], ['b', '=', 2]] assert conditions_expr(conditions, {}) == 'a = 1 AND b = 2' conditions = [[['a', '=', 1], ['b', '=', 2]]] assert conditions_expr(conditions, {}) == '(a = 1 OR b = 2)' conditions = [[['a', '=', 1], ['b', '=', 2]], ['c', '=', 3]] assert conditions_expr(conditions, {}) == '(a = 1 OR b = 2) AND c = 3' conditions = [[['a', '=', 1], ['b', '=', 2]], [['c', '=', 3], ['d', '=', 4]]] assert conditions_expr(conditions, {}) == '(a = 1 OR b = 2) AND (c = 3 OR d = 4)' # Malformed condition input conditions = [[['a', '=', 1], []]] assert conditions_expr(conditions, {}) == 'a = 1' # Test column expansion conditions = [[['tags[foo]', '=', 1], ['b', '=', 2]]] expanded = column_expr('tags[foo]', {}) assert conditions_expr(conditions, {}) == '({} = 1 OR b = 2)'.format(expanded) # Test using alias if column has already been expanded in SELECT clause reuse_body = {} conditions = [[['tags[foo]', '=', 1], ['b', '=', 2]]] column_expr('tags[foo]', reuse_body) # Expand it once so the next time is aliased assert conditions_expr(conditions, reuse_body) == '(`tags[foo]` = 1 OR b = 2)' # Test special output format of LIKE conditions = [['primary_hash', 'LIKE', '%foo%']] assert conditions_expr(conditions, {}) == 'primary_hash LIKE \'%foo%\'' conditions = tuplify( [[['notEmpty', ['arrayElement', ['exception_stacks.type', 1]]], '=', 1]]) assert conditions_expr( conditions, {}) == 'notEmpty(arrayElement(exception_stacks.type, 1)) = 1' conditions = tuplify([[['notEmpty', ['tags[sentry:user]']], '=', 1]]) assert conditions_expr( conditions, {}) == 'notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 1' conditions = tuplify([[['notEmpty', ['tags_key']], '=', 1]]) assert conditions_expr( conditions, {}) == 'notEmpty((arrayJoin(tags.key) AS tags_key)) = 1' conditions = tuplify([ [[['notEmpty', ['tags[sentry:environment]']], '=', 'dev'], [['notEmpty', ['tags[sentry:environment]']], '=', 'prod']], [[['notEmpty', ['tags[sentry:user]']], '=', 'joe'], [['notEmpty', ['tags[sentry:user]']], '=', 'bob']], ]) assert conditions_expr(conditions, {}) == \ """(notEmpty((tags.value[indexOf(tags.key, 'sentry:environment')] AS `tags[sentry:environment]`)) = 'dev' OR notEmpty(`tags[sentry:environment]`) = 'prod') AND (notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 'joe' OR notEmpty(`tags[sentry:user]`) = 'bob')""" # Test scalar condition on array column is expanded as an iterator. conditions = [['exception_frames.filename', 'LIKE', '%foo%']] assert conditions_expr( conditions, {} ) == 'arrayExists(x -> assumeNotNull(x LIKE \'%foo%\'), exception_frames.filename)' # Test negative scalar condition on array column is expanded as an all() type iterator. conditions = [['exception_frames.filename', 'NOT LIKE', '%foo%']] assert conditions_expr( conditions, {} ) == 'arrayAll(x -> assumeNotNull(x NOT LIKE \'%foo%\'), exception_frames.filename)'
def test_complex_conditions_expr() -> None: assert parse_function_to_expr(tuplify(["count", []]),) == FunctionCall( None, "count", () ) assert parse_function_to_expr(tuplify(["notEmpty", ["foo"]]),) == FunctionCall( None, "notEmpty", (Column(None, "foo", None),) ) assert parse_function_to_expr( tuplify(["notEmpty", ["arrayElement", ["foo", 1]]]), ) == FunctionCall( None, "notEmpty", ( FunctionCall( None, "arrayElement", (Column(None, "foo", None), Literal(None, 1)) ), ), ) assert parse_function_to_expr( tuplify(["foo", ["bar", ["qux"], "baz"]]), ) == FunctionCall( None, "foo", ( FunctionCall(None, "bar", (Column(None, "qux", None),)), Column(None, "baz", None), ), ) assert parse_function_to_expr(tuplify(["foo", [], "a"]),) == FunctionCall( "a", "foo", () ) assert parse_function_to_expr(tuplify(["foo", ["b", "c"], "d"]),) == FunctionCall( "d", "foo", (Column(None, "b", None), Column(None, "c", None)) ) assert parse_function_to_expr(tuplify(["foo", ["b", "c", ["d"]]]),) == FunctionCall( None, "foo", (Column(None, "b", None), FunctionCall(None, "c", (Column(None, "d", None),))), ) assert parse_function_to_expr( tuplify(["emptyIfNull", ["project_id"]]), ) == FunctionCall(None, "emptyIfNull", (Column(None, "project_id", None),)) assert parse_function_to_expr( tuplify(["or", [["or", ["a", "b"]], "c"]]), ) == binary_condition( None, BooleanFunctions.OR, binary_condition( None, BooleanFunctions.OR, Column(None, "a", None), Column(None, "b", None) ), Column(None, "c", None), ) assert parse_function_to_expr( tuplify(["and", [["and", ["a", "b"]], "c"]]), ) == binary_condition( None, BooleanFunctions.AND, binary_condition( None, BooleanFunctions.AND, Column(None, "a", None), Column(None, "b", None) ), Column(None, "c", None), ) # (A OR B) AND C assert parse_function_to_expr( tuplify(["and", [["or", ["a", "b"]], "c"]]), ) == binary_condition( None, BooleanFunctions.AND, binary_condition( None, BooleanFunctions.OR, Column(None, "a", None), Column(None, "b", None) ), Column(None, "c", None), ) # A OR B OR C OR D assert parse_function_to_expr( tuplify(["or", [["or", [["or", ["c", "d"]], "b"]], "a"]]), ) == binary_condition( None, BooleanFunctions.OR, binary_condition( None, BooleanFunctions.OR, binary_condition( None, BooleanFunctions.OR, Column(None, "c", None), Column(None, "d", None), ), Column(None, "b", None), ), Column(None, "a", None), ) assert parse_function_to_expr( tuplify( [ "if", [["in", ["release", "tuple", ["'foo'"]]], "release", "'other'"], "release", ] ), ) == FunctionCall( "release", "if", ( FunctionCall( None, "in", ( Column(None, "release", None), FunctionCall(None, "tuple", (Literal(None, "foo"),)), ), ), Column(None, "release", None), Literal(None, "other"), ), ) # TODO once search_message is filled in everywhere, this can be just 'message' again. assert parse_function_to_expr( tuplify(["positionCaseInsensitive", ["message", "'lol 'single' quotes'"]]), ) == FunctionCall( None, "positionCaseInsensitive", (Column(None, "message", None), Literal(None, "lol 'single' quotes")), )
def test_column_expr(self): body = {'granularity': 86400} # Single tag expression assert column_expr(self.dataset, 'tags[foo]', body.copy()) ==\ "(tags.value[indexOf(tags.key, \'foo\')] AS `tags[foo]`)" # Promoted tag expression / no translation assert column_expr(self.dataset, 'tags[server_name]', body.copy()) ==\ "(server_name AS `tags[server_name]`)" # Promoted tag expression / with translation assert column_expr(self.dataset, 'tags[app.device]', body.copy()) ==\ "(app_device AS `tags[app.device]`)" # All tag keys expression assert column_expr( self.dataset, 'tags_key', body.copy()) == ('(arrayJoin(tags.key) AS tags_key)') # If we are going to use both tags_key and tags_value, expand both tag_group_body = {'groupby': ['tags_key', 'tags_value']} assert column_expr(self.dataset, 'tags_key', tag_group_body) == ( '(((arrayJoin(arrayMap((x,y) -> [x,y], tags.key, tags.value)) ' 'AS all_tags))[1] AS tags_key)') assert column_expr(self.dataset, 'time', body.copy()) ==\ "(toDate(timestamp) AS time)" assert column_expr(self.dataset, 'rtime', body.copy()) ==\ "(toDate(received) AS rtime)" assert column_expr(self.dataset, 'col', body.copy(), aggregate='sum') ==\ "(sum(col) AS col)" assert column_expr(self.dataset, 'col', body.copy(), alias='summation', aggregate='sum') ==\ "(sum(col) AS summation)" # Special cases where count() doesn't need a column assert column_expr(self.dataset, '', body.copy(), alias='count', aggregate='count()') ==\ "(count() AS count)" assert column_expr(self.dataset, '', body.copy(), alias='aggregate', aggregate='count()') ==\ "(count() AS aggregate)" # Columns that need escaping assert column_expr(self.dataset, 'sentry:release', body.copy()) == '`sentry:release`' # Columns that start with a negative sign (used in orderby to signify # sort order) retain the '-' sign outside the escaping backticks (if any) assert column_expr(self.dataset, '-timestamp', body.copy()) == '-timestamp' assert column_expr(self.dataset, '-sentry:release', body.copy()) == '-`sentry:release`' # A 'column' that is actually a string literal assert column_expr(self.dataset, '\'hello world\'', body.copy()) == '\'hello world\'' # Complex expressions (function calls) involving both string and column arguments assert column_expr(self.dataset, tuplify(['concat', ['a', '\':\'', 'b']]), body.copy()) == 'concat(a, \':\', b)' group_id_body = body.copy() assert column_expr(self.dataset, 'issue', group_id_body) == '(nullIf(group_id, 0) AS issue)' assert column_expr( self.dataset, 'group_id', group_id_body) == '(nullIf(group_id, 0) AS group_id)' # turn uniq() into ifNull(uniq(), 0) so it doesn't return null where a number was expected. assert column_expr(self.dataset, 'tags[environment]', body.copy(), alias='unique_envs', aggregate='uniq' ) == "(ifNull(uniq(environment), 0) AS unique_envs)"
def format(self) -> str: """Generate a SQL string from the parameters.""" body = self.__request.body query = self.__request.query source = self.__dataset \ .get_dataset_schemas() \ .get_read_schema() \ .get_data_source() aggregate_exprs = [ util.column_expr(self.__dataset, col, body, alias, agg) for (agg, col, alias) in query.get_aggregations() ] groupby = util.to_list(query.get_groupby()) group_exprs = [ util.column_expr(self.__dataset, gb, body) for gb in groupby ] column_names = query.get_selected_columns() or [] selected_cols = [ util.column_expr(self.__dataset, util.tuplify(colname), body) for colname in column_names ] select_clause = u'SELECT {}'.format( ', '.join(group_exprs + aggregate_exprs + selected_cols)) from_clause = u'FROM {}'.format(source) if self.__final: from_clause = u'{} FINAL'.format(from_clause) if query.get_sample(): from_clause = u'{} SAMPLE {}'.format(from_clause, query.get_sample()) join_clause = '' if 'arrayjoin' in body: join_clause = u'ARRAY JOIN {}'.format(body['arrayjoin']) where_clause = '' if query.get_conditions(): where_clause = u'WHERE {}'.format( util.conditions_expr(self.__dataset, query.get_conditions(), body)) prewhere_clause = '' if self.__prewhere_conditions: prewhere_clause = u'PREWHERE {}'.format( util.conditions_expr(self.__dataset, self.__prewhere_conditions, body)) group_clause = '' if groupby: group_clause = 'GROUP BY ({})'.format(', '.join( util.column_expr(self.__dataset, gb, body) for gb in groupby)) if body.get('totals', False): group_clause = '{} WITH TOTALS'.format(group_clause) having_clause = '' having_conditions = body.get('having', []) if having_conditions: assert groupby, 'found HAVING clause with no GROUP BY' having_clause = u'HAVING {}'.format( util.conditions_expr(self.__dataset, having_conditions, body)) order_clause = '' if query.get_orderby(): orderby = [ util.column_expr(self.__dataset, util.tuplify(ob), body) for ob in util.to_list(query.get_orderby()) ] orderby = [ u'{} {}'.format(ob.lstrip('-'), 'DESC' if ob.startswith('-') else 'ASC') for ob in orderby ] order_clause = u'ORDER BY {}'.format(', '.join(orderby)) limitby_clause = '' if 'limitby' in body: limitby_clause = 'LIMIT {} BY {}'.format(*body['limitby']) limit_clause = '' if 'limit' in body: limit_clause = 'LIMIT {}, {}'.format(query.get_offset(), body['limit']) return ' '.join([ c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause ] if c ])
def test_column_expr(self): source = ( self.dataset.get_all_storages()[0] .get_schemas() .get_read_schema() .get_data_source() ) query = Query({"granularity": 86400}, source,) # Single tag expression assert ( column_expr(self.dataset, "tags[foo]", deepcopy(query), ParsingContext()) == "(tags.value[indexOf(tags.key, 'foo')] AS `tags[foo]`)" ) # Promoted tag expression / no translation assert ( column_expr( self.dataset, "tags[server_name]", deepcopy(query), ParsingContext() ) == "(server_name AS `tags[server_name]`)" ) # Promoted tag expression / with translation assert ( column_expr( self.dataset, "tags[app.device]", deepcopy(query), ParsingContext() ) == "(app_device AS `tags[app.device]`)" ) # Promoted context expression / with translation assert ( column_expr( self.dataset, "contexts[device.battery_level]", deepcopy(query), ParsingContext(), ) == "(toString(device_battery_level) AS `contexts[device.battery_level]`)" ) # All tag keys expression q = Query({"granularity": 86400, "selected_columns": ["tags_key"]}, source,) assert column_expr(self.dataset, "tags_key", q, ParsingContext()) == ( "(arrayJoin(tags.key) AS tags_key)" ) # If we are going to use both tags_key and tags_value, expand both tag_group_body = {"groupby": ["tags_key", "tags_value"]} assert column_expr( self.dataset, "tags_key", Query(tag_group_body, source), ParsingContext() ) == ( "(((arrayJoin(arrayMap((x,y) -> [x,y], tags.key, tags.value)) " "AS all_tags))[1] AS tags_key)" ) assert ( column_expr(self.dataset, "time", deepcopy(query), ParsingContext()) == "(toDate(timestamp) AS time)" ) assert ( column_expr(self.dataset, "rtime", deepcopy(query), ParsingContext()) == "(toDate(received) AS rtime)" ) assert ( column_expr( self.dataset, "col", deepcopy(query), ParsingContext(), aggregate="sum" ) == "(sum(col) AS col)" ) assert ( column_expr( self.dataset, "col", deepcopy(query), ParsingContext(), alias="summation", aggregate="sum", ) == "(sum(col) AS summation)" ) # Special cases where count() doesn't need a column assert ( column_expr( self.dataset, "", deepcopy(query), ParsingContext(), alias="count", aggregate="count()", ) == "(count() AS count)" ) assert ( column_expr( self.dataset, "", deepcopy(query), ParsingContext(), alias="aggregate", aggregate="count()", ) == "(count() AS aggregate)" ) # Columns that need escaping assert ( column_expr( self.dataset, "sentry:release", deepcopy(query), ParsingContext() ) == "`sentry:release`" ) # A 'column' that is actually a string literal assert ( column_expr( self.dataset, "'hello world'", deepcopy(query), ParsingContext() ) == "'hello world'" ) # Complex expressions (function calls) involving both string and column arguments assert ( column_expr( self.dataset, tuplify(["concat", ["a", "':'", "b"]]), deepcopy(query), ParsingContext(), ) == "concat(a, ':', b)" ) group_id_query = deepcopy(query) assert ( column_expr(self.dataset, "group_id", group_id_query, ParsingContext()) == "(nullIf(group_id, 0) AS group_id)" ) # turn uniq() into ifNull(uniq(), 0) so it doesn't return null where a number was expected. assert ( column_expr( self.dataset, "tags[environment]", deepcopy(query), ParsingContext(), alias="unique_envs", aggregate="uniq", ) == "(ifNull(uniq(environment), 0) AS unique_envs)" )
def test_conditions_expr(self, dataset): state.set_config("use_escape_alias", 1) conditions = [["a", "=", 1]] source = (dataset.get_all_storages() [0].get_schemas().get_read_schema().get_data_source()) assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "a = 1") conditions = [] assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "") conditions = [[[]], []] assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "") conditions = [[["a", "=", 1]]] assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "a = 1") conditions = [["a", "=", 1], ["b", "=", 2]] assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "a = 1 AND b = 2") conditions = [[["a", "=", 1], ["b", "=", 2]]] assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "(a = 1 OR b = 2)") conditions = [[["a", "=", 1], ["b", "=", 2]], ["c", "=", 3]] assert (conditions_expr(dataset, conditions, Query( {}, source), ParsingContext()) == "(a = 1 OR b = 2) AND c = 3") conditions = [[["a", "=", 1], ["b", "=", 2]], [["c", "=", 3], ["d", "=", 4]]] assert (conditions_expr( dataset, conditions, Query({}, source), ParsingContext()) == "(a = 1 OR b = 2) AND (c = 3 OR d = 4)") # Malformed condition input conditions = [[["a", "=", 1], []]] assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "a = 1") # Test column expansion conditions = [[["tags[foo]", "=", 1], ["b", "=", 2]]] expanded = column_expr(dataset, "tags[foo]", Query({}, source), ParsingContext()) assert conditions_expr( dataset, conditions, Query({}, source), ParsingContext()) == "({} = 1 OR b = 2)".format(expanded) # Test using alias if column has already been expanded in SELECT clause reuse_query = Query({}, source) parsing_context = ParsingContext() conditions = [[["tags[foo]", "=", 1], ["b", "=", 2]]] column_expr( dataset, "tags[foo]", reuse_query, parsing_context) # Expand it once so the next time is aliased assert (conditions_expr( dataset, conditions, reuse_query, parsing_context) == "(`tags[foo]` = 1 OR b = 2)") # Test special output format of LIKE conditions = [["primary_hash", "LIKE", "%foo%"]] assert (conditions_expr(dataset, conditions, Query( {}, source), ParsingContext()) == "primary_hash LIKE '%foo%'") conditions = tuplify( [[["notEmpty", ["arrayElement", ["exception_stacks.type", 1]]], "=", 1]]) assert ( conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "notEmpty(arrayElement((exception_stacks.type AS `exception_stacks.type`), 1)) = 1" ) conditions = tuplify([[["notEmpty", ["tags[sentry:user]"]], "=", 1]]) assert (conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 1") conditions = tuplify([[["notEmpty", ["tags_key"]], "=", 1]]) assert (conditions_expr( dataset, conditions, Query({"conditions": [[["notEmpty", ["tags_key"]], "=", 1]]}, source), ParsingContext(), ) == "notEmpty((arrayJoin(tags.key) AS tags_key)) = 1") conditions = tuplify([ [ [["notEmpty", ["tags[sentry:environment]"]], "=", "dev"], [["notEmpty", ["tags[sentry:environment]"]], "=", "prod"], ], [ [["notEmpty", ["tags[sentry:user]"]], "=", "joe"], [["notEmpty", ["tags[sentry:user]"]], "=", "bob"], ], ]) assert ( conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == """(notEmpty((tags.value[indexOf(tags.key, 'sentry:environment')] AS `tags[sentry:environment]`)) = 'dev' OR notEmpty(`tags[sentry:environment]`) = 'prod') AND (notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 'joe' OR notEmpty(`tags[sentry:user]`) = 'bob')""" ) # Test scalar condition on array column is expanded as an iterator. conditions = [["exception_frames.filename", "LIKE", "%foo%"]] assert ( conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "arrayExists(x -> assumeNotNull(x LIKE '%foo%'), (exception_frames.filename AS `exception_frames.filename`))" ) # Test negative scalar condition on array column is expanded as an all() type iterator. conditions = [["exception_frames.filename", "NOT LIKE", "%foo%"]] assert ( conditions_expr(dataset, conditions, Query({}, source), ParsingContext()) == "arrayAll(x -> assumeNotNull(x NOT LIKE '%foo%'), (exception_frames.filename AS `exception_frames.filename`))" ) # Test that a duplicate IN condition is deduplicated even if # the lists are in different orders.[ conditions = tuplify([["platform", "IN", ["a", "b", "c"]], ["platform", "IN", ["c", "b", "a"]]]) assert (conditions_expr(dataset, conditions, Query( {}, source), ParsingContext()) == "platform IN ('a', 'b', 'c')")
def test_simple_column_expr(): dataset = get_dataset("groups") state.set_config('use_escape_alias', 1) body = {'granularity': 86400} query = Query(body) assert column_expr(dataset, "events.event_id", deepcopy(query), ParsingContext()) \ == "(events.event_id AS `events.event_id`)" assert column_expr(dataset, "groups.id", deepcopy(query), ParsingContext()) \ == "(groups.id AS `groups.id`)" assert column_expr(dataset, "events.event_id", deepcopy(query), ParsingContext(), "MyVerboseAlias") \ == "(events.event_id AS MyVerboseAlias)" # Single tag expression assert column_expr(dataset, 'events.tags[foo]', deepcopy(query), ParsingContext()) ==\ "(events.tags.value[indexOf(events.tags.key, \'foo\')] AS `events.tags[foo]`)" # Promoted tag expression / no translation assert column_expr(dataset, 'events.tags[server_name]', deepcopy(query), ParsingContext()) ==\ "(events.server_name AS `events.tags[server_name]`)" # All tag keys expression assert column_expr(dataset, 'events.tags_key', deepcopy(query), ParsingContext()) == ( '(arrayJoin(events.tags.key) AS `events.tags_key`)') # If we are going to use both tags_key and tags_value, expand both tag_group_body = {'groupby': ['events.tags_key', 'events.tags_value']} parsing_context = ParsingContext() assert column_expr( dataset, 'events.tags_key', Query(tag_group_body), parsing_context ) == ( '(((arrayJoin(arrayMap((x,y) -> [x,y], events.tags.key, events.tags.value)) ' 'AS all_tags))[1] AS `events.tags_key`)') assert column_expr(dataset, 'events.time', deepcopy(query), ParsingContext()) ==\ "(toDate(events.timestamp) AS `events.time`)" assert column_expr(dataset, 'events.col', deepcopy(query), ParsingContext(), aggregate='sum') ==\ "(sum(events.col) AS `events.col`)" assert column_expr(dataset, 'events.col', deepcopy(query), ParsingContext(), alias='summation', aggregate='sum') ==\ "(sum(events.col) AS summation)" assert column_expr(dataset, '', deepcopy(query), ParsingContext(), alias='aggregate', aggregate='count()') ==\ "(count() AS aggregate)" # Columns that need escaping assert column_expr(dataset, 'events.sentry:release', deepcopy(query), ParsingContext()) == '`events.sentry:release`' # A 'column' that is actually a string literal assert column_expr(dataset, '\'hello world\'', deepcopy(query), ParsingContext()) == '\'hello world\'' # Complex expressions (function calls) involving both string and column arguments assert column_expr(dataset, tuplify(['concat', ['a', '\':\'', 'b']]), deepcopy(query), ParsingContext()) == 'concat(a, \':\', b)' group_id_body = deepcopy(query) assert column_expr( dataset, 'events.issue', group_id_body, ParsingContext()) == '(nullIf(events.group_id, 0) AS `events.issue`)' # turn uniq() into ifNull(uniq(), 0) so it doesn't return null where a number was expected. assert column_expr( dataset, 'events.tags[environment]', deepcopy(query), ParsingContext(), alias='unique_envs', aggregate='uniq' ) == "(ifNull(uniq(events.environment), 0) AS unique_envs)"
def test_column_expr(self): body = {'granularity': 86400} # Single tag expression assert column_expr('tags[foo]', body.copy()) ==\ "(tags.value[indexOf(tags.key, \'foo\')] AS `tags[foo]`)" # Promoted tag expression / no translation assert column_expr('tags[server_name]', body.copy()) ==\ "(server_name AS `tags[server_name]`)" # Promoted tag expression / with translation assert column_expr('tags[app.device]', body.copy()) ==\ "(app_device AS `tags[app.device]`)" # All tag keys expression assert column_expr( 'tags_key', body.copy()) == ('(arrayJoin(tags.key) AS tags_key)') # If we are going to use both tags_key and tags_value, expand both tag_group_body = {'groupby': ['tags_key', 'tags_value']} assert column_expr('tags_key', tag_group_body) == ( '(((arrayJoin(arrayMap((x,y) -> [x,y], tags.key, tags.value)) ' 'AS all_tags))[1] AS tags_key)') assert column_expr('time', body.copy()) ==\ "(toDate(timestamp) AS time)" assert column_expr('col', body.copy(), aggregate='sum') ==\ "(sum(col) AS col)" assert column_expr(None, body.copy(), alias='sum', aggregate='sum') ==\ "sum" # This should probably be an error as its an aggregate with no column assert column_expr('col', body.copy(), alias='summation', aggregate='sum') ==\ "(sum(col) AS summation)" # Special cases where count() doesn't need a column assert column_expr('', body.copy(), alias='count', aggregate='count()') ==\ "(count() AS count)" assert column_expr('', body.copy(), alias='aggregate', aggregate='count()') ==\ "(count() AS aggregate)" # Columns that need escaping assert column_expr('sentry:release', body.copy()) == '`sentry:release`' # Columns that start with a negative sign (used in orderby to signify # sort order) retain the '-' sign outside the escaping backticks (if any) assert column_expr('-timestamp', body.copy()) == '-timestamp' assert column_expr('-sentry:release', body.copy()) == '-`sentry:release`' # A 'column' that is actually a string literal assert column_expr('\'hello world\'', body.copy()) == '\'hello world\'' # Complex expressions (function calls) involving both string and column arguments assert column_expr(tuplify(['concat', ['a', '\':\'', 'b']]), body.copy()) == 'concat(a, \':\', b)' group_id_body = body.copy() assert column_expr('issue', group_id_body) == '(group_id AS issue)'
def test_complex_conditions_expr(self): body = {} assert complex_column_expr(tuplify(['count', []]), body.copy()) == 'count()' assert complex_column_expr(tuplify(['notEmpty', ['foo']]), body.copy()) == 'notEmpty(foo)' assert complex_column_expr( tuplify(['notEmpty', ['arrayElement', ['foo', 1]]]), body.copy()) == 'notEmpty(arrayElement(foo, 1))' assert complex_column_expr(tuplify(['foo', ['bar', ['qux'], 'baz']]), body.copy()) == 'foo(bar(qux), baz)' assert complex_column_expr(tuplify(['foo', [], 'a']), body.copy()) == '(foo() AS a)' assert complex_column_expr(tuplify(['foo', ['b', 'c'], 'd']), body.copy()) == '(foo(b, c) AS d)' assert complex_column_expr(tuplify(['foo', ['b', 'c', ['d']]]), body.copy()) == 'foo(b, c(d))' # we may move these to special Snuba function calls in the future assert complex_column_expr(tuplify(['topK', [3], ['project_id']]), body.copy()) == 'topK(3)(project_id)' assert complex_column_expr( tuplify(['topK', [3], ['project_id'], 'baz']), body.copy()) == '(topK(3)(project_id) AS baz)' assert complex_column_expr(tuplify(['emptyIfNull', ['project_id']]), body.copy()) == 'ifNull(project_id, \'\')' assert complex_column_expr( tuplify(['emptyIfNull', ['project_id'], 'foo']), body.copy()) == '(ifNull(project_id, \'\') AS foo)' # TODO once search_message is filled in everywhere, this can be just 'message' again. message_expr = '(coalesce(search_message, message) AS message)' assert complex_column_expr(tuplify(['positionCaseInsensitive', ['message', "'lol 'single' quotes'"]]), body.copy())\ == "positionCaseInsensitive({message_expr}, 'lol \\'single\\' quotes')".format(**locals()) # dangerous characters are allowed but escaped in literals and column names assert complex_column_expr(tuplify(['safe', ['fo`o', "'ba'r'"]]), body.copy()) == r"safe(`fo\`o`, 'ba\'r')" # Dangerous characters not allowed in functions with pytest.raises(AssertionError): assert complex_column_expr( tuplify([r"dang'erous", ['message', '`']]), body.copy()) # Or nested functions with pytest.raises(AssertionError): assert complex_column_expr( tuplify([r"safe", ['dang`erous', ['message']]]), body.copy())
def test_conditions_expr(self, dataset): state.set_config('use_escape_alias', 1) conditions = [['a', '=', 1]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'a = 1' conditions = [[['a', '=', 1]]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'a = 1' conditions = [['a', '=', 1], ['b', '=', 2]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'a = 1 AND b = 2' conditions = [[['a', '=', 1], ['b', '=', 2]]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == '(a = 1 OR b = 2)' conditions = [[['a', '=', 1], ['b', '=', 2]], ['c', '=', 3]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == '(a = 1 OR b = 2) AND c = 3' conditions = [[['a', '=', 1], ['b', '=', 2]], [['c', '=', 3], ['d', '=', 4]]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == '(a = 1 OR b = 2) AND (c = 3 OR d = 4)' # Malformed condition input conditions = [[['a', '=', 1], []]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'a = 1' # Test column expansion conditions = [[['tags[foo]', '=', 1], ['b', '=', 2]]] expanded = column_expr(dataset, 'tags[foo]', Query({}), ParsingContext()) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == '({} = 1 OR b = 2)'.format(expanded) # Test using alias if column has already been expanded in SELECT clause reuse_query = Query({}) parsing_context = ParsingContext() conditions = [[['tags[foo]', '=', 1], ['b', '=', 2]]] column_expr(dataset, 'tags[foo]', reuse_query, parsing_context) # Expand it once so the next time is aliased assert conditions_expr(dataset, conditions, reuse_query, parsing_context) == '(`tags[foo]` = 1 OR b = 2)' # Test special output format of LIKE conditions = [['primary_hash', 'LIKE', '%foo%']] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'primary_hash LIKE \'%foo%\'' conditions = tuplify([[['notEmpty', ['arrayElement', ['exception_stacks.type', 1]]], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'notEmpty(arrayElement((exception_stacks.type AS `exception_stacks.type`), 1)) = 1' conditions = tuplify([[['notEmpty', ['tags[sentry:user]']], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 1' conditions = tuplify([[['notEmpty', ['tags_key']], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'notEmpty((arrayJoin(tags.key) AS tags_key)) = 1' conditions = tuplify([ [ [['notEmpty', ['tags[sentry:environment]']], '=', 'dev'], [['notEmpty', ['tags[sentry:environment]']], '=', 'prod'] ], [ [['notEmpty', ['tags[sentry:user]']], '=', 'joe'], [['notEmpty', ['tags[sentry:user]']], '=', 'bob'] ], ]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == \ """(notEmpty((tags.value[indexOf(tags.key, 'sentry:environment')] AS `tags[sentry:environment]`)) = 'dev' OR notEmpty(`tags[sentry:environment]`) = 'prod') AND (notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 'joe' OR notEmpty(`tags[sentry:user]`) = 'bob')""" # Test scalar condition on array column is expanded as an iterator. conditions = [['exception_frames.filename', 'LIKE', '%foo%']] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'arrayExists(x -> assumeNotNull(x LIKE \'%foo%\'), (exception_frames.filename AS `exception_frames.filename`))' # Test negative scalar condition on array column is expanded as an all() type iterator. conditions = [['exception_frames.filename', 'NOT LIKE', '%foo%']] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == 'arrayAll(x -> assumeNotNull(x NOT LIKE \'%foo%\'), (exception_frames.filename AS `exception_frames.filename`))' # Test that a duplicate IN condition is deduplicated even if # the lists are in different orders.[ conditions = tuplify([ ['platform', 'IN', ['a', 'b', 'c']], ['platform', 'IN', ['c', 'b', 'a']] ]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == "platform IN ('a', 'b', 'c')"
def test_simple_column_expr(): dataset = get_dataset("groups") source = (dataset.get_all_storages() [0].get_schemas().get_read_schema().get_data_source()) body = {"granularity": 86400} query = Query(body, source) assert (column_expr( dataset, "events.event_id", deepcopy(query), ParsingContext()) == "(events.event_id AS `events.event_id`)") assert (column_expr(dataset, "groups.id", deepcopy(query), ParsingContext()) == "(groups.id AS `groups.id`)") assert (column_expr( dataset, "events.event_id", deepcopy(query), ParsingContext(), "MyVerboseAlias", ) == "(events.event_id AS MyVerboseAlias)") # Single tag expression assert ( column_expr(dataset, "events.tags[foo]", deepcopy(query), ParsingContext()) == "(events.tags.value[indexOf(events.tags.key, 'foo')] AS `events.tags[foo]`)" ) # Promoted tag expression / no translation assert (column_expr(dataset, "events.tags[server_name]", deepcopy(query), ParsingContext()) == "(events.server_name AS `events.tags[server_name]`)") # All tag keys expression q = Query({"selected_columns": ["events.tags_key"]}, source) assert column_expr(dataset, "events.tags_key", q, ParsingContext()) == ( "(arrayJoin(events.tags.key) AS `events.tags_key`)") # If we are going to use both tags_key and tags_value, expand both tag_group_body = {"groupby": ["events.tags_key", "events.tags_value"]} parsing_context = ParsingContext() assert column_expr(dataset, "events.tags_key", Query( tag_group_body, source ), parsing_context) == ( "(((arrayJoin(arrayMap((x,y) -> [x,y], events.tags.key, events.tags.value)) " "AS all_tags))[1] AS `events.tags_key`)") assert (column_expr( dataset, "events.time", deepcopy(query), ParsingContext()) == "(toDate(events.timestamp) AS `events.time`)") assert (column_expr( dataset, "events.col", deepcopy(query), ParsingContext(), aggregate="sum") == "(sum(events.col) AS `events.col`)") assert (column_expr( dataset, "events.col", deepcopy(query), ParsingContext(), alias="summation", aggregate="sum", ) == "(sum(events.col) AS summation)") assert (column_expr( dataset, "", deepcopy(query), ParsingContext(), alias="aggregate", aggregate="count()", ) == "(count() AS aggregate)") # Columns that need escaping assert (column_expr(dataset, "events.sentry:release", deepcopy(query), ParsingContext()) == "`events.sentry:release`") # A 'column' that is actually a string literal assert (column_expr(dataset, "'hello world'", deepcopy(query), ParsingContext()) == "'hello world'") # Complex expressions (function calls) involving both string and column arguments assert (column_expr( dataset, tuplify(["concat", ["a", "':'", "b"]]), deepcopy(query), ParsingContext(), ) == "concat(a, ':', b)") group_id_body = deepcopy(query) assert (column_expr(dataset, "events.group_id", group_id_body, ParsingContext()) == "(nullIf(events.group_id, 0) AS `events.group_id`)") # turn uniq() into ifNull(uniq(), 0) so it doesn't return null where a number was expected. assert (column_expr( dataset, "events.tags[environment]", deepcopy(query), ParsingContext(), alias="unique_envs", aggregate="uniq", ) == "(ifNull(uniq(events.environment), 0) AS unique_envs)")
def _parse_query_impl(body: MutableMapping[str, Any], entity: Entity) -> Query: def build_selected_expressions( raw_expressions: Sequence[Any], ) -> List[SelectedExpression]: output = [] for raw_expression in raw_expressions: exp = parse_expression(tuplify(raw_expression), entity.get_data_model(), set()) output.append( SelectedExpression( # An expression in the query can be a string or a # complex list with an alias. In the second case # we trust the parser to find the alias. name=raw_expression if isinstance(raw_expression, str) else exp.alias, expression=exp, )) return output aggregations = [] for aggregation in body.get("aggregations", []): if not isinstance(aggregation, Sequence): raise ParsingException(( f"Invalid aggregation structure {aggregation}. " "It must be a sequence containing expression, column and alias." )) aggregation_function = aggregation[0] column_expr = aggregation[1] column_expr = column_expr if column_expr else [] alias = aggregation[2] alias = alias if alias else None aggregations.append( SelectedExpression( name=alias, expression=parse_aggregation( aggregation_function, column_expr, alias, entity.get_data_model(), set(), ), )) groupby_clause = build_selected_expressions( to_list(body.get("groupby", []))) select_clause = ( groupby_clause + aggregations + build_selected_expressions(body.get("selected_columns", []))) array_join_cols = set() arrayjoin = body.get("arrayjoin") # TODO: Properly detect all array join columns in all clauses of the query. # This is missing an arrayJoin in condition with an alias that is then # used in the select. if arrayjoin: array_join_cols.add(arrayjoin) array_join_expr: Optional[Expression] = parse_expression( body["arrayjoin"], entity.get_data_model(), {arrayjoin}) else: array_join_expr = None for select_expr in select_clause: if isinstance(select_expr.expression, FunctionCall): if select_expr.expression.function_name == "arrayJoin": parameters = select_expr.expression.parameters if len(parameters) != 1: raise ParsingException( "arrayJoin(...) only accepts a single parameter.") if isinstance(parameters[0], Column): array_join_cols.add(parameters[0].column_name) else: # We only accepts columns or functions that do not # reference columns. We could not say whether we are # actually arrayjoining on the values of the column # if it is nested in an arbitrary function. But # functions of literals are fine. for e in parameters[0]: if isinstance(e, Column): raise ParsingException( "arrayJoin(...) cannot contain columns nested in functions." ) where_expr = parse_conditions_to_expr(body.get("conditions", []), entity, array_join_cols) having_expr = parse_conditions_to_expr(body.get("having", []), entity, array_join_cols) orderby_exprs = [] for orderby in to_list(body.get("orderby", [])): if isinstance(orderby, str): match = NEGATE_RE.match(orderby) if match is None: raise ParsingException(( f"Invalid Order By clause {orderby}. If the Order By is a string, " "it must respect the format `[-]column`")) direction, col = match.groups() orderby = col elif is_function(orderby): match = NEGATE_RE.match(orderby[0]) if match is None: raise ParsingException(( f"Invalid Order By clause {orderby}. If the Order By is an expression, " "the function name must respect the format `[-]func_name`" )) direction, col = match.groups() orderby = [col] + orderby[1:] else: raise ParsingException( (f"Invalid Order By clause {orderby}. The Clause was neither " "a string nor a function call.")) orderby_parsed = parse_expression(tuplify(orderby), entity.get_data_model(), set()) orderby_exprs.append( OrderBy( OrderByDirection.DESC if direction == "-" else OrderByDirection.ASC, orderby_parsed, )) return Query( body, None, selected_columns=select_clause, array_join=array_join_expr, condition=where_expr, groupby=[g.expression for g in groupby_clause], having=having_expr, order_by=orderby_exprs, )
def test_conditions_expr(): dataset = get_dataset("groups") source = (dataset.get_all_storages() [0].get_schemas().get_read_schema().get_data_source()) state.set_config("use_escape_alias", 1) conditions = [["events.a", "=", 1]] query = Query({}, source) assert (conditions_expr( dataset, conditions, deepcopy(query), ParsingContext()) == "(events.a AS `events.a`) = 1") conditions = [ [["events.a", "=", 1], ["groups.b", "=", 2]], [["events.c", "=", 3], ["groups.d", "=", 4]], ] assert conditions_expr( dataset, conditions, deepcopy(query), ParsingContext() ) == ( "((events.a AS `events.a`) = 1 OR (groups.b AS `groups.b`) = 2)" " AND ((events.c AS `events.c`) = 3 OR (groups.d AS `groups.d`) = 4)") # Test column expansion conditions = [[["events.tags[foo]", "=", 1], ["groups.b", "=", 2]]] expanded = column_expr(dataset, "events.tags[foo]", deepcopy(query), ParsingContext()) assert conditions_expr( dataset, conditions, deepcopy(query), ParsingContext()) == "({} = 1 OR (groups.b AS `groups.b`) = 2)".format( expanded) # Test using alias if column has already been expanded in SELECT clause reuse_query = deepcopy(query) parsing_context = ParsingContext() conditions = [[["events.tags[foo]", "=", 1], ["groups.b", "=", 2]]] column_expr(dataset, "events.tags[foo]", reuse_query, parsing_context) # Expand it once so the next time is aliased assert (conditions_expr(dataset, conditions, reuse_query, parsing_context) == "(`events.tags[foo]` = 1 OR (groups.b AS `groups.b`) = 2)") # Test special output format of LIKE conditions = [["events.primary_hash", "LIKE", "%foo%"]] assert (conditions_expr(dataset, conditions, deepcopy(query), ParsingContext()) == "(events.primary_hash AS `events.primary_hash`) LIKE '%foo%'") conditions = tuplify( [[["notEmpty", ["arrayElement", ["events.exception_stacks.type", 1]]], "=", 1]]) assert ( conditions_expr(dataset, conditions, deepcopy(query), ParsingContext()) == "notEmpty(arrayElement((events.exception_stacks.type AS `events.exception_stacks.type`), 1)) = 1" ) conditions = tuplify([[["notEmpty", ["events.tags[sentry:user]"]], "=", 1]]) assert (conditions_expr( dataset, conditions, deepcopy(query), ParsingContext()) == "notEmpty(`events.tags[sentry:user]`) = 1") conditions = tuplify([[["notEmpty", ["events.tags_key"]], "=", 1]]) q = Query({"selected_columns": ["events.tags_key"]}, source) assert (conditions_expr(dataset, conditions, q, ParsingContext()) == "notEmpty((arrayJoin(events.tags.key) AS `events.tags_key`)) = 1") # Test scalar condition on array column is expanded as an iterator. conditions = [["events.exception_frames.filename", "LIKE", "%foo%"]] assert ( conditions_expr(dataset, conditions, deepcopy(query), ParsingContext()) == "arrayExists(x -> assumeNotNull(x LIKE '%foo%'), (events.exception_frames.filename AS `events.exception_frames.filename`))" )
import pytest from snuba.datasets.entities import EntityKey from snuba.datasets.entities.factory import get_entity from snuba.query.conditions import ( BooleanFunctions, ConditionFunctions, binary_condition, ) from snuba.query.expressions import Argument, Column, FunctionCall, Lambda, Literal from snuba.query.parser.functions import parse_function_to_expr from snuba.util import tuplify test_data = [ (tuplify(["count", []]), FunctionCall(None, "count", ())), ( tuplify(["notEmpty", ["foo"]]), FunctionCall(None, "notEmpty", (Column(None, None, "foo"), )), ), ( tuplify(["notEmpty", ["arrayElement", ["foo", 1]]]), FunctionCall( None, "notEmpty", (FunctionCall(None, "arrayElement", (Column(None, None, "foo"), Literal(None, 1))), ), ), ), ( tuplify(["foo", ["bar", ["qux"], "baz"]]), FunctionCall(
def parse_and_run_query(validated_body, timer): body = deepcopy(validated_body) turbo = body.get('turbo', False) max_days, table, date_align, config_sample, force_final, max_group_ids_exclude = state.get_configs([ ('max_days', None), ('clickhouse_table', settings.CLICKHOUSE_TABLE), ('date_align_seconds', 1), ('sample', 1), # 1: always use FINAL, 0: never use final, undefined/None: use project setting. ('force_final', 0 if turbo else None), ('max_group_ids_exclude', settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE), ]) stats = {} to_date = util.parse_datetime(body['to_date'], date_align) from_date = util.parse_datetime(body['from_date'], date_align) assert from_date <= to_date if max_days is not None and (to_date - from_date).days > max_days: from_date = to_date - timedelta(days=max_days) where_conditions = body.get('conditions', []) where_conditions.extend([ ('timestamp', '>=', from_date), ('timestamp', '<', to_date), ('deleted', '=', 0), ]) # NOTE: we rely entirely on the schema to make sure that regular snuba # queries are required to send a project_id filter. Some other special # internal query types do not require a project_id filter. project_ids = util.to_list(body['project']) if project_ids: where_conditions.append(('project_id', 'IN', project_ids)) having_conditions = body.get('having', []) aggregate_exprs = [ util.column_expr(col, body, alias, agg) for (agg, col, alias) in body['aggregations'] ] groupby = util.to_list(body['groupby']) group_exprs = [util.column_expr(gb, body) for gb in groupby] selected_cols = [util.column_expr(util.tuplify(colname), body) for colname in body.get('selected_columns', [])] select_exprs = group_exprs + aggregate_exprs + selected_cols select_clause = u'SELECT {}'.format(', '.join(select_exprs)) from_clause = u'FROM {}'.format(table) # For now, we only need FINAL if: # 1. The project has been marked as needing FINAL (in redis) because of recent # replacements (and it affects too many groups for us just to exclude # those groups from the query) # OR # 2. the force_final setting = 1 needs_final, exclude_group_ids = get_projects_query_flags(project_ids) if len(exclude_group_ids) > max_group_ids_exclude: # Cap the number of groups to exclude by query and flip to using FINAL if necessary needs_final = True exclude_group_ids = [] used_final = False if force_final == 1 or (force_final is None and needs_final): from_clause = u'{} FINAL'.format(from_clause) used_final = True elif exclude_group_ids: where_conditions.append(('group_id', 'NOT IN', exclude_group_ids)) sample = body.get('sample', settings.TURBO_SAMPLE_RATE if turbo else config_sample) if sample != 1: from_clause = u'{} SAMPLE {}'.format(from_clause, sample) joins = [] if 'arrayjoin' in body: joins.append(u'ARRAY JOIN {}'.format(body['arrayjoin'])) join_clause = ' '.join(joins) where_clause = '' if where_conditions: where_conditions = list(set(util.tuplify(where_conditions))) where_clause = u'WHERE {}'.format(util.conditions_expr(where_conditions, body)) prewhere_conditions = [] if settings.PREWHERE_KEYS: # Add any condition to PREWHERE if: # - It is a single top-level condition (not OR-nested), and # - Any of its referenced columns are in PREWHERE_KEYS prewhere_candidates = [ (util.columns_in_expr(cond[0]), cond) for cond in where_conditions if util.is_condition(cond) and any(col in settings.PREWHERE_KEYS for col in util.columns_in_expr(cond[0])) ] # Use the condition that has the highest priority (based on the # position of its columns in the PREWHERE_KEYS list) prewhere_candidates = sorted([ (min(settings.PREWHERE_KEYS.index(col) for col in cols if col in settings.PREWHERE_KEYS), cond) for cols, cond in prewhere_candidates ]) if prewhere_candidates: prewhere_conditions = [cond for _, cond in prewhere_candidates][:settings.MAX_PREWHERE_CONDITIONS] prewhere_clause = '' if prewhere_conditions: prewhere_clause = u'PREWHERE {}'.format(util.conditions_expr(prewhere_conditions, body)) having_clause = '' if having_conditions: assert groupby, 'found HAVING clause with no GROUP BY' having_clause = u'HAVING {}'.format(util.conditions_expr(having_conditions, body)) group_clause = ', '.join(util.column_expr(gb, body) for gb in groupby) if group_clause: if body.get('totals', False): group_clause = 'GROUP BY ({}) WITH TOTALS'.format(group_clause) else: group_clause = 'GROUP BY ({})'.format(group_clause) order_clause = '' if body.get('orderby'): orderby = [util.column_expr(util.tuplify(ob), body) for ob in util.to_list(body['orderby'])] orderby = [u'{} {}'.format( ob.lstrip('-'), 'DESC' if ob.startswith('-') else 'ASC' ) for ob in orderby] order_clause = u'ORDER BY {}'.format(', '.join(orderby)) limitby_clause = '' if 'limitby' in body: limitby_clause = 'LIMIT {} BY {}'.format(*body['limitby']) limit_clause = '' if 'limit' in body: limit_clause = 'LIMIT {}, {}'.format(body.get('offset', 0), body['limit']) sql = ' '.join([c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause ] if c]) timer.mark('prepare_query') stats.update({ 'clickhouse_table': table, 'final': used_final, 'referrer': request.referrer, 'num_days': (to_date - from_date).days, 'num_projects': len(project_ids), 'sample': sample, }) return util.raw_query( validated_body, sql, clickhouse_ro, timer, stats )