def test_edit_query(): query = Query({ "selected_columns": ["c1", "c2", "c3"], "conditions": [["c1", "=", "a"]], "groupby": ["project_id"], "aggregations": [["count()", "", "count"]], "orderby": "event_id", "sample": 10, "limit": 100, "offset": 50, }) query.set_selected_columns(["c4"]) assert query.get_selected_columns() == ["c4"] query.set_aggregations([["different_agg()", "", "something"]]) assert query.get_aggregations() == [["different_agg()", "", "something"]] query.add_groupby(["more", "more2"]) assert query.get_groupby() == ["project_id", "more", "more2"] query.add_conditions([["c5", "=", "9"]]) assert query.get_conditions() == [ ["c1", "=", "a"], ["c5", "=", "9"], ] query.set_conditions([["c6", "=", "10"]]) assert query.get_conditions() == [ ["c6", "=", "10"], ]
class TestProjectExtensionWithGroups(BaseTest): def setup_method(self, test_method): super().setup_method(test_method) raw_data = {'project': 2} self.extension = ProjectExtension( processor=ProjectWithGroupsProcessor() ) self.valid_data = validate_jsonschema(raw_data, self.extension.get_schema()) self.query = Query({ "conditions": [] }) def test_with_turbo(self): request_settings = RequestSettings(turbo=True, consistent=False, debug=False) self.extension.get_processor().process_query(self.query, self.valid_data, request_settings) assert self.query.get_conditions() == [('project_id', 'IN', [2])] def test_without_turbo_with_projects_needing_final(self): request_settings = RequestSettings(turbo=False, consistent=False, debug=False) replacer.set_project_needs_final(2) self.extension.get_processor().process_query(self.query, self.valid_data, request_settings) assert self.query.get_conditions() == [('project_id', 'IN', [2])] assert self.query.get_final() def test_without_turbo_without_projects_needing_final(self): request_settings = RequestSettings(turbo=False, consistent=False, debug=False) self.extension.get_processor().process_query(self.query, self.valid_data, request_settings) assert self.query.get_conditions() == [('project_id', 'IN', [2])] assert not self.query.get_final() def test_when_there_are_not_many_groups_to_exclude(self): request_settings = RequestSettings(turbo=False, consistent=False, debug=False) state.set_config('max_group_ids_exclude', 5) replacer.set_project_exclude_groups(2, [100, 101, 102]) self.extension.get_processor().process_query(self.query, self.valid_data, request_settings) expected = [ ('project_id', 'IN', [2]), (['assumeNotNull', ['group_id']], 'NOT IN', [100, 101, 102]) ] assert self.query.get_conditions() == expected assert not self.query.get_final() def test_when_there_are_too_many_groups_to_exclude(self): request_settings = RequestSettings(turbo=False, consistent=False, debug=False) state.set_config('max_group_ids_exclude', 2) replacer.set_project_exclude_groups(2, [100, 101, 102]) self.extension.get_processor().process_query(self.query, self.valid_data, request_settings) assert self.query.get_conditions() == [('project_id', 'IN', [2])] assert self.query.get_final()
def all_referenced_columns(query: Query): """ Return the set of all columns that are used by a query. """ col_exprs: MutableSequence[Any] = [] if query.get_arrayjoin(): col_exprs.extend(to_list(query.get_arrayjoin())) if query.get_groupby(): col_exprs.extend(to_list(query.get_groupby())) if query.get_orderby(): col_exprs.extend(to_list(query.get_orderby())) if query.get_selected_columns(): col_exprs.extend(to_list(query.get_selected_columns())) # Conditions need flattening as they can be nested as AND/OR if query.get_conditions(): flat_conditions = list( chain(*[[c] if is_condition(c) else c for c in query.get_conditions()])) col_exprs.extend([c[0] for c in flat_conditions]) if query.get_aggregations(): col_exprs.extend([a[1] for a in query.get_aggregations()]) # Return the set of all columns referenced in any expression return set(chain(*[columns_in_expr(ex) for ex in col_exprs]))
def detect_table(query: Query, events_only_columns: ColumnSet, transactions_only_columns: ColumnSet) -> str: """ Given a query, we attempt to guess whether it is better to fetch data from the "events" or "transactions" storage. This is going to be wrong in some cases. """ # First check for a top level condition that matches either type = transaction # type != transaction. conditions = query.get_conditions() if conditions: for idx, condition in enumerate(conditions): if is_condition(condition): if tuple(condition) == ("type", "=", "error"): return EVENTS elif tuple(condition) == ("type", "=", "transaction"): return TRANSACTIONS # Check for any conditions that reference a table specific field condition_columns = query.get_columns_referenced_in_conditions() if any(events_only_columns.get(col) for col in condition_columns): return EVENTS if any(transactions_only_columns.get(col) for col in condition_columns): return TRANSACTIONS # Check for any other references to a table specific field all_referenced_columns = query.get_all_referenced_columns() if any(events_only_columns.get(col) for col in all_referenced_columns): return EVENTS if any( transactions_only_columns.get(col) for col in all_referenced_columns): return TRANSACTIONS # Use events by default return EVENTS
def test_full_query(): query = Query( { "selected_columns": ["c1", "c2", "c3"], "conditions": [["c1", "=", "a"]], "arrayjoin": "tags", "having": [["c4", "=", "c"]], "groupby": ["project_id"], "aggregations": [["count()", "", "count"]], "orderby": "event_id", "limitby": (100, "environment"), "sample": 10, "limit": 100, "offset": 50, "totals": True, "granularity": 60, }, TableSource("my_table", ColumnSet([])), ) assert query.get_selected_columns() == ["c1", "c2", "c3"] assert query.get_aggregations() == [["count()", "", "count"]] assert query.get_groupby() == ["project_id"] assert query.get_conditions() == [["c1", "=", "a"]] assert query.get_arrayjoin() == "tags" assert query.get_having() == [["c4", "=", "c"]] assert query.get_orderby() == "event_id" assert query.get_limitby() == (100, "environment") assert query.get_sample() == 10 assert query.get_limit() == 100 assert query.get_offset() == 50 assert query.has_totals() is True assert query.get_granularity() == 60 assert query.get_data_source().format_from() == "my_table"
def test_query_extension_processing( raw_data: dict, expected_conditions: Sequence[Condition], expected_granularity: int, ): state.set_config('max_days', 1) extension = TimeSeriesExtension( default_granularity=60, default_window=datetime.timedelta(days=5), timestamp_column='timestamp', ) valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query( {"conditions": []}, TableSource("my_table", ColumnSet([])), ) request_settings = RequestSettings(turbo=False, consistent=False, debug=False) extension.get_processor().process_query(query, valid_data, request_settings) assert query.get_conditions() == expected_conditions assert query.get_granularity() == expected_granularity
def test_edit_query(): query = Query( { "selected_columns": ["c1", "c2", "c3"], "conditions": [["c1", "=", "a"]], "arrayjoin": "tags", "having": [["c4", "=", "c"]], "groupby": ["project_id"], "aggregations": [["count()", "", "count"]], "orderby": "event_id", "limitby": (100, "environment"), "sample": 10, "limit": 100, "offset": 50, "totals": True, }, TableSource("my_table", ColumnSet([])), ) query.set_selected_columns(["c4"]) assert query.get_selected_columns() == ["c4"] query.set_aggregations([["different_agg()", "", "something"]]) assert query.get_aggregations() == [["different_agg()", "", "something"]] query.add_groupby(["more", "more2"]) assert query.get_groupby() == ["project_id", "more", "more2"] query.add_conditions([["c5", "=", "9"]]) assert query.get_conditions() == [ ["c1", "=", "a"], ["c5", "=", "9"], ] query.set_conditions([["c6", "=", "10"]]) assert query.get_conditions() == [ ["c6", "=", "10"], ] query.set_arrayjoin("not_tags") assert query.get_arrayjoin() == "not_tags" query.set_granularity(7200) assert query.get_granularity() == 7200 query.set_prewhere([["pc6", "=", "10"]]) assert query.get_prewhere() == [["pc6", "=", "10"]]
def test_prewhere(query_body, keys, new_conditions, prewhere_conditions) -> None: settings.MAX_PREWHERE_CONDITIONS = 2 query = Query(query_body, TableSource("my_table", ColumnSet([]), None, keys),) request_settings = HTTPRequestSettings() processor = PrewhereProcessor() processor.process_query(query, request_settings) assert query.get_conditions() == new_conditions assert query.get_prewhere() == prewhere_conditions
def test_empty_query(): query = Query({}) assert query.get_selected_columns() is None assert query.get_aggregations() is None assert query.get_groupby() is None assert query.get_conditions() is None assert query.get_orderby() is None assert query.get_sample() is None assert query.get_limit() == 0 assert query.get_offset() == 0
def test_project_extension_query_processing(raw_data: dict, expected_conditions: Sequence[Condition]): extension = ProjectExtension( processor=ProjectExtensionProcessor() ) valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query({ "conditions": [] }) request_settings = RequestSettings(turbo=False, consistent=False, debug=False) extension.get_processor().process_query(query, valid_data, request_settings) assert query.get_conditions() == expected_conditions
def test_query_extension_processing(raw_data: dict, expected_conditions: Sequence[Condition]): state.set_config('max_days', 1) extension = TimeSeriesExtension( default_granularity=3600, default_window=datetime.timedelta(days=5), timestamp_column='timestamp', ) valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query({"conditions": []}) extension.get_processor().process_query(query, valid_data) assert query.get_conditions() == expected_conditions
def test_organization_extension_query_processing_happy_path(): extension = OrganizationExtension() raw_data = {"organization": 2} valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query({"conditions": []}) request_settings = RequestSettings(turbo=False, consistent=False, debug=False) extension.get_processor().process_query(query, valid_data, request_settings) assert query.get_conditions() == [("org_id", "=", 2)]
def test_empty_query(): query = Query({}) assert query.get_selected_columns() is None assert query.get_aggregations() is None assert query.get_groupby() is None assert query.get_conditions() is None assert query.get_arrayjoin() is None assert query.get_having() == [] assert query.get_orderby() is None assert query.get_limitby() is None assert query.get_sample() is None assert query.get_limit() is None assert query.get_offset() == 0 assert query.has_totals() is False
def test_project_extension_query_processing( raw_data: dict, expected_conditions: Sequence[Condition], expected_ast_conditions: Expression, ): extension = ProjectExtension( processor=ProjectExtensionProcessor(project_column="project_id") ) valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query({"conditions": []}, TableSource("my_table", ColumnSet([])),) request_settings = HTTPRequestSettings() extension.get_processor().process_query(query, valid_data, request_settings) assert query.get_conditions() == expected_conditions assert query.get_condition_from_ast() == expected_ast_conditions
def test_empty_query(): query = Query({}, TableSource("my_table", ColumnSet([]))) assert query.get_selected_columns() is None assert query.get_aggregations() is None assert query.get_groupby() is None assert query.get_conditions() is None assert query.get_arrayjoin() is None assert query.get_having() == [] assert query.get_orderby() is None assert query.get_limitby() is None assert query.get_sample() is None assert query.get_limit() is None assert query.get_offset() == 0 assert query.has_totals() is False assert query.get_data_source().format_from() == "my_table"
def process_query(self, query: Query, request_settings: RequestSettings,) -> None: max_prewhere_conditions: int = ( self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS ) prewhere_keys = query.get_data_source().get_prewhere_candidates() if not prewhere_keys: return prewhere_conditions: Sequence[Condition] = [] # Add any condition to PREWHERE if: # - It is a single top-level condition (not OR-nested), and # - Any of its referenced columns are in prewhere_keys conditions = query.get_conditions() if not conditions: return prewhere_candidates = [ (util.columns_in_expr(cond[0]), cond) for cond in conditions if util.is_condition(cond) and any(col in prewhere_keys for col in util.columns_in_expr(cond[0])) ] # Use the condition that has the highest priority (based on the # position of its columns in the prewhere keys list) prewhere_candidates = sorted( [ ( min( prewhere_keys.index(col) for col in cols if col in prewhere_keys ), cond, ) for cols, cond in prewhere_candidates ], key=lambda priority_and_col: priority_and_col[0], ) if prewhere_candidates: prewhere_conditions = [cond for _, cond in prewhere_candidates][ :max_prewhere_conditions ] query.set_conditions( list(filter(lambda cond: cond not in prewhere_conditions, conditions)) ) query.set_prewhere(prewhere_conditions)
def test_full_query(): query = Query({ "selected_columns": ["c1", "c2", "c3"], "conditions": [["c1", "=", "a"]], "groupby": ["project_id"], "aggregations": [["count()", "", "count"]], "orderby": "event_id", "sample": 10, "limit": 100, "offset": 50, }) assert query.get_selected_columns() == ["c1", "c2", "c3"] assert query.get_aggregations() == [["count()", "", "count"]] assert query.get_groupby() == ["project_id"] assert query.get_conditions() == [["c1", "=", "a"]] assert query.get_orderby() == "event_id" assert query.get_sample() == 10 assert query.get_limit() == 100 assert query.get_offset() == 50
def test_query_extension_processing( raw_data: dict, expected_conditions: Sequence[Condition], expected_ast_condition: Expression, expected_granularity: int, ): state.set_config("max_days", 1) extension = TimeSeriesExtension( default_granularity=60, default_window=timedelta(days=5), timestamp_column="timestamp", ) valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query({"conditions": []}, TableSource("my_table", ColumnSet([])),) request_settings = HTTPRequestSettings() extension.get_processor().process_query(query, valid_data, request_settings) assert query.get_conditions() == expected_conditions assert query.get_condition_from_ast() == expected_ast_condition assert query.get_granularity() == expected_granularity
def process_query(self, query: Query, request_settings: RequestSettings) -> None: conditions = query.get_conditions() if not conditions: return # Enable the processor only if we have enough data in the flattened # columns. Which have been deployed at BEGINNING_OF_TIME. If the query # starts earlier than that we do not apply the optimization. if self.__beginning_of_time: apply_optimization = False for condition in conditions: if (is_condition(condition) and isinstance(condition[0], str) and condition[0] in self.__timestamp_cols and condition[1] in (">=", ">") and isinstance(condition[2], str)): try: start_ts = parse_datetime(condition[2]) if (start_ts - self.__beginning_of_time).total_seconds() > 0: apply_optimization = True except Exception: # We should not get here, it means the from timestamp is malformed # Returning here is just for safety logger.error( "Cannot parse start date for NestedFieldOptimizer: %r", condition, ) return if not apply_optimization: return # Do not use flattened tags if tags are being unpacked anyway. In that case # using flattened tags only implies loading an additional column thus making # the query heavier and slower if self.__has_tags(query.get_arrayjoin_from_ast()): return if query.get_groupby_from_ast(): for expression in query.get_groupby_from_ast(): if self.__has_tags(expression): return if self.__has_tags(query.get_having_from_ast()): return if query.get_orderby_from_ast(): for orderby in query.get_orderby_from_ast(): if self.__has_tags(orderby.expression): return new_conditions = [] positive_like_expression: List[str] = [] negative_like_expression: List[str] = [] for c in conditions: keyvalue = self.__is_optimizable(c, self.__nested_col) if not keyvalue: new_conditions.append(c) else: expression = f"{escape_field(keyvalue.nested_col_key)}={escape_field(keyvalue.value)}" if keyvalue.operand == Operand.EQ: positive_like_expression.append(expression) else: negative_like_expression.append(expression) if positive_like_expression: # Positive conditions "=" are all merged together in one LIKE expression positive_like_expression = sorted(positive_like_expression) like_formatted = f"%|{'|%|'.join(positive_like_expression)}|%" new_conditions.append( [self.__flattened_col, "LIKE", like_formatted]) for expression in negative_like_expression: # Negative conditions "!=" cannot be merged together. We can still transform # them into NOT LIKE statements, but each condition has to be one # statement. not_like_formatted = f"%|{expression}|%" new_conditions.append( [self.__flattened_col, "NOT LIKE", not_like_formatted]) query.set_conditions(new_conditions)
def __init__( self, dataset: Dataset, query: Query, settings: RequestSettings, ) -> None: parsing_context = ParsingContext() aggregate_exprs = [ column_expr(dataset, col, query, parsing_context, alias, agg) for (agg, col, alias) in query.get_aggregations() ] groupby = util.to_list(query.get_groupby()) group_exprs = [ column_expr(dataset, gb, query, parsing_context) for gb in groupby ] column_names = query.get_selected_columns() or [] selected_cols = [ column_expr(dataset, util.tuplify(colname), query, parsing_context) for colname in column_names ] select_clause = u"SELECT {}".format( ", ".join(group_exprs + aggregate_exprs + selected_cols)) from_clause = u"FROM {}".format(query.get_data_source().format_from()) if query.get_final(): from_clause = u"{} FINAL".format(from_clause) if not query.get_data_source().supports_sample(): sample_rate = None else: if query.get_sample(): sample_rate = query.get_sample() elif settings.get_turbo(): sample_rate = snuba_settings.TURBO_SAMPLE_RATE else: sample_rate = None if sample_rate: from_clause = u"{} SAMPLE {}".format(from_clause, sample_rate) join_clause = "" if query.get_arrayjoin(): join_clause = u"ARRAY JOIN {}".format(query.get_arrayjoin()) where_clause = "" if query.get_conditions(): where_clause = u"WHERE {}".format( conditions_expr(dataset, query.get_conditions(), query, parsing_context)) prewhere_clause = "" if query.get_prewhere(): prewhere_clause = u"PREWHERE {}".format( conditions_expr(dataset, query.get_prewhere(), query, parsing_context)) group_clause = "" if groupby: group_clause = "GROUP BY ({})".format(", ".join( column_expr(dataset, gb, query, parsing_context) for gb in groupby)) if query.has_totals(): group_clause = "{} WITH TOTALS".format(group_clause) having_clause = "" having_conditions = query.get_having() if having_conditions: assert groupby, "found HAVING clause with no GROUP BY" having_clause = u"HAVING {}".format( conditions_expr(dataset, having_conditions, query, parsing_context)) order_clause = "" if query.get_orderby(): orderby = [ column_expr(dataset, util.tuplify(ob), query, parsing_context) for ob in util.to_list(query.get_orderby()) ] orderby = [ u"{} {}".format(ob.lstrip("-"), "DESC" if ob.startswith("-") else "ASC") for ob in orderby ] order_clause = u"ORDER BY {}".format(", ".join(orderby)) limitby_clause = "" if query.get_limitby() is not None: limitby_clause = "LIMIT {} BY {}".format(*query.get_limitby()) limit_clause = "" if query.get_limit() is not None: limit_clause = "LIMIT {}, {}".format(query.get_offset(), query.get_limit()) self.__formatted_query = " ".join([ c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause, ] if c ])
def __init__( self, dataset: Dataset, query: Query, settings: RequestSettings, prewhere_conditions: Sequence[str], ) -> None: parsing_context = ParsingContext() aggregate_exprs = [ column_expr(dataset, col, query, parsing_context, alias, agg) for (agg, col, alias) in query.get_aggregations() ] groupby = util.to_list(query.get_groupby()) group_exprs = [ column_expr(dataset, gb, query, parsing_context) for gb in groupby ] column_names = query.get_selected_columns() or [] selected_cols = [ column_expr(dataset, util.tuplify(colname), query, parsing_context) for colname in column_names ] select_clause = u'SELECT {}'.format( ', '.join(group_exprs + aggregate_exprs + selected_cols)) from_clause = u'FROM {}'.format(query.get_data_source().format_from()) if query.get_final(): from_clause = u'{} FINAL'.format(from_clause) if query.get_sample(): sample_rate = query.get_sample() elif settings.get_turbo(): sample_rate = snuba_settings.TURBO_SAMPLE_RATE else: sample_rate = None if sample_rate: from_clause = u'{} SAMPLE {}'.format(from_clause, sample_rate) join_clause = '' if query.get_arrayjoin(): join_clause = u'ARRAY JOIN {}'.format(query.get_arrayjoin()) where_clause = '' if query.get_conditions(): where_clause = u'WHERE {}'.format( conditions_expr(dataset, query.get_conditions(), query, parsing_context)) prewhere_clause = '' if prewhere_conditions: prewhere_clause = u'PREWHERE {}'.format( conditions_expr(dataset, prewhere_conditions, query, parsing_context)) group_clause = '' if groupby: group_clause = 'GROUP BY ({})'.format(', '.join( column_expr(dataset, gb, query, parsing_context) for gb in groupby)) if query.has_totals(): group_clause = '{} WITH TOTALS'.format(group_clause) having_clause = '' having_conditions = query.get_having() if having_conditions: assert groupby, 'found HAVING clause with no GROUP BY' having_clause = u'HAVING {}'.format( conditions_expr(dataset, having_conditions, query, parsing_context)) order_clause = '' if query.get_orderby(): orderby = [ column_expr(dataset, util.tuplify(ob), query, parsing_context) for ob in util.to_list(query.get_orderby()) ] orderby = [ u'{} {}'.format(ob.lstrip('-'), 'DESC' if ob.startswith('-') else 'ASC') for ob in orderby ] order_clause = u'ORDER BY {}'.format(', '.join(orderby)) limitby_clause = '' if query.get_limitby() is not None: limitby_clause = 'LIMIT {} BY {}'.format(*query.get_limitby()) limit_clause = '' if query.get_limit() is not None: limit_clause = 'LIMIT {}, {}'.format(query.get_offset(), query.get_limit()) self.__formatted_query = ' '.join([ c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause ] if c ])
class TestProjectExtensionWithGroups(BaseTest): def setup_method(self, test_method): super().setup_method(test_method) raw_data = {"project": 2} self.extension = ProjectExtension( processor=ProjectWithGroupsProcessor(project_column="project_id") ) self.valid_data = validate_jsonschema(raw_data, self.extension.get_schema()) self.query = Query({"conditions": []}, TableSource("my_table", ColumnSet([])),) def test_with_turbo(self): request_settings = HTTPRequestSettings(turbo=True) self.extension.get_processor().process_query( self.query, self.valid_data, request_settings ) assert self.query.get_conditions() == [("project_id", "IN", [2])] assert self.query.get_condition_from_ast() == build_in("project_id", [2]) def test_without_turbo_with_projects_needing_final(self): request_settings = HTTPRequestSettings() replacer.set_project_needs_final(2) self.extension.get_processor().process_query( self.query, self.valid_data, request_settings ) assert self.query.get_conditions() == [("project_id", "IN", [2])] assert self.query.get_condition_from_ast() == build_in("project_id", [2]) assert self.query.get_final() def test_without_turbo_without_projects_needing_final(self): request_settings = HTTPRequestSettings() self.extension.get_processor().process_query( self.query, self.valid_data, request_settings ) assert self.query.get_conditions() == [("project_id", "IN", [2])] assert self.query.get_condition_from_ast() == build_in("project_id", [2]) assert not self.query.get_final() def test_when_there_are_not_many_groups_to_exclude(self): request_settings = HTTPRequestSettings() state.set_config("max_group_ids_exclude", 5) replacer.set_project_exclude_groups(2, [100, 101, 102]) self.extension.get_processor().process_query( self.query, self.valid_data, request_settings ) expected = [ ("project_id", "IN", [2]), (["assumeNotNull", ["group_id"]], "NOT IN", [100, 101, 102]), ] assert self.query.get_conditions() == expected assert self.query.get_condition_from_ast() == FunctionCall( None, BooleanFunctions.AND, ( FunctionCall( None, "notIn", ( FunctionCall( None, "assumeNotNull", (Column(None, "group_id", None),) ), FunctionCall( None, "tuple", ( Literal(None, 100), Literal(None, 101), Literal(None, 102), ), ), ), ), build_in("project_id", [2]), ), ) assert not self.query.get_final() def test_when_there_are_too_many_groups_to_exclude(self): request_settings = HTTPRequestSettings() state.set_config("max_group_ids_exclude", 2) replacer.set_project_exclude_groups(2, [100, 101, 102]) self.extension.get_processor().process_query( self.query, self.valid_data, request_settings ) assert self.query.get_conditions() == [("project_id", "IN", [2])] assert self.query.get_condition_from_ast() == build_in("project_id", [2]) assert self.query.get_final()