def test_alias_in_alias(self): body = {'groupby': ['tags_key', 'tags_value']} assert column_expr('tags_key', body) == ( '(((arrayJoin(arrayMap((x,y) -> [x,y], tags.key, tags.value)) ' 'AS all_tags))[1] AS tags_key)') # If we want to use `tags_key` again, make sure we use the # already-created alias verbatim assert column_expr('tags_key', body) == 'tags_key' # If we also want to use `tags_value`, make sure that we use # the `all_tags` alias instead of re-expanding the tags arrayJoin assert column_expr('tags_value', body) == '((all_tags)[2] AS tags_value)'
def test_duplicate_expression_alias(self): body = { 'aggregations': [ ['topK(3)', 'logger', 'dupe_alias'], ['uniq', 'environment', 'dupe_alias'], ] } # In the case where 2 different expressions are aliased # to the same thing, one ends up overwriting the other. # This may not be ideal as it may mask bugs in query conditions exprs = [ column_expr(col, body, alias, agg) for (agg, col, alias) in body['aggregations'] ] assert exprs == ['(topK(3)(logger) AS dupe_alias)', 'dupe_alias']
def test_column_expr(self): body = {'granularity': 86400} # Single tag expression assert column_expr('tags[foo]', body.copy()) ==\ "(tags.value[indexOf(tags.key, \'foo\')] AS `tags[foo]`)" # Promoted tag expression / no translation assert column_expr('tags[server_name]', body.copy()) ==\ "(server_name AS `tags[server_name]`)" # Promoted tag expression / with translation assert column_expr('tags[app.device]', body.copy()) ==\ "(app_device AS `tags[app.device]`)" # All tag keys expression assert column_expr( 'tags_key', body.copy()) == ('(arrayJoin(tags.key) AS tags_key)') # If we are going to use both tags_key and tags_value, expand both tag_group_body = {'groupby': ['tags_key', 'tags_value']} assert column_expr('tags_key', tag_group_body) == ( '(((arrayJoin(arrayMap((x,y) -> [x,y], tags.key, tags.value)) ' 'AS all_tags))[1] AS tags_key)') assert column_expr('time', body.copy()) ==\ "(toDate(timestamp) AS time)" assert column_expr('col', body.copy(), aggregate='sum') ==\ "(sum(col) AS col)" assert column_expr(None, body.copy(), alias='sum', aggregate='sum') ==\ "sum" # This should probably be an error as its an aggregate with no column assert column_expr('col', body.copy(), alias='summation', aggregate='sum') ==\ "(sum(col) AS summation)" # Special cases where count() doesn't need a column assert column_expr('', body.copy(), alias='count', aggregate='count()') ==\ "(count() AS count)" assert column_expr('', body.copy(), alias='aggregate', aggregate='count()') ==\ "(count() AS aggregate)" # Columns that need escaping assert column_expr('sentry:release', body.copy()) == '`sentry:release`' # Columns that start with a negative sign (used in orderby to signify # sort order) retain the '-' sign outside the escaping backticks (if any) assert column_expr('-timestamp', body.copy()) == '-timestamp' assert column_expr('-sentry:release', body.copy()) == '-`sentry:release`' # A 'column' that is actually a string literal assert column_expr('\'hello world\'', body.copy()) == '\'hello world\'' # Complex expressions (function calls) involving both string and column arguments assert column_expr(tuplify(['concat', ['a', '\':\'', 'b']]), body.copy()) == 'concat(a, \':\', b)' group_id_body = body.copy() assert column_expr('issue', group_id_body) == '(group_id AS issue)'
def test_conditions_expr(self): conditions = [['a', '=', 1]] assert conditions_expr(conditions, {}) == 'a = 1' conditions = [[['a', '=', 1]]] assert conditions_expr(conditions, {}) == 'a = 1' conditions = [['a', '=', 1], ['b', '=', 2]] assert conditions_expr(conditions, {}) == 'a = 1 AND b = 2' conditions = [[['a', '=', 1], ['b', '=', 2]]] assert conditions_expr(conditions, {}) == '(a = 1 OR b = 2)' conditions = [[['a', '=', 1], ['b', '=', 2]], ['c', '=', 3]] assert conditions_expr(conditions, {}) == '(a = 1 OR b = 2) AND c = 3' conditions = [[['a', '=', 1], ['b', '=', 2]], [['c', '=', 3], ['d', '=', 4]]] assert conditions_expr(conditions, {}) == '(a = 1 OR b = 2) AND (c = 3 OR d = 4)' # Malformed condition input conditions = [[['a', '=', 1], []]] assert conditions_expr(conditions, {}) == 'a = 1' # Test column expansion conditions = [[['tags[foo]', '=', 1], ['b', '=', 2]]] expanded = column_expr('tags[foo]', {}) assert conditions_expr(conditions, {}) == '({} = 1 OR b = 2)'.format(expanded) # Test using alias if column has already been expanded in SELECT clause reuse_body = {} conditions = [[['tags[foo]', '=', 1], ['b', '=', 2]]] column_expr('tags[foo]', reuse_body) # Expand it once so the next time is aliased assert conditions_expr(conditions, reuse_body) == '(`tags[foo]` = 1 OR b = 2)' # Test special output format of LIKE conditions = [['primary_hash', 'LIKE', '%foo%']] assert conditions_expr(conditions, {}) == 'primary_hash LIKE \'%foo%\'' conditions = tuplify( [[['notEmpty', ['arrayElement', ['exception_stacks.type', 1]]], '=', 1]]) assert conditions_expr( conditions, {}) == 'notEmpty(arrayElement(exception_stacks.type, 1)) = 1' conditions = tuplify([[['notEmpty', ['tags[sentry:user]']], '=', 1]]) assert conditions_expr( conditions, {}) == 'notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 1' conditions = tuplify([[['notEmpty', ['tags_key']], '=', 1]]) assert conditions_expr( conditions, {}) == 'notEmpty((arrayJoin(tags.key) AS tags_key)) = 1' conditions = tuplify([ [[['notEmpty', ['tags[sentry:environment]']], '=', 'dev'], [['notEmpty', ['tags[sentry:environment]']], '=', 'prod']], [[['notEmpty', ['tags[sentry:user]']], '=', 'joe'], [['notEmpty', ['tags[sentry:user]']], '=', 'bob']], ]) assert conditions_expr(conditions, {}) == \ """(notEmpty((tags.value[indexOf(tags.key, 'sentry:environment')] AS `tags[sentry:environment]`)) = 'dev' OR notEmpty(`tags[sentry:environment]`) = 'prod') AND (notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 'joe' OR notEmpty(`tags[sentry:user]`) = 'bob')""" # Test scalar condition on array column is expanded as an iterator. conditions = [['exception_frames.filename', 'LIKE', '%foo%']] assert conditions_expr( conditions, {} ) == 'arrayExists(x -> assumeNotNull(x LIKE \'%foo%\'), exception_frames.filename)' # Test negative scalar condition on array column is expanded as an all() type iterator. conditions = [['exception_frames.filename', 'NOT LIKE', '%foo%']] assert conditions_expr( conditions, {} ) == 'arrayAll(x -> assumeNotNull(x NOT LIKE \'%foo%\'), exception_frames.filename)'
def test_column_expr(self): body = {'granularity': 86400} # Single tag expression assert column_expr(self.dataset, 'tags[foo]', body.copy()) ==\ "(tags.value[indexOf(tags.key, \'foo\')] AS `tags[foo]`)" # Promoted tag expression / no translation assert column_expr(self.dataset, 'tags[server_name]', body.copy()) ==\ "(server_name AS `tags[server_name]`)" # Promoted tag expression / with translation assert column_expr(self.dataset, 'tags[app.device]', body.copy()) ==\ "(app_device AS `tags[app.device]`)" # All tag keys expression assert column_expr( self.dataset, 'tags_key', body.copy()) == ('(arrayJoin(tags.key) AS tags_key)') # If we are going to use both tags_key and tags_value, expand both tag_group_body = {'groupby': ['tags_key', 'tags_value']} assert column_expr(self.dataset, 'tags_key', tag_group_body) == ( '(((arrayJoin(arrayMap((x,y) -> [x,y], tags.key, tags.value)) ' 'AS all_tags))[1] AS tags_key)') assert column_expr(self.dataset, 'time', body.copy()) ==\ "(toDate(timestamp) AS time)" assert column_expr(self.dataset, 'rtime', body.copy()) ==\ "(toDate(received) AS rtime)" assert column_expr(self.dataset, 'col', body.copy(), aggregate='sum') ==\ "(sum(col) AS col)" assert column_expr(self.dataset, 'col', body.copy(), alias='summation', aggregate='sum') ==\ "(sum(col) AS summation)" # Special cases where count() doesn't need a column assert column_expr(self.dataset, '', body.copy(), alias='count', aggregate='count()') ==\ "(count() AS count)" assert column_expr(self.dataset, '', body.copy(), alias='aggregate', aggregate='count()') ==\ "(count() AS aggregate)" # Columns that need escaping assert column_expr(self.dataset, 'sentry:release', body.copy()) == '`sentry:release`' # Columns that start with a negative sign (used in orderby to signify # sort order) retain the '-' sign outside the escaping backticks (if any) assert column_expr(self.dataset, '-timestamp', body.copy()) == '-timestamp' assert column_expr(self.dataset, '-sentry:release', body.copy()) == '-`sentry:release`' # A 'column' that is actually a string literal assert column_expr(self.dataset, '\'hello world\'', body.copy()) == '\'hello world\'' # Complex expressions (function calls) involving both string and column arguments assert column_expr(self.dataset, tuplify(['concat', ['a', '\':\'', 'b']]), body.copy()) == 'concat(a, \':\', b)' group_id_body = body.copy() assert column_expr(self.dataset, 'issue', group_id_body) == '(nullIf(group_id, 0) AS issue)' assert column_expr( self.dataset, 'group_id', group_id_body) == '(nullIf(group_id, 0) AS group_id)' # turn uniq() into ifNull(uniq(), 0) so it doesn't return null where a number was expected. assert column_expr(self.dataset, 'tags[environment]', body.copy(), alias='unique_envs', aggregate='uniq' ) == "(ifNull(uniq(environment), 0) AS unique_envs)"
def format(self) -> str: """Generate a SQL string from the parameters.""" body = self.__request.body query = self.__request.query source = self.__dataset \ .get_dataset_schemas() \ .get_read_schema() \ .get_data_source() aggregate_exprs = [ util.column_expr(self.__dataset, col, body, alias, agg) for (agg, col, alias) in query.get_aggregations() ] groupby = util.to_list(query.get_groupby()) group_exprs = [ util.column_expr(self.__dataset, gb, body) for gb in groupby ] column_names = query.get_selected_columns() or [] selected_cols = [ util.column_expr(self.__dataset, util.tuplify(colname), body) for colname in column_names ] select_clause = u'SELECT {}'.format( ', '.join(group_exprs + aggregate_exprs + selected_cols)) from_clause = u'FROM {}'.format(source) if self.__final: from_clause = u'{} FINAL'.format(from_clause) if query.get_sample(): from_clause = u'{} SAMPLE {}'.format(from_clause, query.get_sample()) join_clause = '' if 'arrayjoin' in body: join_clause = u'ARRAY JOIN {}'.format(body['arrayjoin']) where_clause = '' if query.get_conditions(): where_clause = u'WHERE {}'.format( util.conditions_expr(self.__dataset, query.get_conditions(), body)) prewhere_clause = '' if self.__prewhere_conditions: prewhere_clause = u'PREWHERE {}'.format( util.conditions_expr(self.__dataset, self.__prewhere_conditions, body)) group_clause = '' if groupby: group_clause = 'GROUP BY ({})'.format(', '.join( util.column_expr(self.__dataset, gb, body) for gb in groupby)) if body.get('totals', False): group_clause = '{} WITH TOTALS'.format(group_clause) having_clause = '' having_conditions = body.get('having', []) if having_conditions: assert groupby, 'found HAVING clause with no GROUP BY' having_clause = u'HAVING {}'.format( util.conditions_expr(self.__dataset, having_conditions, body)) order_clause = '' if query.get_orderby(): orderby = [ util.column_expr(self.__dataset, util.tuplify(ob), body) for ob in util.to_list(query.get_orderby()) ] orderby = [ u'{} {}'.format(ob.lstrip('-'), 'DESC' if ob.startswith('-') else 'ASC') for ob in orderby ] order_clause = u'ORDER BY {}'.format(', '.join(orderby)) limitby_clause = '' if 'limitby' in body: limitby_clause = 'LIMIT {} BY {}'.format(*body['limitby']) limit_clause = '' if 'limit' in body: limit_clause = 'LIMIT {}, {}'.format(query.get_offset(), body['limit']) return ' '.join([ c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause ] if c ])
def parse_and_run_query(validated_body, timer): body = deepcopy(validated_body) turbo = body.get('turbo', False) max_days, table, date_align, config_sample, force_final, max_group_ids_exclude = state.get_configs([ ('max_days', None), ('clickhouse_table', settings.CLICKHOUSE_TABLE), ('date_align_seconds', 1), ('sample', 1), # 1: always use FINAL, 0: never use final, undefined/None: use project setting. ('force_final', 0 if turbo else None), ('max_group_ids_exclude', settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE), ]) stats = {} to_date = util.parse_datetime(body['to_date'], date_align) from_date = util.parse_datetime(body['from_date'], date_align) assert from_date <= to_date if max_days is not None and (to_date - from_date).days > max_days: from_date = to_date - timedelta(days=max_days) where_conditions = body.get('conditions', []) where_conditions.extend([ ('timestamp', '>=', from_date), ('timestamp', '<', to_date), ('deleted', '=', 0), ]) # NOTE: we rely entirely on the schema to make sure that regular snuba # queries are required to send a project_id filter. Some other special # internal query types do not require a project_id filter. project_ids = util.to_list(body['project']) if project_ids: where_conditions.append(('project_id', 'IN', project_ids)) having_conditions = body.get('having', []) aggregate_exprs = [ util.column_expr(col, body, alias, agg) for (agg, col, alias) in body['aggregations'] ] groupby = util.to_list(body['groupby']) group_exprs = [util.column_expr(gb, body) for gb in groupby] selected_cols = [util.column_expr(util.tuplify(colname), body) for colname in body.get('selected_columns', [])] select_exprs = group_exprs + aggregate_exprs + selected_cols select_clause = u'SELECT {}'.format(', '.join(select_exprs)) from_clause = u'FROM {}'.format(table) # For now, we only need FINAL if: # 1. The project has been marked as needing FINAL (in redis) because of recent # replacements (and it affects too many groups for us just to exclude # those groups from the query) # OR # 2. the force_final setting = 1 needs_final, exclude_group_ids = get_projects_query_flags(project_ids) if len(exclude_group_ids) > max_group_ids_exclude: # Cap the number of groups to exclude by query and flip to using FINAL if necessary needs_final = True exclude_group_ids = [] used_final = False if force_final == 1 or (force_final is None and needs_final): from_clause = u'{} FINAL'.format(from_clause) used_final = True elif exclude_group_ids: where_conditions.append(('group_id', 'NOT IN', exclude_group_ids)) sample = body.get('sample', settings.TURBO_SAMPLE_RATE if turbo else config_sample) if sample != 1: from_clause = u'{} SAMPLE {}'.format(from_clause, sample) joins = [] if 'arrayjoin' in body: joins.append(u'ARRAY JOIN {}'.format(body['arrayjoin'])) join_clause = ' '.join(joins) where_clause = '' if where_conditions: where_conditions = list(set(util.tuplify(where_conditions))) where_clause = u'WHERE {}'.format(util.conditions_expr(where_conditions, body)) prewhere_conditions = [] if settings.PREWHERE_KEYS: # Add any condition to PREWHERE if: # - It is a single top-level condition (not OR-nested), and # - Any of its referenced columns are in PREWHERE_KEYS prewhere_candidates = [ (util.columns_in_expr(cond[0]), cond) for cond in where_conditions if util.is_condition(cond) and any(col in settings.PREWHERE_KEYS for col in util.columns_in_expr(cond[0])) ] # Use the condition that has the highest priority (based on the # position of its columns in the PREWHERE_KEYS list) prewhere_candidates = sorted([ (min(settings.PREWHERE_KEYS.index(col) for col in cols if col in settings.PREWHERE_KEYS), cond) for cols, cond in prewhere_candidates ]) if prewhere_candidates: prewhere_conditions = [cond for _, cond in prewhere_candidates][:settings.MAX_PREWHERE_CONDITIONS] prewhere_clause = '' if prewhere_conditions: prewhere_clause = u'PREWHERE {}'.format(util.conditions_expr(prewhere_conditions, body)) having_clause = '' if having_conditions: assert groupby, 'found HAVING clause with no GROUP BY' having_clause = u'HAVING {}'.format(util.conditions_expr(having_conditions, body)) group_clause = ', '.join(util.column_expr(gb, body) for gb in groupby) if group_clause: if body.get('totals', False): group_clause = 'GROUP BY ({}) WITH TOTALS'.format(group_clause) else: group_clause = 'GROUP BY ({})'.format(group_clause) order_clause = '' if body.get('orderby'): orderby = [util.column_expr(util.tuplify(ob), body) for ob in util.to_list(body['orderby'])] orderby = [u'{} {}'.format( ob.lstrip('-'), 'DESC' if ob.startswith('-') else 'ASC' ) for ob in orderby] order_clause = u'ORDER BY {}'.format(', '.join(orderby)) limitby_clause = '' if 'limitby' in body: limitby_clause = 'LIMIT {} BY {}'.format(*body['limitby']) limit_clause = '' if 'limit' in body: limit_clause = 'LIMIT {}, {}'.format(body.get('offset', 0), body['limit']) sql = ' '.join([c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause ] if c]) timer.mark('prepare_query') stats.update({ 'clickhouse_table': table, 'final': used_final, 'referrer': request.referrer, 'num_days': (to_date - from_date).days, 'num_projects': len(project_ids), 'sample': sample, }) return util.raw_query( validated_body, sql, clickhouse_ro, timer, stats )