Exemplo n.º 1
0
def parse_and_run_query(dataset, request: Request, timer) -> QueryResult:
    from_date, to_date = TimeSeriesExtensionProcessor.get_time_limit(
        request.extensions['timeseries'])

    extensions = dataset.get_extensions()

    for name, extension in extensions.items():
        extension.get_processor().process_query(request.query,
                                                request.extensions[name],
                                                request.settings)
    request.query.add_conditions(dataset.default_conditions())

    if request.settings.get_turbo():
        request.query.set_final(False)

    prewhere_conditions = []
    # Add any condition to PREWHERE if:
    # - It is a single top-level condition (not OR-nested), and
    # - Any of its referenced columns are in dataset.get_prewhere_keys()
    prewhere_candidates = [(util.columns_in_expr(cond[0]), cond)
                           for cond in request.query.get_conditions()
                           if util.is_condition(cond) and any(
                               col in dataset.get_prewhere_keys()
                               for col in util.columns_in_expr(cond[0]))]
    # Use the condition that has the highest priority (based on the
    # position of its columns in the prewhere keys list)
    prewhere_candidates = sorted(
        [(min(dataset.get_prewhere_keys().index(col)
              for col in cols if col in dataset.get_prewhere_keys()), cond)
         for cols, cond in prewhere_candidates],
        key=lambda priority_and_col: priority_and_col[0])
    if prewhere_candidates:
        prewhere_conditions = [cond for _, cond in prewhere_candidates
                               ][:settings.MAX_PREWHERE_CONDITIONS]
        request.query.set_conditions(
            list(
                filter(lambda cond: cond not in prewhere_conditions,
                       request.query.get_conditions())))

    source = dataset.get_dataset_schemas().get_read_schema().get_data_source()
    # TODO: consider moving the performance logic and the pre_where generation into
    # ClickhouseQuery since they are Clickhouse specific
    query = ClickhouseQuery(dataset, request.query, request.settings,
                            prewhere_conditions)
    timer.mark('prepare_query')

    stats = {
        'clickhouse_table': source,
        'final': request.query.get_final(),
        'referrer': http_request.referrer,
        'num_days': (to_date - from_date).days,
        'sample': request.query.get_sample(),
    }

    return raw_query(request, query, clickhouse_ro, timer, stats)
Exemplo n.º 2
0
 def _get_prewhere_candidates(
     self, query: Query, prewhere_keys: Sequence[str]
 ) -> Sequence[Tuple[Iterable[str], Condition]]:
     # Add any condition to PREWHERE if:
     # - It is a single top-level condition (not OR-nested), and
     # - Any of its referenced columns are in prewhere_keys
     conditions = query.get_conditions()
     if not conditions:
         return []
     return [(util.columns_in_expr(cond[0]), cond) for cond in conditions
             if util.is_condition(cond) and cond[1] in ALLOWED_OPERATORS
             and any(col in prewhere_keys
                     for col in util.columns_in_expr(cond[0]))]
Exemplo n.º 3
0
def all_referenced_columns(query: Query):
    """
    Return the set of all columns that are used by a query.
    """
    col_exprs: MutableSequence[Any] = []

    if query.get_arrayjoin():
        col_exprs.extend(to_list(query.get_arrayjoin()))
    if query.get_groupby():
        col_exprs.extend(to_list(query.get_groupby()))
    if query.get_orderby():
        col_exprs.extend(to_list(query.get_orderby()))
    if query.get_selected_columns():
        col_exprs.extend(to_list(query.get_selected_columns()))

    # Conditions need flattening as they can be nested as AND/OR
    if query.get_conditions():
        flat_conditions = list(
            chain(*[[c] if is_condition(c) else c
                    for c in query.get_conditions()]))
        col_exprs.extend([c[0] for c in flat_conditions])

    if query.get_aggregations():
        col_exprs.extend([a[1] for a in query.get_aggregations()])

    # Return the set of all columns referenced in any expression
    return set(chain(*[columns_in_expr(ex) for ex in col_exprs]))
Exemplo n.º 4
0
    def __get_filter_tags(self, query: Query) -> List[str]:
        """
        Identifies the tag names we can apply the arrayFilter optimization on.
        Which means: if the tags_key column is in the select clause and there are
        one or more top level conditions on the tags_key column.

        We can only apply the arrayFilter optimization to tag keys conditions
        that are not in OR with other columns. To simplify the problem, we only
        consider those conditions that are included in the first level of the query:
        [['tagskey' '=' 'a'],['col' '=' 'b'],['col2' '=' 'c']]  works
        [[['tagskey' '=' 'a'], ['col2' '=' 'b']], ['tagskey' '=' 'c']] does not
        """
        if not state.get_config("ast_tag_processor_enabled", 1):
            return []

        tags_key_found = any(
            "tags_key" in columns_in_expr(expression)
            for expression in query.get_selected_columns() or []
        )

        if not tags_key_found:
            return []

        def extract_tags_from_condition(
            cond: Sequence[Condition],
        ) -> Optional[List[str]]:
            if not cond:
                return []

            ret = []
            for c in cond:
                if not is_condition(c):
                    # This is an OR
                    return None

                if c[1] == "=" and c[0] == "tags_key" and isinstance(c[2], str):
                    ret.append(str(c[2]))

                elif (
                    c[1] == "IN"
                    and c[0] == "tags_key"
                    and isinstance(c[2], (list, tuple))
                ):
                    ret.extend([str(tag) for tag in c[2]])

            return ret

        cond_tags_key = extract_tags_from_condition(query.get_conditions() or [])
        if cond_tags_key is None:
            # This means we found an OR. Cowardly we give up even though there could
            # be cases where this condition is still optimizable.
            return []
        having_tags_key = extract_tags_from_condition(query.get_having() or [])
        if having_tags_key is None:
            # Same as above
            return []

        return cond_tags_key + having_tags_key
Exemplo n.º 5
0
 def process_query(self, query: Query, request_settings: RequestSettings,) -> None:
     max_prewhere_conditions: int = (
         self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS
     )
     prewhere_keys = query.get_data_source().get_prewhere_candidates()
     if not prewhere_keys:
         return
     prewhere_conditions: Sequence[Condition] = []
     # Add any condition to PREWHERE if:
     # - It is a single top-level condition (not OR-nested), and
     # - Any of its referenced columns are in prewhere_keys
     conditions = query.get_conditions()
     if not conditions:
         return
     prewhere_candidates = [
         (util.columns_in_expr(cond[0]), cond)
         for cond in conditions
         if util.is_condition(cond)
         and any(col in prewhere_keys for col in util.columns_in_expr(cond[0]))
     ]
     # Use the condition that has the highest priority (based on the
     # position of its columns in the prewhere keys list)
     prewhere_candidates = sorted(
         [
             (
                 min(
                     prewhere_keys.index(col) for col in cols if col in prewhere_keys
                 ),
                 cond,
             )
             for cols, cond in prewhere_candidates
         ],
         key=lambda priority_and_col: priority_and_col[0],
     )
     if prewhere_candidates:
         prewhere_conditions = [cond for _, cond in prewhere_candidates][
             :max_prewhere_conditions
         ]
         query.set_conditions(
             list(filter(lambda cond: cond not in prewhere_conditions, conditions))
         )
     query.set_prewhere(prewhere_conditions)
Exemplo n.º 6
0
 def __get_referenced_columns(
         self, col_exprs: MutableSequence[Any]) -> Sequence[Any]:
     return set(chain(*[columns_in_expr(ex) for ex in col_exprs]))
Exemplo n.º 7
0
def parse_and_run_query(validated_body, timer):
    body = deepcopy(validated_body)
    turbo = body.get('turbo', False)
    max_days, table, date_align, config_sample, force_final, max_group_ids_exclude = state.get_configs([
        ('max_days', None),
        ('clickhouse_table', settings.CLICKHOUSE_TABLE),
        ('date_align_seconds', 1),
        ('sample', 1),
        # 1: always use FINAL, 0: never use final, undefined/None: use project setting.
        ('force_final', 0 if turbo else None),
        ('max_group_ids_exclude', settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE),
    ])
    stats = {}
    to_date = util.parse_datetime(body['to_date'], date_align)
    from_date = util.parse_datetime(body['from_date'], date_align)
    assert from_date <= to_date

    if max_days is not None and (to_date - from_date).days > max_days:
        from_date = to_date - timedelta(days=max_days)

    where_conditions = body.get('conditions', [])
    where_conditions.extend([
        ('timestamp', '>=', from_date),
        ('timestamp', '<', to_date),
        ('deleted', '=', 0),
    ])
    # NOTE: we rely entirely on the schema to make sure that regular snuba
    # queries are required to send a project_id filter. Some other special
    # internal query types do not require a project_id filter.
    project_ids = util.to_list(body['project'])
    if project_ids:
        where_conditions.append(('project_id', 'IN', project_ids))

    having_conditions = body.get('having', [])

    aggregate_exprs = [
        util.column_expr(col, body, alias, agg)
        for (agg, col, alias) in body['aggregations']
    ]
    groupby = util.to_list(body['groupby'])
    group_exprs = [util.column_expr(gb, body) for gb in groupby]

    selected_cols = [util.column_expr(util.tuplify(colname), body)
                     for colname in body.get('selected_columns', [])]

    select_exprs = group_exprs + aggregate_exprs + selected_cols
    select_clause = u'SELECT {}'.format(', '.join(select_exprs))

    from_clause = u'FROM {}'.format(table)

    # For now, we only need FINAL if:
    #    1. The project has been marked as needing FINAL (in redis) because of recent
    #       replacements (and it affects too many groups for us just to exclude
    #       those groups from the query)
    #    OR
    #    2. the force_final setting = 1
    needs_final, exclude_group_ids = get_projects_query_flags(project_ids)
    if len(exclude_group_ids) > max_group_ids_exclude:
        # Cap the number of groups to exclude by query and flip to using FINAL if necessary
        needs_final = True
        exclude_group_ids = []

    used_final = False
    if force_final == 1 or (force_final is None and needs_final):
        from_clause = u'{} FINAL'.format(from_clause)
        used_final = True
    elif exclude_group_ids:
        where_conditions.append(('group_id', 'NOT IN', exclude_group_ids))

    sample = body.get('sample', settings.TURBO_SAMPLE_RATE if turbo else config_sample)
    if sample != 1:
        from_clause = u'{} SAMPLE {}'.format(from_clause, sample)

    joins = []

    if 'arrayjoin' in body:
        joins.append(u'ARRAY JOIN {}'.format(body['arrayjoin']))
    join_clause = ' '.join(joins)

    where_clause = ''
    if where_conditions:
        where_conditions = list(set(util.tuplify(where_conditions)))
        where_clause = u'WHERE {}'.format(util.conditions_expr(where_conditions, body))

    prewhere_conditions = []
    if settings.PREWHERE_KEYS:
        # Add any condition to PREWHERE if:
        # - It is a single top-level condition (not OR-nested), and
        # - Any of its referenced columns are in PREWHERE_KEYS
        prewhere_candidates = [
            (util.columns_in_expr(cond[0]), cond)
            for cond in where_conditions if util.is_condition(cond) and
            any(col in settings.PREWHERE_KEYS for col in util.columns_in_expr(cond[0]))
        ]
        # Use the condition that has the highest priority (based on the
        # position of its columns in the PREWHERE_KEYS list)
        prewhere_candidates = sorted([
            (min(settings.PREWHERE_KEYS.index(col) for col in cols if col in settings.PREWHERE_KEYS), cond)
            for cols, cond in prewhere_candidates
        ])
        if prewhere_candidates:
            prewhere_conditions = [cond for _, cond in prewhere_candidates][:settings.MAX_PREWHERE_CONDITIONS]

    prewhere_clause = ''
    if prewhere_conditions:
        prewhere_clause = u'PREWHERE {}'.format(util.conditions_expr(prewhere_conditions, body))

    having_clause = ''
    if having_conditions:
        assert groupby, 'found HAVING clause with no GROUP BY'
        having_clause = u'HAVING {}'.format(util.conditions_expr(having_conditions, body))

    group_clause = ', '.join(util.column_expr(gb, body) for gb in groupby)
    if group_clause:
        if body.get('totals', False):
            group_clause = 'GROUP BY ({}) WITH TOTALS'.format(group_clause)
        else:
            group_clause = 'GROUP BY ({})'.format(group_clause)

    order_clause = ''
    if body.get('orderby'):
        orderby = [util.column_expr(util.tuplify(ob), body) for ob in util.to_list(body['orderby'])]
        orderby = [u'{} {}'.format(
            ob.lstrip('-'),
            'DESC' if ob.startswith('-') else 'ASC'
        ) for ob in orderby]
        order_clause = u'ORDER BY {}'.format(', '.join(orderby))

    limitby_clause = ''
    if 'limitby' in body:
        limitby_clause = 'LIMIT {} BY {}'.format(*body['limitby'])

    limit_clause = ''
    if 'limit' in body:
        limit_clause = 'LIMIT {}, {}'.format(body.get('offset', 0), body['limit'])

    sql = ' '.join([c for c in [
        select_clause,
        from_clause,
        join_clause,
        prewhere_clause,
        where_clause,
        group_clause,
        having_clause,
        order_clause,
        limitby_clause,
        limit_clause
    ] if c])

    timer.mark('prepare_query')

    stats.update({
        'clickhouse_table': table,
        'final': used_final,
        'referrer': request.referrer,
        'num_days': (to_date - from_date).days,
        'num_projects': len(project_ids),
        'sample': sample,
    })

    return util.raw_query(
        validated_body, sql, clickhouse_ro, timer, stats
    )