示例#1
0
    def validate(self, exp: Expression, entity: Entity) -> None:
        if not isinstance(exp, FunctionCall):
            return

        entity_validators = entity.get_function_call_validators()
        common_function_validators = (entity_validators.keys()
                                      & default_validators.keys())
        if common_function_validators:
            logger.warning(
                "Dataset validators are overlapping with default ones. Entity: %s. Overlap %r",
                entity,
                common_function_validators,
                exc_info=True,
            )

        validators = ChainMap(default_validators, entity_validators)
        try:
            # TODO: Decide whether these validators should exist at the Dataset or Entity level
            validator = validators.get(exp.function_name)
            if validator is not None:
                validator.validate(exp.parameters, entity.get_data_model())
        except InvalidFunctionCall as exception:
            raise InvalidExpressionException(
                exp,
                f"Illegal call to function {exp.function_name}: {str(exception)}",
            ) from exception
示例#2
0
文件: views.py 项目: getsentry/snuba
    def _write_to_entity(*, entity: EntityType) -> RespTuple:
        from snuba.processor import InsertBatch

        rows: MutableSequence[WriterTableRow] = []
        offset_base = int(round(time.time() * 1000))
        writable_storage = entity.get_writable_storage()
        assert writable_storage is not None
        table_writer = writable_storage.get_table_writer()

        for index, message in enumerate(json.loads(http_request.data)):
            offset = offset_base + index

            processed_message = (
                table_writer.get_stream_loader()
                .get_processor()
                .process_message(
                    message,
                    KafkaMessageMetadata(
                        offset=offset, partition=0, timestamp=datetime.utcnow()
                    ),
                )
            )
            if processed_message:
                assert isinstance(processed_message, InsertBatch)
                rows.extend(processed_message.rows)

        BatchWriterEncoderWrapper(
            table_writer.get_batch_writer(metrics),
            JSONRowEncoder(),
        ).write(rows)

        return ("ok", 200, {"Content-Type": "text/plain"})
示例#3
0
def enforce_table_writer(entity: Entity) -> TableWriter:
    writable_storage = entity.get_writable_storage()

    assert (
        writable_storage is not None
    ), f"Entity {ENTITY_NAME_LOOKUP[entity]} does not have a writable storage."
    return writable_storage.get_table_writer()
示例#4
0
def _parse_query_impl(body: MutableMapping[str, Any], entity: Entity) -> Query:
    def build_selected_expressions(
        raw_expressions: Sequence[Any], ) -> List[SelectedExpression]:
        output = []
        for raw_expression in raw_expressions:
            exp = parse_expression(tuplify(raw_expression),
                                   entity.get_data_model(), set())
            output.append(
                SelectedExpression(
                    # An expression in the query can be a string or a
                    # complex list with an alias. In the second case
                    # we trust the parser to find the alias.
                    name=raw_expression
                    if isinstance(raw_expression, str) else exp.alias,
                    expression=exp,
                ))
        return output

    aggregations = []
    for aggregation in body.get("aggregations", []):
        if not isinstance(aggregation, Sequence):
            raise ParsingException((
                f"Invalid aggregation structure {aggregation}. "
                "It must be a sequence containing expression, column and alias."
            ))
        aggregation_function = aggregation[0]
        column_expr = aggregation[1]
        column_expr = column_expr if column_expr else []
        alias = aggregation[2]
        alias = alias if alias else None

        aggregations.append(
            SelectedExpression(
                name=alias,
                expression=parse_aggregation(
                    aggregation_function,
                    column_expr,
                    alias,
                    entity.get_data_model(),
                    set(),
                ),
            ))

    groupby_clause = build_selected_expressions(
        to_list(body.get("groupby", [])))

    select_clause = (
        groupby_clause + aggregations +
        build_selected_expressions(body.get("selected_columns", [])))

    array_join_cols = set()
    arrayjoin = body.get("arrayjoin")
    # TODO: Properly detect all array join columns in all clauses of the query.
    # This is missing an arrayJoin in condition with an alias that is then
    # used in the select.
    if arrayjoin:
        array_join_cols.add(arrayjoin)
        array_join_expr: Optional[Expression] = parse_expression(
            body["arrayjoin"], entity.get_data_model(), {arrayjoin})
    else:
        array_join_expr = None
        for select_expr in select_clause:
            if isinstance(select_expr.expression, FunctionCall):
                if select_expr.expression.function_name == "arrayJoin":
                    parameters = select_expr.expression.parameters
                    if len(parameters) != 1:
                        raise ParsingException(
                            "arrayJoin(...) only accepts a single parameter.")
                    if isinstance(parameters[0], Column):
                        array_join_cols.add(parameters[0].column_name)
                    else:
                        # We only accepts columns or functions that do not
                        # reference columns. We could not say whether we are
                        # actually arrayjoining on the values of the column
                        # if it is nested in an arbitrary function. But
                        # functions of literals are fine.
                        for e in parameters[0]:
                            if isinstance(e, Column):
                                raise ParsingException(
                                    "arrayJoin(...) cannot contain columns nested in functions."
                                )

    where_expr = parse_conditions_to_expr(body.get("conditions", []), entity,
                                          array_join_cols)
    having_expr = parse_conditions_to_expr(body.get("having", []), entity,
                                           array_join_cols)

    orderby_exprs = []
    for orderby in to_list(body.get("orderby", [])):
        if isinstance(orderby, str):
            match = NEGATE_RE.match(orderby)
            if match is None:
                raise ParsingException((
                    f"Invalid Order By clause {orderby}. If the Order By is a string, "
                    "it must respect the format `[-]column`"))
            direction, col = match.groups()
            orderby = col
        elif is_function(orderby):
            match = NEGATE_RE.match(orderby[0])
            if match is None:
                raise ParsingException((
                    f"Invalid Order By clause {orderby}. If the Order By is an expression, "
                    "the function name must respect the format `[-]func_name`"
                ))
            direction, col = match.groups()
            orderby = [col] + orderby[1:]
        else:
            raise ParsingException(
                (f"Invalid Order By clause {orderby}. The Clause was neither "
                 "a string nor a function call."))
        orderby_parsed = parse_expression(tuplify(orderby),
                                          entity.get_data_model(), set())
        orderby_exprs.append(
            OrderBy(
                OrderByDirection.DESC
                if direction == "-" else OrderByDirection.ASC,
                orderby_parsed,
            ))

    return Query(
        body,
        None,
        selected_columns=select_clause,
        array_join=array_join_expr,
        condition=where_expr,
        groupby=[g.expression for g in groupby_clause],
        having=having_expr,
        order_by=orderby_exprs,
    )
示例#5
0
def test_nullable_field_casting(entity: Entity,
                                expected_table_name: str) -> None:
    dataset_name = "discover"

    query_str = """MATCH (discover)
    SELECT
        uniq(sdk_version)
    WHERE
        timestamp >= toDateTime('2021-07-25T15:02:10') AND
        timestamp < toDateTime('2021-07-26T15:02:10') AND
        project_id IN tuple(5492900)
    """

    # ----- create the request object as if it came in through our API -----
    query_body = {
        "query": query_str,
        "debug": True,
        "dataset": dataset_name,
        "turbo": False,
        "consistent": False,
    }

    dataset = get_dataset(dataset_name)

    schema = RequestSchema.build(HTTPQuerySettings)

    request = build_request(
        query_body,
        parse_snql_query,
        HTTPQuerySettings,
        schema,
        dataset,
        Timer(name="bloop"),
        "some_referrer",
    )

    # --------------------------------------------------------------------

    def query_verifier(
        query: Union[Query, CompositeQuery[Table]],
        settings: QuerySettings,
        reader: Reader,
    ) -> QueryResult:
        # The only reason this extends StringifyVisitor is because it has all the other
        # visit methods implemented.
        class NullCastingVerifier(StringifyVisitor):
            def __init__(self) -> None:
                self.sdk_version_cast_to_null = False
                super().__init__()

            def visit_function_call(self, exp: FunctionCall) -> str:
                if (exp.function_name == "cast"
                        and exp.alias == "_snuba_sdk_version"
                        and exp.parameters == (
                            Column(None, None, "sdk_version"),
                            Literal(None, "Nullable(String)"),
                        )):
                    self.sdk_version_cast_to_null = True
                return super().visit_function_call(exp)

        for select_expr in query.get_selected_columns():
            verifier = NullCastingVerifier()
            select_expr.expression.accept(verifier)
            assert verifier.sdk_version_cast_to_null

        return QueryResult(
            result={
                "meta": [],
                "data": [],
                "totals": {}
            },
            extra={
                "stats": {},
                "sql": "",
                "experiments": {}
            },
        )

    entity.get_query_pipeline_builder().build_execution_pipeline(
        request, query_verifier).execute()
示例#6
0
def parse_conditions(
    operand_builder: Callable[[Any, ColumnSet, Set[str]], TExpression],
    and_builder: Callable[[Sequence[TExpression]], Optional[TExpression]],
    or_builder: Callable[[Sequence[TExpression]], Optional[TExpression]],
    unpack_array_condition_builder: Callable[[TExpression, str, Any],
                                             TExpression],
    simple_condition_builder: Callable[[TExpression, str, Any], TExpression],
    entity: Entity,
    conditions: Any,
    arrayjoin_cols: Set[str],
    depth: int = 0,
) -> Optional[TExpression]:
    """
    Return a boolean expression suitable for putting in the WHERE clause of the
    query.  The expression is constructed by ANDing groups of OR expressions.
    Expansion of columns is handled, as is replacement of columns with aliases,
    if the column has already been expanded and aliased elsewhere.

    operand_builder: Builds the TExpression representing the left hand side
      of a simple condition. This can be as nested as the user wants
    and_builder / or_builder: Combine a list of expressions in AND/OR
    unpack_array_condition_builder: Deals with a special case where we unpack conditions
      on array columns. More details in the code.
    simple_condition_builder: Generates a simple condition made by expression on the
      left hand side, an operator and a literal on the right hand side.
    """
    from snuba.clickhouse.columns import Array

    if not conditions:
        return None

    if depth == 0:
        # dedupe conditions at top level, but keep them in order
        sub = OrderedDict((
            parse_conditions(
                operand_builder,
                and_builder,
                or_builder,
                unpack_array_condition_builder,
                simple_condition_builder,
                entity,
                cond,
                arrayjoin_cols,
                depth + 1,
            ),
            None,
        ) for cond in conditions)
        return and_builder([s for s in sub.keys() if s])
    elif is_condition(conditions):
        try:
            lhs, op, lit = conditions
        except Exception as cause:
            raise ParsingException(f"Cannot process condition {conditions}",
                                   cause) from cause

        # facilitate deduping IN conditions by sorting them.
        if op in ("IN", "NOT IN") and isinstance(lit, tuple):
            lit = tuple(sorted(lit))

        # If the LHS is a simple column name that refers to an array column
        # (and we are not arrayJoining on that column, which would make it
        # scalar again) and the RHS is a scalar value, we assume that the user
        # actually means to check if any (or all) items in the array match the
        # predicate, so we return an `any(x == value for x in array_column)`
        # type expression. We assume that operators looking for a specific value
        # (IN, =, LIKE) are looking for rows where any array value matches, and
        # exclusionary operators (NOT IN, NOT LIKE, !=) are looking for rows
        # where all elements match (eg. all NOT LIKE 'foo').
        columns = entity.get_data_model()
        if (isinstance(lhs, str) and lhs in columns
                and isinstance(columns[lhs].type, Array)
                and columns[lhs].base_name not in arrayjoin_cols
                and columns[lhs].flattened not in arrayjoin_cols
                and not isinstance(lit, (list, tuple))):
            return unpack_array_condition_builder(
                operand_builder(lhs, entity.get_data_model(), arrayjoin_cols),
                op,
                lit,
            )
        else:
            return simple_condition_builder(
                operand_builder(lhs, entity.get_data_model(), arrayjoin_cols),
                op,
                lit,
            )

    elif depth == 1:
        sub_expression = (parse_conditions(
            operand_builder,
            and_builder,
            or_builder,
            unpack_array_condition_builder,
            simple_condition_builder,
            entity,
            cond,
            arrayjoin_cols,
            depth + 1,
        ) for cond in conditions)
        return or_builder([s for s in sub_expression if s])
    else:
        raise InvalidConditionException(str(conditions))
示例#7
0
def test_span_id_promotion(entity: Entity, expected_table_name: str) -> None:
    """In order to save space in the contexts column and provide faster query
    performance, we promote span_id to a proper column and don't store it in the
    actual contexts object in the DB.

    The client however, still queries by `contexts[trace.span_id]` and expects that
    it is a hex string rather than a 64 bit uint (which is what we store it as)

    This test makes sure that our query pipeline will do the proper column promotion and conversion
    """

    dataset_name = "discover"

    # The client queries by contexts[trace.span_id] even though that's not how we store it
    query_str = f"""MATCH (discover)
    SELECT
        contexts[trace.span_id]
    WHERE
        timestamp >= toDateTime('2021-07-25T15:02:10') AND
        timestamp < toDateTime('2021-07-26T15:02:10') AND
        contexts[trace.span_id] = '{span_id_hex}' AND
        project_id IN tuple(5492900)
    """

    # ----- create the request object as if it came in through our API -----
    query_body = {
        "query": query_str,
        "debug": True,
        "dataset": dataset_name,
        "turbo": False,
        "consistent": False,
    }

    dataset = get_dataset(dataset_name)

    schema = RequestSchema.build(HTTPQuerySettings)

    request = build_request(
        query_body,
        parse_snql_query,
        HTTPQuerySettings,
        schema,
        dataset,
        Timer(name="bloop"),
        "some_referrer",
    )

    # --------------------------------------------------------------------

    def query_verifier(
        query: Union[Query, CompositeQuery[Table]],
        settings: QuerySettings,
        reader: Reader,
    ) -> QueryResult:
        assert isinstance(query, Query)
        # in local and CI there's a table name difference
        # errors_local vs errors_dist and discover_local vs discover_dist
        # so we check using `in` instead of `==`
        assert expected_table_name in query.get_from_clause().table_name
        assert query.get_selected_columns() == [
            SelectedExpression(
                name="contexts[trace.span_id]",
                # the select converts the span_id into a lowecase hex string
                expression=FunctionCall(
                    "_snuba_contexts[trace.span_id]",
                    "lower",
                    (FunctionCall(None, "hex",
                                  (Column(None, None, "span_id"), )), ),
                ),
            )
        ]

        class SpanIdVerifier(NoopVisitor):
            def __init__(self) -> None:
                self.found_span_condition = False
                super().__init__()

            def visit_function_call(self, exp: FunctionCall) -> None:
                if exp.function_name == "equals" and exp.parameters[
                        0] == Column(None, None, "span_id"):
                    self.found_span_condition = True
                    # and here we can see that the hex string the client queried us with
                    # has been converted to the correct uint64
                    assert exp.parameters[1] == Literal(
                        None, span_id_as_uint64)
                return super().visit_function_call(exp)

        verifier = SpanIdVerifier()
        condition = query.get_condition()
        assert condition is not None
        condition.accept(verifier)
        assert verifier.found_span_condition

        return QueryResult(
            result={
                "meta": [],
                "data": [],
                "totals": {}
            },
            extra={
                "stats": {},
                "sql": "",
                "experiments": {}
            },
        )

    entity.get_query_pipeline_builder().build_execution_pipeline(
        request, query_verifier).execute()