Exemplo n.º 1
0
def detect_table(query: Query, events_only_columns: ColumnSet,
                 transactions_only_columns: ColumnSet) -> str:
    """
    Given a query, we attempt to guess whether it is better to fetch data from the
    "events" or "transactions" storage. This is going to be wrong in some cases.
    """
    # First check for a top level condition that matches either type = transaction
    # type != transaction.
    conditions = query.get_conditions()
    if conditions:
        for idx, condition in enumerate(conditions):
            if is_condition(condition):
                if tuple(condition) == ("type", "=", "error"):
                    return EVENTS
                elif tuple(condition) == ("type", "=", "transaction"):
                    return TRANSACTIONS

    # Check for any conditions that reference a table specific field
    condition_columns = query.get_columns_referenced_in_conditions()
    if any(events_only_columns.get(col) for col in condition_columns):
        return EVENTS
    if any(transactions_only_columns.get(col) for col in condition_columns):
        return TRANSACTIONS

    # Check for any other references to a table specific field
    all_referenced_columns = query.get_all_referenced_columns()
    if any(events_only_columns.get(col) for col in all_referenced_columns):
        return EVENTS
    if any(
            transactions_only_columns.get(col)
            for col in all_referenced_columns):
        return TRANSACTIONS

    # Use events by default
    return EVENTS
Exemplo n.º 2
0
    def process_query(
        self,
        query: Query,
        request_settings: RequestSettings,
    ) -> None:
        from_clause = query.get_data_source()
        if not isinstance(from_clause, JoinClause):
            return

        referenced_columns = query.get_all_referenced_columns()
        referenced_aliases = set()
        for qualified_column in referenced_columns:
            # This will be much better when we will represent columns
            # with a more structured data type than strings.
            match = QUALIFIED_COLUMN_REGEX.match(qualified_column)
            if match:
                # match[1] is the first parenthesized group in the regex, thus
                # the table alias.
                table_alias = match[1]
                referenced_aliases.add(table_alias)

        assert (len(referenced_aliases) >
                0), "Trying to otpimize a join query without aliases"
        if len(referenced_aliases) > 1:
            return

        from_tables = from_clause.get_tables()
        table = from_tables[referenced_aliases.pop()]

        query.set_data_source(table)
Exemplo n.º 3
0
    def __tags_expr(self,
        column_name: str,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str="",
    ) -> str:
        """
        Return an expression that array-joins on tags to produce an output with one
        row per tag.
        """
        assert column_name in ['tags_key', 'tags_value']
        col, k_or_v = column_name.split('_', 1)
        nested_tags_only = state.get_config('nested_tags_only', 1)

        qualified_col = qualified_column(col, table_alias)
        # Generate parallel lists of keys and values to arrayJoin on
        if nested_tags_only:
            key_list = '{}.key'.format(qualified_col)
            val_list = '{}.value'.format(qualified_col)
        else:
            promoted = self.__promoted_columns[col]
            col_map = self.__column_tag_map[col]
            key_list = u'arrayConcat([{}], {}.key)'.format(
                u', '.join(u'\'{}\''.format(col_map.get(p, p)) for p in promoted),
                qualified_col
            )
            val_list = u'arrayConcat([{}], {}.value)'.format(
                ', '.join(self.__string_col(p) for p in promoted),
                qualified_col
            )

        qualified_key = qualified_column("tags_key", table_alias)
        qualified_value = qualified_column("tags_value", table_alias)
        cols_used = query.get_all_referenced_columns() & set([qualified_key, qualified_value])
        if len(cols_used) == 2:
            # If we use both tags_key and tags_value in this query, arrayjoin
            # on (key, value) tag tuples.
            expr = (u'arrayJoin(arrayMap((x,y) -> [x,y], {}, {}))').format(
                key_list,
                val_list
            )

            # put the all_tags expression in the alias cache so we can use the alias
            # to refer to it next time (eg. 'all_tags[1] AS tags_key'). instead of
            # expanding the whole tags expression again.
            expr = alias_expr(expr, 'all_tags', parsing_context)
            return u'({})[{}]'.format(expr, 1 if k_or_v == 'key' else 2)
        else:
            # If we are only ever going to use one of tags_key or tags_value, don't
            # bother creating the k/v tuples to arrayJoin on, or the all_tags alias
            # to re-use as we won't need it.
            return 'arrayJoin({})'.format(key_list if k_or_v == 'key' else val_list)
Exemplo n.º 4
0
def test_referenced_columns():
    # a = 1 AND b = 1
    dataset = get_dataset("events")
    source = dataset.get_dataset_schemas().get_read_schema().get_data_source()
    body = {"conditions": [["a", "=", "1"], ["b", "=", "1"]]}
    query = Query(body, source)
    assert query.get_all_referenced_columns() == set(["a", "b"])
    assert query.get_columns_referenced_in_conditions() == set(["a", "b"])
    assert query.get_columns_referenced_in_having() == set([])

    # a = 1 AND (b = 1 OR c = 1)
    body = {
        "conditions": [["a", "=", "1"], [["b", "=", "1"], ["c", "=", "1"]]]
    }
    query = Query(body, source)
    assert query.get_all_referenced_columns() == set(["a", "b", "c"])
    assert query.get_columns_referenced_in_conditions() == set(["a", "b", "c"])
    assert query.get_columns_referenced_in_having() == set([])

    # a = 1 AND (b = 1 OR foo(c) = 1)
    body = {
        "conditions": [["a", "=", "1"],
                       [["b", "=", "1"], [["foo", ["c"]], "=", "1"]]]
    }
    query = Query(body, source)
    assert query.get_all_referenced_columns() == set(["a", "b", "c"])
    assert query.get_columns_referenced_in_conditions() == set(["a", "b", "c"])
    assert query.get_columns_referenced_in_having() == set([])

    # a = 1 AND (b = 1 OR foo(c, bar(d)) = 1)
    body = {
        "conditions": [
            ["a", "=", "1"],
            [["b", "=", "1"], [["foo", ["c", ["bar", ["d"]]]], "=", "1"]],
        ]
    }
    query = Query(body, source)
    assert query.get_all_referenced_columns() == set(["a", "b", "c", "d"])
    assert query.get_columns_referenced_in_conditions() == set(
        ["a", "b", "c", "d"])
    assert query.get_columns_referenced_in_having() == set([])

    # Other fields, including expressions in selected columns
    body = {
        "arrayjoin": "tags_key",
        "groupby": ["time", "group_id"],
        "orderby": "-time",
        "selected_columns": [
            "group_id",
            "time",
            ["foo", ["c", ["bar", ["d"]]]],  # foo(c, bar(d))
        ],
        "aggregations": [["uniq", "tags_value", "values_seen"]],
    }
    query = Query(body, source)
    assert query.get_all_referenced_columns() == set(
        ["tags_key", "tags_value", "time", "group_id", "c", "d"])
    assert query.get_columns_referenced_in_conditions() == set([])
    assert query.get_columns_referenced_in_having() == set([])

    body = {
        "conditions": [["a", "=", "1"]],
        "having": [
            ["b", "=", "1"],
            [["c", "=", "1"], [["foo", ["d", ["bar", ["e"]]]], "=", "1"]],
        ],
    }
    query = Query(body, source)
    query.set_prewhere([["pc6", "=", "10"]])
    assert query.get_all_referenced_columns() == set(
        ["a", "b", "c", "d", "e", "pc6"])
    assert query.get_columns_referenced_in_having() == set(
        ["b", "c", "d", "e"])
Exemplo n.º 5
0
    def __tags_expr(
        self,
        parsed_col: ParsedNestedColumn,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str = "",
    ) -> str:
        """
        Return an expression that array-joins on tags to produce an output with one
        row per tag.

        It can also apply an arrayFilter in the arrayJoin if an equivalent condition
        is found in the query in order to reduce the size of the arrayJoin.
        """
        col, k_or_v = parsed_col.col_name.split("_", 1)
        nested_tags_only = state.get_config("nested_tags_only", 1)

        qualified_col = qualified_column(col, table_alias)
        # Generate parallel lists of keys and values to arrayJoin on
        if nested_tags_only:
            key_list = "{}.key".format(qualified_col)
            val_list = "{}.value".format(qualified_col)
        else:
            promoted = self.__promoted_columns[col]
            col_map = self.__column_tag_map[col]
            key_list = "arrayConcat([{}], {}.key)".format(
                ", ".join("'{}'".format(col_map.get(p, p)) for p in promoted),
                qualified_col,
            )
            val_list = "arrayConcat([{}], {}.value)".format(
                ", ".join(self.__string_col(p) for p in promoted),
                qualified_col)

        qualified_key = qualified_column("tags_key", table_alias)
        qualified_value = qualified_column("tags_value", table_alias)
        cols_used = query.get_all_referenced_columns() & set(
            [qualified_key, qualified_value])

        filter_tags = ",".join(
            [f"'{tag}'" for tag in self.__get_filter_tags(query)])
        if len(cols_used) == 2:
            # If we use both tags_key and tags_value in this query, arrayjoin
            # on (key, value) tag tuples.
            mapping = f"arrayMap((x,y) -> [x,y], {key_list}, {val_list})"
            if filter_tags:
                filtering = (
                    f"arrayFilter(pair -> pair[1] IN ({filter_tags}), {mapping})"
                )
            else:
                filtering = mapping

            expr = f"arrayJoin({filtering})"

            # put the all_tags expression in the alias cache so we can use the alias
            # to refer to it next time (eg. 'all_tags[1] AS tags_key'). instead of
            # expanding the whole tags expression again.
            expr = alias_expr(expr, "all_tags", parsing_context)
            return "({})[{}]".format(expr, 1 if k_or_v == "key" else 2)
        else:
            # If we are only ever going to use one of tags_key or tags_value, don't
            # bother creating the k/v tuples to arrayJoin on, or the all_tags alias
            # to re-use as we won't need it.
            if filter_tags:
                return (
                    f"arrayJoin(arrayFilter(tag -> tag IN ({filter_tags}), {key_list}))"
                )
            else:
                return f"arrayJoin({key_list if k_or_v == 'key' else val_list})"
Exemplo n.º 6
0
def test_referenced_columns():
    # a = 1 AND b = 1
    dataset = get_dataset('events')
    source = dataset.get_dataset_schemas().get_read_schema().get_data_source()
    body = {
        'conditions': [
            ['a', '=', '1'],
            ['b', '=', '1'],
        ]
    }
    query = Query(body, source)
    assert query.get_all_referenced_columns() == set(['a', 'b'])

    # a = 1 AND (b = 1 OR c = 1)
    body = {
        'conditions': [
            ['a', '=', '1'],
            [
                ['b', '=', '1'],
                ['c', '=', '1'],
            ],
        ]
    }
    query = Query(body, source)
    assert query.get_all_referenced_columns() == set(['a', 'b', 'c'])

    # a = 1 AND (b = 1 OR foo(c) = 1)
    body = {
        'conditions': [
            ['a', '=', '1'],
            [
                ['b', '=', '1'],
                [['foo', ['c']], '=', '1'],
            ],
        ]
    }
    query = Query(body, source)
    assert query.get_all_referenced_columns() == set(['a', 'b', 'c'])

    # a = 1 AND (b = 1 OR foo(c, bar(d)) = 1)
    body = {
        'conditions': [
            ['a', '=', '1'],
            [
                ['b', '=', '1'],
                [['foo', ['c', ['bar', ['d']]]], '=', '1'],
            ],
        ]
    }
    query = Query(body, source)
    assert query.get_all_referenced_columns() == set(['a', 'b', 'c', 'd'])

    # Other fields, including expressions in selected columns
    body = {
        'arrayjoin': 'tags_key',
        'groupby': ['time', 'issue'],
        'orderby': '-time',
        'selected_columns': [
            'issue',
            'time',
            ['foo', ['c', ['bar', ['d']]]]  # foo(c, bar(d))
        ],
        'aggregations': [['uniq', 'tags_value', 'values_seen']]
    }
    query = Query(body, source)
    assert query.get_all_referenced_columns() == set(
        ['tags_key', 'tags_value', 'time', 'issue', 'c', 'd'])