示例#1
0
def test_time_split_ast() -> None:
    """
    Test that the time split transforms the query properly both on the old representation
    and on the AST representation.
    """
    found_timestamps = []

    def do_query(
        query: ClickhouseQuery,
        request_settings: RequestSettings,
    ) -> QueryResult:
        from_date_ast, to_date_ast = get_time_range(query, "timestamp")
        assert from_date_ast is not None and isinstance(
            from_date_ast, datetime)
        assert to_date_ast is not None and isinstance(to_date_ast, datetime)

        found_timestamps.append(
            (from_date_ast.isoformat(), to_date_ast.isoformat()))

        return QueryResult({"data": []}, {})

    body = {
        "selected_columns": [
            "event_id",
            "level",
            "logger",
            "server_name",
            "transaction",
            "timestamp",
            "project_id",
        ],
        "conditions": [
            ("timestamp", ">=", "2019-09-18T10:00:00"),
            ("timestamp", "<", "2019-09-19T12:00:00"),
            ("project_id", "IN", [1]),
        ],
        "limit":
        10,
        "orderby": ["-timestamp"],
    }

    query = parse_query(body, get_dataset("events"))
    entity = get_entity(query.get_from_clause().key)
    settings = HTTPRequestSettings()
    for p in entity.get_query_processors():
        p.process_query(query, settings)

    clickhouse_query = identity_translate(query)
    splitter = TimeSplitQueryStrategy("timestamp")
    splitter.execute(clickhouse_query, settings, do_query)

    assert found_timestamps == [
        ("2019-09-19T11:00:00", "2019-09-19T12:00:00"),
        ("2019-09-19T01:00:00", "2019-09-19T11:00:00"),
        ("2019-09-18T10:00:00", "2019-09-19T01:00:00"),
    ]
示例#2
0
def test_no_split(dataset_name: str, id_column: str, project_column: str,
                  timestamp_column: str) -> None:
    events = get_dataset(dataset_name)
    query = ClickhouseQuery(
        events.get_default_entity().get_all_storages()
        [0].get_schema().get_data_source(), )

    def do_query(
        query: ClickhouseQuery,
        request_settings: RequestSettings,
        reader: Reader,
    ) -> QueryResult:
        assert query == query
        return QueryResult({}, {})

    strategy = SimpleQueryPlanExecutionStrategy(
        ClickhouseCluster("localhost", 1024, "default", "", "default", 80,
                          set(), True),
        [],
        [
            ColumnSplitQueryStrategy(
                id_column=id_column,
                project_column=project_column,
                timestamp_column=timestamp_column,
            ),
            TimeSplitQueryStrategy(timestamp_col=timestamp_column),
        ],
    )

    strategy.execute(query, HTTPRequestSettings(), do_query)
示例#3
0
def test_time_split_ast() -> None:
    """
    Test that the time split transforms the query properly both on the old representation
    and on the AST representation.
    """
    found_timestamps = []

    def do_query(
        query: ClickhouseQuery,
        query_settings: QuerySettings,
    ) -> QueryResult:
        from_date_ast, to_date_ast = get_time_range(query, "timestamp")
        assert from_date_ast is not None and isinstance(
            from_date_ast, datetime)
        assert to_date_ast is not None and isinstance(to_date_ast, datetime)

        found_timestamps.append(
            (from_date_ast.isoformat(), to_date_ast.isoformat()))

        return QueryResult({"data": []}, {})

    body = """
        MATCH (events)
        SELECT event_id, level, logger, server_name, transaction, timestamp, project_id
        WHERE timestamp >= toDateTime('2019-09-18T10:00:00')
        AND timestamp < toDateTime('2019-09-19T12:00:00')
        AND project_id IN tuple(1)
        ORDER BY timestamp DESC
        LIMIT 10
        """

    query, _ = parse_snql_query(body, get_dataset("events"))
    entity = get_entity(query.get_from_clause().key)
    settings = HTTPQuerySettings()
    for p in entity.get_query_processors():
        p.process_query(query, settings)

    clickhouse_query = identity_translate(query)
    splitter = TimeSplitQueryStrategy("timestamp")
    splitter.execute(clickhouse_query, settings, do_query)

    assert found_timestamps == [
        ("2019-09-19T11:00:00", "2019-09-19T12:00:00"),
        ("2019-09-19T01:00:00", "2019-09-19T11:00:00"),
        ("2019-09-18T10:00:00", "2019-09-19T01:00:00"),
    ]
示例#4
0
def test_col_split(
    dataset_name: str,
    id_column: str,
    project_column: str,
    timestamp_column: str,
    first_query_data: Sequence[MutableMapping[str, Any]],
    second_query_data: Sequence[MutableMapping[str, Any]],
) -> None:
    def do_query(
        query: ClickhouseQuery,
        request_settings: RequestSettings,
        reader: Reader[SqlQuery],
    ) -> QueryResult:
        selected_cols = query.get_selected_columns()
        assert selected_cols == [
            c.expression.column_name
            for c in query.get_selected_columns_from_ast() or []
            if isinstance(c.expression, Column)
        ]
        if selected_cols == list(first_query_data[0].keys()):
            return QueryResult({"data": first_query_data}, {})
        elif selected_cols == list(second_query_data[0].keys()):
            return QueryResult({"data": second_query_data}, {})
        else:
            raise ValueError(f"Unexpected selected columns: {selected_cols}")

    events = get_dataset(dataset_name)
    query = ClickhouseQuery(
        LogicalQuery(
            {
                "selected_columns": list(second_query_data[0].keys()),
                "conditions": [""],
                "orderby": "events.event_id",
                "sample": 10,
                "limit": 100,
                "offset": 50,
            },
            events.get_all_storages()[0].get_schema().get_data_source(),
            selected_columns=[
                SelectedExpression(name=col_name,
                                   expression=Column(None, None, col_name))
                for col_name in second_query_data[0].keys()
            ],
        ))

    strategy = SimpleQueryPlanExecutionStrategy(
        ClickhouseCluster("localhost", 1024, "default", "", "default", 80,
                          set(), True),
        [],
        [
            ColumnSplitQueryStrategy(id_column, project_column,
                                     timestamp_column),
            TimeSplitQueryStrategy(timestamp_col=timestamp_column),
        ],
    )

    strategy.execute(query, HTTPRequestSettings(), do_query)
示例#5
0
def test_col_split(
    dataset_name: str,
    id_column: str,
    project_column: str,
    timestamp_column: str,
    first_query_data: Sequence[MutableMapping[str, Any]],
    second_query_data: Sequence[MutableMapping[str, Any]],
) -> None:
    def do_query(
        query: ClickhouseQuery,
        query_settings: QuerySettings,
        reader: Reader,
    ) -> QueryResult:
        selected_col_names = [
            c.expression.column_name
            for c in query.get_selected_columns() or []
            if isinstance(c.expression, Column)
        ]
        if selected_col_names == list(first_query_data[0].keys()):
            return QueryResult({"data": first_query_data}, {})
        elif selected_col_names == list(second_query_data[0].keys()):
            return QueryResult({"data": second_query_data}, {})
        else:
            raise ValueError(
                f"Unexpected selected columns: {selected_col_names}")

    events = get_dataset(dataset_name)
    query = ClickhouseQuery(
        events.get_default_entity().get_all_storages()
        [0].get_schema().get_data_source(),
        selected_columns=[
            SelectedExpression(name=col_name,
                               expression=Column(None, None, col_name))
            for col_name in second_query_data[0].keys()
        ],
    )

    strategy = SimpleQueryPlanExecutionStrategy(
        ClickhouseCluster("localhost", 1024, "default", "", "default", 80,
                          set(), True),
        [],
        [
            ColumnSplitQueryStrategy(id_column, project_column,
                                     timestamp_column),
            TimeSplitQueryStrategy(timestamp_col=timestamp_column),
        ],
    )

    strategy.execute(query, HTTPQuerySettings(), do_query)
示例#6
0
def test_no_split(
    dataset_name: str, id_column: str, project_column: str, timestamp_column: str
) -> None:
    events = get_dataset(dataset_name)
    query = ClickhouseQuery(
        LogicalQuery(
            {
                "selected_columns": ["event_id"],
                "conditions": [""],
                "orderby": "event_id",
                "sample": 10,
                "limit": 100,
                "offset": 50,
            },
            events.get_all_storages()[0].get_schema().get_data_source(),
        )
    )

    def do_query(
        query: ClickhouseQuery,
        request_settings: RequestSettings,
        reader: Reader[SqlQuery],
    ) -> QueryResult:
        assert query == query
        return QueryResult({}, {})

    strategy = SimpleQueryPlanExecutionStrategy(
        ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True),
        [],
        [
            ColumnSplitQueryStrategy(
                id_column=id_column,
                project_column=project_column,
                timestamp_column=timestamp_column,
            ),
            TimeSplitQueryStrategy(timestamp_col=timestamp_column),
        ],
    )

    strategy.execute(query, HTTPRequestSettings(), do_query)
示例#7
0
    # during create statement
    # (https://github.com/ClickHouse/ClickHouse/issues/12586), so the
    # materialization is added with a migration.
    skipped_cols_on_creation={"_tags_hash_map"},
)


storage = WritableTableStorage(
    storage_key=StorageKey.TRANSACTIONS,
    storage_set_key=StorageSetKey.TRANSACTIONS,
    schema=schema,
    query_processors=[
        NestedFieldConditionOptimizer(
            "contexts",
            "_contexts_flattened",
            {"start_ts", "finish_ts"},
            BEGINNING_OF_TIME,
        ),
        MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"),
        TransactionColumnProcessor(),
        ArrayJoinKeyValueOptimizer("tags"),
        ArrayJoinKeyValueOptimizer("measurements"),
        PrewhereProcessor(),
    ],
    stream_loader=KafkaStreamLoader(
        processor=TransactionsMessageProcessor(), default_topic="events",
    ),
    query_splitters=[TimeSplitQueryStrategy(timestamp_col="finish_ts")],
    writer_options={"insert_allow_materialized_columns": 1},
)
示例#8
0
                    "sentry:dist": "dist",
                    "sentry:user": "******",
                },
                "contexts": {"trace.trace_id": "trace_id"},
            }
        ),
        MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"),
        ArrayJoinKeyValueOptimizer("tags"),
        UUIDColumnProcessor(set(["event_id", "trace_id"])),
        EventsBooleanContextsProcessor(),
        PrewhereProcessor(
            [
                "event_id",
                "release",
                "message",
                "transaction_name",
                "environment",
                "project_id",
            ]
        ),
    ],
    query_splitters=[
        ColumnSplitQueryStrategy(
            id_column="event_id",
            project_column="project_id",
            timestamp_column="timestamp",
        ),
        TimeSplitQueryStrategy(timestamp_col="timestamp"),
    ],
)
示例#9
0
])

schema = TableSchema(
    columns=columns,
    local_table_name="discover_local",
    dist_table_name="discover_dist",
    storage_set_key=StorageSetKey.DISCOVER,
    mandatory_conditions=mandatory_conditions,
    prewhere_candidates=[
        "event_id",
        "release",
        "message",
        "transaction_name",
        "environment",
        "project_id",
    ],
)

storage = ReadableTableStorage(
    storage_key=StorageKey.DISCOVER,
    storage_set_key=StorageSetKey.DISCOVER,
    schema=schema,
    query_processors=[
        MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"),
        EventIdColumnProcessor(),
        ArrayJoinKeyValueOptimizer("tags"),
        PrewhereProcessor(),
    ],
    query_splitters=[TimeSplitQueryStrategy(timestamp_col="timestamp")],
)
示例#10
0
def test_time_split_ast() -> None:
    """
    Test that the time split transforms the query properly both on the old representation
    and on the AST representation.
    """
    found_timestamps = []

    def do_query(
        query: ClickhouseQuery,
        request_settings: RequestSettings,
    ) -> QueryResult:
        from_date_ast, to_date_ast = get_time_range(query, "timestamp")
        assert from_date_ast is not None and isinstance(
            from_date_ast, datetime)
        assert to_date_ast is not None and isinstance(to_date_ast, datetime)

        conditions = query.get_conditions() or []
        from_date_str = next(
            (condition[2] for condition in conditions
             if condition[0] == "timestamp" and condition[1] == ">="),
            None,
        )
        to_date_str = next(
            (condition[2] for condition in conditions
             if condition[0] == "timestamp" and condition[1] == "<"),
            None,
        )
        assert from_date_str == from_date_ast.isoformat()
        assert to_date_str == to_date_ast.isoformat()

        found_timestamps.append(
            (from_date_ast.isoformat(), to_date_ast.isoformat()))

        return QueryResult({"data": []}, {})

    body = {
        "selected_columns": [
            "event_id",
            "level",
            "logger",
            "server_name",
            "transaction",
            "timestamp",
            "project_id",
        ],
        "conditions": [
            ("timestamp", ">=", "2019-09-18T10:00:00"),
            ("timestamp", "<", "2019-09-19T12:00:00"),
            ("project_id", "IN", [1]),
        ],
        "limit":
        10,
        "orderby": ["-timestamp"],
    }

    events = get_dataset("events")
    query = parse_query(body, events)

    splitter = TimeSplitQueryStrategy("timestamp")
    splitter.execute(ClickhouseQuery(query), HTTPRequestSettings(), do_query)

    assert found_timestamps == [
        ("2019-09-19T11:00:00", "2019-09-19T12:00:00"),
        ("2019-09-19T01:00:00", "2019-09-19T11:00:00"),
        ("2019-09-18T10:00:00", "2019-09-19T01:00:00"),
    ]