Пример #1
0
 def __forward_migrations(
         self, table_name: str) -> Sequence[operations.Operation]:
     return [
         operations.ModifyColumn(
             StorageSetKey.QUERYLOG,
             table_name,
             Column("status", LowCardinality(String())),
         ),
         operations.ModifyColumn(
             StorageSetKey.QUERYLOG,
             table_name,
             Column("clickhouse_queries.status",
                    Array(LowCardinality(String()))),
         ),
     ]
Пример #2
0
    def __init__(self):
        read_columns = ColumnSet([
            ('org_id', UInt(64)),
            ('project_id', UInt(64)),
            ('key_id', Nullable(UInt(64))),
            ('timestamp', DateTime()),
            ('outcome', UInt(8)),
            ('reason', LowCardinality(Nullable(String()))),
            ('event_id', Nullable(UUID())),
        ])

        read_schema = MergeTreeSchema(
            columns=read_columns,
            local_table_name='outcomes_raw_local',
            dist_table_name='outcomes_raw_dist',
            order_by='(org_id, project_id, timestamp)',
            partition_by='(toMonday(timestamp))',
            settings={'index_granularity': 16384})

        dataset_schemas = DatasetSchemas(read_schema=read_schema,
                                         write_schema=None,
                                         intermediary_schemas=[])

        super().__init__(dataset_schemas=dataset_schemas,
                         time_group_columns={
                             'time': 'timestamp',
                         },
                         time_parse_columns=('timestamp', ))
Пример #3
0
    def __init__(self) -> None:
        read_columns = ColumnSet([
            ("org_id", UInt(64)),
            ("project_id", UInt(64)),
            ("key_id", Nullable(UInt(64))),
            ("timestamp", DateTime()),
            ("outcome", UInt(8)),
            ("reason", LowCardinality(Nullable(String()))),
            ("event_id", Nullable(UUID())),
        ])

        read_schema = MergeTreeSchema(
            columns=read_columns,
            local_table_name="outcomes_raw_local",
            dist_table_name="outcomes_raw_dist",
            order_by="(org_id, project_id, timestamp)",
            partition_by="(toMonday(timestamp))",
            settings={"index_granularity": 16384},
            migration_function=outcomes_raw_migrations,
        )

        dataset_schemas = DatasetSchemas(read_schema=read_schema,
                                         write_schema=None,
                                         intermediary_schemas=[])

        super().__init__(
            dataset_schemas=dataset_schemas,
            time_group_columns={"time": "timestamp"},
            time_parse_columns=("timestamp", ),
        )
Пример #4
0
 def forwards_dist(self) -> Sequence[operations.Operation]:
     return [
         operations.AddColumn(
             storage_set=StorageSetKey.TRANSACTIONS,
             table_name="transactions_dist",
             column=Column(
                 "measurements",
                 Nested([("key", LowCardinality(String())), ("value", Float(64))]),
             ),
             after="_contexts_flattened",
         ),
     ]
Пример #5
0
 def forwards_local(self) -> Sequence[operations.Operation]:
     return [
         operations.AddColumn(
             storage_set=StorageSetKey.EVENTS,
             table_name="errors_local",
             column=Column("http_method",
                           LowCardinality(Nullable(String()))),
             after="sdk_version",
         ),
         operations.AddColumn(
             storage_set=StorageSetKey.EVENTS,
             table_name="errors_local",
             column=Column("http_referer", Nullable(String())),
             after="http_method",
         ),
     ]
 def forwards_dist(self) -> Sequence[operations.Operation]:
     return [
         operations.AddColumn(
             storage_set=StorageSetKey.TRANSACTIONS,
             table_name="transactions_dist",
             column=Column("http_method",
                           LowCardinality(Nullable(String()))),
             after="sdk_version",
         ),
         operations.AddColumn(
             storage_set=StorageSetKey.TRANSACTIONS,
             table_name="transactions_dist",
             column=Column("http_referer", Nullable(String())),
             after="http_method",
         ),
     ]
Пример #7
0
 def __forward_migrations(
         self, table_name: str) -> Sequence[operations.Operation]:
     return [
         operations.AddColumn(
             storage_set=StorageSetKey.QUERYLOG,
             table_name=table_name,
             column=Column(
                 "clickhouse_queries.all_columns",
                 WithDefault(
                     Array(Array(LowCardinality(String()))),
                     "arrayResize([['']], length(clickhouse_queries.sql))",
                 ),
             ),
             after="clickhouse_queries.consistent",
         ),
         operations.AddColumn(
             storage_set=StorageSetKey.QUERYLOG,
             table_name=table_name,
             column=Column(
                 "clickhouse_queries.or_conditions",
                 WithDefault(
                     Array(UInt(8)),
                     "arrayResize([0], length(clickhouse_queries.sql))",
                 ),
             ),
             after="clickhouse_queries.all_columns",
         ),
         operations.AddColumn(
             storage_set=StorageSetKey.QUERYLOG,
             table_name=table_name,
             column=Column(
                 "clickhouse_queries.where_columns",
                 WithDefault(
                     Array(Array(LowCardinality(String()))),
                     "arrayResize([['']], length(clickhouse_queries.sql))",
                 ),
             ),
             after="clickhouse_queries.or_conditions",
         ),
         operations.AddColumn(
             storage_set=StorageSetKey.QUERYLOG,
             table_name=table_name,
             column=Column(
                 "clickhouse_queries.where_mapping_columns",
                 WithDefault(
                     Array(Array(LowCardinality(String()))),
                     "arrayResize([['']], length(clickhouse_queries.sql))",
                 ),
             ),
             after="clickhouse_queries.where_columns",
         ),
         operations.AddColumn(
             storage_set=StorageSetKey.QUERYLOG,
             table_name=table_name,
             column=Column(
                 "clickhouse_queries.groupby_columns",
                 WithDefault(
                     Array(Array(LowCardinality(String()))),
                     "arrayResize([['']], length(clickhouse_queries.sql))",
                 ),
             ),
             after="clickhouse_queries.where_mapping_columns",
         ),
         operations.AddColumn(
             storage_set=StorageSetKey.QUERYLOG,
             table_name=table_name,
             column=Column(
                 "clickhouse_queries.array_join_columns",
                 WithDefault(
                     Array(Array(LowCardinality(String()))),
                     "arrayResize([['']], length(clickhouse_queries.sql))",
                 ),
             ),
             after="clickhouse_queries.groupby_columns",
         ),
     ]
Пример #8
0
def transactions_migrations(
    clickhouse_table: str, current_schema: Mapping[str, ColumnType]
) -> Sequence[str]:
    ret = []
    duration_col = current_schema.get("duration")
    if duration_col and Materialized in duration_col.get_all_modifiers():
        ret.append("ALTER TABLE %s MODIFY COLUMN duration UInt32" % clickhouse_table)

    if "sdk_name" not in current_schema:
        ret.append(
            "ALTER TABLE %s ADD COLUMN sdk_name LowCardinality(String) DEFAULT ''"
            % clickhouse_table
        )

    if "sdk_version" not in current_schema:
        ret.append(
            "ALTER TABLE %s ADD COLUMN sdk_version LowCardinality(String) DEFAULT ''"
            % clickhouse_table
        )

    if "transaction_status" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN transaction_status UInt8 DEFAULT {UNKNOWN_SPAN_STATUS} AFTER transaction_op"
        )

    if "_tags_flattened" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN _tags_flattened String DEFAULT ''"
        )

    if "_contexts_flattened" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN _contexts_flattened String DEFAULT ''"
        )

    if "_start_date" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN _start_date Date MATERIALIZED toDate(start_ts) AFTER start_ms"
        )

    if "_finish_date" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN _finish_date Date MATERIALIZED toDate(finish_ts) AFTER finish_ms"
        )

    if "user_hash" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN user_hash UInt64 MATERIALIZED cityHash64(user) AFTER user"
        )

    low_cardinality_cols = [
        "transaction_name",
        "release",
        "dist",
        "sdk_name",
        "sdk_version",
        "environment",
    ]
    for col_name in low_cardinality_cols:
        col = current_schema.get(col_name)

        if col and LowCardinality not in col.get_all_modifiers():
            if isinstance(col, WithDefault):
                col.inner_type = LowCardinality(col.inner_type)
            else:
                col = LowCardinality(col)
            ret.append(
                f"ALTER TABLE {clickhouse_table} MODIFY COLUMN {col_name} {col.for_schema()}"
            )

    return ret
Пример #9
0
    def __init__(self) -> None:
        columns = ColumnSet(
            [
                ("project_id", UInt(64)),
                ("event_id", UUID()),
                ("trace_id", UUID()),
                ("span_id", UInt(64)),
                ("transaction_name", LowCardinality(String())),
                (
                    "transaction_hash",
                    Materialized(UInt(64), "cityHash64(transaction_name)",),
                ),
                ("transaction_op", LowCardinality(String())),
                ("transaction_status", WithDefault(UInt(8), UNKNOWN_SPAN_STATUS)),
                ("start_ts", DateTime()),
                ("start_ms", UInt(16)),
                ("_start_date", Materialized(Date(), "toDate(start_ts)"),),
                ("finish_ts", DateTime()),
                ("finish_ms", UInt(16)),
                ("_finish_date", Materialized(Date(), "toDate(finish_ts)"),),
                ("duration", UInt(32)),
                ("platform", LowCardinality(String())),
                ("environment", LowCardinality(Nullable(String()))),
                ("release", LowCardinality(Nullable(String()))),
                ("dist", LowCardinality(Nullable(String()))),
                ("ip_address_v4", Nullable(IPv4())),
                ("ip_address_v6", Nullable(IPv6())),
                ("user", WithDefault(String(), "''",)),
                ("user_hash", Materialized(UInt(64), "cityHash64(user)"),),
                ("user_id", Nullable(String())),
                ("user_name", Nullable(String())),
                ("user_email", Nullable(String())),
                ("sdk_name", WithDefault(LowCardinality(String()), "''")),
                ("sdk_version", WithDefault(LowCardinality(String()), "''")),
                ("tags", Nested([("key", String()), ("value", String())])),
                ("_tags_flattened", String()),
                ("contexts", Nested([("key", String()), ("value", String())])),
                ("_contexts_flattened", String()),
                ("partition", UInt(16)),
                ("offset", UInt(64)),
                ("retention_days", UInt(16)),
                ("deleted", UInt(8)),
            ]
        )

        schema = ReplacingMergeTreeSchema(
            columns=columns,
            local_table_name="transactions_local",
            dist_table_name="transactions_dist",
            mandatory_conditions=[],
            prewhere_candidates=["event_id", "project_id"],
            order_by="(project_id, _finish_date, transaction_name, cityHash64(span_id))",
            partition_by="(retention_days, toMonday(_finish_date))",
            version_column="deleted",
            sample_expr=None,
            migration_function=transactions_migrations,
        )

        dataset_schemas = DatasetSchemas(read_schema=schema, write_schema=schema,)

        self.__tags_processor = TagColumnProcessor(
            columns=columns,
            promoted_columns=self._get_promoted_columns(),
            column_tag_map=self._get_column_tag_map(),
        )

        super().__init__(
            dataset_schemas=dataset_schemas,
            table_writer=TransactionsTableWriter(
                write_schema=schema,
                stream_loader=KafkaStreamLoader(
                    processor=TransactionsMessageProcessor(), default_topic="events",
                ),
            ),
            time_group_columns={
                "bucketed_start": "start_ts",
                "bucketed_end": "finish_ts",
            },
            time_parse_columns=("start_ts", "finish_ts"),
        )
Пример #10
0
    def __init__(self) -> None:
        self.__common_columns = ColumnSet(
            [
                ("event_id", FixedString(32)),
                ("project_id", UInt(64)),
                ("type", Nullable(String())),
                ("timestamp", DateTime()),
                ("platform", Nullable(String())),
                ("environment", Nullable(String())),
                ("release", Nullable(String())),
                ("dist", Nullable(String())),
                ("user", Nullable(String())),
                ("transaction", Nullable(String())),
                ("message", Nullable(String())),
                ("title", Nullable(String())),
                # User
                ("user_id", Nullable(String())),
                ("username", Nullable(String())),
                ("email", Nullable(String())),
                ("ip_address", Nullable(String())),
                # SDK
                ("sdk_name", Nullable(String())),
                ("sdk_version", Nullable(String())),
                # geo location context
                ("geo_country_code", Nullable(String())),
                ("geo_region", Nullable(String())),
                ("geo_city", Nullable(String())),
                ("http_method", Nullable(String())),
                ("http_referer", Nullable(String())),
                # Other tags and context
                ("tags", Nested([("key", String()), ("value", String())])),
                ("contexts", Nested([("key", String()), ("value", String())])),
            ]
        )

        self.__events_columns = ColumnSet(
            [
                ("group_id", Nullable(UInt(64))),
                ("primary_hash", Nullable(FixedString(32))),
                # Promoted tags
                ("level", Nullable(String())),
                ("logger", Nullable(String())),
                ("server_name", Nullable(String())),
                ("site", Nullable(String())),
                ("url", Nullable(String())),
                ("search_message", Nullable(String())),
                ("location", Nullable(String())),
                ("culprit", Nullable(String())),
                ("received", Nullable(DateTime())),
                ("sdk_integrations", Nullable(Array(String()))),
                ("version", Nullable(String())),
                # exception interface
                (
                    "exception_stacks",
                    Nested(
                        [
                            ("type", Nullable(String())),
                            ("value", Nullable(String())),
                            ("mechanism_type", Nullable(String())),
                            ("mechanism_handled", Nullable(UInt(8))),
                        ]
                    ),
                ),
                (
                    "exception_frames",
                    Nested(
                        [
                            ("abs_path", Nullable(String())),
                            ("filename", Nullable(String())),
                            ("package", Nullable(String())),
                            ("module", Nullable(String())),
                            ("function", Nullable(String())),
                            ("in_app", Nullable(UInt(8))),
                            ("colno", Nullable(UInt(32))),
                            ("lineno", Nullable(UInt(32))),
                            ("stack_level", UInt(16)),
                        ]
                    ),
                ),
                ("modules", Nested([("name", String()), ("version", String())])),
            ]
        )

        self.__transactions_columns = ColumnSet(
            [
                ("trace_id", Nullable(UUID())),
                ("span_id", Nullable(UInt(64))),
                ("transaction_hash", Nullable(UInt(64))),
                ("transaction_op", Nullable(String())),
                ("transaction_status", Nullable(UInt(8))),
                ("duration", Nullable(UInt(32))),
                (
                    "measurements",
                    Nested([("key", LowCardinality(String())), ("value", Float(64))]),
                ),
            ]
        )

        events_storage = get_storage(StorageKey.EVENTS)
        events_ro_storage = get_storage(StorageKey.EVENTS_RO)
        transactions_storage = get_storage(StorageKey.TRANSACTIONS)

        self.__time_group_columns: Mapping[str, str] = {}
        self.__time_parse_columns = ("timestamp",)

        super().__init__(
            storages=[events_storage, transactions_storage],
            query_plan_builder=SelectedStorageQueryPlanBuilder(
                selector=DiscoverQueryStorageSelector(
                    events_table=events_storage,
                    events_ro_table=events_ro_storage,
                    abstract_events_columns=self.__events_columns,
                    transactions_table=transactions_storage,
                    abstract_transactions_columns=self.__transactions_columns,
                ),
            ),
            abstract_column_set=(
                self.__common_columns
                + self.__events_columns
                + self.__transactions_columns
            ),
            writable_storage=None,
        )
Пример #11
0
    def __init__(self):
        columns = ColumnSet([
            ('project_id', UInt(64)),
            ('event_id', UUID()),
            ('trace_id', UUID()),
            ('span_id', UInt(64)),
            ('transaction_name', String()),
            ('transaction_hash',
             Materialized(
                 UInt(64),
                 'cityHash64(transaction_name)',
             )),
            ('transaction_op', LowCardinality(String())),
            ('start_ts', DateTime()),
            ('start_ms', UInt(16)),
            ('finish_ts', DateTime()),
            ('finish_ms', UInt(16)),
            ('duration',
             Materialized(
                 UInt(32),
                 '((finish_ts - start_ts) * 1000) + (finish_ms - start_ms)',
             )),
            ('platform', LowCardinality(String())),
            ('environment', Nullable(String())),
            ('release', Nullable(String())),
            ('dist', Nullable(String())),
            ('ip_address_v4', Nullable(IPv4())),
            ('ip_address_v6', Nullable(IPv6())),
            ('user', WithDefault(
                String(),
                "''",
            )),
            ('user_id', Nullable(String())),
            ('user_name', Nullable(String())),
            ('user_email', Nullable(String())),
            ('tags', Nested([
                ('key', String()),
                ('value', String()),
            ])),
            ('contexts', Nested([
                ('key', String()),
                ('value', String()),
            ])),
            ('partition', UInt(16)),
            ('offset', UInt(64)),
            ('retention_days', UInt(16)),
            ('deleted', UInt(8)),
        ])

        schema = ReplacingMergeTreeSchema(
            columns=columns,
            local_table_name='transactions_local',
            dist_table_name='transactions_dist',
            order_by=
            '(project_id, toStartOfDay(start_ts), transaction_hash, start_ts, start_ms, trace_id, span_id)',
            partition_by='(retention_days, toMonday(start_ts))',
            version_column='deleted',
            sample_expr=None,
        )

        dataset_schemas = DatasetSchemas(
            read_schema=schema,
            write_schema=schema,
        )

        super().__init__(
            dataset_schemas=dataset_schemas,
            processor=TransactionsMessageProcessor(),
            default_topic="events",
            time_group_columns={
                'bucketed_start': 'start_ts',
                'bucketed_end': 'finish_ts',
            },
        )
Пример #12
0
    String,
    UInt,
    WithDefault,
)
from snuba.clusters.storage_sets import StorageSetKey
from snuba.migrations import migration, operations, table_engines


UNKNOWN_SPAN_STATUS = 2

columns = [
    Column("project_id", UInt(64)),
    Column("event_id", UUID()),
    Column("trace_id", UUID()),
    Column("span_id", UInt(64)),
    Column("transaction_name", LowCardinality(String())),
    Column("transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)")),
    Column("transaction_op", LowCardinality(String())),
    Column("transaction_status", WithDefault(UInt(8), str(UNKNOWN_SPAN_STATUS))),
    Column("start_ts", DateTime()),
    Column("start_ms", UInt(16)),
    Column("finish_ts", DateTime()),
    Column("finish_ms", UInt(16)),
    Column("duration", UInt(32)),
    Column("platform", LowCardinality(String())),
    Column("environment", LowCardinality(Nullable(String()))),
    Column("release", LowCardinality(Nullable(String()))),
    Column("dist", LowCardinality(Nullable(String()))),
    Column("ip_address_v4", Nullable(IPv4())),
    Column("ip_address_v6", Nullable(IPv6())),
    Column("user", WithDefault(String(), "''",)),
Пример #13
0
  AggregateFunction("uniq", UInt(8))),
 (("AggregateFunction(countIf, UUID, UInt8)", "", "", ""),
  AggregateFunction("countIf", UUID(), UInt(8))),
 (("AggregateFunction(quantileIf(0.5, 0.9), UInt32, UInt8)", "", "", ""),
  AggregateFunction("quantileIf(0.5, 0.9)", UInt(32), UInt(8))),
 # Array
 (("Array(String)", "", "", ""), Array(String())),
 (("Array(DateTime)", "", "", ""), Array(DateTime())),
 (("Array(UInt64)", "", "", ""), Array(UInt(64))),
 (("Array(Nullable(UUID))", "", "", ""), Array(Nullable(UUID()))),
 # Nullable
 (("Nullable(String)", "", "", ""), Nullable(String())),
 (("Nullable(FixedString(8))", "", "", ""), Nullable(FixedString(8))),
 (("Nullable(Date)", "", "", ""), Nullable(Date())),
 # Low cardinality
 (("LowCardinality(String)", "", "", ""), LowCardinality(String())),
 (("LowCardinality(Nullable(String))", "", "", ""),
  LowCardinality(Nullable(String()))),
 # Materialized
 (("Date", "MATERIALIZED", "toDate(col1)", ""),
  Materialized(Date(), "toDate(col1)")),
 (("UInt64", "MATERIALIZED", "CAST(cityHash64(col1), 'UInt64')", ""),
  Materialized(UInt(64), "cityHash64(col1)")),
 # Default value
 (("LowCardinality(String)", "DEFAULT", "a", ""),
  WithDefault(LowCardinality(String()), "a")),
 (("UInt8", "DEFAULT", "2", ""), WithDefault(UInt(8), "2")),
 # With codecs
 (("UUID", "", "", "NONE"), WithCodecs(UUID(), ["NONE"])),
 (("DateTime", "", "", "DoubleDelta, LZ4"),
  WithCodecs(DateTime(), ["DoubleDelta", "LZ4"])),
Пример #14
0
    if "http_referer" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN http_referer Nullable(String) AFTER http_method"
        )

    return ret


columns = ColumnSet(
    [
        ("project_id", UInt(64)),
        ("event_id", UUID()),
        ("trace_id", UUID()),
        ("span_id", UInt(64)),
        ("transaction_name", LowCardinality(String())),
        ("transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)",),),
        ("transaction_op", LowCardinality(String())),
        ("transaction_status", WithDefault(UInt(8), str(UNKNOWN_SPAN_STATUS))),
        ("start_ts", DateTime()),
        ("start_ms", UInt(16)),
        ("finish_ts", DateTime()),
        ("finish_ms", UInt(16)),
        ("duration", UInt(32)),
        ("platform", LowCardinality(String())),
        ("environment", LowCardinality(Nullable(String()))),
        ("release", LowCardinality(Nullable(String()))),
        ("dist", LowCardinality(Nullable(String()))),
        ("ip_address_v4", Nullable(IPv4())),
        ("ip_address_v6", Nullable(IPv6())),
        ("user", WithDefault(String(), "''",)),
Пример #15
0
from snuba.datasets.spans_processor import UNKNOWN_SPAN_STATUS, SpansMessageProcessor
from snuba.datasets.storage import WritableTableStorage
from snuba.datasets.storages import StorageKey
from snuba.datasets.storages.tags_hash_map import TAGS_HASH_MAP_COLUMN
from snuba.datasets.table_storage import KafkaStreamLoader
from snuba.query.processors.prewhere import PrewhereProcessor
from snuba.web.split import TimeSplitQueryStrategy

columns = ColumnSet([
    ("project_id", UInt(64)),
    ("transaction_id", UUID()),
    ("trace_id", UUID()),
    ("transaction_span_id", UInt(64)),
    ("span_id", UInt(64)),
    ("parent_span_id", Nullable(UInt(64))),
    ("transaction_name", LowCardinality(String())),
    ("description", String()),  # description in span
    ("op", LowCardinality(String())),
    (
        "status",
        WithDefault(UInt(8), str(UNKNOWN_SPAN_STATUS)),
    ),
    ("start_ts", DateTime()),
    ("start_ns", UInt(32)),
    ("finish_ts", DateTime()),
    ("finish_ns", UInt(32)),
    ("duration_ms", UInt(32)),
    ("tags", Nested([("key", String()), ("value", String())])),
    ("_tags_hash_map", Materialized(Array(UInt(64)), TAGS_HASH_MAP_COLUMN)),
    ("retention_days", UInt(16)),
    ("deleted", UInt(8)),
Пример #16
0
READ_LOCAL_MV_NAME = "sessions_hourly_mv_local"
READ_DIST_MV_NAME = "sessions_hourly_mv_dist"

all_columns = ColumnSet([
    ("session_id", UUID()),
    ("distinct_id", UUID()),
    ("seq", UInt(64)),
    ("org_id", UInt(64)),
    ("project_id", UInt(64)),
    ("retention_days", UInt(16)),
    ("duration", UInt(32)),
    ("status", UInt(8)),
    ("errors", UInt(16)),
    ("received", DateTime()),
    ("started", DateTime()),
    ("release", LowCardinality(String())),
    ("environment", LowCardinality(String())),
])

raw_schema = MergeTreeSchema(
    columns=all_columns,
    local_table_name=WRITE_LOCAL_TABLE_NAME,
    dist_table_name=WRITE_DIST_TABLE_NAME,
    storage_set_key=StorageSetKey.SESSIONS,
    order_by="(org_id, project_id, release, environment, started)",
    partition_by="(toMonday(started))",
    settings={"index_granularity": "16384"},
)

read_columns = ColumnSet([
    ("org_id", UInt(64)),
Пример #17
0
 def visit_lowcardinality(self, node: Node,
                          visited_children: Iterable[Any]) -> ColumnType:
     (_lc, _paren, _sp, inner_type, _sp, _paren) = visited_children
     return LowCardinality(inner_type)
Пример #18
0
    def __init__(self) -> None:
        write_columns = ColumnSet([
            ("org_id", UInt(64)),
            ("project_id", UInt(64)),
            ("key_id", Nullable(UInt(64))),
            ("timestamp", DateTime()),
            ("outcome", UInt(8)),
            ("reason", LowCardinality(Nullable(String()))),
            ("event_id", Nullable(UUID())),
        ])

        write_schema = MergeTreeSchema(
            columns=write_columns,
            # TODO: change to outcomes.raw_local when we add multi DB support
            local_table_name=WRITE_LOCAL_TABLE_NAME,
            dist_table_name=WRITE_DIST_TABLE_NAME,
            order_by="(org_id, project_id, timestamp)",
            partition_by="(toMonday(timestamp))",
            settings={"index_granularity": 16384},
        )

        read_columns = ColumnSet([
            ("org_id", UInt(64)),
            ("project_id", UInt(64)),
            ("key_id", UInt(64)),
            ("timestamp", DateTime()),
            ("outcome", UInt(8)),
            ("reason", LowCardinality(String())),
            ("times_seen", UInt(64)),
        ])

        read_schema = SummingMergeTreeSchema(
            columns=read_columns,
            local_table_name=READ_LOCAL_TABLE_NAME,
            dist_table_name=READ_DIST_TABLE_NAME,
            order_by="(org_id, project_id, key_id, outcome, reason, timestamp)",
            partition_by="(toMonday(timestamp))",
            settings={"index_granularity": 256},
        )

        materialized_view_columns = ColumnSet([
            ("org_id", UInt(64)),
            ("project_id", UInt(64)),
            ("key_id", UInt(64)),
            ("timestamp", DateTime()),
            ("outcome", UInt(8)),
            ("reason", String()),
            ("times_seen", UInt(64)),
        ])

        # TODO: Find a better way to specify a query for a materialized view
        # The problem right now is that we have a way to define our columns in a ColumnSet abstraction but the query
        # doesn't use it.
        query = """
               SELECT
                   org_id,
                   project_id,
                   ifNull(key_id, 0) AS key_id,
                   toStartOfHour(timestamp) AS timestamp,
                   outcome,
                   ifNull(reason, 'none') AS reason,
                   count() AS times_seen
               FROM %(source_table_name)s
               GROUP BY org_id, project_id, key_id, timestamp, outcome, reason
               """

        materialized_view = MaterializedViewSchema(
            local_materialized_view_name="outcomes_mv_hourly_local",
            dist_materialized_view_name="outcomes_mv_hourly_dist",
            prewhere_candidates=["project_id", "org_id"],
            columns=materialized_view_columns,
            query=query,
            local_source_table_name=WRITE_LOCAL_TABLE_NAME,
            local_destination_table_name=READ_LOCAL_TABLE_NAME,
            dist_source_table_name=WRITE_DIST_TABLE_NAME,
            dist_destination_table_name=READ_DIST_TABLE_NAME,
        )

        dataset_schemas = DatasetSchemas(
            read_schema=read_schema,
            write_schema=write_schema,
            intermediary_schemas=[materialized_view],
        )

        table_writer = TableWriter(
            write_schema=write_schema,
            stream_loader=KafkaStreamLoader(
                processor=OutcomesProcessor(),
                default_topic="outcomes",
            ),
        )

        super().__init__(
            dataset_schemas=dataset_schemas,
            table_writer=table_writer,
            time_group_columns={"time": "timestamp"},
            time_parse_columns=("timestamp", ),
        )
Пример #19
0
    UInt,
    WithDefault,
)
from snuba.clusters.storage_sets import StorageSetKey
from snuba.datasets.querylog_processor import QuerylogProcessor
from snuba.datasets.schemas.tables import WritableTableSchema
from snuba.datasets.storage import WritableTableStorage
from snuba.datasets.storages import StorageKey
from snuba.datasets.table_storage import KafkaStreamLoader

NESTED_ARRAY_DEFAULT = "arrayResize([['']], length(clickhouse_queries.sql))"

columns = ColumnSet([
    ("request_id", UUID()),
    ("request_body", String()),
    ("referrer", LowCardinality(String())),
    ("dataset", LowCardinality(String())),
    ("projects", Array(UInt(64))),
    ("organization", Nullable(UInt(64))),
    ("timestamp", DateTime()),
    ("duration_ms", UInt(32)),
    ("status", LowCardinality(String())),
    # clickhouse_queries Nested columns.
    # This is expanded into arrays instead of being expressed as a
    # Nested column because, when adding new columns to a nested field
    # we need to provide a default for the entire array (each new column
    # is an array).
    # The same schema cannot be achieved with the Nested construct (where
    # we can only provide default for individual values), so, if we
    # use the Nested construct, this schema cannot match the one generated
    # by the migration framework (or by any ALTER statement).
Пример #20
0
    LowCardinality,
    Nested,
    Nullable,
    String,
    UInt,
    UUID,
)
from snuba.clusters.storage_sets import StorageSetKey
from snuba.migrations import migration, operations, table_engines

status_type = Enum([("success", 0), ("error", 1), ("rate-limited", 2)])

columns = [
    Column("request_id", UUID()),
    Column("request_body", String()),
    Column("referrer", LowCardinality(String())),
    Column("dataset", LowCardinality(String())),
    Column("projects", Array(UInt(64))),
    Column("organization", Nullable(UInt(64))),
    Column("timestamp", DateTime()),
    Column("duration_ms", UInt(32)),
    Column("status", status_type),
    Column(
        "clickhouse_queries",
        Nested([
            Column("sql", String()),
            Column("status", status_type),
            Column("trace_id", Nullable(UUID())),
            Column("duration_ms", UInt(32)),
            Column("stats", String()),
            Column("final", UInt(8)),
Пример #21
0
from snuba.datasets.storages import StorageKey
from snuba.datasets.table_storage import KafkaStreamLoader
from snuba.query.processors.prewhere import PrewhereProcessor

WRITE_LOCAL_TABLE_NAME = "outcomes_raw_local"
WRITE_DIST_TABLE_NAME = "outcomes_raw_dist"
READ_LOCAL_TABLE_NAME = "outcomes_hourly_local"
READ_DIST_TABLE_NAME = "outcomes_hourly_dist"

write_columns = ColumnSet([
    ("org_id", UInt(64)),
    ("project_id", UInt(64)),
    ("key_id", Nullable(UInt(64))),
    ("timestamp", DateTime()),
    ("outcome", UInt(8)),
    ("reason", LowCardinality(Nullable(String()))),
    ("event_id", Nullable(UUID())),
])

raw_schema = MergeTreeSchema(
    columns=write_columns,
    # TODO: change to outcomes.raw_local when we add multi DB support
    local_table_name=WRITE_LOCAL_TABLE_NAME,
    dist_table_name=WRITE_DIST_TABLE_NAME,
    storage_set_key=StorageSetKey.OUTCOMES,
    order_by="(org_id, project_id, timestamp)",
    partition_by="(toMonday(timestamp))",
    settings={"index_granularity": "16384"},
)

read_columns = ColumnSet([
Пример #22
0
    return ret


all_columns = ColumnSet(
    [
        ("org_id", UInt(64)),
        ("project_id", UInt(64)),
        ("timestamp", DateTime()),
        ("event_id", WithCodecs(UUID(), ["NONE"])),
        (
            "event_hash",
            WithCodecs(
                Materialized(UInt(64), "cityHash64(toString(event_id))",), ["NONE"],
            ),
        ),
        ("platform", LowCardinality(String())),
        ("environment", LowCardinality(Nullable(String()))),
        ("release", LowCardinality(Nullable(String()))),
        ("dist", LowCardinality(Nullable(String()))),
        ("ip_address_v4", Nullable(IPv4())),
        ("ip_address_v6", Nullable(IPv6())),
        ("user", WithDefault(String(), "''")),
        ("user_hash", Materialized(UInt(64), "cityHash64(user)"),),
        ("user_id", Nullable(String())),
        ("user_name", Nullable(String())),
        ("user_email", Nullable(String())),
        ("sdk_name", LowCardinality(Nullable(String()))),
        ("sdk_version", LowCardinality(Nullable(String()))),
        ("http_method", LowCardinality(Nullable(String()))),
        ("http_referer", Nullable(String())),
        ("tags", Nested([("key", String()), ("value", String())])),
Пример #23
0
    def __init__(self):
        write_columns = ColumnSet([
            ('org_id', UInt(64)),
            ('project_id', UInt(64)),
            ('key_id', Nullable(UInt(64))),
            ('timestamp', DateTime()),
            ('outcome', UInt(8)),
            ('reason', LowCardinality(Nullable(String()))),
            ('event_id', Nullable(UUID())),
        ])

        write_schema = MergeTreeSchema(
            columns=write_columns,
            # TODO: change to outcomes.raw_local when we add multi DB support
            local_table_name=WRITE_LOCAL_TABLE_NAME,
            dist_table_name=WRITE_DIST_TABLE_NAME,
            order_by='(org_id, project_id, timestamp)',
            partition_by='(toMonday(timestamp))',
            settings={'index_granularity': 16384})

        read_columns = ColumnSet([
            ('org_id', UInt(64)),
            ('project_id', UInt(64)),
            ('key_id', UInt(64)),
            ('timestamp', DateTime()),
            ('outcome', UInt(8)),
            ('reason', LowCardinality(String())),
            ('times_seen', UInt(64)),
        ])

        read_schema = SummingMergeTreeSchema(
            columns=read_columns,
            local_table_name=READ_LOCAL_TABLE_NAME,
            dist_table_name=READ_DIST_TABLE_NAME,
            order_by='(org_id, project_id, key_id, outcome, reason, timestamp)',
            partition_by='(toMonday(timestamp))',
            settings={'index_granularity': 256})

        materialized_view_columns = ColumnSet([
            ('org_id', UInt(64)),
            ('project_id', UInt(64)),
            ('key_id', UInt(64)),
            ('timestamp', DateTime()),
            ('outcome', UInt(8)),
            ('reason', String()),
            ('times_seen', UInt(64)),
        ])

        # TODO: Find a better way to specify a query for a materialized view
        # The problem right now is that we have a way to define our columns in a ColumnSet abstraction but the query
        # doesn't use it.
        query = """
               SELECT
                   org_id,
                   project_id,
                   ifNull(key_id, 0) AS key_id,
                   toStartOfHour(timestamp) AS timestamp,
                   outcome,
                   ifNull(reason, 'none') AS reason,
                   count() AS times_seen
               FROM %(source_table_name)s
               GROUP BY org_id, project_id, key_id, timestamp, outcome, reason
               """

        materialized_view = MaterializedViewSchema(
            local_materialized_view_name='outcomes_mv_hourly_local',
            dist_materialized_view_name='outcomes_mv_hourly_dist',
            columns=materialized_view_columns,
            query=query,
            local_source_table_name=WRITE_LOCAL_TABLE_NAME,
            local_destination_table_name=READ_LOCAL_TABLE_NAME,
            dist_source_table_name=WRITE_DIST_TABLE_NAME,
            dist_destination_table_name=READ_DIST_TABLE_NAME)

        dataset_schemas = DatasetSchemas(
            read_schema=read_schema,
            write_schema=write_schema,
            intermediary_schemas=[materialized_view])

        super(OutcomesDataset, self).__init__(
            dataset_schemas=dataset_schemas,
            processor=OutcomesProcessor(),
            default_topic="outcomes",
        )
Пример #24
0
def transactions_migrations(
    clickhouse_table: str, current_schema: Mapping[str, ColumnType]
) -> Sequence[str]:
    ret = []
    duration_col = current_schema.get("duration")
    if duration_col and Materialized in duration_col.get_all_modifiers():
        ret.append("ALTER TABLE %s MODIFY COLUMN duration UInt32" % clickhouse_table)

    if "sdk_name" not in current_schema:
        ret.append(
            "ALTER TABLE %s ADD COLUMN sdk_name LowCardinality(String) DEFAULT ''"
            % clickhouse_table
        )

    if "sdk_version" not in current_schema:
        ret.append(
            "ALTER TABLE %s ADD COLUMN sdk_version LowCardinality(String) DEFAULT ''"
            % clickhouse_table
        )

    if "transaction_status" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN transaction_status UInt8 DEFAULT {UNKNOWN_SPAN_STATUS} AFTER transaction_op"
        )

    if "_tags_flattened" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN _tags_flattened String DEFAULT ''"
        )

    if "_contexts_flattened" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN _contexts_flattened String DEFAULT ''"
        )

    if "user_hash" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN user_hash UInt64 MATERIALIZED cityHash64(user) AFTER user"
        )

    low_cardinality_cols = [
        "transaction_name",
        "release",
        "dist",
        "sdk_name",
        "sdk_version",
        "environment",
    ]
    for col_name in low_cardinality_cols:
        col = current_schema.get(col_name)

        if col and LowCardinality not in col.get_all_modifiers():
            if isinstance(col, WithDefault):
                col.inner_type = LowCardinality(col.inner_type)
            else:
                col = LowCardinality(col)
            ret.append(
                f"ALTER TABLE {clickhouse_table} MODIFY COLUMN {col_name} {col.for_schema()}"
            )

    if "message_timestamp" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN message_timestamp DateTime AFTER offset"
        )

    if "_tags_hash_map" not in current_schema:
        ret.append(
            (
                f"ALTER TABLE {clickhouse_table} ADD COLUMN _tags_hash_map Array(UInt64) "
                f"MATERIALIZED {TAGS_HASH_MAP_COLUMN} AFTER _tags_flattened"
            )
        )

    # `Nested` is only syntactic sugar for table creation. Nested columns are actually arrays.
    # So current_schema does not contain any single `measurements` column. It includes
    # two separate arrays instead.
    if "measurements.key" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN measurements.key Array(LowCardinality(String)) "
            f"AFTER _contexts_flattened"
        )

    if "measurements.value" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN measurements.value Array(Float64) "
            f"AFTER measurements.key"
        )

    if "http_method" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN http_method LowCardinality(Nullable(String)) AFTER sdk_version"
        )

    if "http_referer" not in current_schema:
        ret.append(
            f"ALTER TABLE {clickhouse_table} ADD COLUMN http_referer Nullable(String) AFTER http_method"
        )

    return ret
Пример #25
0
    def __init__(self) -> None:
        all_columns = ColumnSet([
            ("org_id", UInt(64)),
            ("project_id", UInt(64)),
            ("timestamp", DateTime()),
            ("event_id", WithCodecs(UUID(), ["NONE"])),
            (
                "event_hash",
                WithCodecs(
                    Materialized(
                        UInt(64),
                        "cityHash64(toString(event_id))",
                    ),
                    ["NONE"],
                ),
            ),
            ("platform", LowCardinality(String())),
            ("environment", LowCardinality(Nullable(String()))),
            ("release", LowCardinality(Nullable(String()))),
            ("dist", LowCardinality(Nullable(String()))),
            ("ip_address_v4", Nullable(IPv4())),
            ("ip_address_v6", Nullable(IPv6())),
            ("user", WithDefault(String(), "''")),
            (
                "user_hash",
                Materialized(UInt(64), "cityHash64(user)"),
            ),
            ("user_id", Nullable(String())),
            ("user_name", Nullable(String())),
            ("user_email", Nullable(String())),
            ("sdk_name", LowCardinality(Nullable(String()))),
            ("sdk_version", LowCardinality(Nullable(String()))),
            ("tags", Nested([("key", String()), ("value", String())])),
            ("_tags_flattened", String()),
            ("contexts", Nested([("key", String()), ("value", String())])),
            ("_contexts_flattened", String()),
            ("transaction_name", WithDefault(LowCardinality(String()), "''")),
            (
                "transaction_hash",
                Materialized(UInt(64), "cityHash64(transaction_name)"),
            ),
            ("span_id", Nullable(UInt(64))),
            ("trace_id", Nullable(UUID())),
            ("partition", UInt(16)),
            ("offset", WithCodecs(UInt(64), ["DoubleDelta", "LZ4"])),
            ("retention_days", UInt(16)),
            ("deleted", UInt(8)),
            ("group_id", UInt(64)),
            ("primary_hash", FixedString(32)),
            ("primary_hash_hex", Materialized(UInt(64), "hex(primary_hash)")),
            ("event_string", WithCodecs(String(), ["NONE"])),
            ("received", DateTime()),
            ("message", String()),
            ("title", String()),
            ("culprit", String()),
            ("level", LowCardinality(String())),
            ("location", Nullable(String())),
            ("version", LowCardinality(Nullable(String()))),
            ("type", LowCardinality(String())),
            (
                "exception_stacks",
                Nested([
                    ("type", Nullable(String())),
                    ("value", Nullable(String())),
                    ("mechanism_type", Nullable(String())),
                    ("mechanism_handled", Nullable(UInt(8))),
                ]),
            ),
            (
                "exception_frames",
                Nested([
                    ("abs_path", Nullable(String())),
                    ("colno", Nullable(UInt(32))),
                    ("filename", Nullable(String())),
                    ("function", Nullable(String())),
                    ("lineno", Nullable(UInt(32))),
                    ("in_app", Nullable(UInt(8))),
                    ("package", Nullable(String())),
                    ("module", Nullable(String())),
                    ("stack_level", Nullable(UInt(16))),
                ]),
            ),
            ("sdk_integrations", Array(String())),
            ("modules", Nested([("name", String()), ("version", String())])),
        ])

        self.__promoted_tag_columns = {
            "environment": "environment",
            "sentry:release": "release",
            "sentry:dist": "dist",
            "sentry:user": "******",
            "transaction": "transaction_name",
            "level": "level",
        }

        schema = ReplacingMergeTreeSchema(
            columns=all_columns,
            local_table_name="errors_local",
            dist_table_name="errors_dist",
            mandatory_conditions=[("deleted", "=", 0)],
            prewhere_candidates=[
                "event_id",
                "group_id",
                "tags[sentry:release]",
                "message",
                "environment",
                "project_id",
            ],
            order_by=
            "(org_id, project_id, toStartOfDay(timestamp), primary_hash_hex, event_hash)",
            partition_by=
            "(toMonday(timestamp), if(retention_days = 30, 30, 90))",
            version_column="deleted",
            sample_expr="event_hash",
            ttl_expr="timestamp + toIntervalDay(retention_days)",
            settings={"index_granularity": "8192"},
        )

        dataset_schemas = DatasetSchemas(
            read_schema=schema,
            write_schema=schema,
        )

        table_writer = TableWriter(
            write_schema=schema,
            stream_loader=KafkaStreamLoader(
                processor=ErrorsProcessor(self.__promoted_tag_columns),
                default_topic="events",
            ),
        )

        super().__init__(
            dataset_schemas=dataset_schemas,
            table_writer=table_writer,
            time_group_columns={
                "time": "timestamp",
                "rtime": "received"
            },
            time_parse_columns=("timestamp", "received"),
        )

        self.__tags_processor = TagColumnProcessor(
            columns=all_columns,
            promoted_columns=self._get_promoted_columns(),
            column_tag_map=self._get_column_tag_map(),
        )