def __forward_migrations( self, table_name: str) -> Sequence[operations.Operation]: return [ operations.ModifyColumn( StorageSetKey.QUERYLOG, table_name, Column("status", LowCardinality(String())), ), operations.ModifyColumn( StorageSetKey.QUERYLOG, table_name, Column("clickhouse_queries.status", Array(LowCardinality(String()))), ), ]
def __init__(self): read_columns = ColumnSet([ ('org_id', UInt(64)), ('project_id', UInt(64)), ('key_id', Nullable(UInt(64))), ('timestamp', DateTime()), ('outcome', UInt(8)), ('reason', LowCardinality(Nullable(String()))), ('event_id', Nullable(UUID())), ]) read_schema = MergeTreeSchema( columns=read_columns, local_table_name='outcomes_raw_local', dist_table_name='outcomes_raw_dist', order_by='(org_id, project_id, timestamp)', partition_by='(toMonday(timestamp))', settings={'index_granularity': 16384}) dataset_schemas = DatasetSchemas(read_schema=read_schema, write_schema=None, intermediary_schemas=[]) super().__init__(dataset_schemas=dataset_schemas, time_group_columns={ 'time': 'timestamp', }, time_parse_columns=('timestamp', ))
def __init__(self) -> None: read_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", Nullable(UInt(64))), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(Nullable(String()))), ("event_id", Nullable(UUID())), ]) read_schema = MergeTreeSchema( columns=read_columns, local_table_name="outcomes_raw_local", dist_table_name="outcomes_raw_dist", order_by="(org_id, project_id, timestamp)", partition_by="(toMonday(timestamp))", settings={"index_granularity": 16384}, migration_function=outcomes_raw_migrations, ) dataset_schemas = DatasetSchemas(read_schema=read_schema, write_schema=None, intermediary_schemas=[]) super().__init__( dataset_schemas=dataset_schemas, time_group_columns={"time": "timestamp"}, time_parse_columns=("timestamp", ), )
def forwards_dist(self) -> Sequence[operations.Operation]: return [ operations.AddColumn( storage_set=StorageSetKey.TRANSACTIONS, table_name="transactions_dist", column=Column( "measurements", Nested([("key", LowCardinality(String())), ("value", Float(64))]), ), after="_contexts_flattened", ), ]
def forwards_local(self) -> Sequence[operations.Operation]: return [ operations.AddColumn( storage_set=StorageSetKey.EVENTS, table_name="errors_local", column=Column("http_method", LowCardinality(Nullable(String()))), after="sdk_version", ), operations.AddColumn( storage_set=StorageSetKey.EVENTS, table_name="errors_local", column=Column("http_referer", Nullable(String())), after="http_method", ), ]
def forwards_dist(self) -> Sequence[operations.Operation]: return [ operations.AddColumn( storage_set=StorageSetKey.TRANSACTIONS, table_name="transactions_dist", column=Column("http_method", LowCardinality(Nullable(String()))), after="sdk_version", ), operations.AddColumn( storage_set=StorageSetKey.TRANSACTIONS, table_name="transactions_dist", column=Column("http_referer", Nullable(String())), after="http_method", ), ]
def __forward_migrations( self, table_name: str) -> Sequence[operations.Operation]: return [ operations.AddColumn( storage_set=StorageSetKey.QUERYLOG, table_name=table_name, column=Column( "clickhouse_queries.all_columns", WithDefault( Array(Array(LowCardinality(String()))), "arrayResize([['']], length(clickhouse_queries.sql))", ), ), after="clickhouse_queries.consistent", ), operations.AddColumn( storage_set=StorageSetKey.QUERYLOG, table_name=table_name, column=Column( "clickhouse_queries.or_conditions", WithDefault( Array(UInt(8)), "arrayResize([0], length(clickhouse_queries.sql))", ), ), after="clickhouse_queries.all_columns", ), operations.AddColumn( storage_set=StorageSetKey.QUERYLOG, table_name=table_name, column=Column( "clickhouse_queries.where_columns", WithDefault( Array(Array(LowCardinality(String()))), "arrayResize([['']], length(clickhouse_queries.sql))", ), ), after="clickhouse_queries.or_conditions", ), operations.AddColumn( storage_set=StorageSetKey.QUERYLOG, table_name=table_name, column=Column( "clickhouse_queries.where_mapping_columns", WithDefault( Array(Array(LowCardinality(String()))), "arrayResize([['']], length(clickhouse_queries.sql))", ), ), after="clickhouse_queries.where_columns", ), operations.AddColumn( storage_set=StorageSetKey.QUERYLOG, table_name=table_name, column=Column( "clickhouse_queries.groupby_columns", WithDefault( Array(Array(LowCardinality(String()))), "arrayResize([['']], length(clickhouse_queries.sql))", ), ), after="clickhouse_queries.where_mapping_columns", ), operations.AddColumn( storage_set=StorageSetKey.QUERYLOG, table_name=table_name, column=Column( "clickhouse_queries.array_join_columns", WithDefault( Array(Array(LowCardinality(String()))), "arrayResize([['']], length(clickhouse_queries.sql))", ), ), after="clickhouse_queries.groupby_columns", ), ]
def transactions_migrations( clickhouse_table: str, current_schema: Mapping[str, ColumnType] ) -> Sequence[str]: ret = [] duration_col = current_schema.get("duration") if duration_col and Materialized in duration_col.get_all_modifiers(): ret.append("ALTER TABLE %s MODIFY COLUMN duration UInt32" % clickhouse_table) if "sdk_name" not in current_schema: ret.append( "ALTER TABLE %s ADD COLUMN sdk_name LowCardinality(String) DEFAULT ''" % clickhouse_table ) if "sdk_version" not in current_schema: ret.append( "ALTER TABLE %s ADD COLUMN sdk_version LowCardinality(String) DEFAULT ''" % clickhouse_table ) if "transaction_status" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN transaction_status UInt8 DEFAULT {UNKNOWN_SPAN_STATUS} AFTER transaction_op" ) if "_tags_flattened" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN _tags_flattened String DEFAULT ''" ) if "_contexts_flattened" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN _contexts_flattened String DEFAULT ''" ) if "_start_date" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN _start_date Date MATERIALIZED toDate(start_ts) AFTER start_ms" ) if "_finish_date" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN _finish_date Date MATERIALIZED toDate(finish_ts) AFTER finish_ms" ) if "user_hash" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN user_hash UInt64 MATERIALIZED cityHash64(user) AFTER user" ) low_cardinality_cols = [ "transaction_name", "release", "dist", "sdk_name", "sdk_version", "environment", ] for col_name in low_cardinality_cols: col = current_schema.get(col_name) if col and LowCardinality not in col.get_all_modifiers(): if isinstance(col, WithDefault): col.inner_type = LowCardinality(col.inner_type) else: col = LowCardinality(col) ret.append( f"ALTER TABLE {clickhouse_table} MODIFY COLUMN {col_name} {col.for_schema()}" ) return ret
def __init__(self) -> None: columns = ColumnSet( [ ("project_id", UInt(64)), ("event_id", UUID()), ("trace_id", UUID()), ("span_id", UInt(64)), ("transaction_name", LowCardinality(String())), ( "transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)",), ), ("transaction_op", LowCardinality(String())), ("transaction_status", WithDefault(UInt(8), UNKNOWN_SPAN_STATUS)), ("start_ts", DateTime()), ("start_ms", UInt(16)), ("_start_date", Materialized(Date(), "toDate(start_ts)"),), ("finish_ts", DateTime()), ("finish_ms", UInt(16)), ("_finish_date", Materialized(Date(), "toDate(finish_ts)"),), ("duration", UInt(32)), ("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''",)), ("user_hash", Materialized(UInt(64), "cityHash64(user)"),), ("user_id", Nullable(String())), ("user_name", Nullable(String())), ("user_email", Nullable(String())), ("sdk_name", WithDefault(LowCardinality(String()), "''")), ("sdk_version", WithDefault(LowCardinality(String()), "''")), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), ("contexts", Nested([("key", String()), ("value", String())])), ("_contexts_flattened", String()), ("partition", UInt(16)), ("offset", UInt(64)), ("retention_days", UInt(16)), ("deleted", UInt(8)), ] ) schema = ReplacingMergeTreeSchema( columns=columns, local_table_name="transactions_local", dist_table_name="transactions_dist", mandatory_conditions=[], prewhere_candidates=["event_id", "project_id"], order_by="(project_id, _finish_date, transaction_name, cityHash64(span_id))", partition_by="(retention_days, toMonday(_finish_date))", version_column="deleted", sample_expr=None, migration_function=transactions_migrations, ) dataset_schemas = DatasetSchemas(read_schema=schema, write_schema=schema,) self.__tags_processor = TagColumnProcessor( columns=columns, promoted_columns=self._get_promoted_columns(), column_tag_map=self._get_column_tag_map(), ) super().__init__( dataset_schemas=dataset_schemas, table_writer=TransactionsTableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=TransactionsMessageProcessor(), default_topic="events", ), ), time_group_columns={ "bucketed_start": "start_ts", "bucketed_end": "finish_ts", }, time_parse_columns=("start_ts", "finish_ts"), )
def __init__(self) -> None: self.__common_columns = ColumnSet( [ ("event_id", FixedString(32)), ("project_id", UInt(64)), ("type", Nullable(String())), ("timestamp", DateTime()), ("platform", Nullable(String())), ("environment", Nullable(String())), ("release", Nullable(String())), ("dist", Nullable(String())), ("user", Nullable(String())), ("transaction", Nullable(String())), ("message", Nullable(String())), ("title", Nullable(String())), # User ("user_id", Nullable(String())), ("username", Nullable(String())), ("email", Nullable(String())), ("ip_address", Nullable(String())), # SDK ("sdk_name", Nullable(String())), ("sdk_version", Nullable(String())), # geo location context ("geo_country_code", Nullable(String())), ("geo_region", Nullable(String())), ("geo_city", Nullable(String())), ("http_method", Nullable(String())), ("http_referer", Nullable(String())), # Other tags and context ("tags", Nested([("key", String()), ("value", String())])), ("contexts", Nested([("key", String()), ("value", String())])), ] ) self.__events_columns = ColumnSet( [ ("group_id", Nullable(UInt(64))), ("primary_hash", Nullable(FixedString(32))), # Promoted tags ("level", Nullable(String())), ("logger", Nullable(String())), ("server_name", Nullable(String())), ("site", Nullable(String())), ("url", Nullable(String())), ("search_message", Nullable(String())), ("location", Nullable(String())), ("culprit", Nullable(String())), ("received", Nullable(DateTime())), ("sdk_integrations", Nullable(Array(String()))), ("version", Nullable(String())), # exception interface ( "exception_stacks", Nested( [ ("type", Nullable(String())), ("value", Nullable(String())), ("mechanism_type", Nullable(String())), ("mechanism_handled", Nullable(UInt(8))), ] ), ), ( "exception_frames", Nested( [ ("abs_path", Nullable(String())), ("filename", Nullable(String())), ("package", Nullable(String())), ("module", Nullable(String())), ("function", Nullable(String())), ("in_app", Nullable(UInt(8))), ("colno", Nullable(UInt(32))), ("lineno", Nullable(UInt(32))), ("stack_level", UInt(16)), ] ), ), ("modules", Nested([("name", String()), ("version", String())])), ] ) self.__transactions_columns = ColumnSet( [ ("trace_id", Nullable(UUID())), ("span_id", Nullable(UInt(64))), ("transaction_hash", Nullable(UInt(64))), ("transaction_op", Nullable(String())), ("transaction_status", Nullable(UInt(8))), ("duration", Nullable(UInt(32))), ( "measurements", Nested([("key", LowCardinality(String())), ("value", Float(64))]), ), ] ) events_storage = get_storage(StorageKey.EVENTS) events_ro_storage = get_storage(StorageKey.EVENTS_RO) transactions_storage = get_storage(StorageKey.TRANSACTIONS) self.__time_group_columns: Mapping[str, str] = {} self.__time_parse_columns = ("timestamp",) super().__init__( storages=[events_storage, transactions_storage], query_plan_builder=SelectedStorageQueryPlanBuilder( selector=DiscoverQueryStorageSelector( events_table=events_storage, events_ro_table=events_ro_storage, abstract_events_columns=self.__events_columns, transactions_table=transactions_storage, abstract_transactions_columns=self.__transactions_columns, ), ), abstract_column_set=( self.__common_columns + self.__events_columns + self.__transactions_columns ), writable_storage=None, )
def __init__(self): columns = ColumnSet([ ('project_id', UInt(64)), ('event_id', UUID()), ('trace_id', UUID()), ('span_id', UInt(64)), ('transaction_name', String()), ('transaction_hash', Materialized( UInt(64), 'cityHash64(transaction_name)', )), ('transaction_op', LowCardinality(String())), ('start_ts', DateTime()), ('start_ms', UInt(16)), ('finish_ts', DateTime()), ('finish_ms', UInt(16)), ('duration', Materialized( UInt(32), '((finish_ts - start_ts) * 1000) + (finish_ms - start_ms)', )), ('platform', LowCardinality(String())), ('environment', Nullable(String())), ('release', Nullable(String())), ('dist', Nullable(String())), ('ip_address_v4', Nullable(IPv4())), ('ip_address_v6', Nullable(IPv6())), ('user', WithDefault( String(), "''", )), ('user_id', Nullable(String())), ('user_name', Nullable(String())), ('user_email', Nullable(String())), ('tags', Nested([ ('key', String()), ('value', String()), ])), ('contexts', Nested([ ('key', String()), ('value', String()), ])), ('partition', UInt(16)), ('offset', UInt(64)), ('retention_days', UInt(16)), ('deleted', UInt(8)), ]) schema = ReplacingMergeTreeSchema( columns=columns, local_table_name='transactions_local', dist_table_name='transactions_dist', order_by= '(project_id, toStartOfDay(start_ts), transaction_hash, start_ts, start_ms, trace_id, span_id)', partition_by='(retention_days, toMonday(start_ts))', version_column='deleted', sample_expr=None, ) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) super().__init__( dataset_schemas=dataset_schemas, processor=TransactionsMessageProcessor(), default_topic="events", time_group_columns={ 'bucketed_start': 'start_ts', 'bucketed_end': 'finish_ts', }, )
String, UInt, WithDefault, ) from snuba.clusters.storage_sets import StorageSetKey from snuba.migrations import migration, operations, table_engines UNKNOWN_SPAN_STATUS = 2 columns = [ Column("project_id", UInt(64)), Column("event_id", UUID()), Column("trace_id", UUID()), Column("span_id", UInt(64)), Column("transaction_name", LowCardinality(String())), Column("transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)")), Column("transaction_op", LowCardinality(String())), Column("transaction_status", WithDefault(UInt(8), str(UNKNOWN_SPAN_STATUS))), Column("start_ts", DateTime()), Column("start_ms", UInt(16)), Column("finish_ts", DateTime()), Column("finish_ms", UInt(16)), Column("duration", UInt(32)), Column("platform", LowCardinality(String())), Column("environment", LowCardinality(Nullable(String()))), Column("release", LowCardinality(Nullable(String()))), Column("dist", LowCardinality(Nullable(String()))), Column("ip_address_v4", Nullable(IPv4())), Column("ip_address_v6", Nullable(IPv6())), Column("user", WithDefault(String(), "''",)),
AggregateFunction("uniq", UInt(8))), (("AggregateFunction(countIf, UUID, UInt8)", "", "", ""), AggregateFunction("countIf", UUID(), UInt(8))), (("AggregateFunction(quantileIf(0.5, 0.9), UInt32, UInt8)", "", "", ""), AggregateFunction("quantileIf(0.5, 0.9)", UInt(32), UInt(8))), # Array (("Array(String)", "", "", ""), Array(String())), (("Array(DateTime)", "", "", ""), Array(DateTime())), (("Array(UInt64)", "", "", ""), Array(UInt(64))), (("Array(Nullable(UUID))", "", "", ""), Array(Nullable(UUID()))), # Nullable (("Nullable(String)", "", "", ""), Nullable(String())), (("Nullable(FixedString(8))", "", "", ""), Nullable(FixedString(8))), (("Nullable(Date)", "", "", ""), Nullable(Date())), # Low cardinality (("LowCardinality(String)", "", "", ""), LowCardinality(String())), (("LowCardinality(Nullable(String))", "", "", ""), LowCardinality(Nullable(String()))), # Materialized (("Date", "MATERIALIZED", "toDate(col1)", ""), Materialized(Date(), "toDate(col1)")), (("UInt64", "MATERIALIZED", "CAST(cityHash64(col1), 'UInt64')", ""), Materialized(UInt(64), "cityHash64(col1)")), # Default value (("LowCardinality(String)", "DEFAULT", "a", ""), WithDefault(LowCardinality(String()), "a")), (("UInt8", "DEFAULT", "2", ""), WithDefault(UInt(8), "2")), # With codecs (("UUID", "", "", "NONE"), WithCodecs(UUID(), ["NONE"])), (("DateTime", "", "", "DoubleDelta, LZ4"), WithCodecs(DateTime(), ["DoubleDelta", "LZ4"])),
if "http_referer" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN http_referer Nullable(String) AFTER http_method" ) return ret columns = ColumnSet( [ ("project_id", UInt(64)), ("event_id", UUID()), ("trace_id", UUID()), ("span_id", UInt(64)), ("transaction_name", LowCardinality(String())), ("transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)",),), ("transaction_op", LowCardinality(String())), ("transaction_status", WithDefault(UInt(8), str(UNKNOWN_SPAN_STATUS))), ("start_ts", DateTime()), ("start_ms", UInt(16)), ("finish_ts", DateTime()), ("finish_ms", UInt(16)), ("duration", UInt(32)), ("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''",)),
from snuba.datasets.spans_processor import UNKNOWN_SPAN_STATUS, SpansMessageProcessor from snuba.datasets.storage import WritableTableStorage from snuba.datasets.storages import StorageKey from snuba.datasets.storages.tags_hash_map import TAGS_HASH_MAP_COLUMN from snuba.datasets.table_storage import KafkaStreamLoader from snuba.query.processors.prewhere import PrewhereProcessor from snuba.web.split import TimeSplitQueryStrategy columns = ColumnSet([ ("project_id", UInt(64)), ("transaction_id", UUID()), ("trace_id", UUID()), ("transaction_span_id", UInt(64)), ("span_id", UInt(64)), ("parent_span_id", Nullable(UInt(64))), ("transaction_name", LowCardinality(String())), ("description", String()), # description in span ("op", LowCardinality(String())), ( "status", WithDefault(UInt(8), str(UNKNOWN_SPAN_STATUS)), ), ("start_ts", DateTime()), ("start_ns", UInt(32)), ("finish_ts", DateTime()), ("finish_ns", UInt(32)), ("duration_ms", UInt(32)), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_hash_map", Materialized(Array(UInt(64)), TAGS_HASH_MAP_COLUMN)), ("retention_days", UInt(16)), ("deleted", UInt(8)),
READ_LOCAL_MV_NAME = "sessions_hourly_mv_local" READ_DIST_MV_NAME = "sessions_hourly_mv_dist" all_columns = ColumnSet([ ("session_id", UUID()), ("distinct_id", UUID()), ("seq", UInt(64)), ("org_id", UInt(64)), ("project_id", UInt(64)), ("retention_days", UInt(16)), ("duration", UInt(32)), ("status", UInt(8)), ("errors", UInt(16)), ("received", DateTime()), ("started", DateTime()), ("release", LowCardinality(String())), ("environment", LowCardinality(String())), ]) raw_schema = MergeTreeSchema( columns=all_columns, local_table_name=WRITE_LOCAL_TABLE_NAME, dist_table_name=WRITE_DIST_TABLE_NAME, storage_set_key=StorageSetKey.SESSIONS, order_by="(org_id, project_id, release, environment, started)", partition_by="(toMonday(started))", settings={"index_granularity": "16384"}, ) read_columns = ColumnSet([ ("org_id", UInt(64)),
def visit_lowcardinality(self, node: Node, visited_children: Iterable[Any]) -> ColumnType: (_lc, _paren, _sp, inner_type, _sp, _paren) = visited_children return LowCardinality(inner_type)
def __init__(self) -> None: write_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", Nullable(UInt(64))), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(Nullable(String()))), ("event_id", Nullable(UUID())), ]) write_schema = MergeTreeSchema( columns=write_columns, # TODO: change to outcomes.raw_local when we add multi DB support local_table_name=WRITE_LOCAL_TABLE_NAME, dist_table_name=WRITE_DIST_TABLE_NAME, order_by="(org_id, project_id, timestamp)", partition_by="(toMonday(timestamp))", settings={"index_granularity": 16384}, ) read_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", UInt(64)), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(String())), ("times_seen", UInt(64)), ]) read_schema = SummingMergeTreeSchema( columns=read_columns, local_table_name=READ_LOCAL_TABLE_NAME, dist_table_name=READ_DIST_TABLE_NAME, order_by="(org_id, project_id, key_id, outcome, reason, timestamp)", partition_by="(toMonday(timestamp))", settings={"index_granularity": 256}, ) materialized_view_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", UInt(64)), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", String()), ("times_seen", UInt(64)), ]) # TODO: Find a better way to specify a query for a materialized view # The problem right now is that we have a way to define our columns in a ColumnSet abstraction but the query # doesn't use it. query = """ SELECT org_id, project_id, ifNull(key_id, 0) AS key_id, toStartOfHour(timestamp) AS timestamp, outcome, ifNull(reason, 'none') AS reason, count() AS times_seen FROM %(source_table_name)s GROUP BY org_id, project_id, key_id, timestamp, outcome, reason """ materialized_view = MaterializedViewSchema( local_materialized_view_name="outcomes_mv_hourly_local", dist_materialized_view_name="outcomes_mv_hourly_dist", prewhere_candidates=["project_id", "org_id"], columns=materialized_view_columns, query=query, local_source_table_name=WRITE_LOCAL_TABLE_NAME, local_destination_table_name=READ_LOCAL_TABLE_NAME, dist_source_table_name=WRITE_DIST_TABLE_NAME, dist_destination_table_name=READ_DIST_TABLE_NAME, ) dataset_schemas = DatasetSchemas( read_schema=read_schema, write_schema=write_schema, intermediary_schemas=[materialized_view], ) table_writer = TableWriter( write_schema=write_schema, stream_loader=KafkaStreamLoader( processor=OutcomesProcessor(), default_topic="outcomes", ), ) super().__init__( dataset_schemas=dataset_schemas, table_writer=table_writer, time_group_columns={"time": "timestamp"}, time_parse_columns=("timestamp", ), )
UInt, WithDefault, ) from snuba.clusters.storage_sets import StorageSetKey from snuba.datasets.querylog_processor import QuerylogProcessor from snuba.datasets.schemas.tables import WritableTableSchema from snuba.datasets.storage import WritableTableStorage from snuba.datasets.storages import StorageKey from snuba.datasets.table_storage import KafkaStreamLoader NESTED_ARRAY_DEFAULT = "arrayResize([['']], length(clickhouse_queries.sql))" columns = ColumnSet([ ("request_id", UUID()), ("request_body", String()), ("referrer", LowCardinality(String())), ("dataset", LowCardinality(String())), ("projects", Array(UInt(64))), ("organization", Nullable(UInt(64))), ("timestamp", DateTime()), ("duration_ms", UInt(32)), ("status", LowCardinality(String())), # clickhouse_queries Nested columns. # This is expanded into arrays instead of being expressed as a # Nested column because, when adding new columns to a nested field # we need to provide a default for the entire array (each new column # is an array). # The same schema cannot be achieved with the Nested construct (where # we can only provide default for individual values), so, if we # use the Nested construct, this schema cannot match the one generated # by the migration framework (or by any ALTER statement).
LowCardinality, Nested, Nullable, String, UInt, UUID, ) from snuba.clusters.storage_sets import StorageSetKey from snuba.migrations import migration, operations, table_engines status_type = Enum([("success", 0), ("error", 1), ("rate-limited", 2)]) columns = [ Column("request_id", UUID()), Column("request_body", String()), Column("referrer", LowCardinality(String())), Column("dataset", LowCardinality(String())), Column("projects", Array(UInt(64))), Column("organization", Nullable(UInt(64))), Column("timestamp", DateTime()), Column("duration_ms", UInt(32)), Column("status", status_type), Column( "clickhouse_queries", Nested([ Column("sql", String()), Column("status", status_type), Column("trace_id", Nullable(UUID())), Column("duration_ms", UInt(32)), Column("stats", String()), Column("final", UInt(8)),
from snuba.datasets.storages import StorageKey from snuba.datasets.table_storage import KafkaStreamLoader from snuba.query.processors.prewhere import PrewhereProcessor WRITE_LOCAL_TABLE_NAME = "outcomes_raw_local" WRITE_DIST_TABLE_NAME = "outcomes_raw_dist" READ_LOCAL_TABLE_NAME = "outcomes_hourly_local" READ_DIST_TABLE_NAME = "outcomes_hourly_dist" write_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", Nullable(UInt(64))), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(Nullable(String()))), ("event_id", Nullable(UUID())), ]) raw_schema = MergeTreeSchema( columns=write_columns, # TODO: change to outcomes.raw_local when we add multi DB support local_table_name=WRITE_LOCAL_TABLE_NAME, dist_table_name=WRITE_DIST_TABLE_NAME, storage_set_key=StorageSetKey.OUTCOMES, order_by="(org_id, project_id, timestamp)", partition_by="(toMonday(timestamp))", settings={"index_granularity": "16384"}, ) read_columns = ColumnSet([
return ret all_columns = ColumnSet( [ ("org_id", UInt(64)), ("project_id", UInt(64)), ("timestamp", DateTime()), ("event_id", WithCodecs(UUID(), ["NONE"])), ( "event_hash", WithCodecs( Materialized(UInt(64), "cityHash64(toString(event_id))",), ["NONE"], ), ), ("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''")), ("user_hash", Materialized(UInt(64), "cityHash64(user)"),), ("user_id", Nullable(String())), ("user_name", Nullable(String())), ("user_email", Nullable(String())), ("sdk_name", LowCardinality(Nullable(String()))), ("sdk_version", LowCardinality(Nullable(String()))), ("http_method", LowCardinality(Nullable(String()))), ("http_referer", Nullable(String())), ("tags", Nested([("key", String()), ("value", String())])),
def __init__(self): write_columns = ColumnSet([ ('org_id', UInt(64)), ('project_id', UInt(64)), ('key_id', Nullable(UInt(64))), ('timestamp', DateTime()), ('outcome', UInt(8)), ('reason', LowCardinality(Nullable(String()))), ('event_id', Nullable(UUID())), ]) write_schema = MergeTreeSchema( columns=write_columns, # TODO: change to outcomes.raw_local when we add multi DB support local_table_name=WRITE_LOCAL_TABLE_NAME, dist_table_name=WRITE_DIST_TABLE_NAME, order_by='(org_id, project_id, timestamp)', partition_by='(toMonday(timestamp))', settings={'index_granularity': 16384}) read_columns = ColumnSet([ ('org_id', UInt(64)), ('project_id', UInt(64)), ('key_id', UInt(64)), ('timestamp', DateTime()), ('outcome', UInt(8)), ('reason', LowCardinality(String())), ('times_seen', UInt(64)), ]) read_schema = SummingMergeTreeSchema( columns=read_columns, local_table_name=READ_LOCAL_TABLE_NAME, dist_table_name=READ_DIST_TABLE_NAME, order_by='(org_id, project_id, key_id, outcome, reason, timestamp)', partition_by='(toMonday(timestamp))', settings={'index_granularity': 256}) materialized_view_columns = ColumnSet([ ('org_id', UInt(64)), ('project_id', UInt(64)), ('key_id', UInt(64)), ('timestamp', DateTime()), ('outcome', UInt(8)), ('reason', String()), ('times_seen', UInt(64)), ]) # TODO: Find a better way to specify a query for a materialized view # The problem right now is that we have a way to define our columns in a ColumnSet abstraction but the query # doesn't use it. query = """ SELECT org_id, project_id, ifNull(key_id, 0) AS key_id, toStartOfHour(timestamp) AS timestamp, outcome, ifNull(reason, 'none') AS reason, count() AS times_seen FROM %(source_table_name)s GROUP BY org_id, project_id, key_id, timestamp, outcome, reason """ materialized_view = MaterializedViewSchema( local_materialized_view_name='outcomes_mv_hourly_local', dist_materialized_view_name='outcomes_mv_hourly_dist', columns=materialized_view_columns, query=query, local_source_table_name=WRITE_LOCAL_TABLE_NAME, local_destination_table_name=READ_LOCAL_TABLE_NAME, dist_source_table_name=WRITE_DIST_TABLE_NAME, dist_destination_table_name=READ_DIST_TABLE_NAME) dataset_schemas = DatasetSchemas( read_schema=read_schema, write_schema=write_schema, intermediary_schemas=[materialized_view]) super(OutcomesDataset, self).__init__( dataset_schemas=dataset_schemas, processor=OutcomesProcessor(), default_topic="outcomes", )
def transactions_migrations( clickhouse_table: str, current_schema: Mapping[str, ColumnType] ) -> Sequence[str]: ret = [] duration_col = current_schema.get("duration") if duration_col and Materialized in duration_col.get_all_modifiers(): ret.append("ALTER TABLE %s MODIFY COLUMN duration UInt32" % clickhouse_table) if "sdk_name" not in current_schema: ret.append( "ALTER TABLE %s ADD COLUMN sdk_name LowCardinality(String) DEFAULT ''" % clickhouse_table ) if "sdk_version" not in current_schema: ret.append( "ALTER TABLE %s ADD COLUMN sdk_version LowCardinality(String) DEFAULT ''" % clickhouse_table ) if "transaction_status" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN transaction_status UInt8 DEFAULT {UNKNOWN_SPAN_STATUS} AFTER transaction_op" ) if "_tags_flattened" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN _tags_flattened String DEFAULT ''" ) if "_contexts_flattened" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN _contexts_flattened String DEFAULT ''" ) if "user_hash" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN user_hash UInt64 MATERIALIZED cityHash64(user) AFTER user" ) low_cardinality_cols = [ "transaction_name", "release", "dist", "sdk_name", "sdk_version", "environment", ] for col_name in low_cardinality_cols: col = current_schema.get(col_name) if col and LowCardinality not in col.get_all_modifiers(): if isinstance(col, WithDefault): col.inner_type = LowCardinality(col.inner_type) else: col = LowCardinality(col) ret.append( f"ALTER TABLE {clickhouse_table} MODIFY COLUMN {col_name} {col.for_schema()}" ) if "message_timestamp" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN message_timestamp DateTime AFTER offset" ) if "_tags_hash_map" not in current_schema: ret.append( ( f"ALTER TABLE {clickhouse_table} ADD COLUMN _tags_hash_map Array(UInt64) " f"MATERIALIZED {TAGS_HASH_MAP_COLUMN} AFTER _tags_flattened" ) ) # `Nested` is only syntactic sugar for table creation. Nested columns are actually arrays. # So current_schema does not contain any single `measurements` column. It includes # two separate arrays instead. if "measurements.key" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN measurements.key Array(LowCardinality(String)) " f"AFTER _contexts_flattened" ) if "measurements.value" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN measurements.value Array(Float64) " f"AFTER measurements.key" ) if "http_method" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN http_method LowCardinality(Nullable(String)) AFTER sdk_version" ) if "http_referer" not in current_schema: ret.append( f"ALTER TABLE {clickhouse_table} ADD COLUMN http_referer Nullable(String) AFTER http_method" ) return ret
def __init__(self) -> None: all_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("timestamp", DateTime()), ("event_id", WithCodecs(UUID(), ["NONE"])), ( "event_hash", WithCodecs( Materialized( UInt(64), "cityHash64(toString(event_id))", ), ["NONE"], ), ), ("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''")), ( "user_hash", Materialized(UInt(64), "cityHash64(user)"), ), ("user_id", Nullable(String())), ("user_name", Nullable(String())), ("user_email", Nullable(String())), ("sdk_name", LowCardinality(Nullable(String()))), ("sdk_version", LowCardinality(Nullable(String()))), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), ("contexts", Nested([("key", String()), ("value", String())])), ("_contexts_flattened", String()), ("transaction_name", WithDefault(LowCardinality(String()), "''")), ( "transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)"), ), ("span_id", Nullable(UInt(64))), ("trace_id", Nullable(UUID())), ("partition", UInt(16)), ("offset", WithCodecs(UInt(64), ["DoubleDelta", "LZ4"])), ("retention_days", UInt(16)), ("deleted", UInt(8)), ("group_id", UInt(64)), ("primary_hash", FixedString(32)), ("primary_hash_hex", Materialized(UInt(64), "hex(primary_hash)")), ("event_string", WithCodecs(String(), ["NONE"])), ("received", DateTime()), ("message", String()), ("title", String()), ("culprit", String()), ("level", LowCardinality(String())), ("location", Nullable(String())), ("version", LowCardinality(Nullable(String()))), ("type", LowCardinality(String())), ( "exception_stacks", Nested([ ("type", Nullable(String())), ("value", Nullable(String())), ("mechanism_type", Nullable(String())), ("mechanism_handled", Nullable(UInt(8))), ]), ), ( "exception_frames", Nested([ ("abs_path", Nullable(String())), ("colno", Nullable(UInt(32))), ("filename", Nullable(String())), ("function", Nullable(String())), ("lineno", Nullable(UInt(32))), ("in_app", Nullable(UInt(8))), ("package", Nullable(String())), ("module", Nullable(String())), ("stack_level", Nullable(UInt(16))), ]), ), ("sdk_integrations", Array(String())), ("modules", Nested([("name", String()), ("version", String())])), ]) self.__promoted_tag_columns = { "environment": "environment", "sentry:release": "release", "sentry:dist": "dist", "sentry:user": "******", "transaction": "transaction_name", "level": "level", } schema = ReplacingMergeTreeSchema( columns=all_columns, local_table_name="errors_local", dist_table_name="errors_dist", mandatory_conditions=[("deleted", "=", 0)], prewhere_candidates=[ "event_id", "group_id", "tags[sentry:release]", "message", "environment", "project_id", ], order_by= "(org_id, project_id, toStartOfDay(timestamp), primary_hash_hex, event_hash)", partition_by= "(toMonday(timestamp), if(retention_days = 30, 30, 90))", version_column="deleted", sample_expr="event_hash", ttl_expr="timestamp + toIntervalDay(retention_days)", settings={"index_granularity": "8192"}, ) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) table_writer = TableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=ErrorsProcessor(self.__promoted_tag_columns), default_topic="events", ), ) super().__init__( dataset_schemas=dataset_schemas, table_writer=table_writer, time_group_columns={ "time": "timestamp", "rtime": "received" }, time_parse_columns=("timestamp", "received"), ) self.__tags_processor = TagColumnProcessor( columns=all_columns, promoted_columns=self._get_promoted_columns(), column_tag_map=self._get_column_tag_map(), )