def __init__( self, schema: WritableTableSchema, required_columns: Sequence[str], tag_column_map: Mapping[str, Mapping[str, str]], promoted_tags: Mapping[str, Sequence[str]], state_name: ReplacerState, use_promoted_prewhere: bool, ) -> None: super().__init__(schema=schema) self.__required_columns = required_columns self.__all_columns = [ col for col in schema.get_columns() if not col.type.has_modifier(ReadOnly) ] self.__tag_column_map = tag_column_map self.__promoted_tags = promoted_tags self.__state_name = state_name self.__use_promoted_prewhere = use_promoted_prewhere self.__schema = schema self.__replacement_context = ReplacementContext( all_columns=self.__all_columns, state_name=self.__state_name, required_columns=self.__required_columns, use_promoted_prewhere=self.__use_promoted_prewhere, schema=self.__schema, tag_column_map=self.__tag_column_map, promoted_tags=self.__promoted_tags, )
def __init__( self, schema: WritableTableSchema, required_columns: Sequence[str], tag_column_map: Mapping[str, Mapping[str, str]], promoted_tags: Mapping[str, Sequence[str]], state_name: ReplacerState, ) -> None: super().__init__(schema=schema) self.__required_columns = required_columns self.__all_columns = [ col for col in schema.get_columns() if not isinstance(col.type, ReadOnly) ] self.__tag_column_map = tag_column_map self.__promoted_tags = promoted_tags self.__state_name = state_name
def __init__( self, write_schema: WritableTableSchema, read_schema: TableSchema, required_columns: Sequence[str], tag_column_map: Mapping[str, Mapping[str, str]], promoted_tags: Mapping[str, Sequence[str]], state_name: ReplacerState, ) -> None: super().__init__(write_schema=write_schema, read_schema=read_schema) self.__required_columns = required_columns self.__all_column_names = [ col.escaped for col in write_schema.get_columns() if Materialized not in col.type.get_all_modifiers() ] self.__tag_column_map = tag_column_map self.__promoted_tags = promoted_tags self.__state_name = state_name
all_columns, get_promoted_tags, get_tag_column_map, mandatory_conditions, prewhere_candidates, promoted_tag_columns, query_processors, query_splitters, required_columns, ) from snuba.datasets.table_storage import KafkaStreamLoader schema = WritableTableSchema( columns=all_columns, local_table_name="sentry_local", dist_table_name="sentry_dist", storage_set_key=StorageSetKey.EVENTS, mandatory_conditions=mandatory_conditions, prewhere_candidates=prewhere_candidates, ) storage = WritableTableStorage( storage_key=StorageKey.EVENTS, storage_set_key=StorageSetKey.EVENTS, schema=schema, query_processors=query_processors, stream_loader=KafkaStreamLoader( processor=EventsProcessor(promoted_tag_columns), default_topic="events", replacement_topic="event-replacements", commit_log_topic="snuba-commit-log", ),
("offset", UInt(64)), ("message_timestamp", DateTime()), ("retention_days", UInt(16)), ("deleted", UInt(8)), ("type", String(Modifiers(readonly=True))), ("message", String(Modifiers(readonly=True))), ("title", String(Modifiers(readonly=True))), ("timestamp", DateTime(Modifiers(readonly=True))), ]) schema = WritableTableSchema( columns=columns, local_table_name="transactions_local", dist_table_name="transactions_dist", storage_set_key=StorageSetKey.TRANSACTIONS, mandatory_conditions=[], prewhere_candidates=[ "event_id", "transaction_name", "transaction", "title" ], part_format=[util.PartSegment.RETENTION_DAYS, util.PartSegment.DATE], ) storage = WritableTableStorage( storage_key=StorageKey.TRANSACTIONS, storage_set_key=StorageSetKey.TRANSACTIONS, schema=schema, query_processors=[ MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), EventIdColumnProcessor(), ArrayJoinKeyValueOptimizer("tags"), ArrayJoinKeyValueOptimizer("measurements"),
from snuba.datasets.storage import WritableTableStorage from snuba.datasets.storages import StorageKey from snuba.datasets.storages.errors_common import ( all_columns, mandatory_conditions, promoted_tag_columns, query_processors, query_splitters, required_columns, ) from snuba.datasets.table_storage import build_kafka_stream_loader_from_settings schema = WritableTableSchema( columns=all_columns, local_table_name="errors_local", dist_table_name="errors_dist", storage_set_key=StorageSetKey.EVENTS, mandatory_conditions=mandatory_conditions, part_format=[util.PartSegment.RETENTION_DAYS, util.PartSegment.DATE], ) storage = WritableTableStorage( storage_key=StorageKey.ERRORS, storage_set_key=StorageSetKey.EVENTS, schema=schema, query_processors=query_processors, query_splitters=query_splitters, stream_loader=build_kafka_stream_loader_from_settings( StorageKey.ERRORS, processor=ErrorsProcessor(promoted_tag_columns), default_topic_name="events", replacement_topic_name="event-replacements",
READ_DIST_TABLE_NAME = "outcomes_hourly_dist" write_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", UInt(64, Modifiers(nullable=True))), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", String(Modifiers(nullable=True))), ("event_id", UUID(Modifiers(nullable=True))), ]) raw_schema = WritableTableSchema( columns=write_columns, # TODO: change to outcomes.raw_local when we add multi DB support local_table_name=WRITE_LOCAL_TABLE_NAME, dist_table_name=WRITE_DIST_TABLE_NAME, storage_set_key=StorageSetKey.OUTCOMES, ) read_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", UInt(64)), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", String()), ("times_seen", UInt(64)), ]) read_schema = TableSchema(
KafkaProducer( build_kafka_producer_configuration(Topic.DEAD_LETTER_METRICS)), KafkaTopic(Topic.DEAD_LETTER_METRICS.value), ) polymorphic_bucket = WritableTableStorage( storage_key=StorageKey.METRICS_RAW, storage_set_key=StorageSetKey.METRICS, schema=WritableTableSchema( columns=ColumnSet([ Column("use_case_id", String()), *PRE_VALUE_COLUMNS, Column("count_value", Float(64)), Column("set_values", Array(UInt(64))), Column("distribution_values", Array(Float(64))), *POST_VALUE_COLUMNS, ]), local_table_name="metrics_raw_v2_local", dist_table_name="metrics_raw_v2_dist", storage_set_key=StorageSetKey.METRICS, ), query_processors=[], stream_loader=build_kafka_stream_loader_from_settings( processor=PolymorphicMetricsProcessor(), default_topic=Topic.METRICS, commit_log_topic=Topic.METRICS_COMMIT_LOG, subscription_scheduler_mode=SchedulingWatermarkMode.GLOBAL, subscription_scheduled_topic=Topic.SUBSCRIPTION_SCHEDULED_METRICS, subscription_result_topic=Topic.SUBSCRIPTION_RESULTS_METRICS, dead_letter_queue_policy_creator=produce_policy_creator,
WithDefault(UInt(8), str(UNKNOWN_SPAN_STATUS)), ), ("start_ts", DateTime()), ("start_ns", UInt(32)), ("finish_ts", DateTime()), ("finish_ns", UInt(32)), ("duration_ms", UInt(32)), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_hash_map", Materialized(Array(UInt(64)), TAGS_HASH_MAP_COLUMN)), ("retention_days", UInt(16)), ("deleted", UInt(8)), ]) schema = WritableTableSchema( columns=columns, local_table_name="spans_experimental_local", dist_table_name="spans_experimental_dist", storage_set_key=StorageSetKey.TRANSACTIONS, ) storage = WritableTableStorage( storage_key=StorageKey.SPANS, storage_set_key=StorageSetKey.TRANSACTIONS, schema=schema, query_processors=[PrewhereProcessor()], stream_loader=KafkaStreamLoader( processor=SpansMessageProcessor(), default_topic="events", ), query_splitters=[TimeSplitQueryStrategy(timestamp_col="finish_ts")], )
columns = ColumnSet([ # columns to maintain the dataset # Kafka topic offset ("offset", UInt(64)), ("record_deleted", UInt(8)), # PG columns ("project_id", UInt(64)), ("group_id", UInt(64)), ("date_added", DateTime(Modifiers(nullable=True))), ("user_id", UInt(64, Modifiers(nullable=True))), ("team_id", UInt(64, Modifiers(nullable=True))), ]) schema = WritableTableSchema( columns=columns, local_table_name="groupassignee_local", dist_table_name="groupassignee_dist", storage_set_key=StorageSetKey.CDC, ) POSTGRES_TABLE = "sentry_groupasignee" storage = CdcStorage( storage_key=StorageKey.GROUPASSIGNEES, storage_set_key=StorageSetKey.CDC, schema=schema, query_processors=[ PrewhereProcessor(["project_id"]), ConsistencyEnforcerProcessor(), ], stream_loader=build_kafka_stream_loader_from_settings( processor=GroupAssigneeProcessor(POSTGRES_TABLE),
get_tag_column_map, mandatory_conditions, prewhere_candidates, promoted_tag_columns, query_processors, query_splitters, required_columns, ) from snuba.datasets.table_storage import build_kafka_stream_loader_from_settings schema = WritableTableSchema( columns=all_columns, local_table_name="sentry_local", dist_table_name="sentry_dist", storage_set_key=StorageSetKey.EVENTS, mandatory_conditions=mandatory_conditions, prewhere_candidates=prewhere_candidates, part_format=[util.PartSegment.DATE, util.PartSegment.RETENTION_DAYS], ) storage = WritableTableStorage( storage_key=StorageKey.EVENTS, storage_set_key=StorageSetKey.EVENTS, schema=schema, query_processors=query_processors, stream_loader=build_kafka_stream_loader_from_settings( StorageKey.EVENTS, processor=EventsProcessor(promoted_tag_columns), default_topic_name="events",
("environment", String(Modifiers(nullable=True))), ("platform", String()), ("trace_id", UUID()), ("transaction_name", String()), ("version_name", String()), ("version_code", String()), ] ) writable_columns = readable_columns + ColumnSet( [("retention_days", UInt(16)), ("partition", UInt(16)), ("offset", UInt(64))] ) writable_schema = WritableTableSchema( columns=writable_columns, local_table_name=PROFILES_LOCAL_TABLE_NAME, dist_table_name=PROFILES_DIST_TABLE_NAME, storage_set_key=StorageSetKey.PROFILES, ) writable_storage = WritableTableStorage( storage_key=StorageKey.PROFILES, storage_set_key=StorageSetKey.PROFILES, schema=writable_schema, query_processors=processors, mandatory_condition_checkers=[ OrgIdEnforcer("organization_id"), ProjectIdEnforcer(), ], stream_loader=loader, )
def process_delete_tag( message: ReplacementMessage, all_columns: Sequence[FlattenedColumn], tag_column_map: Mapping[str, Mapping[str, str]], promoted_tags: Mapping[str, Sequence[str]], use_promoted_prewhere: bool, schema: WritableTableSchema, ) -> Optional[Replacement]: tag = message.data["tag"] if not tag: return None assert isinstance(tag, str) timestamp = datetime.strptime( message.data["datetime"], settings.PAYLOAD_DATETIME_FORMAT ) tag_column_name = tag_column_map["tags"].get(tag, tag) is_promoted = tag in promoted_tags["tags"] where = """\ WHERE project_id = %(project_id)s AND received <= CAST('%(timestamp)s' AS DateTime) AND NOT deleted """ if is_promoted and use_promoted_prewhere: prewhere = " PREWHERE %(tag_column)s IS NOT NULL " else: prewhere = " PREWHERE has(`tags.key`, %(tag_str)s) " insert_query_template = ( """\ INSERT INTO %(table_name)s (%(all_columns)s) SELECT %(select_columns)s FROM %(table_name)s FINAL """ + prewhere + where ) select_columns = [] for col in all_columns: if is_promoted and col.flattened == tag_column_name: # The promoted tag columns of events are non nullable, but those of # errors are non nullable. We check the column against the schema # to determine whether to write an empty string or NULL. column_type = schema.get_data_source().get_columns().get(tag_column_name) assert column_type is not None is_nullable = column_type.type.has_modifier(Nullable) if is_nullable: select_columns.append("NULL") else: select_columns.append("''") elif col.flattened == "tags.key": select_columns.append( "arrayFilter(x -> (indexOf(`tags.key`, x) != indexOf(`tags.key`, %s)), `tags.key`)" % escape_string(tag) ) elif col.flattened == "tags.value": select_columns.append( "arrayMap(x -> arrayElement(`tags.value`, x), arrayFilter(x -> x != indexOf(`tags.key`, %s), arrayEnumerate(`tags.value`)))" % escape_string(tag) ) else: select_columns.append(col.escaped) all_column_names = [col.escaped for col in all_columns] query_args = { "all_columns": ", ".join(all_column_names), "select_columns": ", ".join(select_columns), "project_id": message.data["project_id"], "tag_str": escape_string(tag), "tag_column": escape_identifier(tag_column_name), "timestamp": timestamp.strftime(DATETIME_FORMAT), } count_query_template = ( """\ SELECT count() FROM %(table_name)s FINAL """ + prewhere + where ) query_time_flags = (NEEDS_FINAL, message.data["project_id"]) return LegacyReplacement( count_query_template, insert_query_template, query_args, query_time_flags, replacement_type=message.action_type, replacement_message_metadata=message.metadata, )
from snuba.datasets.storages.transactions_common import ( columns, mandatory_condition_checkers, query_processors, query_splitters, ) from snuba.datasets.table_storage import build_kafka_stream_loader_from_settings from snuba.datasets.transactions_processor import TransactionsMessageProcessor from snuba.query.processors.tuple_unaliaser import TupleUnaliaser from snuba.subscriptions.utils import SchedulingWatermarkMode from snuba.utils.streams.topics import Topic schema = WritableTableSchema( columns=columns, local_table_name="transactions_local", dist_table_name="transactions_dist", storage_set_key=StorageSetKey.TRANSACTIONS_V2, mandatory_conditions=[], part_format=[util.PartSegment.RETENTION_DAYS, util.PartSegment.DATE], ) v2_query_processors = [*query_processors, TupleUnaliaser()] storage = WritableTableStorage( storage_key=StorageKey.TRANSACTIONS_V2, storage_set_key=StorageSetKey.TRANSACTIONS_V2, schema=schema, query_processors=v2_query_processors, stream_loader=build_kafka_stream_loader_from_settings( processor=TransactionsMessageProcessor(), pre_filter=KafkaHeaderFilter("transaction_forwarder", "0"),
POST_VALUE_COLUMNS: Sequence[Column[SchemaModifiers]] = [ Column("materialization_version", UInt(8)), Column("retention_days", UInt(16)), Column("partition", UInt(16)), Column("offset", UInt(64)), ] sets_buckets = WritableTableStorage( storage_key=StorageKey.METRICS_BUCKETS, storage_set_key=StorageSetKey.METRICS, schema=WritableTableSchema( columns=ColumnSet([ *PRE_VALUE_COLUMNS, Column("set_values", Array(UInt(64))), *POST_VALUE_COLUMNS, ]), local_table_name="metrics_buckets_local", dist_table_name="metrics_buckets_dist", storage_set_key=StorageSetKey.METRICS, ), query_processors=[], stream_loader=build_kafka_stream_loader_from_settings( processor=SetsMetricsProcessor(), default_topic=Topic.METRICS, ), ) counters_buckets = WritableTableStorage( storage_key=StorageKey.METRICS_COUNTERS_BUCKETS, storage_set_key=StorageSetKey.METRICS, schema=WritableTableSchema(
("_tags_hash_map", ReadOnly(Array(UInt(64)))), ("contexts", Nested([("key", String()), ("value", String())])), ("_contexts_flattened", String()), ("measurements", Nested([("key", String()), ("value", Float(64))]),), ("partition", UInt(16)), ("offset", UInt(64)), ("message_timestamp", DateTime()), ("retention_days", UInt(16)), ("deleted", UInt(8)), ] ) schema = WritableTableSchema( columns=columns, local_table_name="transactions_local", dist_table_name="transactions_dist", storage_set_key=StorageSetKey.TRANSACTIONS, mandatory_conditions=[], prewhere_candidates=["event_id", "transaction_name", "transaction", "title"], ) storage = WritableTableStorage( storage_key=StorageKey.TRANSACTIONS, storage_set_key=StorageSetKey.TRANSACTIONS, schema=schema, query_processors=[ MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), TransactionColumnProcessor(), ArrayJoinKeyValueOptimizer("tags"), ArrayJoinKeyValueOptimizer("measurements"), PrewhereProcessor(),
("project_id", UInt(64)), ("id", UInt(64)), ("status", UInt(8, Modifiers(nullable=True))), ("last_seen", DateTime(Modifiers(nullable=True))), ("first_seen", DateTime(Modifiers(nullable=True))), ("active_at", DateTime(Modifiers(nullable=True))), ("first_release_id", UInt(64, Modifiers(nullable=True))), ]) schema = WritableTableSchema( columns=columns, local_table_name="groupedmessage_local", dist_table_name="groupedmessage_dist", storage_set_key=StorageSetKey.EVENTS, mandatory_conditions=[ binary_condition( ConditionFunctions.EQ, Column(None, None, "record_deleted"), Literal(None, 0), ), ], prewhere_candidates=["project_id", "id"], ) POSTGRES_TABLE = "sentry_groupedmessage" storage = CdcStorage( storage_key=StorageKey.GROUPEDMESSAGES, storage_set_key=StorageSetKey.EVENTS, schema=schema, query_processors=[], stream_loader=build_kafka_stream_loader_from_settings(
("user_id", String(Modifiers(nullable=True))), ("user_name", String(Modifiers(nullable=True))), ("user_email", String(Modifiers(nullable=True))), # sdk info ("sdk_name", String()), ("sdk_version", String()), ("tags", Nested([("key", String()), ("value", String())])), # deletion info ("retention_days", UInt(16)), ("partition", UInt(16)), ("offset", UInt(64)), ]) schema = WritableTableSchema( columns=columns, local_table_name=LOCAL_TABLE_NAME, dist_table_name=DIST_TABLE_NAME, storage_set_key=StorageSetKey.REPLAYS, ) # TODO: set up deadletter queue for bad messages. storage = WritableTableStorage( storage_key=StorageKey.REPLAYS, storage_set_key=StorageSetKey.REPLAYS, schema=schema, query_processors=[TableRateLimit()], mandatory_condition_checkers=[ProjectIdEnforcer()], stream_loader=build_kafka_stream_loader_from_settings( processor=ReplaysProcessor(), default_topic=Topic.REPLAYEVENTS, ),
("retention_days", UInt(16)), ("duration", UInt(32)), ("status", UInt(8)), ("errors", UInt(16)), ("received", DateTime()), ("started", DateTime()), ("release", String()), ("environment", String()), ("user_agent", String()), ("os", String()), ] ) raw_schema = WritableTableSchema( columns=all_columns, local_table_name=WRITE_LOCAL_TABLE_NAME, dist_table_name=WRITE_DIST_TABLE_NAME, storage_set_key=StorageSetKey.SESSIONS, ) read_columns = ColumnSet( [ ("org_id", UInt(64)), ("project_id", UInt(64)), ("started", DateTime()), ("release", String()), ("environment", String()), ("user_agent", String()), ("os", String()), ( "duration_quantiles", AggregateFunction("quantilesIf(0.5, 0.9)", [UInt(32), UInt(8)]),
("clickhouse_queries.is_duplicate", Array(UInt(8))), ("clickhouse_queries.consistent", Array(UInt(8))), ("clickhouse_queries.all_columns", Array(Array(String()))), ("clickhouse_queries.or_conditions", Array(UInt(8))), ("clickhouse_queries.where_columns", Array(Array(String()))), ("clickhouse_queries.where_mapping_columns", Array(Array(String()))), ("clickhouse_queries.groupby_columns", Array(Array(String()))), ("clickhouse_queries.array_join_columns", Array(Array(String()))), ]) # Note, we are using the simplified WritableTableSchema class here instead of # the MergeTreeSchema that corresponds to the actual table engine. This is because # the querylog table isn't generated by the old migration system. schema = WritableTableSchema( columns=columns, local_table_name="querylog_local", dist_table_name="querylog_dist", storage_set_key=StorageSetKey.QUERYLOG, ) storage = WritableTableStorage( storage_key=StorageKey.QUERYLOG, storage_set_key=StorageSetKey.QUERYLOG, schema=schema, query_processors=[], stream_loader=build_kafka_stream_loader_from_settings( processor=QuerylogProcessor(), default_topic=Topic.QUERYLOG, ), )
"level": "level", } schema = WritableTableSchema( columns=all_columns, local_table_name="errors_local", dist_table_name="errors_dist", storage_set_key=StorageSetKey.EVENTS, mandatory_conditions=[ MandatoryCondition( ("deleted", "=", 0), binary_condition( None, ConditionFunctions.EQ, Column(None, None, "deleted"), Literal(None, 0), ), ) ], prewhere_candidates=[ "event_id", "group_id", "tags[sentry:release]", "message", "environment", "project_id", ], ) required_columns = [ "event_id",
("project_id", UInt(64)), ("id", UInt(64)), ("status", UInt(8, Modifiers(nullable=True))), ("last_seen", DateTime(Modifiers(nullable=True))), ("first_seen", DateTime(Modifiers(nullable=True))), ("active_at", DateTime(Modifiers(nullable=True))), ("first_release_id", UInt(64, Modifiers(nullable=True))), ]) schema = WritableTableSchema( columns=columns, local_table_name="groupedmessage_local", dist_table_name="groupedmessage_dist", storage_set_key=StorageSetKey.CDC, mandatory_conditions=[ binary_condition( ConditionFunctions.EQ, Column(None, None, "record_deleted"), Literal(None, 0), ), ], ) POSTGRES_TABLE = "sentry_groupedmessage" storage = CdcStorage( storage_key=StorageKey.GROUPEDMESSAGES, storage_set_key=StorageSetKey.CDC, schema=schema, query_processors=[ PrewhereProcessor(["project_id", "id"]),