def __init__( self, storage_key: StorageKey, storage_set_key: StorageSetKey, schema: Schema, query_processors: Sequence[QueryProcessor], stream_loader: KafkaStreamLoader, query_splitters: Optional[Sequence[QuerySplitStrategy]] = None, mandatory_condition_checkers: Optional[ Sequence[ConditionChecker]] = None, replacer_processor: Optional[ReplacerProcessor[Any]] = None, writer_options: ClickhouseWriterOptions = None, ) -> None: super().__init__( storage_key, storage_set_key, schema, query_processors, query_splitters, mandatory_condition_checkers, ) assert isinstance(schema, WritableTableSchema) self.__table_writer = TableWriter( storage_set=storage_set_key, write_schema=schema, stream_loader=stream_loader, replacer_processor=replacer_processor, writer_options=writer_options, )
def build_batch_writer( table_writer: TableWriter, metrics: MetricsBackend, replacements_producer: Optional[ConfluentKafkaProducer] = None, replacements_topic: Optional[Topic] = None, ) -> Callable[[], ProcessedMessageBatchWriter]: assert not (replacements_producer is None) ^ (replacements_topic is None) supports_replacements = replacements_producer is not None writer = table_writer.get_batch_writer( metrics, { "load_balancing": "in_order", "insert_distributed_sync": 1 }, ) def build_writer() -> ProcessedMessageBatchWriter: insert_batch_writer = InsertBatchWriter( writer, MetricsWrapper(metrics, "insertions")) replacement_batch_writer: Optional[ReplacementBatchWriter] if supports_replacements: assert replacements_producer is not None assert replacements_topic is not None replacement_batch_writer = ReplacementBatchWriter( replacements_producer, replacements_topic) else: replacement_batch_writer = None return ProcessedMessageBatchWriter(insert_batch_writer, replacement_batch_writer) return build_writer
("max_threads", UInt(8)), ("num_days", UInt(32)), ("clickhouse_table", LowCardinality(String())), ("query_id", String()), ("is_duplicate", UInt(8)), ("consistent", UInt(8)), ]), ), ]) schema = MergeTreeSchema( columns=columns, local_table_name="querylog_local", dist_table_name="querylog_dist", order_by="(toStartOfDay(timestamp), request_id)", partition_by="(toMonday(timestamp))", sample_expr="request_id", ) storage = WritableTableStorage( schemas=StorageSchemas(read_schema=schema, write_schema=schema), table_writer=TableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=QuerylogProcessor(), default_topic=settings.QUERIES_TOPIC, ), ), query_processors=[], )
def __init__(self) -> None: write_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", Nullable(UInt(64))), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(Nullable(String()))), ("event_id", Nullable(UUID())), ]) write_schema = MergeTreeSchema( columns=write_columns, # TODO: change to outcomes.raw_local when we add multi DB support local_table_name=WRITE_LOCAL_TABLE_NAME, dist_table_name=WRITE_DIST_TABLE_NAME, order_by="(org_id, project_id, timestamp)", partition_by="(toMonday(timestamp))", settings={"index_granularity": 16384}, ) read_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", UInt(64)), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(String())), ("times_seen", UInt(64)), ]) read_schema = SummingMergeTreeSchema( columns=read_columns, local_table_name=READ_LOCAL_TABLE_NAME, dist_table_name=READ_DIST_TABLE_NAME, order_by="(org_id, project_id, key_id, outcome, reason, timestamp)", partition_by="(toMonday(timestamp))", settings={"index_granularity": 256}, ) materialized_view_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", UInt(64)), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", String()), ("times_seen", UInt(64)), ]) # TODO: Find a better way to specify a query for a materialized view # The problem right now is that we have a way to define our columns in a ColumnSet abstraction but the query # doesn't use it. query = """ SELECT org_id, project_id, ifNull(key_id, 0) AS key_id, toStartOfHour(timestamp) AS timestamp, outcome, ifNull(reason, 'none') AS reason, count() AS times_seen FROM %(source_table_name)s GROUP BY org_id, project_id, key_id, timestamp, outcome, reason """ materialized_view = MaterializedViewSchema( local_materialized_view_name="outcomes_mv_hourly_local", dist_materialized_view_name="outcomes_mv_hourly_dist", prewhere_candidates=["project_id", "org_id"], columns=materialized_view_columns, query=query, local_source_table_name=WRITE_LOCAL_TABLE_NAME, local_destination_table_name=READ_LOCAL_TABLE_NAME, dist_source_table_name=WRITE_DIST_TABLE_NAME, dist_destination_table_name=READ_DIST_TABLE_NAME, ) dataset_schemas = DatasetSchemas( read_schema=read_schema, write_schema=write_schema, intermediary_schemas=[materialized_view], ) table_writer = TableWriter( write_schema=write_schema, stream_loader=KafkaStreamLoader( processor=OutcomesProcessor(), default_topic="outcomes", ), ) super().__init__( dataset_schemas=dataset_schemas, table_writer=table_writer, time_group_columns={"time": "timestamp"}, time_parse_columns=("timestamp", ), )
] for col in get_promoted_columns() } storage = WritableTableStorage( schemas=StorageSchemas(read_schema=schema, write_schema=schema), table_writer=TableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=EventsProcessor(promoted_tag_columns), default_topic="events", replacement_topic="event-replacements", commit_log_topic="snuba-commit-log", ), replacer_processor=ErrorsReplacer( write_schema=schema, read_schema=schema, required_columns=[col.escaped for col in required_columns], tag_column_map=get_tag_column_map(), promoted_tags=get_promoted_tags(), state_name=ReplacerState.EVENTS, ), ), query_processors=[ # TODO: This one should become an entirely separate storage and picked # in the storage selector. ReadOnlyTableSelector("sentry_dist", "sentry_dist_ro"), EventsColumnProcessor(), PrewhereProcessor(), ],
def __init__(self): metadata_columns = ColumnSet([ # optional stream related data ('offset', Nullable(UInt(64))), ('partition', Nullable(UInt(16))), ]) promoted_tag_columns = ColumnSet([ # These are the classic tags, they are saved in Snuba exactly as they # appear in the event body. ('level', Nullable(String())), ('logger', Nullable(String())), ('server_name', Nullable(String())), # future name: device_id? ('transaction', Nullable(String())), ('environment', Nullable(String())), ('sentry:release', Nullable(String())), ('sentry:dist', Nullable(String())), ('sentry:user', Nullable(String())), ('site', Nullable(String())), ('url', Nullable(String())), ]) promoted_context_tag_columns = ColumnSet([ # These are promoted tags that come in in `tags`, but are more closely # related to contexts. To avoid naming confusion with Clickhouse nested # columns, they are stored in the database with s/./_/ # promoted tags ('app_device', Nullable(String())), ('device', Nullable(String())), ('device_family', Nullable(String())), ('runtime', Nullable(String())), ('runtime_name', Nullable(String())), ('browser', Nullable(String())), ('browser_name', Nullable(String())), ('os', Nullable(String())), ('os_name', Nullable(String())), ('os_rooted', Nullable(UInt(8))), ]) promoted_context_columns = ColumnSet([ ('os_build', Nullable(String())), ('os_kernel_version', Nullable(String())), ('device_name', Nullable(String())), ('device_brand', Nullable(String())), ('device_locale', Nullable(String())), ('device_uuid', Nullable(String())), ('device_model_id', Nullable(String())), ('device_arch', Nullable(String())), ('device_battery_level', Nullable(Float(32))), ('device_orientation', Nullable(String())), ('device_simulator', Nullable(UInt(8))), ('device_online', Nullable(UInt(8))), ('device_charging', Nullable(UInt(8))), ]) required_columns = ColumnSet([ ('event_id', FixedString(32)), ('project_id', UInt(64)), ('group_id', UInt(64)), ('timestamp', DateTime()), ('deleted', UInt(8)), ('retention_days', UInt(16)), ]) all_columns = required_columns + [ # required for non-deleted ('platform', Nullable(String())), ('message', Nullable(String())), ('primary_hash', Nullable(FixedString(32))), ('received', Nullable(DateTime())), ('search_message', Nullable(String())), ('title', Nullable(String())), ('location', Nullable(String())), # optional user ('user_id', Nullable(String())), ('username', Nullable(String())), ('email', Nullable(String())), ('ip_address', Nullable(String())), # optional geo ('geo_country_code', Nullable(String())), ('geo_region', Nullable(String())), ('geo_city', Nullable(String())), ('sdk_name', Nullable(String())), ('sdk_version', Nullable(String())), ('type', Nullable(String())), ('version', Nullable(String())), ] + metadata_columns \ + promoted_context_columns \ + promoted_tag_columns \ + promoted_context_tag_columns \ + [ # other tags ('tags', Nested([ ('key', String()), ('value', String()), ])), # other context ('contexts', Nested([ ('key', String()), ('value', String()), ])), # http interface ('http_method', Nullable(String())), ('http_referer', Nullable(String())), # exception interface ('exception_stacks', Nested([ ('type', Nullable(String())), ('value', Nullable(String())), ('mechanism_type', Nullable(String())), ('mechanism_handled', Nullable(UInt(8))), ])), ('exception_frames', Nested([ ('abs_path', Nullable(String())), ('filename', Nullable(String())), ('package', Nullable(String())), ('module', Nullable(String())), ('function', Nullable(String())), ('in_app', Nullable(UInt(8))), ('colno', Nullable(UInt(32))), ('lineno', Nullable(UInt(32))), ('stack_level', UInt(16)), ])), # These are columns we added later in the life of the (current) production # database. They don't necessarily belong here in a logical/readability sense # but they are here to match the order of columns in production becase # `insert_distributed_sync` is very sensitive to column existence and ordering. ('culprit', Nullable(String())), ('sdk_integrations', Array(String())), ('modules', Nested([ ('name', String()), ('version', String()), ])), ] sample_expr = 'cityHash64(toString(event_id))' schema = ReplacingMergeTreeSchema( columns=all_columns, local_table_name='sentry_local', dist_table_name='sentry_dist', mandatory_conditions=[('deleted', '=', 0)], order_by='(project_id, toStartOfDay(timestamp), %s)' % sample_expr, partition_by='(toMonday(timestamp), if(equals(retention_days, 30), 30, 90))', version_column='deleted', sample_expr=sample_expr, migration_function=events_migrations) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) table_writer = TableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=EventsProcessor(promoted_tag_columns), default_topic="events", replacement_topic="event-replacements", commit_log_topic="snuba-commit-log", ) ) super(EventsDataset, self).__init__( dataset_schemas=dataset_schemas, table_writer=table_writer, time_group_columns={ 'time': 'timestamp', 'rtime': 'received' }, time_parse_columns=('timestamp', 'received') ) self.__metadata_columns = metadata_columns self.__promoted_tag_columns = promoted_tag_columns self.__promoted_context_tag_columns = promoted_context_tag_columns self.__promoted_context_columns = promoted_context_columns self.__required_columns = required_columns self.__tags_processor = TagColumnProcessor( columns=all_columns, promoted_columns=self._get_promoted_columns(), column_tag_map=self._get_column_tag_map(), )
dist_materialized_view_name="outcomes_mv_hourly_dist", prewhere_candidates=["project_id", "org_id"], columns=materialized_view_columns, query=query, local_source_table_name=WRITE_LOCAL_TABLE_NAME, local_destination_table_name=READ_LOCAL_TABLE_NAME, dist_source_table_name=WRITE_DIST_TABLE_NAME, dist_destination_table_name=READ_DIST_TABLE_NAME, ) raw_storage = WritableTableStorage( schemas=StorageSchemas(read_schema=raw_schema, write_schema=raw_schema), table_writer=TableWriter( write_schema=raw_schema, stream_loader=KafkaStreamLoader( processor=OutcomesProcessor(), default_topic="outcomes", ), ), query_processors=[], ) materialized_storage = ReadableTableStorage( schemas=StorageSchemas( read_schema=read_schema, write_schema=None, intermediary_schemas=[materialized_view_schema], ), query_processors=[PrewhereProcessor()], )
"deleted", "retention_days", ] storage = WritableTableStorage( schemas=StorageSchemas(read_schema=schema, write_schema=schema), table_writer=TableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=ErrorsProcessor(promoted_tag_columns), default_topic="events", replacement_topic="errors-replacements", ), replacer_processor=ErrorsReplacer( write_schema=schema, read_schema=schema, required_columns=required_columns, tag_column_map={ "tags": promoted_tag_columns, "contexts": {}, }, promoted_tags={ "tags": promoted_tag_columns.keys(), "contexts": {}, }, state_name=ReplacerState.ERRORS, ), ), query_processors=[PrewhereProcessor()], )
def __init__(self) -> None: metadata_columns = ColumnSet([ # optional stream related data ("offset", Nullable(UInt(64))), ("partition", Nullable(UInt(16))), ]) promoted_tag_columns = ColumnSet([ # These are the classic tags, they are saved in Snuba exactly as they # appear in the event body. ("level", Nullable(String())), ("logger", Nullable(String())), ("server_name", Nullable(String())), # future name: device_id? ("transaction", Nullable(String())), ("environment", Nullable(String())), ("sentry:release", Nullable(String())), ("sentry:dist", Nullable(String())), ("sentry:user", Nullable(String())), ("site", Nullable(String())), ("url", Nullable(String())), ]) promoted_context_tag_columns = ColumnSet([ # These are promoted tags that come in in `tags`, but are more closely # related to contexts. To avoid naming confusion with Clickhouse nested # columns, they are stored in the database with s/./_/ # promoted tags ("app_device", Nullable(String())), ("device", Nullable(String())), ("device_family", Nullable(String())), ("runtime", Nullable(String())), ("runtime_name", Nullable(String())), ("browser", Nullable(String())), ("browser_name", Nullable(String())), ("os", Nullable(String())), ("os_name", Nullable(String())), ("os_rooted", Nullable(UInt(8))), ]) promoted_context_columns = ColumnSet([ ("os_build", Nullable(String())), ("os_kernel_version", Nullable(String())), ("device_name", Nullable(String())), ("device_brand", Nullable(String())), ("device_locale", Nullable(String())), ("device_uuid", Nullable(String())), ("device_model_id", Nullable(String())), ("device_arch", Nullable(String())), ("device_battery_level", Nullable(Float(32))), ("device_orientation", Nullable(String())), ("device_simulator", Nullable(UInt(8))), ("device_online", Nullable(UInt(8))), ("device_charging", Nullable(UInt(8))), ]) required_columns = ColumnSet([ ("event_id", FixedString(32)), ("project_id", UInt(64)), ("group_id", UInt(64)), ("timestamp", DateTime()), ("deleted", UInt(8)), ("retention_days", UInt(16)), ]) all_columns = ( required_columns + [ # required for non-deleted ("platform", Nullable(String())), ("message", Nullable(String())), ("primary_hash", Nullable(FixedString(32))), ("received", Nullable(DateTime())), ("search_message", Nullable(String())), ("title", Nullable(String())), ("location", Nullable(String())), # optional user ("user_id", Nullable(String())), ("username", Nullable(String())), ("email", Nullable(String())), ("ip_address", Nullable(String())), # optional geo ("geo_country_code", Nullable(String())), ("geo_region", Nullable(String())), ("geo_city", Nullable(String())), ("sdk_name", Nullable(String())), ("sdk_version", Nullable(String())), ("type", Nullable(String())), ("version", Nullable(String())), ] + metadata_columns + promoted_context_columns + promoted_tag_columns + promoted_context_tag_columns + [ # other tags ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), # other context ("contexts", Nested([("key", String()), ("value", String())])), # http interface ("http_method", Nullable(String())), ("http_referer", Nullable(String())), # exception interface ( "exception_stacks", Nested([ ("type", Nullable(String())), ("value", Nullable(String())), ("mechanism_type", Nullable(String())), ("mechanism_handled", Nullable(UInt(8))), ]), ), ( "exception_frames", Nested([ ("abs_path", Nullable(String())), ("filename", Nullable(String())), ("package", Nullable(String())), ("module", Nullable(String())), ("function", Nullable(String())), ("in_app", Nullable(UInt(8))), ("colno", Nullable(UInt(32))), ("lineno", Nullable(UInt(32))), ("stack_level", UInt(16)), ]), ), # These are columns we added later in the life of the (current) production # database. They don't necessarily belong here in a logical/readability sense # but they are here to match the order of columns in production becase # `insert_distributed_sync` is very sensitive to column existence and ordering. ("culprit", Nullable(String())), ("sdk_integrations", Array(String())), ("modules", Nested([("name", String()), ("version", String())])), ]) sample_expr = "cityHash64(toString(event_id))" schema = ReplacingMergeTreeSchema( columns=all_columns, local_table_name="sentry_local", dist_table_name="sentry_dist", mandatory_conditions=[("deleted", "=", 0)], prewhere_candidates=[ "event_id", "group_id", "tags[sentry:release]", "message", "environment", "project_id", ], order_by="(project_id, toStartOfDay(timestamp), %s)" % sample_expr, partition_by= "(toMonday(timestamp), if(equals(retention_days, 30), 30, 90))", version_column="deleted", sample_expr=sample_expr, migration_function=events_migrations, ) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) table_writer = TableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=EventsProcessor(promoted_tag_columns), default_topic="events", replacement_topic="event-replacements", commit_log_topic="snuba-commit-log", ), ) super(EventsDataset, self).__init__( dataset_schemas=dataset_schemas, table_writer=table_writer, time_group_columns={ "time": "timestamp", "rtime": "received" }, time_parse_columns=("timestamp", "received"), ) self.__metadata_columns = metadata_columns self.__promoted_tag_columns = promoted_tag_columns self.__promoted_context_tag_columns = promoted_context_tag_columns self.__promoted_context_columns = promoted_context_columns self.__required_columns = required_columns self.__tags_processor = TagColumnProcessor( columns=all_columns, promoted_columns=self._get_promoted_columns(), column_tag_map=self._get_column_tag_map(), )
def __init__(self) -> None: all_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("timestamp", DateTime()), ("event_id", WithCodecs(UUID(), ["NONE"])), ( "event_hash", WithCodecs( Materialized( UInt(64), "cityHash64(toString(event_id))", ), ["NONE"], ), ), ("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''")), ( "user_hash", Materialized(UInt(64), "cityHash64(user)"), ), ("user_id", Nullable(String())), ("user_name", Nullable(String())), ("user_email", Nullable(String())), ("sdk_name", LowCardinality(Nullable(String()))), ("sdk_version", LowCardinality(Nullable(String()))), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), ("contexts", Nested([("key", String()), ("value", String())])), ("_contexts_flattened", String()), ("transaction_name", WithDefault(LowCardinality(String()), "''")), ( "transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)"), ), ("span_id", Nullable(UInt(64))), ("trace_id", Nullable(UUID())), ("partition", UInt(16)), ("offset", WithCodecs(UInt(64), ["DoubleDelta", "LZ4"])), ("retention_days", UInt(16)), ("deleted", UInt(8)), ("group_id", UInt(64)), ("primary_hash", FixedString(32)), ("primary_hash_hex", Materialized(UInt(64), "hex(primary_hash)")), ("event_string", WithCodecs(String(), ["NONE"])), ("received", DateTime()), ("message", String()), ("title", String()), ("culprit", String()), ("level", LowCardinality(String())), ("location", Nullable(String())), ("version", LowCardinality(Nullable(String()))), ("type", LowCardinality(String())), ( "exception_stacks", Nested([ ("type", Nullable(String())), ("value", Nullable(String())), ("mechanism_type", Nullable(String())), ("mechanism_handled", Nullable(UInt(8))), ]), ), ( "exception_frames", Nested([ ("abs_path", Nullable(String())), ("colno", Nullable(UInt(32))), ("filename", Nullable(String())), ("function", Nullable(String())), ("lineno", Nullable(UInt(32))), ("in_app", Nullable(UInt(8))), ("package", Nullable(String())), ("module", Nullable(String())), ("stack_level", Nullable(UInt(16))), ]), ), ("sdk_integrations", Array(String())), ("modules", Nested([("name", String()), ("version", String())])), ]) self.__promoted_tag_columns = { "environment": "environment", "sentry:release": "release", "sentry:dist": "dist", "sentry:user": "******", "transaction": "transaction_name", "level": "level", } schema = ReplacingMergeTreeSchema( columns=all_columns, local_table_name="errors_local", dist_table_name="errors_dist", mandatory_conditions=[("deleted", "=", 0)], prewhere_candidates=[ "event_id", "group_id", "tags[sentry:release]", "message", "environment", "project_id", ], order_by= "(org_id, project_id, toStartOfDay(timestamp), primary_hash_hex, event_hash)", partition_by= "(toMonday(timestamp), if(retention_days = 30, 30, 90))", version_column="deleted", sample_expr="event_hash", ttl_expr="timestamp + toIntervalDay(retention_days)", settings={"index_granularity": "8192"}, ) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) table_writer = TableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=ErrorsProcessor(self.__promoted_tag_columns), default_topic="events", ), ) super().__init__( dataset_schemas=dataset_schemas, table_writer=table_writer, time_group_columns={ "time": "timestamp", "rtime": "received" }, time_parse_columns=("timestamp", "received"), ) self.__tags_processor = TagColumnProcessor( columns=all_columns, promoted_columns=self._get_promoted_columns(), column_tag_map=self._get_column_tag_map(), )