def run_optimize( clickhouse: ClickhousePool, storage: ReadableTableStorage, database: str, before: Optional[datetime] = None, ignore_cutoff: bool = False, parallel: int = 1, clickhouse_host: Optional[str] = None, ) -> int: start = time.time() schema = storage.get_schema() assert isinstance(schema, TableSchema) table = schema.get_local_table_name() database = storage.get_cluster().get_database() parts = get_partitions_to_optimize(clickhouse, storage, database, table, before) optimize_partition_runner(clickhouse, database, table, parts, ignore_cutoff, parallel, clickhouse_host) metrics.timing( "optimized_all_parts", time.time() - start, tags=_get_metrics_tags(table, clickhouse_host), ) return len(parts)
def run_optimize( clickhouse: ClickhousePool, storage: ReadableTableStorage, database: str, before: Optional[datetime] = None, ) -> int: schema = storage.get_schema() assert isinstance(schema, TableSchema) table = schema.get_local_table_name() database = storage.get_cluster().get_database() parts = get_partitions_to_optimize(clickhouse, storage, database, table, before) optimize_partitions(clickhouse, database, table, parts) return len(parts)
storage = ReadableTableStorage( storage_key=StorageKey.DISCOVER, storage_set_key=StorageSetKey.DISCOVER, schema=schema, query_processors=[ MappingColumnPromoter( mapping_specs={ "tags": { "environment": "environment", "sentry:release": "release", "sentry:dist": "dist", "sentry:user": "******", }, "contexts": {"trace.trace_id": "trace_id"}, } ), MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), ArrayJoinKeyValueOptimizer("tags"), UUIDColumnProcessor(set(["event_id", "trace_id"])), EventsBooleanContextsProcessor(), PrewhereProcessor( [ "event_id", "release", "message", "transaction_name", "environment", "project_id", ] ), ], query_splitters=[ ColumnSplitQueryStrategy( id_column="event_id", project_column="project_id", timestamp_column="timestamp", ), TimeSplitQueryStrategy(timestamp_col="timestamp"), ], )
]) schema = TableSchema( columns=columns, local_table_name="discover_local", dist_table_name="discover_dist", storage_set_key=StorageSetKey.DISCOVER, mandatory_conditions=mandatory_conditions, prewhere_candidates=[ "event_id", "release", "message", "transaction_name", "environment", "project_id", ], ) storage = ReadableTableStorage( storage_key=StorageKey.DISCOVER, storage_set_key=StorageSetKey.DISCOVER, schema=schema, query_processors=[ MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), EventIdColumnProcessor(), ArrayJoinKeyValueOptimizer("tags"), PrewhereProcessor(), ], query_splitters=[TimeSplitQueryStrategy(timestamp_col="timestamp")], )
dist_table_name=READ_DIST_TABLE_NAME, storage_set_key=StorageSetKey.SESSIONS, ) materialized_view_schema = TableSchema( local_table_name=READ_LOCAL_MV_NAME, dist_table_name=READ_DIST_MV_NAME, storage_set_key=StorageSetKey.SESSIONS, columns=read_columns, ) # The raw table we write onto, and that potentially we could # query. raw_storage = WritableTableStorage( storage_key=StorageKey.SESSIONS_RAW, storage_set_key=StorageSetKey.SESSIONS, schema=raw_schema, query_processors=[], stream_loader=build_kafka_stream_loader_from_settings( StorageKey.SESSIONS_RAW, processor=SessionsProcessor(), default_topic_name="ingest-sessions", ), ) # The materialized view we query aggregate data from. materialized_storage = ReadableTableStorage( storage_key=StorageKey.SESSIONS_HOURLY, storage_set_key=StorageSetKey.SESSIONS, schema=read_schema, query_processors=[PrewhereProcessor(["project_id", "org_id"])], )
Column("granularity", UInt(8)), Column("timestamp", DateTime()), Column("retention_days", UInt(16)), Column( "tags", Nested([("key", UInt(64)), ("indexed_value", UInt(64)), ("raw_value", String())]), ), Column("_raw_tags_hash", Array(UInt(64), SchemaModifiers(readonly=True))), Column("_indexed_tags_hash", Array(UInt(64), SchemaModifiers(readonly=True))), ] sets_storage = ReadableTableStorage( storage_key=StorageKey.GENERIC_METRICS_SETS, storage_set_key=StorageSetKey.GENERIC_METRICS_SETS, schema=TableSchema( local_table_name="generic_metrics_sets_local", dist_table_name="generic_metrics_sets_dist", storage_set_key=StorageSetKey.GENERIC_METRICS_SETS, columns=ColumnSet([ *aggregated_columns, Column("value", AggregateFunction("uniqCombined64", [UInt(64)])), ]), ), query_processors=[ ArrayJoinKeyValueOptimizer("tags"), TableRateLimit(), ], )
def get_partitions_to_optimize( clickhouse: ClickhousePool, storage: ReadableTableStorage, database: str, table: str, before: Optional[datetime] = None, ) -> Sequence[util.Part]: engine = clickhouse.execute( """ SELECT engine FROM system.tables WHERE (database = %(database)s) AND (name = %(table)s) """, { "database": database, "table": table }, ) if not engine: logger.warning("Table %s.%s doesn't exist on %s:%s" % (database, table, clickhouse.host, clickhouse.port)) return [] if engine[0][0].startswith("Replicated"): is_leader = clickhouse.execute( """ SELECT is_leader FROM system.replicas WHERE (database = %(database)s) AND (table = %(table)s) """, { "database": database, "table": table }, ) # response: [(0,)] for non-leader or [(1,)] for leader if not (len(is_leader) == 1 and is_leader[0][0]): return [] active_parts = clickhouse.execute( """ SELECT partition, count() AS c FROM system.parts WHERE active AND database = %(database)s AND table = %(table)s GROUP BY partition HAVING c > 1 ORDER BY c DESC, partition """, { "database": database, "table": table }, ) schema = storage.get_schema() assert isinstance(schema, TableSchema) part_format = schema.get_part_format() assert part_format is not None parts = [ util.decode_part_str(part, part_format) for part, count in active_parts ] if before: parts = [ p for p in parts if (p.date + timedelta(days=6 - p.date.weekday())) < before ] return parts
materialized_view_schema = TableSchema( local_table_name="outcomes_mv_hourly_local", dist_table_name="outcomes_mv_hourly_dist", storage_set_key=StorageSetKey.OUTCOMES, columns=materialized_view_columns, ) raw_storage = WritableTableStorage( storage_key=StorageKey.OUTCOMES_RAW, storage_set_key=StorageSetKey.OUTCOMES, schema=raw_schema, query_processors=[TableRateLimit()], mandatory_condition_checkers=[OrgIdEnforcer()], stream_loader=build_kafka_stream_loader_from_settings( processor=OutcomesProcessor(), default_topic=Topic.OUTCOMES, ), ) materialized_storage = ReadableTableStorage( storage_key=StorageKey.OUTCOMES_HOURLY, storage_set_key=StorageSetKey.OUTCOMES, schema=read_schema, query_processors=[ PrewhereProcessor(["project_id", "org_id"]), TableRateLimit() ], mandatory_condition_checkers=[OrgIdEnforcer()], )
("reason", String()), ("times_seen", UInt(64)), ]) materialized_view_schema = TableSchema( local_table_name="outcomes_mv_hourly_local", dist_table_name="outcomes_mv_hourly_dist", storage_set_key=StorageSetKey.OUTCOMES, prewhere_candidates=["project_id", "org_id"], columns=materialized_view_columns, ) raw_storage = WritableTableStorage( storage_key=StorageKey.OUTCOMES_RAW, storage_set_key=StorageSetKey.OUTCOMES, schema=raw_schema, query_processors=[], stream_loader=build_kafka_stream_loader_from_settings( StorageKey.OUTCOMES_RAW, processor=OutcomesProcessor(), default_topic_name="outcomes", ), ) materialized_storage = ReadableTableStorage( storage_key=StorageKey.OUTCOMES_HOURLY, storage_set_key=StorageSetKey.OUTCOMES, schema=read_schema, query_processors=[PrewhereProcessor()], )
from snuba.clusters.storage_sets import StorageSetKey from snuba.datasets.storage import ReadableTableStorage from snuba.datasets.schemas.tables import TableSchema from snuba.datasets.storages import StorageKey from snuba.datasets.storages.events_common import ( all_columns, mandatory_conditions, query_processors, query_splitters, ) schema = TableSchema( columns=all_columns, local_table_name="sentry_local", dist_table_name="sentry_dist_ro", storage_set_key=StorageSetKey.EVENTS_RO, mandatory_conditions=mandatory_conditions, ) storage = ReadableTableStorage( storage_key=StorageKey.EVENTS_RO, storage_set_key=StorageSetKey.EVENTS_RO, schema=schema, query_processors=query_processors, query_splitters=query_splitters, )
dist_table_name="discover_dist", storage_set_key=StorageSetKey.DISCOVER, mandatory_conditions=[], ) schema2 = TableSchema( columns=columns2, local_table_name="discover_local", dist_table_name="discover_dist", storage_set_key=StorageSetKey.DISCOVER, mandatory_conditions=[], ) Storage1 = ReadableTableStorage( storage_key=StorageKey.DISCOVER, storage_set_key=StorageSetKey.DISCOVER, schema=schema1, ) Storage2 = ReadableTableStorage( storage_key=StorageKey.DISCOVER, storage_set_key=StorageSetKey.DISCOVER, schema=schema2, ) merged_columns = ColumnSet([ ("timestamp", DateTime()), ("mismatched1", String(Modifiers(nullable=True))), ("mismatched2", String(Modifiers(nullable=True))), ])
from snuba.clusters.storage_sets import StorageSetKey from snuba.datasets.schemas.tables import TableSchema from snuba.datasets.storage import ReadableTableStorage from snuba.datasets.storages import StorageKey from snuba.datasets.storages.transactions_common import ( columns, mandatory_condition_checkers, query_processors, query_splitters, ) schema = TableSchema( columns=columns, local_table_name="transactions_local", dist_table_name="transactions_dist", storage_set_key=StorageSetKey.TRANSACTIONS_RO, mandatory_conditions=[], ) storage = ReadableTableStorage( storage_key=StorageKey.TRANSACTIONS_RO, storage_set_key=StorageSetKey.TRANSACTIONS_RO, schema=schema, query_processors=query_processors, query_splitters=query_splitters, mandatory_condition_checkers=mandatory_condition_checkers, )
dist_materialized_view_name="outcomes_mv_hourly_dist", prewhere_candidates=["project_id", "org_id"], columns=materialized_view_columns, query=query, local_source_table_name=WRITE_LOCAL_TABLE_NAME, local_destination_table_name=READ_LOCAL_TABLE_NAME, dist_source_table_name=WRITE_DIST_TABLE_NAME, dist_destination_table_name=READ_DIST_TABLE_NAME, ) raw_storage = WritableTableStorage( schemas=StorageSchemas(read_schema=raw_schema, write_schema=raw_schema), table_writer=TableWriter( write_schema=raw_schema, stream_loader=KafkaStreamLoader( processor=OutcomesProcessor(), default_topic="outcomes", ), ), query_processors=[], ) materialized_storage = ReadableTableStorage( schemas=StorageSchemas( read_schema=read_schema, write_schema=None, intermediary_schemas=[materialized_view_schema], ), query_processors=[PrewhereProcessor()], )
Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("granularity", UInt(32)), Column("timestamp", DateTime()), Column("retention_days", UInt(16)), Column("tags", Nested([("key", UInt(64)), ("value", UInt(64))])), Column("_tags_hash", Array(UInt(64), SchemaModifiers(readonly=True))), ] sets_storage = ReadableTableStorage( storage_key=StorageKey.METRICS_SETS, storage_set_key=StorageSetKey.METRICS, schema=TableSchema( local_table_name="metrics_sets_local", dist_table_name="metrics_sets_dist", storage_set_key=StorageSetKey.METRICS, columns=ColumnSet([ *aggregated_columns, Column("value", AggregateFunction("uniqCombined64", [UInt(64)])), ]), ), query_processors=[ArrayJoinKeyValueOptimizer("tags")], ) counters_storage = ReadableTableStorage( storage_key=StorageKey.METRICS_COUNTERS, storage_set_key=StorageSetKey.METRICS, schema=TableSchema( local_table_name="metrics_counters_local", dist_table_name="metrics_counters_dist", storage_set_key=StorageSetKey.METRICS, columns=ColumnSet([
) storage = ReadableTableStorage( storage_key=StorageKey.DISCOVER, storage_set_key=StorageSetKey.DISCOVER, schema=schema, query_processors=[ MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), EventIdColumnProcessor(), ArrayJoinKeyValueOptimizer("tags"), PrewhereProcessor( [ "event_id", "release", "message", "transaction_name", "environment", "project_id", ] ), ], query_splitters=[ ColumnSplitQueryStrategy( id_column="event_id", project_column="project_id", timestamp_column="timestamp", ), TimeSplitQueryStrategy(timestamp_col="timestamp"), ], )
CounterAggregateProcessor(), default_topic=Topic.METRICS, dead_letter_queue_policy_creator=produce_policy_creator, ), write_format=WriteFormat.VALUES, ) org_counters_storage = ReadableTableStorage( storage_key=StorageKey.ORG_METRICS_COUNTERS, storage_set_key=StorageSetKey.METRICS, schema=TableSchema( local_table_name="metrics_counters_v2_local", dist_table_name="metrics_counters_v2_dist", storage_set_key=StorageSetKey.METRICS, columns=ColumnSet([ Column("org_id", UInt(64)), Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("granularity", UInt(32)), Column("timestamp", DateTime()), ]), ), query_processors=[TableRateLimit()], ) distributions_storage = WritableTableStorage( storage_key=StorageKey.METRICS_DISTRIBUTIONS, storage_set_key=StorageSetKey.METRICS, schema=WritableTableSchema( local_table_name="metrics_distributions_v2_local", dist_table_name="metrics_distributions_v2_dist",