def __init__(self): read_columns = ColumnSet([ ('org_id', UInt(64)), ('project_id', UInt(64)), ('key_id', Nullable(UInt(64))), ('timestamp', DateTime()), ('outcome', UInt(8)), ('reason', LowCardinality(Nullable(String()))), ('event_id', Nullable(UUID())), ]) read_schema = MergeTreeSchema( columns=read_columns, local_table_name='outcomes_raw_local', dist_table_name='outcomes_raw_dist', order_by='(org_id, project_id, timestamp)', partition_by='(toMonday(timestamp))', settings={'index_granularity': 16384}) dataset_schemas = DatasetSchemas(read_schema=read_schema, write_schema=None, intermediary_schemas=[]) super().__init__(dataset_schemas=dataset_schemas, time_group_columns={ 'time': 'timestamp', }, time_parse_columns=('timestamp', ))
def __init__(self) -> None: read_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", Nullable(UInt(64))), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(Nullable(String()))), ("event_id", Nullable(UUID())), ]) read_schema = MergeTreeSchema( columns=read_columns, local_table_name="outcomes_raw_local", dist_table_name="outcomes_raw_dist", order_by="(org_id, project_id, timestamp)", partition_by="(toMonday(timestamp))", settings={"index_granularity": 16384}, migration_function=outcomes_raw_migrations, ) dataset_schemas = DatasetSchemas(read_schema=read_schema, write_schema=None, intermediary_schemas=[]) super().__init__( dataset_schemas=dataset_schemas, time_group_columns={"time": "timestamp"}, time_parse_columns=("timestamp", ), )
) from snuba.datasets.schemas.tables import MergeTreeSchema from snuba.datasets.schemas.join import ( JoinConditionExpression, JoinCondition, JoinClause, JoinType, TableJoinNode, ) table1 = MergeTreeSchema( columns=ColumnSet([ ("t1c1", UInt(64)), ("t1c2", String()), ("t1c3", Nested([("t11c4", UInt(64))])), ]), local_table_name="table1", dist_table_name="table1", order_by="", partition_by="", ).get_data_source() table2 = MergeTreeSchema( columns=ColumnSet([ ("t2c1", UInt(64)), ("t2c2", String()), ("t2c3", Nested([("t21c4", UInt(64))])), ]), local_table_name="table2", dist_table_name="table2", order_by="",
def __init__(self): write_columns = ColumnSet([ ('org_id', UInt(64)), ('project_id', UInt(64)), ('key_id', Nullable(UInt(64))), ('timestamp', DateTime()), ('outcome', UInt(8)), ('reason', LowCardinality(Nullable(String()))), ('event_id', Nullable(UUID())), ]) write_schema = MergeTreeSchema( columns=write_columns, # TODO: change to outcomes.raw_local when we add multi DB support local_table_name=WRITE_LOCAL_TABLE_NAME, dist_table_name=WRITE_DIST_TABLE_NAME, order_by='(org_id, project_id, timestamp)', partition_by='(toMonday(timestamp))', settings={'index_granularity': 16384}) read_columns = ColumnSet([ ('org_id', UInt(64)), ('project_id', UInt(64)), ('key_id', UInt(64)), ('timestamp', DateTime()), ('outcome', UInt(8)), ('reason', LowCardinality(String())), ('times_seen', UInt(64)), ]) read_schema = SummingMergeTreeSchema( columns=read_columns, local_table_name=READ_LOCAL_TABLE_NAME, dist_table_name=READ_DIST_TABLE_NAME, order_by='(org_id, project_id, key_id, outcome, reason, timestamp)', partition_by='(toMonday(timestamp))', settings={'index_granularity': 256}) materialized_view_columns = ColumnSet([ ('org_id', UInt(64)), ('project_id', UInt(64)), ('key_id', UInt(64)), ('timestamp', DateTime()), ('outcome', UInt(8)), ('reason', String()), ('times_seen', UInt(64)), ]) # TODO: Find a better way to specify a query for a materialized view # The problem right now is that we have a way to define our columns in a ColumnSet abstraction but the query # doesn't use it. query = """ SELECT org_id, project_id, ifNull(key_id, 0) AS key_id, toStartOfHour(timestamp) AS timestamp, outcome, ifNull(reason, 'none') AS reason, count() AS times_seen FROM %(source_table_name)s GROUP BY org_id, project_id, key_id, timestamp, outcome, reason """ materialized_view = MaterializedViewSchema( local_materialized_view_name='outcomes_mv_hourly_local', dist_materialized_view_name='outcomes_mv_hourly_dist', columns=materialized_view_columns, query=query, local_source_table_name=WRITE_LOCAL_TABLE_NAME, local_destination_table_name=READ_LOCAL_TABLE_NAME, dist_source_table_name=WRITE_DIST_TABLE_NAME, dist_destination_table_name=READ_DIST_TABLE_NAME) dataset_schemas = DatasetSchemas( read_schema=read_schema, write_schema=write_schema, intermediary_schemas=[materialized_view]) super(OutcomesDataset, self).__init__( dataset_schemas=dataset_schemas, processor=OutcomesProcessor(), default_topic="outcomes", )
("project_id", UInt(64)), ("retention_days", UInt(16)), ("duration", UInt(32)), ("status", UInt(8)), ("errors", UInt(16)), ("received", DateTime()), ("started", DateTime()), ("release", LowCardinality(String())), ("environment", LowCardinality(String())), ]) raw_schema = MergeTreeSchema( columns=all_columns, local_table_name=WRITE_LOCAL_TABLE_NAME, dist_table_name=WRITE_DIST_TABLE_NAME, storage_set_key=StorageSetKey.SESSIONS, order_by="(org_id, project_id, release, environment, started)", partition_by="(toMonday(started))", settings={"index_granularity": "16384"}, ) read_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("started", DateTime()), ("release", LowCardinality(String())), ("environment", LowCardinality(String())), ( "duration_quantiles", AggregateFunction("quantilesIf(0.5, 0.9)", UInt(32), UInt(8)), ),
("cache_hit", UInt(8)), ("sample", Float(32)), ("max_threads", UInt(8)), ("num_days", UInt(32)), ("clickhouse_table", LowCardinality(String())), ("query_id", String()), ("is_duplicate", UInt(8)), ("consistent", UInt(8)), ]), ), ]) schema = MergeTreeSchema( columns=columns, local_table_name="querylog_local", dist_table_name="querylog_dist", order_by="(toStartOfDay(timestamp), request_id)", partition_by="(toMonday(timestamp))", sample_expr="request_id", ) storage = WritableTableStorage( schemas=StorageSchemas(read_schema=schema, write_schema=schema), table_writer=TableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=QuerylogProcessor(), default_topic=settings.QUERIES_TOPIC, ), ), query_processors=[], )
def __init__(self) -> None: write_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", Nullable(UInt(64))), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(Nullable(String()))), ("event_id", Nullable(UUID())), ]) write_schema = MergeTreeSchema( columns=write_columns, # TODO: change to outcomes.raw_local when we add multi DB support local_table_name=WRITE_LOCAL_TABLE_NAME, dist_table_name=WRITE_DIST_TABLE_NAME, order_by="(org_id, project_id, timestamp)", partition_by="(toMonday(timestamp))", settings={"index_granularity": 16384}, ) read_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", UInt(64)), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(String())), ("times_seen", UInt(64)), ]) read_schema = SummingMergeTreeSchema( columns=read_columns, local_table_name=READ_LOCAL_TABLE_NAME, dist_table_name=READ_DIST_TABLE_NAME, order_by="(org_id, project_id, key_id, outcome, reason, timestamp)", partition_by="(toMonday(timestamp))", settings={"index_granularity": 256}, ) materialized_view_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", UInt(64)), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", String()), ("times_seen", UInt(64)), ]) # TODO: Find a better way to specify a query for a materialized view # The problem right now is that we have a way to define our columns in a ColumnSet abstraction but the query # doesn't use it. query = """ SELECT org_id, project_id, ifNull(key_id, 0) AS key_id, toStartOfHour(timestamp) AS timestamp, outcome, ifNull(reason, 'none') AS reason, count() AS times_seen FROM %(source_table_name)s GROUP BY org_id, project_id, key_id, timestamp, outcome, reason """ materialized_view = MaterializedViewSchema( local_materialized_view_name="outcomes_mv_hourly_local", dist_materialized_view_name="outcomes_mv_hourly_dist", prewhere_candidates=["project_id", "org_id"], columns=materialized_view_columns, query=query, local_source_table_name=WRITE_LOCAL_TABLE_NAME, local_destination_table_name=READ_LOCAL_TABLE_NAME, dist_source_table_name=WRITE_DIST_TABLE_NAME, dist_destination_table_name=READ_DIST_TABLE_NAME, ) dataset_schemas = DatasetSchemas( read_schema=read_schema, write_schema=write_schema, intermediary_schemas=[materialized_view], ) table_writer = TableWriter( write_schema=write_schema, stream_loader=KafkaStreamLoader( processor=OutcomesProcessor(), default_topic="outcomes", ), ) super().__init__( dataset_schemas=dataset_schemas, table_writer=table_writer, time_group_columns={"time": "timestamp"}, time_parse_columns=("timestamp", ), )
write_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", Nullable(UInt(64))), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(Nullable(String()))), ("event_id", Nullable(UUID())), ]) raw_schema = MergeTreeSchema( columns=write_columns, # TODO: change to outcomes.raw_local when we add multi DB support local_table_name=WRITE_LOCAL_TABLE_NAME, dist_table_name=WRITE_DIST_TABLE_NAME, storage_set_key=StorageSetKey.OUTCOMES, order_by="(org_id, project_id, timestamp)", partition_by="(toMonday(timestamp))", settings={"index_granularity": "16384"}, ) read_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", UInt(64)), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(String())), ("times_seen", UInt(64)), ])
import pytest from snuba.datasets.schemas.tables import MergeTreeSchema from snuba.datasets.schemas.join import ( JoinConditionExpression, JoinCondition, JoinStructure, SchemaJoinedSource, SubJoinSource, JoinType, ) table1 = MergeTreeSchema( columns=None, local_table_name="table1", dist_table_name="table1", order_by="", partition_by="", ) table2 = MergeTreeSchema( columns=None, local_table_name="table2", dist_table_name="table2", order_by="", partition_by="", ) table3 = MergeTreeSchema( columns=None, local_table_name="table3",
from snuba.datasets.schemas.tables import MergeTreeSchema from snuba.datasets.schemas.join import ( JoinConditionExpression, JoinCondition, JoinClause, JoinType, TableJoinNode, JoinClause, ) table1 = MergeTreeSchema( columns=ColumnSet([ ("t1c1", UInt(64)), ("t1c2", String()), ("t1c3", Nested([("t11c4", UInt(64))])), ]), local_table_name="table1", dist_table_name="table1", order_by="", partition_by="", ) table2 = MergeTreeSchema( columns=ColumnSet([ ("t2c1", UInt(64)), ("t2c2", String()), ("t2c3", Nested([("t21c4", UInt(64))])), ]), local_table_name="table2", dist_table_name="table2", order_by="",