def setup_method(self, test_method): super().setup_method(test_method) self.app.post = partial(self.app.post, headers={"referer": "test"}) self.trace_id = uuid.UUID("7400045b-25c4-43b8-8591-4600aa83ad04") self.event = get_raw_event() self.project_id = self.event["project_id"] write_unprocessed_events(get_writable_storage(StorageKey.EVENTS), [self.event]) write_unprocessed_events( get_writable_storage(StorageKey.TRANSACTIONS), [get_raw_transaction()], )
def __init__(self) -> None: # The raw table we write onto, and that potentially we could # query. writable_storage = get_writable_storage(StorageKey.OUTCOMES_RAW) # The materialized view we query aggregate data from. materialized_storage = get_storage(StorageKey.OUTCOMES_HOURLY) read_schema = materialized_storage.get_schema() super().__init__( storages=[writable_storage, materialized_storage], query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( # TODO: Once we are ready to expose the raw data model and select whether to use # materialized storage or the raw one here, replace this with a custom storage # selector that decides when to use the materialized data. storage=materialized_storage, ), ), abstract_column_set=read_schema.get_columns(), join_relationships={}, writable_storage=writable_storage, validators=[EntityRequiredColumnValidator({"org_id"})], required_time_column="timestamp", )
def __init__( self, writable_storage_key: StorageKey, readable_storage_key: StorageKey, value_schema: Sequence[Column[SchemaModifiers]], mappers: TranslationMappers, ) -> None: writable_storage = get_writable_storage(writable_storage_key) readable_storage = get_storage(readable_storage_key) super().__init__( storages=[writable_storage, readable_storage], query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( readable_storage, mappers=TranslationMappers(subscriptables=[ SubscriptableMapper(None, "tags", None, "tags"), ], ).concat(mappers), )), abstract_column_set=ColumnSet([ Column("org_id", UInt(64)), Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("timestamp", DateTime()), Column("tags", Nested([("key", UInt(64)), ("value", UInt(64))])), *value_schema, ]), join_relationships={}, writable_storage=writable_storage, validators=[ EntityRequiredColumnValidator({"org_id", "project_id"}) ], required_time_column="timestamp", )
def __init__(self, custom_mappers: Optional[TranslationMappers] = None) -> None: storage = get_writable_storage(StorageKey.TRANSACTIONS) schema = storage.get_table_writer().get_schema() super().__init__( storages=[storage], query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=storage, mappers=transaction_translator if custom_mappers is None else transaction_translator.concat(custom_mappers), ), ), abstract_column_set=schema.get_columns(), join_relationships={ "contains": JoinRelationship( rhs_entity=EntityKey.SPANS, columns=[ ("project_id", "project_id"), ("span_id", "transaction_span_id"), ], join_type=JoinType.INNER, equivalences=[ ColumnEquivalence("event_id", "transaction_id"), ColumnEquivalence("transaction_name", "transaction_name"), ColumnEquivalence("trace_id", "trace_id"), ], ) }, writable_storage=storage, )
def setup_method(self, test_method): self.metadata = KafkaMessageMetadata(0, 0, datetime.now()) self.event = get_raw_event() self.processor = (get_writable_storage( StorageKey.EVENTS).get_table_writer().get_stream_loader(). get_processor())
def test_tags_hash_map(self) -> None: """ Adds an event and ensures the tags_hash_map is properly populated including escaping. """ self.event = get_raw_event() self.event["data"]["tags"].append(["test_tag1", "value1"]) self.event["data"]["tags"].append(["test_tag=2", "value2"]) # Requires escaping storage = get_writable_storage(StorageKey.ERRORS) schema = storage.get_schema() assert isinstance(schema, TableSchema) table_name = schema.get_table_name() write_unprocessed_events(storage, [self.event]) clickhouse = storage.get_cluster().get_query_connection( ClickhouseClientSettings.QUERY) hashed = clickhouse.execute( "SELECT cityHash64('test_tag1=value1'), cityHash64('test_tag\\\\=2=value2')" ) tag1, tag2 = hashed[0] event = clickhouse.execute(( f"SELECT replaceAll(toString(event_id), '-', '') FROM {table_name} WHERE has(_tags_hash_map, {tag1}) " f"AND has(_tags_hash_map, {tag2})")) assert len(event) == 1 assert event[0][0] == self.event["data"]["id"]
def test_write_each_node( override_fixture: Callable[[bool], FakeClickhouseCluster], write_node_replacements_projects: str, expected_queries: Mapping[str, Sequence[str]], request: Any, ) -> None: """ Test the execution of replacement queries on both storage nodes and query nodes. """ set_config("write_node_replacements_projects", write_node_replacements_projects) override_func = request.getfixturevalue(override_fixture) test_cluster = override_func(True) replacer = ReplacerWorker( get_writable_storage(StorageKey.ERRORS), "consumer_group", DummyMetricsBackend(), ) replacer.flush_batch([ LegacyReplacement( COUNT_QUERY_TEMPLATE, INSERT_QUERY_TEMPLATE, FINAL_QUERY_TEMPLATE, (NEEDS_FINAL, 1), REPLACEMENT_TYPE, REPLACEMENT_MESSAGE_METADATA, ) ]) queries = test_cluster.get_queries() assert queries == expected_queries
def test_tags_hash_map(self) -> None: """ Adds an event and ensures the tags_hash_map is properly populated including escaping. """ self.event = get_raw_event() self.event["data"]["tags"].append(["test_tag1", "value1"]) self.event["data"]["tags"].append(["test_tag=2", "value2"]) # Requires escaping storage = get_writable_storage(StorageKey.EVENTS) write_unprocessed_events(storage, [self.event]) clickhouse = storage.get_cluster().get_query_connection( ClickhouseClientSettings.QUERY) hashed = clickhouse.execute( "SELECT cityHash64('test_tag1=value1'), cityHash64('test_tag\\\\=2=value2')" ) tag1, tag2 = hashed[0] event = clickhouse.execute(( f"SELECT event_id FROM sentry_local WHERE has(_tags_hash_map, {tag1}) " f"AND has(_tags_hash_map, {tag2})")) assert len(event) == 1 assert event[0][0] == self.event["data"]["id"]
def test_failing_query( override_cluster: Callable[[bool], FakeClickhouseCluster]) -> None: """ Test the execution of replacement queries on single node when the query fails. """ set_config("write_node_replacements_projects", "[1]") override_cluster(False) replacer = ReplacerWorker( get_writable_storage(StorageKey.ERRORS), "consumer_group", DummyMetricsBackend(), ) with pytest.raises(ServerExplodedException): replacer.flush_batch([ LegacyReplacement( COUNT_QUERY_TEMPLATE, INSERT_QUERY_TEMPLATE, FINAL_QUERY_TEMPLATE, (NEEDS_FINAL, 1), REPLACEMENT_TYPE, REPLACEMENT_MESSAGE_METADATA, ) ])
def test_mock_consumer() -> None: storage = get_writable_storage(StorageKey.ERRORS) strategy = KafkaConsumerStrategyFactory( None, lambda message: None, build_mock_batch_writer(storage, True, TestingMetricsBackend(), 100, 50), max_batch_size=1, max_batch_time=1, processes=None, input_block_size=None, output_block_size=None, initialize_parallel_transform=None, ).create(lambda message: None) strategy.submit( Message( Partition(Topic("events"), 0), 1, KafkaPayload(None, b"INVALID MESSAGE", []), datetime.now(), )) strategy.close() strategy.join() # If the mock was not applied correctly we would have data in Clickhouse reader = storage.get_cluster().get_reader() result = reader.execute( FormattedQuery([StringNode("SELECT count() as c from errors_local")])) assert result["data"] == [{"c": 0}]
def setup_method(self, test_method): super().setup_method(test_method) self.app.post = partial(self.app.post, headers={"referer": "test"}) self.trace_id = uuid.UUID("7400045b-25c4-43b8-8591-4600aa83ad04") self.event = get_raw_event() self.project_id = self.event["project_id"] self.skew = timedelta(minutes=180) self.base_time = datetime.utcnow().replace( minute=0, second=0, microsecond=0, tzinfo=pytz.utc) - timedelta(minutes=180) write_unprocessed_events(get_writable_storage(StorageKey.EVENTS), [self.event]) write_unprocessed_events( get_writable_storage(StorageKey.TRANSACTIONS), [get_raw_transaction()], )
def generate_transactions() -> None: from datetime import datetime table_writer = get_writable_storage( StorageKey.TRANSACTIONS).get_table_writer() rows = [] for i in range(5): raw_transaction = get_raw_transaction() # Older versions of this table did not have measurements del raw_transaction["data"]["measurements"] processed = ( table_writer.get_stream_loader().get_processor().process_message( (2, "insert", raw_transaction), KafkaMessageMetadata(0, 0, datetime.utcnow()), )) rows.extend(processed.rows) BatchWriterEncoderWrapper( table_writer.get_batch_writer(metrics=DummyMetricsBackend( strict=True)), JSONRowEncoder(), ).write(rows)
def process_message_multistorage( message: Message[MultistorageKafkaPayload], ) -> Sequence[Tuple[StorageKey, Union[None, JSONRowInsertBatch, ReplacementBatch]]]: # XXX: Avoid circular import on KafkaMessageMetadata, remove when that type # is itself removed. from snuba.datasets.storages.factory import get_writable_storage value = rapidjson.loads(message.payload.payload.value) metadata = KafkaMessageMetadata(message.offset, message.partition.index, message.timestamp) results: MutableSequence[Tuple[StorageKey, Union[None, JSONRowInsertBatch, ReplacementBatch]]] = [] for storage_key in message.payload.storage_keys: result = (get_writable_storage(storage_key).get_table_writer( ).get_stream_loader().get_processor().process_message(value, metadata)) if isinstance(result, InsertBatch): results.append(( storage_key, JSONRowInsertBatch( [json_row_encoder.encode(row) for row in result.rows], result.origin_timestamp, ), )) else: results.append((storage_key, result)) return results
def test_load_balancing( override_cluster: Callable[[bool], FakeClickhouseCluster]) -> None: """ Test running two replacements in a row and verify the queries are properly load balanced on different nodes. """ set_config("write_node_replacements_projects", "[1]") cluster = override_cluster(True) replacer = ReplacerWorker(get_writable_storage(StorageKey.ERRORS), DummyMetricsBackend()) replacement = LegacyReplacement( COUNT_QUERY_TEMPLATE, INSERT_QUERY_TEMPLATE, FINAL_QUERY_TEMPLATE, (NEEDS_FINAL, 1), ) replacer.flush_batch([replacement, replacement]) assert cluster.get_queries() == { "query_node": [ "SELECT count() FROM errors_dist FINAL WHERE event_id = '6f0ccc03-6efb-4f7c-8005-d0c992106b31'", "SELECT count() FROM errors_dist FINAL WHERE event_id = '6f0ccc03-6efb-4f7c-8005-d0c992106b31'", ], "storage-0-0": [LOCAL_QUERY], "storage-0-1": [LOCAL_QUERY], "storage-1-0": [LOCAL_QUERY], "storage-1-1": [LOCAL_QUERY], "storage-2-0": [LOCAL_QUERY], "storage-2-1": [LOCAL_QUERY], }
def test_backfill_errors() -> None: backfill_migration_id = "0014_backfill_errors" runner = Runner() runner.run_migration(MigrationKey(MigrationGroup.SYSTEM, "0001_migrations")) run_prior_migrations(MigrationGroup.EVENTS, backfill_migration_id, runner) errors_storage = get_writable_storage(StorageKey.ERRORS) clickhouse = errors_storage.get_cluster().get_query_connection( ClickhouseClientSettings.QUERY) errors_table_name = errors_storage.get_table_writer().get_schema( ).get_table_name() raw_events = [] for i in range(10): event = get_raw_event() raw_events.append(event) events_storage = get_writable_storage(StorageKey.EVENTS) write_unprocessed_events(events_storage, raw_events) assert get_count_from_storage(errors_table_name, clickhouse) == 0 # Run 0014_backfill_errors runner.run_migration(MigrationKey(MigrationGroup.EVENTS, backfill_migration_id), force=True) assert get_count_from_storage(errors_table_name, clickhouse) == 10 outcome = perform_select_query(["contexts.key", "contexts.value"], errors_table_name, None, str(1), clickhouse) assert outcome[0] == ( [ "device.model_id", "geo.city", "geo.country_code", "geo.region", "os.kernel_version", ], ["Galaxy", "San Francisco", "US", "CA", "1.1.1"], )
def setup_method(self, test_method: Callable[..., Any]) -> None: super().setup_method(test_method) self.skew = timedelta(minutes=180) self.base_time = datetime.utcnow().replace( minute=0, second=0, microsecond=0) - timedelta(minutes=180) self.next_time = datetime.utcnow().replace( minute=0, second=0, microsecond=0) + timedelta(minutes=180) self.storage = get_writable_storage(StorageKey.SESSIONS_RAW)
def __init__(self, custom_mappers: Optional[TranslationMappers] = None) -> None: if settings.ERRORS_ROLLOUT_ALL: events_storage = get_writable_storage(StorageKey.ERRORS) pipeline_builder = SimplePipelineBuilder( query_plan_builder=SelectedStorageQueryPlanBuilder( selector=ErrorsQueryStorageSelector( mappers=errors_translators if custom_mappers is None else errors_translators.concat(custom_mappers))), ) else: events_storage = get_writable_storage(StorageKey.EVENTS) pipeline_builder = SimplePipelineBuilder( query_plan_builder=SelectedStorageQueryPlanBuilder( selector=EventsQueryStorageSelector( mappers=event_translator if custom_mappers is None else event_translator.concat(custom_mappers))), ) schema = events_storage.get_table_writer().get_schema() columns = schema.get_columns() super().__init__( storages=[events_storage], query_pipeline_builder=pipeline_builder, abstract_column_set=columns, join_relationships={ "grouped": JoinRelationship( rhs_entity=EntityKey.GROUPEDMESSAGES, columns=[("project_id", "project_id"), ("group_id", "id")], join_type=JoinType.INNER, equivalences=[], ), "assigned": JoinRelationship( rhs_entity=EntityKey.GROUPASSIGNEE, columns=[("project_id", "project_id"), ("group_id", "group_id")], join_type=JoinType.INNER, equivalences=[], ), }, writable_storage=events_storage, validators=[EntityRequiredColumnValidator({"project_id"})], required_time_column="timestamp", )
def get_values_row_encoder(storage_key: StorageKey) -> ValuesRowEncoder: from snuba.datasets.storages.factory import get_writable_storage if storage_key not in values_row_encoders: table_writer = get_writable_storage(storage_key).get_table_writer() values_row_encoders[storage_key] = ValuesRowEncoder( table_writer.get_writeable_columns()) return values_row_encoders[storage_key]
def setup_method(self, test_method: Any) -> None: super().setup_method(test_method) self.skew_minutes = 180 self.skew = timedelta(minutes=self.skew_minutes) self.base_time = ( datetime.utcnow().replace(minute=0, second=0, microsecond=0) - self.skew) self.storage = get_writable_storage(StorageKey.OUTCOMES_RAW)
def test(self) -> None: storage = get_writable_storage(StorageKey.EVENTS) cluster = storage.get_cluster() clickhouse = cluster.get_query_connection(ClickhouseClientSettings.OPTIMIZE) table = storage.get_table_writer().get_schema().get_table_name() database = cluster.get_database() # no data, 0 partitions to optimize parts = optimize.get_partitions_to_optimize(clickhouse, database, table) assert parts == [] base = datetime(1999, 12, 26) # a sunday base_monday = base - timedelta(days=base.weekday()) # 1 event, 0 unoptimized parts self.write_processed_messages([self.create_event_row_for_date(base)]) parts = optimize.get_partitions_to_optimize(clickhouse, database, table) assert parts == [] # 2 events in the same part, 1 unoptimized part self.write_processed_messages([self.create_event_row_for_date(base)]) parts = optimize.get_partitions_to_optimize(clickhouse, database, table) assert parts == [(base_monday, 90)] # 3 events in the same part, 1 unoptimized part self.write_processed_messages([self.create_event_row_for_date(base)]) parts = optimize.get_partitions_to_optimize(clickhouse, database, table) assert parts == [(base_monday, 90)] # 3 events in one part, 2 in another, 2 unoptimized parts a_month_earlier = base_monday - timedelta(days=31) a_month_earlier_monday = a_month_earlier - timedelta( days=a_month_earlier.weekday() ) self.write_processed_messages( [self.create_event_row_for_date(a_month_earlier_monday)] ) self.write_processed_messages( [self.create_event_row_for_date(a_month_earlier_monday)] ) parts = optimize.get_partitions_to_optimize(clickhouse, database, table) assert parts == [(base_monday, 90), (a_month_earlier_monday, 90)] # respects before (base is properly excluded) assert list( optimize.get_partitions_to_optimize( clickhouse, database, table, before=base ) ) == [(a_month_earlier_monday, 90)] optimize.optimize_partitions(clickhouse, database, table, parts) # all parts should be optimized parts = optimize.get_partitions_to_optimize(clickhouse, database, table) assert parts == []
def __init__(self) -> None: storage = get_writable_storage("querylog") columns = storage.get_table_writer().get_schema().get_columns() super().__init__( storages=[storage], query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage), abstract_column_set=columns, writable_storage=storage, )
def __init__(self) -> None: storage = get_writable_storage(StorageKey.SPANS) super().__init__( storages=[storage], query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=storage, mappers=TranslationMappers( subscriptables=[ SubscriptableMapper(None, "tags", None, "tags") ], ), ), ), abstract_column_set=ColumnSet( [ ("project_id", UInt(64)), ("transaction_id", UUID()), ("trace_id", UUID()), ("transaction_span_id", UInt(64)), ("span_id", UInt(64)), ("parent_span_id", UInt(64, Modifiers(nullable=True))), ("transaction_name", String()), ("op", String()), ("status", UInt(8)), ("start_ts", DateTime()), ("start_ns", UInt(32)), ("finish_ts", DateTime()), ("finish_ns", UInt(32)), ("duration_ms", UInt(32)), ("tags", Nested([("key", String()), ("value", String())])), ] ), join_relationships={ "contained": JoinRelationship( rhs_entity=EntityKey.TRANSACTIONS, columns=[ ("project_id", "project_id"), ("transaction_span_id", "span_id"), ], join_type=JoinType.INNER, equivalences=[ ColumnEquivalence("transaction_id", "event_id"), ColumnEquivalence("transaction_name", "transaction_name"), ColumnEquivalence("trace_id", "trace_id"), ], ) }, writable_storage=storage, validators=[EntityRequiredColumnValidator({"project_id"})], required_time_column=None, )
def setup_method(self, test_method: Any) -> None: super().setup_method(test_method) # values for test data self.minutes = 180 self.skew = timedelta(minutes=self.minutes) self.started = datetime.utcnow().replace(minute=0, second=0, microsecond=0, tzinfo=pytz.utc) self.storage = get_writable_storage(StorageKey.SESSIONS_RAW)
def __init__(self) -> None: storage = get_writable_storage(StorageKey.TRANSACTIONS) schema = storage.get_table_writer().get_schema() super().__init__( storages=[storage], query_plan_builder=SingleStorageQueryPlanBuilder( storage=storage, mappers=transaction_translator ), abstract_column_set=schema.get_columns(), writable_storage=storage, )
def __init__(self) -> None: storage = get_writable_storage(StorageKey.ERRORS) schema = storage.get_table_writer().get_schema() columns = schema.get_columns() super().__init__( storages=[storage], query_plan_builder=SingleStorageQueryPlanBuilder( storage=storage, mappers=errors_translators ), abstract_column_set=columns, writable_storage=storage, )
def setup_method(self, test_method): super().setup_method(test_method) self.app.post = partial(self.app.post, headers={"referer": "test"}) # values for test data self.minutes = 180 self.skew = timedelta(minutes=self.minutes) self.started = datetime.utcnow().replace(minute=0, second=0, microsecond=0, tzinfo=pytz.utc) self.storage = get_writable_storage(StorageKey.SESSIONS_RAW)
def test_backfill_errors() -> None: errors_storage = get_writable_storage(StorageKey.ERRORS) clickhouse = errors_storage.get_cluster().get_query_connection( ClickhouseClientSettings.QUERY) errors_table_name = errors_storage.get_table_writer().get_schema( ).get_table_name() def get_errors_count() -> int: return clickhouse.execute( f"SELECT count() from {errors_table_name}")[0][0] raw_events = [] for i in range(10): event = get_raw_event() raw_events.append(event) events_storage = get_writable_storage(StorageKey.EVENTS) write_unprocessed_events(events_storage, raw_events) assert get_errors_count() == 0 backfill_errors() assert get_errors_count() == 10 assert clickhouse.execute( f"SELECT contexts.key, contexts.value from {errors_table_name} LIMIT 1;" )[0] == ( ( "device.model_id", "geo.city", "geo.country_code", "geo.region", "os.kernel_version", ), ("Galaxy", "San Francisco", "US", "CA", "1.1.1"), )
def __init__(self) -> None: storage = get_writable_storage(StorageKey.ERRORS) schema = storage.get_table_writer().get_schema() columns = schema.get_columns() self.__time_group_columns = {"time": "timestamp", "rtime": "received"} self.__time_parse_columns = ("timestamp", "received") super().__init__( storages=[storage], query_plan_builder=SingleStorageQueryPlanBuilder( storage=storage, mappers=errors_translators), abstract_column_set=columns, writable_storage=storage, )
def setup_method(self): from snuba.web.views import application assert application.testing is True self.app = application.test_client() self.app.post = partial(self.app.post, headers={"referer": "test"}) self.storage = get_writable_storage(StorageKey.ERRORS) self.replacer = replacer.ReplacerWorker( self.storage, DummyMetricsBackend(strict=True)) self.project_id = 1 self.event = get_raw_event()
def __init__( self, writable_storage_key: Optional[StorageKey], readable_storage_key: StorageKey, value_schema: Sequence[Column[SchemaModifiers]], mappers: TranslationMappers, abstract_column_set: Optional[ColumnSet] = None, validators: Optional[Sequence[QueryValidator]] = None, ) -> None: writable_storage = (get_writable_storage(writable_storage_key) if writable_storage_key else None) readable_storage = get_storage(readable_storage_key) storages = [readable_storage] if writable_storage: storages.append(writable_storage) if abstract_column_set is None: abstract_column_set = ColumnSet([ Column("org_id", UInt(64)), Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("timestamp", DateTime()), Column("bucketed_time", DateTime()), Column("tags", Nested([("key", UInt(64)), ("value", UInt(64))])), *value_schema, ]) if validators is None: validators = [ EntityRequiredColumnValidator({"org_id", "project_id"}), GranularityValidator(minimum=10), ] super().__init__( storages=storages, query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( readable_storage, mappers=TranslationMappers(subscriptables=[ SubscriptableMapper(None, "tags", None, "tags"), ], ).concat(mappers), )), abstract_column_set=abstract_column_set, join_relationships={}, writable_storage=writable_storage, validators=validators, required_time_column="timestamp", )