def run_test( self, subscriptions: Collection[Subscription], start: timedelta, end: timedelta, expected: Collection[ScheduledSubscriptionTask], sort_key: Optional[Callable[[ScheduledSubscriptionTask], Tuple[datetime, uuid.UUID]]] = None, ) -> None: tick = self.build_tick(start, end) store = RedisSubscriptionDataStore( redis_client, self.entity_key, self.partition_id, ) for subscription in subscriptions: store.create(subscription.identifier.uuid, subscription.data) scheduler = SubscriptionScheduler( EntityKey.EVENTS, store, self.partition_id, timedelta(minutes=1), DummyMetricsBackend(strict=True), ) result = list(scheduler.find(tick)) if sort_key: result.sort(key=sort_key) assert result == expected
def test(self) -> None: creator = SubscriptionCreator(self.dataset, EntityKey.EVENTS) subscription = SubscriptionData( project_id=1, query="MATCH (events) SELECT count() AS count", time_window_sec=10 * 60, resolution_sec=60, entity_subscription=create_entity_subscription(), ) identifier = creator.create(subscription, Timer("test")) assert (cast( List[Tuple[UUID, SubscriptionData]], RedisSubscriptionDataStore( redis_client, self.entity_key, identifier.partition, ).all(), )[0][1] == subscription) SubscriptionDeleter(self.entity_key, identifier.partition).delete(identifier.uuid) assert (RedisSubscriptionDataStore( redis_client, self.entity_key, identifier.partition, ).all() == [])
def run_test( self, subscriptions: Collection[Subscription], start: timedelta, end: timedelta, expected: Collection[ScheduledTask[Subscription]], sort_key=None, ) -> None: store = RedisSubscriptionDataStore( redis_client, self.dataset, self.partition_id, ) for subscription in subscriptions: store.create(subscription.identifier.uuid, subscription.data) scheduler = SubscriptionScheduler( store, self.partition_id, timedelta(minutes=1), DummyMetricsBackend(strict=True), ) result = list(scheduler.find(self.build_interval(start, end))) if sort_key: result.sort(key=sort_key) assert result == expected
def test(self) -> None: creator = SubscriptionCreator(self.dataset) subscription = LegacySubscriptionData( project_id=1, conditions=[], aggregations=[["count()", "", "count"]], time_window=timedelta(minutes=10), resolution=timedelta(minutes=1), ) identifier = creator.create(subscription, Timer("test")) assert ( cast( List[Tuple[UUID, SubscriptionData]], RedisSubscriptionDataStore( redis_client, self.dataset, identifier.partition, ).all(), )[0][1] == subscription ) SubscriptionDeleter(self.dataset, identifier.partition).delete(identifier.uuid) assert ( RedisSubscriptionDataStore( redis_client, self.dataset, identifier.partition, ).all() == [] )
def create_subscription() -> None: store = RedisSubscriptionDataStore(redis_client, EntityKey.EVENTS, PartitionId(0)) store.create( uuid.uuid4(), SubscriptionData( project_id=1, time_window_sec=60, resolution_sec=60, query="MATCH (events) SELECT count()", entity_subscription=EventsSubscription(data_dict={}), ), )
def __init__( self, entity_key: EntityKey, mode: SchedulingWatermarkMode, schedule_ttl: int, stale_threshold_seconds: Optional[int], partitions: int, producer: Producer[KafkaPayload], scheduled_topic_spec: KafkaTopicSpec, metrics: MetricsBackend, ) -> None: self.__mode = mode self.__stale_threshold_seconds = stale_threshold_seconds self.__partitions = partitions self.__producer = producer self.__scheduled_topic_spec = scheduled_topic_spec self.__metrics = metrics self.__buffer_size = settings.SUBSCRIPTIONS_ENTITY_BUFFER_SIZE.get( entity_key.value, settings.SUBSCRIPTIONS_DEFAULT_BUFFER_SIZE) self.__schedulers = { index: SubscriptionScheduler( entity_key, RedisSubscriptionDataStore(redis_client, entity_key, PartitionId(index)), partition_id=PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=self.__metrics, ) for index in range(self.__partitions) }
def test(self): creator = SubscriptionCreator(self.dataset) subscription = SubscriptionData( project_id=1, conditions=[], aggregations=[["count()", "", "count"]], time_window=timedelta(minutes=10), resolution=timedelta(minutes=1), ) identifier = creator.create(subscription, Timer("test")) RedisSubscriptionDataStore( redis_client, self.dataset, identifier.partition, ).all()[0][1] == subscription SubscriptionDeleter(self.dataset, identifier.partition).delete(identifier.uuid) RedisSubscriptionDataStore( redis_client, self.dataset, identifier.partition, ).all() == []
def test(self, subscription: SubscriptionData) -> None: creator = SubscriptionCreator(self.dataset, EntityKey.EVENTS) identifier = creator.create(subscription, self.timer) assert (cast( List[Tuple[UUID, SubscriptionData]], RedisSubscriptionDataStore( redis_client, self.entity_key, identifier.partition, ).all(), )[0][1] == subscription)
def test(self): creator = SubscriptionCreator(self.dataset) subscription = SubscriptionData( project_id=123, conditions=[["platform", "IN", ["a"]]], aggregations=[["count()", "", "count"]], time_window=timedelta(minutes=10), resolution=timedelta(minutes=1), ) identifier = creator.create(subscription, self.timer) RedisSubscriptionDataStore( redis_client, self.dataset, identifier.partition, ).all()[0][1] == subscription
def create(self, data: SubscriptionData, timer: Timer) -> SubscriptionIdentifier: data.validate() self._test_request(data, timer) identifier = SubscriptionIdentifier( self.__partitioner.build_partition_id(data), uuid1(), ) RedisSubscriptionDataStore( redis_client, self.entity_key, identifier.partition ).create( identifier.uuid, data, ) return identifier
def create(self, data: SubscriptionData, timer: Timer) -> SubscriptionIdentifier: # We want to test the query out here to make sure it's valid and can run request = data.build_request(self.dataset, datetime.utcnow(), None, timer) parse_and_run_query(self.dataset, request, timer) identifier = SubscriptionIdentifier( self.__partitioner.build_partition_id(data), uuid1(), ) RedisSubscriptionDataStore(redis_client, self.dataset, identifier.partition).create( identifier.uuid, data, ) return identifier
def create(self, data: SubscriptionData, timer: Timer) -> SubscriptionIdentifier: # We want to test the query out here to make sure it's valid and can run # If there is a delegate subscription, we need to run both the SnQL and Legacy validator if isinstance(data, DelegateSubscriptionData): self._test_request(data.to_snql(), timer) self._test_request(data.to_legacy(), timer) else: self._test_request(data, timer) identifier = SubscriptionIdentifier( self.__partitioner.build_partition_id(data), uuid1(), ) RedisSubscriptionDataStore( redis_client, self.dataset, identifier.partition ).create( identifier.uuid, data, ) return identifier
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], ) -> None: """Evaluates subscribed queries for a dataset.""" assert result_topic is not None setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) if not bootstrap_servers: bootstrap_servers = settings.DEFAULT_DATASET_BROKERS.get( dataset_name, settings.DEFAULT_BROKERS ) loader = enforce_table_writer(dataset).get_stream_loader() consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, consumer_group, auto_offset_reset=auto_offset_reset, ), PassthroughCodec(), ), KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", ), CommitCodec(), ), ( Topic(commit_log_topic) if commit_log_topic is not None else Topic(loader.get_commit_log_topic_spec().topic_name) ), set(commit_log_groups), ) ) producer = KafkaProducer( { "bootstrap.servers": ",".join(bootstrap_servers), "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, SubscriptionResultCodec(), ) with closing(consumer), closing(producer): batching_consumer = BatchingConsumer( consumer, ( Topic(topic) if topic is not None else Topic(loader.get_default_topic_spec().topic_name) ), SubscriptionWorker( SubscriptionExecutor( dataset, ThreadPoolExecutor( max_workers=settings.SUBSCRIPTIONS_MAX_CONCURRENT_QUERIES ), ), { index: SubscriptionScheduler( RedisSubscriptionDataStore( redis_client, dataset, PartitionId(index) ), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), ) for index in range( partitions if partitions is not None else loader.get_default_topic_spec().partitions_number ) }, producer, Topic(result_topic), ), max_batch_size, max_batch_time_ms, create_metrics( "snuba.subscriptions", tags={"group": consumer_group, "dataset": dataset_name}, ), ) def handler(signum, frame) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, max_query_workers: Optional[int], schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], delay_seconds: Optional[int], ) -> None: """Evaluates subscribed queries for a dataset.""" setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) storage = dataset.get_default_entity().get_writable_storage() assert ( storage is not None ), f"Dataset {dataset_name} does not have a writable storage by default." loader = enforce_table_writer(dataset).get_stream_loader() commit_log_topic_spec = loader.get_commit_log_topic_spec() assert commit_log_topic_spec is not None result_topic_spec = loader.get_subscription_result_topic_spec() assert result_topic_spec is not None metrics = MetricsWrapper( environment.metrics, "subscriptions", tags={ "group": consumer_group, "dataset": dataset_name }, ) consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( loader.get_default_topic_spec().topic, consumer_group, auto_offset_reset=auto_offset_reset, bootstrap_servers=bootstrap_servers, ), ), KafkaConsumer( build_kafka_consumer_configuration( commit_log_topic_spec.topic, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", bootstrap_servers=bootstrap_servers, ), ), (Topic(commit_log_topic) if commit_log_topic is not None else Topic(commit_log_topic_spec.topic_name)), set(commit_log_groups), ), time_shift=(timedelta(seconds=delay_seconds * -1) if delay_seconds is not None else None), ) producer = ProducerEncodingWrapper( KafkaProducer( build_kafka_producer_configuration( loader.get_default_topic_spec().topic, bootstrap_servers=bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, )), SubscriptionTaskResultEncoder(), ) executor = ThreadPoolExecutor(max_workers=max_query_workers) logger.debug("Starting %r with %s workers...", executor, getattr(executor, "_max_workers", 0)) metrics.gauge("executor.workers", getattr(executor, "_max_workers", 0)) with closing(consumer), executor, closing(producer): from arroyo import configure_metrics configure_metrics(StreamMetricsAdapter(metrics)) batching_consumer = StreamProcessor( consumer, (Topic(topic) if topic is not None else Topic( loader.get_default_topic_spec().topic_name)), BatchProcessingStrategyFactory( SubscriptionWorker( dataset, executor, { index: SubscriptionScheduler( RedisSubscriptionDataStore(redis_client, dataset, PartitionId(index)), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=metrics, ) for index in range(partitions if partitions is not None else loader. get_default_topic_spec().partitions_number) }, producer, Topic(result_topic) if result_topic is not None else Topic( result_topic_spec.topic_name), metrics, ), max_batch_size, max_batch_time_ms, ), ) def handler(signum: int, frame: Optional[Any]) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
def delete(self, subscription_id: UUID) -> None: RedisSubscriptionDataStore(redis_client, self.dataset, self.partition).delete(subscription_id)
def test_scheduler_consumer() -> None: settings.TOPIC_PARTITION_COUNTS = {"events": 2} importlib.reload(scheduler_consumer) admin_client = AdminClient(get_default_kafka_configuration()) create_topics(admin_client, [SnubaTopic.COMMIT_LOG]) metrics_backend = TestingMetricsBackend() entity_name = "events" entity = get_entity(EntityKey(entity_name)) storage = entity.get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() commit_log_topic = Topic("snuba-commit-log") mock_scheduler_producer = mock.Mock() from snuba.redis import redis_client from snuba.subscriptions.data import PartitionId, SubscriptionData from snuba.subscriptions.entity_subscription import EventsSubscription from snuba.subscriptions.store import RedisSubscriptionDataStore entity_key = EntityKey(entity_name) partition_index = 0 store = RedisSubscriptionDataStore(redis_client, entity_key, PartitionId(partition_index)) store.create( uuid.uuid4(), SubscriptionData( project_id=1, time_window_sec=60, resolution_sec=60, query="MATCH events SELECT count()", entity_subscription=EventsSubscription(data_dict={}), ), ) builder = scheduler_consumer.SchedulerBuilder( entity_name, str(uuid.uuid1().hex), "events", mock_scheduler_producer, "latest", False, 60 * 5, None, None, metrics_backend, ) scheduler = builder.build_consumer() time.sleep(2) scheduler._run_once() scheduler._run_once() scheduler._run_once() epoch = datetime(1970, 1, 1) producer = KafkaProducer( build_kafka_producer_configuration( stream_loader.get_default_topic_spec().topic, )) for (partition, offset, orig_message_ts) in [ (0, 0, epoch), (1, 0, epoch + timedelta(minutes=1)), (0, 1, epoch + timedelta(minutes=2)), (1, 1, epoch + timedelta(minutes=3)), ]: fut = producer.produce( commit_log_topic, payload=commit_codec.encode( Commit( "events", Partition(commit_log_topic, partition), offset, orig_message_ts, )), ) fut.result() producer.close() for _ in range(5): scheduler._run_once() scheduler._shutdown() assert mock_scheduler_producer.produce.call_count == 2 settings.TOPIC_PARTITION_COUNTS = {}
def build_store(self, key="1") -> RedisSubscriptionDataStore: return RedisSubscriptionDataStore(redis_client, self.dataset, key)
def build_store(self, key: int = 1) -> RedisSubscriptionDataStore: return RedisSubscriptionDataStore(redis_client, self.entity_key, PartitionId(key))
def __init__( self, dataset: Dataset, entity_names: Sequence[str], partitions: int, max_concurrent_queries: int, total_concurrent_queries: int, producer: Producer[KafkaPayload], metrics: MetricsBackend, stale_threshold_seconds: Optional[int], result_topic: str, schedule_ttl: int, scheduling_mode: Optional[SchedulingWatermarkMode] = None, ) -> None: # TODO: self.__partitions might not be the same for each entity self.__partitions = partitions self.__entity_names = entity_names self.__metrics = metrics entity_keys = [EntityKey(entity_name) for entity_name in self.__entity_names] self.__schedulers = [ { index: SubscriptionScheduler( entity_key, RedisSubscriptionDataStore( redis_client, entity_key, PartitionId(index) ), partition_id=PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=self.__metrics, ) for index in range(self.__partitions) } for entity_key in entity_keys ] # Just apply the max buffer size if they are configured differently # for each entity that is being run together self.__buffer_size = max( [ settings.SUBSCRIPTIONS_ENTITY_BUFFER_SIZE.get( entity_key.value, settings.SUBSCRIPTIONS_DEFAULT_BUFFER_SIZE ) for entity_key in entity_keys ] ) self.__executor_factory = SubscriptionExecutorProcessingFactory( max_concurrent_queries, total_concurrent_queries, dataset, entity_names, producer, metrics, stale_threshold_seconds, result_topic, ) if scheduling_mode is not None: self.__mode = scheduling_mode else: modes = { self._get_entity_watermark_mode(entity_key) for entity_key in entity_keys } mode = modes.pop() assert len(modes) == 0, "Entities provided do not share the same mode" self.__mode = mode