def test_encode(self): result = SubscriptionResult( ScheduledTask( datetime.now(), Subscription( SubscriptionIdentifier(PartitionId(1), uuid.uuid1()), SubscriptionData( 1, [], [["count()", "", "count"]], timedelta(minutes=1), timedelta(minutes=1), ), ), ), {"data": { "count": 100 }}, ) codec = SubscriptionResultCodec() message = codec.encode(result) data = json.loads(message.value.decode("utf-8")) assert data["version"] == 1 payload = data["payload"] assert payload["subscription_id"] == str(result.task.task.identifier) assert payload["values"] == result.result assert payload["timestamp"] == result.task.timestamp.isoformat()
def test(self): executor = SubscriptionExecutor(self.dataset, ThreadPoolExecutor(), DummyMetricsBackend(strict=True)) subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid1()), SubscriptionData( project_id=self.project_id, conditions=[["platform", "IN", ["a"]]], aggregations=[["count()", "", "count"]], time_window=timedelta(minutes=500), resolution=timedelta(minutes=1), ), ) now = datetime.utcnow() tick = Tick( offsets=Interval(1, 2), timestamps=Interval(now - timedelta(minutes=1), now), ) result = executor.execute(ScheduledTask(now, subscription), tick).result() assert result["data"][0]["count"] == 10 result = executor.execute( ScheduledTask( now + timedelta(minutes=self.minutes) + subscription.data.time_window, subscription, ), tick, ).result() assert result["data"][0]["count"] == 0
def __get_subscriptions(self) -> List[Subscription]: current_time = datetime.now() if ( self.__last_refresh is None or (current_time - self.__last_refresh) > self.__cache_ttl ): self.__subscriptions = [ Subscription(SubscriptionIdentifier(self.__partition_id, uuid), data) for uuid, data in self.__store.all() ] self.__last_refresh = current_time self.__metrics.gauge( "schedule.size", len(self.__subscriptions), tags={"partition": str(self.__partition_id)}, ) self.__metrics.timing( "schedule.staleness", (current_time - self.__last_refresh).total_seconds() * 1000.0, tags={"partition": str(self.__partition_id)}, ) return self.__subscriptions
def __get_subscriptions(self, current_time: datetime) -> List[Subscription]: if (self.__last_refresh is None or (current_time - self.__last_refresh) > self.__cache_ttl): self.__subscriptions = [ Subscription(SubscriptionIdentifier(self.__partition_id, uuid), data) for uuid, data in self.__store.all() ] self.__last_refresh = current_time return self.__subscriptions
def build_subscription(resolution: timedelta, sequence: int) -> Subscription: return Subscription( SubscriptionIdentifier(PartitionId(1), UUIDS[sequence]), SnQLSubscriptionData( project_id=1, time_window=timedelta(minutes=5), resolution=resolution, query="MATCH events SELECT count()", ), )
def build_subscription(self, resolution: timedelta) -> Subscription: return Subscription( SubscriptionIdentifier(self.partition_id, uuid.uuid4()), LegacySubscriptionData( project_id=1, conditions=[], aggregations=[["count()", "", "count"]], time_window=timedelta(minutes=1), resolution=resolution, ), )
def build_subscription(self, resolution: timedelta) -> Subscription: return Subscription( SubscriptionIdentifier(self.partition_id, uuid.uuid4()), SubscriptionData( project_id=1, query="MATCH (events) SELECT count() AS count", time_window_sec=60, resolution_sec=int(resolution.total_seconds()), entity_subscription=create_entity_subscription(), ), )
def build_subscription(resolution: timedelta, sequence: int) -> Subscription: entity_subscription = EventsSubscription(data_dict={}) return Subscription( SubscriptionIdentifier(PartitionId(1), UUIDS[sequence]), SubscriptionData( project_id=1, time_window_sec=int(timedelta(minutes=5).total_seconds()), resolution_sec=int(resolution.total_seconds()), query="MATCH events SELECT count()", entity_subscription=entity_subscription, ), )
def test_subscription_worker_consistent( subscription_data: SubscriptionData) -> None: state.set_config("event_subscription_non_consistent_sample_rate", 1) broker: Broker[SubscriptionTaskResult] = Broker(MemoryMessageStorage(), TestingClock()) result_topic = Topic("subscription-results") broker.create_topic(result_topic, partitions=1) frequency = timedelta(minutes=1) evaluations = 1 subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid1()), subscription_data, ) store = DummySubscriptionDataStore() store.create(subscription.identifier.uuid, subscription.data) metrics = TestingMetricsBackend() dataset = get_dataset("events") worker = SubscriptionWorker( dataset, ThreadPoolExecutor(), { 0: SubscriptionScheduler(store, PartitionId(0), timedelta(), DummyMetricsBackend(strict=True)) }, broker.get_producer(), result_topic, metrics, ) now = datetime(2000, 1, 1) tick = Tick( offsets=Interval(0, 1), timestamps=Interval(now - (frequency * evaluations), now), ) worker.process_message(Message(Partition(Topic("events"), 0), 0, tick, now)) time.sleep(0.1) assert (len([ m for m in metrics.calls if isinstance(m, Increment) and m.name == "consistent" ]) == 1)
def test_subscription_task_result_encoder() -> None: codec = SubscriptionTaskResultEncoder() timestamp = datetime.now() entity_subscription = EventsSubscription(data_dict={}) subscription_data = SubscriptionData( project_id=1, query="MATCH (events) SELECT count() AS count", time_window_sec=60, resolution_sec=60, entity_subscription=entity_subscription, ) # XXX: This seems way too coupled to the dataset. request = subscription_data.build_request(get_dataset("events"), timestamp, None, Timer("timer")) result: Result = { "meta": [{ "type": "UInt64", "name": "count" }], "data": [{ "count": 1 }], } task_result = SubscriptionTaskResult( ScheduledSubscriptionTask( timestamp, SubscriptionWithMetadata( EntityKey.EVENTS, Subscription( SubscriptionIdentifier(PartitionId(1), uuid.uuid1()), subscription_data, ), 5, ), ), (request, result), ) message = codec.encode(task_result) data = json.loads(message.value.decode("utf-8")) assert data["version"] == 3 payload = data["payload"] assert payload["subscription_id"] == str( task_result.task.task.subscription.identifier) assert payload["request"] == request.original_body assert payload["result"] == result assert payload["timestamp"] == task_result.task.timestamp.isoformat() assert payload["entity"] == EntityKey.EVENTS.value
def test_subscription_task_result_encoder() -> None: codec = SubscriptionTaskResultEncoder() timestamp = datetime.now() subscription_data = LegacySubscriptionData( project_id=1, conditions=[], aggregations=[["count()", "", "count"]], time_window=timedelta(minutes=1), resolution=timedelta(minutes=1), ) # XXX: This seems way too coupled to the dataset. request = subscription_data.build_request(get_dataset("events"), timestamp, None, Timer("timer")) result: Result = { "meta": [{ "type": "UInt64", "name": "count" }], "data": [{ "count": 1 }], } task_result = SubscriptionTaskResult( ScheduledTask( timestamp, Subscription( SubscriptionIdentifier(PartitionId(1), uuid.uuid1()), subscription_data, ), ), (request, result), ) message = codec.encode(task_result) data = json.loads(message.value.decode("utf-8")) assert data["version"] == 2 payload = data["payload"] assert payload["subscription_id"] == str(task_result.task.task.identifier) assert payload["request"] == request.body assert payload["result"] == result assert payload["timestamp"] == task_result.task.timestamp.isoformat()
def test_subscription_task_encoder() -> None: encoder = SubscriptionScheduledTaskEncoder() subscription_data = SubscriptionData( project_id=1, query="MATCH events SELECT count()", time_window_sec=60, resolution_sec=60, entity_subscription=EventsSubscription(data_dict={}), ) subscription_id = uuid.UUID("91b46cb6224f11ecb2ddacde48001122") epoch = datetime(1970, 1, 1) tick_upper_offset = 5 subscription_with_metadata = SubscriptionWithMetadata( EntityKey.EVENTS, Subscription(SubscriptionIdentifier(PartitionId(1), subscription_id), subscription_data), tick_upper_offset, ) task = ScheduledSubscriptionTask(timestamp=epoch, task=subscription_with_metadata) encoded = encoder.encode(task) assert encoded.key == b"1/91b46cb6224f11ecb2ddacde48001122" assert encoded.value == ( b"{" b'"timestamp":"1970-01-01T00:00:00",' b'"entity":"events",' b'"task":{' b'"data":{"project_id":1,"time_window":60,"resolution":60,"query":"MATCH events SELECT count()"}},' b'"tick_upper_offset":5' b"}") decoded = encoder.decode(encoded) assert decoded == task
def generate_message( entity_key: EntityKey, subscription_identifier: Optional[SubscriptionIdentifier] = None, ) -> Iterator[Message[KafkaPayload]]: codec = SubscriptionScheduledTaskEncoder() epoch = datetime(1970, 1, 1) i = 0 if subscription_identifier is None: subscription_identifier = SubscriptionIdentifier( PartitionId(1), uuid.uuid1()) data_dict = {} if entity_key in (EntityKey.METRICS_SETS, EntityKey.METRICS_COUNTERS): data_dict = {"organization": 1} entity_subscription = ENTITY_KEY_TO_SUBSCRIPTION_MAPPER[entity_key]( data_dict=data_dict) while True: payload = codec.encode( ScheduledSubscriptionTask( epoch + timedelta(minutes=i), SubscriptionWithMetadata( entity_key, Subscription( subscription_identifier, SubscriptionData( project_id=1, time_window_sec=60, resolution_sec=60, query=f"MATCH ({entity_key.value}) SELECT count()", entity_subscription=entity_subscription, ), ), i + 1, ), )) yield Message(Partition(Topic("test"), 0), i, payload, epoch) i += 1
def decode(self, value: KafkaPayload) -> ScheduledSubscriptionTask: payload_value = value.value assert value.key is not None subscription_identifier = value.key.decode("utf-8") scheduled_subscription_dict = rapidjson.loads(payload_value.decode("utf-8")) entity_key = EntityKey(scheduled_subscription_dict["entity"]) return ScheduledSubscriptionTask( datetime.fromisoformat(scheduled_subscription_dict["timestamp"]), SubscriptionWithMetadata( entity_key, Subscription( SubscriptionIdentifier.from_string(subscription_identifier), SubscriptionData.from_dict( scheduled_subscription_dict["task"]["data"], entity_key ), ), scheduled_subscription_dict["tick_upper_offset"], ), )
def test_subscription_worker(broker: Broker[SubscriptionTaskResult], ) -> None: result_topic = Topic("subscription-results") broker.create_topic(result_topic, partitions=1) frequency = timedelta(minutes=1) evaluations = 3 subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid1()), SubscriptionData( project_id=1, conditions=[], aggregations=[["count()", "", "count"]], time_window=timedelta(minutes=60), resolution=frequency, ), ) store = DummySubscriptionDataStore() store.create(subscription.identifier.uuid, subscription.data) metrics = DummyMetricsBackend(strict=True) dataset = get_dataset("events") worker = SubscriptionWorker( dataset, ThreadPoolExecutor(), { 0: SubscriptionScheduler(store, PartitionId(0), timedelta(), metrics) }, broker.get_producer(), result_topic, metrics, ) now = datetime(2000, 1, 1) tick = Tick( offsets=Interval(0, 1), timestamps=Interval(now - (frequency * evaluations), now), ) result_futures = worker.process_message( Message(Partition(Topic("events"), 0), 0, tick, now)) assert result_futures is not None and len(result_futures) == evaluations # Publish the results. worker.flush_batch([result_futures]) # Check to make sure the results were published. # NOTE: This does not cover the ``SubscriptionTaskResultCodec``! consumer = broker.get_consumer("group") consumer.subscribe([result_topic]) for i in range(evaluations): timestamp = now - frequency * (evaluations - i) message = consumer.poll() assert message is not None assert message.partition.topic == result_topic task, future = result_futures[i] future_result = request, result = future.result() assert message.payload.task.timestamp == timestamp assert message.payload == SubscriptionTaskResult(task, future_result) # NOTE: The time series extension is folded back into the request # body, ideally this would reference the timeseries options in # isolation. assert (request.body.items() > { "from_date": (timestamp - subscription.data.time_window).isoformat(), "to_date": timestamp.isoformat(), }.items()) assert result == { "meta": [{ "name": "count", "type": "UInt64" }], "data": [{ "count": 0 }], }
def test_metrics_subscription_task_result_encoder( subscription_cls: Type[EntitySubscription], aggregate: str, entity_key: EntityKey) -> None: codec = SubscriptionTaskResultEncoder() timestamp = datetime.now() entity_subscription = subscription_cls(data_dict={"organization": 1}) subscription_data = SubscriptionData( project_id=1, query=(f""" MATCH ({entity_key.value}) SELECT {aggregate}(value) AS value BY project_id, tags[3] WHERE org_id = 1 AND project_id IN array(1) AND metric_id = 7 AND tags[3] IN array(1,2) """), time_window_sec=60, resolution_sec=60, entity_subscription=entity_subscription, ) # XXX: This seems way too coupled to the dataset. request = subscription_data.build_request(get_dataset("metrics"), timestamp, None, Timer("timer")) result: Result = { "data": [ { "project_id": 1, "tags[3]": 13, "value": 8 }, { "project_id": 1, "tags[3]": 4, "value": 46 }, ], "meta": [ { "name": "project_id", "type": "UInt64" }, { "name": "tags[3]", "type": "UInt64" }, { "name": "value", "type": "Float64" }, ], } task_result = SubscriptionTaskResult( ScheduledSubscriptionTask( timestamp, SubscriptionWithMetadata( entity_key, Subscription( SubscriptionIdentifier(PartitionId(1), uuid.uuid1()), subscription_data, ), 5, ), ), (request, result), ) message = codec.encode(task_result) data = json.loads(message.value.decode("utf-8")) assert data["version"] == 3 payload = data["payload"] assert payload["subscription_id"] == str( task_result.task.task.subscription.identifier) assert payload["request"] == request.original_body assert payload["result"] == result assert payload["timestamp"] == task_result.task.timestamp.isoformat() assert payload["entity"] == entity_key.value
def test_produce_result() -> None: state.set_config("subscription_mode_events", "new") epoch = datetime(1970, 1, 1) scheduled_topic = Topic("scheduled-subscriptions-events") result_topic = Topic("events-subscriptions-results") clock = TestingClock() broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage() broker: Broker[KafkaPayload] = Broker(broker_storage, clock) broker.create_topic(scheduled_topic, partitions=1) broker.create_topic(result_topic, partitions=1) producer = broker.get_producer() commit = mock.Mock() strategy = ProduceResult(producer, result_topic.name, commit) subscription_data = SubscriptionData( project_id=1, query="MATCH (events) SELECT count() AS count", time_window_sec=60, resolution_sec=60, entity_subscription=EventsSubscription(data_dict={}), ) subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid.uuid1()), subscription_data) request = subscription_data.build_request(get_dataset("events"), epoch, None, Timer("timer")) result: Result = { "meta": [{ "type": "UInt64", "name": "count" }], "data": [{ "count": 1 }], } message = Message( Partition(scheduled_topic, 0), 1, SubscriptionTaskResult( ScheduledSubscriptionTask( epoch, SubscriptionWithMetadata(EntityKey.EVENTS, subscription, 1), ), (request, result), ), epoch, ) strategy.submit(message) produced_message = broker_storage.consume(Partition(result_topic, 0), 0) assert produced_message is not None assert produced_message.payload.key == str( subscription.identifier).encode("utf-8") assert broker_storage.consume(Partition(result_topic, 0), 1) is None assert commit.call_count == 0 strategy.poll() assert commit.call_count == 1 # Commit is throttled so if we immediately submit another message, the commit count will not change strategy.submit(message) strategy.poll() assert commit.call_count == 1 # Commit count immediately increases once we call join() strategy.join() assert commit.call_count == 2
def build_subscription(self, resolution: timedelta) -> Subscription: return Subscription( SubscriptionIdentifier(self.partition_id, uuid.uuid4()), SubscriptionData(1, [], [], timedelta(minutes=1), resolution), )
def test_executor_consumer() -> None: """ End to end integration test """ state.set_config("subscription_mode_events", "new") admin_client = AdminClient(get_default_kafka_configuration()) create_topics(admin_client, [SnubaTopic.SUBSCRIPTION_SCHEDULED_EVENTS]) create_topics(admin_client, [SnubaTopic.SUBSCRIPTION_RESULTS_EVENTS]) dataset_name = "events" entity_name = "events" entity_key = EntityKey(entity_name) entity = get_entity(entity_key) storage = entity.get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() scheduled_result_topic_spec = stream_loader.get_subscription_result_topic_spec( ) assert scheduled_result_topic_spec is not None result_producer = KafkaProducer( build_kafka_producer_configuration(scheduled_result_topic_spec.topic)) result_consumer = KafkaConsumer( build_kafka_consumer_configuration( scheduled_result_topic_spec.topic, str(uuid.uuid1().hex), auto_offset_reset="latest", strict_offset_reset=False, )) assigned = False def on_partitions_assigned(partitions: Mapping[Partition, int]) -> None: nonlocal assigned assigned = True result_consumer.subscribe( [Topic(scheduled_result_topic_spec.topic_name)], on_assign=on_partitions_assigned, ) attempts = 10 while attempts > 0 and not assigned: result_consumer.poll(1.0) attempts -= 1 # We need to wait for the consumer to receive partitions otherwise, # when we try to consume messages, we will not find anything. # Subscription is an async process. assert assigned == True, "Did not receive assignment within 10 attempts" consumer_group = str(uuid.uuid1().hex) auto_offset_reset = "latest" strict_offset_reset = False executor = build_executor_consumer( dataset_name, [entity_name], consumer_group, result_producer, 2, 2, auto_offset_reset, strict_offset_reset, TestingMetricsBackend(), None, ) for i in range(1, 5): # Give time to the executor to subscribe time.sleep(1) executor._run_once() # Produce a scheduled task to the scheduled subscriptions topic subscription_data = SubscriptionData( project_id=1, query="MATCH (events) SELECT count()", time_window_sec=60, resolution_sec=60, entity_subscription=EventsSubscription(data_dict={}), ) task = ScheduledSubscriptionTask( timestamp=datetime(1970, 1, 1), task=SubscriptionWithMetadata( entity_key, Subscription( SubscriptionIdentifier( PartitionId(1), uuid.UUID("91b46cb6224f11ecb2ddacde48001122")), subscription_data, ), 1, ), ) encoder = SubscriptionScheduledTaskEncoder() encoded_task = encoder.encode(task) scheduled_topic_spec = stream_loader.get_subscription_scheduled_topic_spec( ) assert scheduled_topic_spec is not None tasks_producer = KafkaProducer( build_kafka_producer_configuration(scheduled_topic_spec.topic)) scheduled_topic = Topic(scheduled_topic_spec.topic_name) tasks_producer.produce(scheduled_topic, payload=encoded_task).result() tasks_producer.close() executor._run_once() executor.signal_shutdown() # Call run here so that the executor shuts down itself cleanly. executor.run() result = result_consumer.poll(5) assert result is not None, "Did not receive a result message" data = json.loads(result.payload.value) assert (data["payload"]["subscription_id"] == "1/91b46cb6224f11ecb2ddacde48001122"), "Invalid subscription id" result_producer.close()
def test_subscription_worker(subscription_data: SubscriptionData) -> None: broker: Broker[SubscriptionTaskResult] = Broker(MemoryMessageStorage(), TestingClock()) result_topic = Topic("subscription-results") broker.create_topic(result_topic, partitions=1) frequency = timedelta(minutes=1) evaluations = 3 subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid1()), subscription_data, ) store = DummySubscriptionDataStore() store.create(subscription.identifier.uuid, subscription.data) metrics = DummyMetricsBackend(strict=True) dataset = get_dataset("events") worker = SubscriptionWorker( dataset, ThreadPoolExecutor(), { 0: SubscriptionScheduler(store, PartitionId(0), timedelta(), metrics) }, broker.get_producer(), result_topic, metrics, ) now = datetime(2000, 1, 1) tick = Tick( offsets=Interval(0, 1), timestamps=Interval(now - (frequency * evaluations), now), ) result_futures = worker.process_message( Message(Partition(Topic("events"), 0), 0, tick, now)) assert result_futures is not None and len(result_futures) == evaluations # Publish the results. worker.flush_batch([result_futures]) # Check to make sure the results were published. # NOTE: This does not cover the ``SubscriptionTaskResultCodec``! consumer = broker.get_consumer("group") consumer.subscribe([result_topic]) for i in range(evaluations): timestamp = now - frequency * (evaluations - i) message = consumer.poll() assert message is not None assert message.partition.topic == result_topic task, future = result_futures[i] future_result = request, result = future.result() assert message.payload.task.timestamp == timestamp assert message.payload == SubscriptionTaskResult(task, future_result) # NOTE: The time series extension is folded back into the request # body, ideally this would reference the timeseries options in # isolation. from_pattern = FunctionCall( String(ConditionFunctions.GTE), ( Column(None, String("timestamp")), Literal(Datetime(timestamp - subscription.data.time_window)), ), ) to_pattern = FunctionCall( String(ConditionFunctions.LT), (Column(None, String("timestamp")), Literal(Datetime(timestamp))), ) condition = request.query.get_condition() assert condition is not None conditions = get_first_level_and_conditions(condition) assert any([from_pattern.match(e) for e in conditions]) assert any([to_pattern.match(e) for e in conditions]) assert result == { "meta": [{ "name": "count", "type": "UInt64" }], "data": [{ "count": 0 }], }