Пример #1
0
def describe(entity_name: str) -> None:
    try:
        entity = get_entity(EntityKey(entity_name))
        click.echo(f"Entity {entity_name}")
        entity.describe().accept(CLIDescriber())
    except InvalidEntityError:
        click.echo(
            f"Entity {entity_name} does not exists or it is not registered.")
Пример #2
0
    def get_topic_configuration_for_entity(
        entity_name: str,
    ) -> TopicConfig:
        storage = get_entity(EntityKey(entity_name)).get_writable_storage()
        assert storage is not None
        stream_loader = storage.get_table_writer().get_stream_loader()
        partition_count = stream_loader.get_default_topic_spec().partitions_number

        commit_log_topic_spec = stream_loader.get_commit_log_topic_spec()
        assert commit_log_topic_spec is not None

        result_topic_spec = stream_loader.get_subscription_result_topic_spec()
        assert result_topic_spec is not None
        return TopicConfig(partition_count, commit_log_topic_spec, result_topic_spec)
Пример #3
0
    def __init__(
        self,
        entity_name: str,
        consumer_group: str,
        followed_consumer_group: str,
        producer: Producer[KafkaPayload],
        auto_offset_reset: str,
        strict_offset_reset: Optional[bool],
        schedule_ttl: int,
        delay_seconds: Optional[int],
        stale_threshold_seconds: Optional[int],
        metrics: MetricsBackend,
    ) -> None:
        self.__entity_key = EntityKey(entity_name)

        storage = get_entity(self.__entity_key).get_writable_storage()

        assert (
            storage is not None
        ), f"Entity {entity_name} does not have a writable storage by default."

        stream_loader = storage.get_table_writer().get_stream_loader()

        commit_log_topic_spec = stream_loader.get_commit_log_topic_spec()
        assert commit_log_topic_spec is not None
        self.__commit_log_topic_spec = commit_log_topic_spec

        scheduled_topic_spec = stream_loader.get_subscription_scheduled_topic_spec(
        )
        assert scheduled_topic_spec is not None
        self.__scheduled_topic_spec = scheduled_topic_spec

        mode = stream_loader.get_subscription_scheduler_mode()
        assert mode is not None

        self.__mode = mode

        self.__partitions = stream_loader.get_default_topic_spec(
        ).partitions_number

        self.__consumer_group = consumer_group
        self.__followed_consumer_group = followed_consumer_group
        self.__producer = producer
        self.__auto_offset_reset = auto_offset_reset
        self.__strict_offset_reset = strict_offset_reset
        self.__schedule_ttl = schedule_ttl
        self.__delay_seconds = delay_seconds
        self.__stale_threshold_seconds = stale_threshold_seconds
        self.__metrics = metrics
Пример #4
0
    def decode(self, value: KafkaPayload) -> ScheduledSubscriptionTask:
        payload_value = value.value

        assert value.key is not None
        subscription_identifier = value.key.decode("utf-8")

        scheduled_subscription_dict = rapidjson.loads(payload_value.decode("utf-8"))

        entity_key = EntityKey(scheduled_subscription_dict["entity"])

        return ScheduledSubscriptionTask(
            datetime.fromisoformat(scheduled_subscription_dict["timestamp"]),
            SubscriptionWithMetadata(
                entity_key,
                Subscription(
                    SubscriptionIdentifier.from_string(subscription_identifier),
                    SubscriptionData.from_dict(
                        scheduled_subscription_dict["task"]["data"], entity_key
                    ),
                ),
                scheduled_subscription_dict["tick_upper_offset"],
            ),
        )
Пример #5
0
    def get_topics_for_entity(
        entity_name: str, ) -> Tuple[KafkaTopicSpec, KafkaTopicSpec]:
        assert (
            entity_name in dataset_entity_names
        ), f"Entity {entity_name} does not exist in dataset {dataset_name}"

        entity = get_entity(EntityKey(entity_name))
        storage = entity.get_writable_storage()

        assert (
            storage is not None
        ), f"Entity {entity_name} does not have a writable storage by default."

        stream_loader = storage.get_table_writer().get_stream_loader()

        scheduled_topic_spec = stream_loader.get_subscription_scheduled_topic_spec(
        )
        assert scheduled_topic_spec is not None

        result_topic_spec = stream_loader.get_subscription_result_topic_spec()
        assert result_topic_spec is not None

        return scheduled_topic_spec, result_topic_spec
Пример #6
0
def test_joins(
    clauses: Sequence[Tuple[str, str]], expected: JoinClause[QueryEntity]
) -> None:
    relationships = []

    for clause in clauses:
        lhs, rhs = clause
        lhs_alias, lhs = lhs.split(":", 1)
        rhs_alias, rhs = rhs.split(":", 1)
        data = JoinRelationship(
            rhs_entity=EntityKey(rhs),
            join_type=JoinType.INNER,
            columns=[("event_id", "event_id")],
            equivalences=[],
        )
        relationships.append(
            RelationshipTuple(
                node(lhs_alias, lhs), uuid.uuid4().hex, node(rhs_alias, rhs), data,
            )
        )

    result = build_join_clause(relationships)
    assert result == expected
Пример #7
0
def test_scheduler_consumer() -> None:
    settings.TOPIC_PARTITION_COUNTS = {"events": 2}
    importlib.reload(scheduler_consumer)

    admin_client = AdminClient(get_default_kafka_configuration())
    create_topics(admin_client, [SnubaTopic.COMMIT_LOG])

    metrics_backend = TestingMetricsBackend()
    entity_name = "events"
    entity = get_entity(EntityKey(entity_name))
    storage = entity.get_writable_storage()
    assert storage is not None
    stream_loader = storage.get_table_writer().get_stream_loader()

    commit_log_topic = Topic("snuba-commit-log")

    mock_scheduler_producer = mock.Mock()

    from snuba.redis import redis_client
    from snuba.subscriptions.data import PartitionId, SubscriptionData
    from snuba.subscriptions.entity_subscription import EventsSubscription
    from snuba.subscriptions.store import RedisSubscriptionDataStore

    entity_key = EntityKey(entity_name)
    partition_index = 0

    store = RedisSubscriptionDataStore(redis_client, entity_key,
                                       PartitionId(partition_index))
    store.create(
        uuid.uuid4(),
        SubscriptionData(
            project_id=1,
            time_window_sec=60,
            resolution_sec=60,
            query="MATCH events SELECT count()",
            entity_subscription=EventsSubscription(data_dict={}),
        ),
    )

    builder = scheduler_consumer.SchedulerBuilder(
        entity_name,
        str(uuid.uuid1().hex),
        "events",
        mock_scheduler_producer,
        "latest",
        False,
        60 * 5,
        None,
        None,
        metrics_backend,
    )
    scheduler = builder.build_consumer()
    time.sleep(2)
    scheduler._run_once()
    scheduler._run_once()
    scheduler._run_once()

    epoch = datetime(1970, 1, 1)

    producer = KafkaProducer(
        build_kafka_producer_configuration(
            stream_loader.get_default_topic_spec().topic, ))

    for (partition, offset, orig_message_ts) in [
        (0, 0, epoch),
        (1, 0, epoch + timedelta(minutes=1)),
        (0, 1, epoch + timedelta(minutes=2)),
        (1, 1, epoch + timedelta(minutes=3)),
    ]:
        fut = producer.produce(
            commit_log_topic,
            payload=commit_codec.encode(
                Commit(
                    "events",
                    Partition(commit_log_topic, partition),
                    offset,
                    orig_message_ts,
                )),
        )
        fut.result()

    producer.close()

    for _ in range(5):
        scheduler._run_once()

    scheduler._shutdown()

    assert mock_scheduler_producer.produce.call_count == 2

    settings.TOPIC_PARTITION_COUNTS = {}
Пример #8
0
    def __init__(
        self,
        dataset: Dataset,
        entity_names: Sequence[str],
        partitions: int,
        max_concurrent_queries: int,
        total_concurrent_queries: int,
        producer: Producer[KafkaPayload],
        metrics: MetricsBackend,
        stale_threshold_seconds: Optional[int],
        result_topic: str,
        schedule_ttl: int,
        scheduling_mode: Optional[SchedulingWatermarkMode] = None,
    ) -> None:
        # TODO: self.__partitions might not be the same for each entity
        self.__partitions = partitions
        self.__entity_names = entity_names
        self.__metrics = metrics

        entity_keys = [EntityKey(entity_name) for entity_name in self.__entity_names]

        self.__schedulers = [
            {
                index: SubscriptionScheduler(
                    entity_key,
                    RedisSubscriptionDataStore(
                        redis_client, entity_key, PartitionId(index)
                    ),
                    partition_id=PartitionId(index),
                    cache_ttl=timedelta(seconds=schedule_ttl),
                    metrics=self.__metrics,
                )
                for index in range(self.__partitions)
            }
            for entity_key in entity_keys
        ]

        # Just apply the max buffer size if they are configured differently
        # for each entity that is being run together
        self.__buffer_size = max(
            [
                settings.SUBSCRIPTIONS_ENTITY_BUFFER_SIZE.get(
                    entity_key.value, settings.SUBSCRIPTIONS_DEFAULT_BUFFER_SIZE
                )
                for entity_key in entity_keys
            ]
        )

        self.__executor_factory = SubscriptionExecutorProcessingFactory(
            max_concurrent_queries,
            total_concurrent_queries,
            dataset,
            entity_names,
            producer,
            metrics,
            stale_threshold_seconds,
            result_topic,
        )

        if scheduling_mode is not None:
            self.__mode = scheduling_mode
        else:
            modes = {
                self._get_entity_watermark_mode(entity_key)
                for entity_key in entity_keys
            }

            mode = modes.pop()

            assert len(modes) == 0, "Entities provided do not share the same mode"

            self.__mode = mode
Пример #9
0
 def setup_method(self) -> None:
     self.now = datetime.utcnow().replace(minute=0, second=0, microsecond=0)
     self.partition_id = PartitionId(1)
     self.entity_key = EntityKey("events")
Пример #10
0
 def to_python(self, value: str) -> Entity:
     return get_entity(EntityKey(value))
Пример #11
0
    def convert(data: str, entity: str) -> str:
        legacy = json.loads(data)

        def func(value: Union[str, Sequence[Any]]) -> str:
            if not isinstance(value, list):
                return f"{value}" if value is not None else "NULL"

            children = ""
            if isinstance(value[1], list):
                children = ",".join(map(func, value[1]))
            elif value[1]:
                children = func(value[1])

            alias = f" AS {value[2]}" if len(value) > 2 else ""
            return f"{value[0]}({children}){alias}"

        def literal(value: Union[str, Sequence[Any]]) -> str:
            if isinstance(value, (list, tuple)):
                return f"tuple({','.join(list(map(literal, value)))})"

            if isinstance(value, (int, float)):
                return f"{value}"
            else:
                escaped = value.replace("'", "\\'")
                return f"'{escaped}'"

        sample = legacy.get("sample")
        sample_clause = ""
        if sample is not None:
            sample_clause = f"SAMPLE {float(sample)}" if sample else ""
        match_clause = f"MATCH ({entity} {sample_clause})"

        aggregations = []
        for a in legacy.get("aggregations", []):
            if a[0].endswith(")") and not a[1]:
                aggregations.append(f"{a[0]} AS {a[2]}")
            else:
                agg = func(a)
                aggregations.append(agg)

        expressions = aggregations + list(map(func, legacy.get("selected_columns", [])))
        select_clause = f"SELECT {', '.join(expressions)}" if expressions else ""

        arrayjoin = legacy.get("arrayjoin")
        if arrayjoin:
            array_join_clause = (
                f"arrayJoin({arrayjoin}) AS {arrayjoin}" if arrayjoin else ""
            )
            select_clause = (
                f"SELECT {array_join_clause}"
                if not select_clause
                else f"{select_clause}, {array_join_clause}"
            )

        groupby = legacy.get("groupby", [])
        if groupby and not isinstance(groupby, list):
            groupby = [groupby]

        groupby = ", ".join(map(func, groupby))
        phrase = "BY" if select_clause else f"SELECT {groupby} BY"
        groupby_clause = f"{phrase} {groupby}" if groupby else ""

        word_ops = ("NOT IN", "IN", "LIKE", "NOT LIKE", "IS NULL", "IS NOT NULL")
        conditions = []

        # These conditions are ordered to match how the legacy parser would
        # add these conditions so we can compare SQL queries directly.
        organization = legacy.get("organization")
        if isinstance(organization, int):
            conditions.append(f"org_id={organization}")
        elif isinstance(organization, list):
            organization = ",".join(organization)
            conditions.append(f"org_id IN tuple({organization})")

        # Hack to help keep legacy in step with the validation SnQL requires
        main_entity = get_entity(EntityKey(entity))
        if main_entity._required_time_column:
            time_cols = (("from_date", ">="), ("to_date", "<"))
            for col, op in time_cols:
                date_val = legacy.get(col)
                if date_val:
                    conditions.append(
                        f"{main_entity._required_time_column} {op} toDateTime('{date_val}')"
                    )

        project = legacy.get("project")
        if isinstance(project, int):
            conditions.append(f"project_id IN tuple({project})")
        elif isinstance(project, list):
            project = ",".join(map(str, project))
            conditions.append(f"project_id IN tuple({project})")

        for cond in legacy.get("conditions", []):
            if len(cond) != 3 or not isinstance(cond[1], str):
                or_condition = []
                for or_cond in cond:
                    op = f" {or_cond[1]} " if or_cond[1] in word_ops else or_cond[1]
                    or_condition.append(
                        f"{func(or_cond[0])}{op}{literal(or_cond[2])}".join(or_cond)
                    )
                or_condition_str = " OR ".join(or_condition)
                conditions.append(f"{or_condition_str}")
            else:
                rhs = ""
                if cond[1] not in ["IS NULL", "IS NOT NULL"]:
                    rhs = literal(cond[2])

                op = f" {cond[1]} " if cond[1] in word_ops else cond[1]
                conditions.append(f"{func(cond[0])}{op}{rhs}")

        conditions_str = " AND ".join(conditions)
        where_clause = f"WHERE {conditions_str}" if conditions_str else ""

        having = []
        for cond in legacy.get("having", []):
            if len(cond) != 3 or not isinstance(cond[1], str):
                or_condition = []
                for or_cond in cond:
                    op = f" {or_cond[1]} " if or_cond[1] in word_ops else or_cond[1]
                    or_condition.append(
                        f"{func(or_cond[0])}{op}{literal(or_cond[2])}".join(or_cond)
                    )
                or_condition_str = " OR ".join(or_condition)
                having.append(f"{or_condition_str}")
            else:
                op = f" {cond[1]} " if cond[1] in word_ops else cond[1]
                having.append(f"{func(cond[0])}{op}{literal(cond[2])}")

        having_str = " AND ".join(having)
        having_clause = f"HAVING {having_str}" if having_str else ""

        order_by = legacy.get("orderby")
        order_by_str = ""
        if order_by:
            if isinstance(order_by, list):
                parts: List[str] = []
                for part in order_by:
                    order_part = ""
                    if isinstance(part, (list, tuple)):
                        order_part = func(part)
                    else:
                        order_part = part

                    sort = "ASC"
                    if order_part.startswith("-"):
                        order_part = order_part[1:]
                        sort = "DESC"

                    parts.append(f"{order_part} {sort}")
                order_by_str = ",".join(parts)
            else:
                sort = "ASC"
                if order_by.startswith("-"):
                    order_by = order_by[1:]
                    sort = "DESC"
                order_by_str = f"{order_by} {sort}"
        order_by_clause = f"ORDER BY {order_by_str}" if order_by else ""

        limit_by_clause = ""
        if legacy.get("limitby"):
            limit, column = legacy.get("limitby")
            limit_by_clause = f"LIMIT {limit} BY {column}"

        extras = ("limit", "offset", "granularity", "totals")
        extra_exps = []
        for extra in extras:
            if legacy.get(extra):
                extra_exps.append(f"{extra.upper()} {legacy.get(extra)}")
        extras_clause = " ".join(extra_exps)

        query = f"{match_clause} {select_clause} {groupby_clause} {where_clause} {having_clause} {order_by_clause} {limit_by_clause} {extras_clause}"
        body = {"query": query}

        settings_extras = ("consistent", "debug", "turbo")
        for setting in settings_extras:
            if legacy.get(setting) is not None:
                body[setting] = legacy[setting]

        return json.dumps(body)
Пример #12
0
 def visit_entity_name(self, node: Node, visited_children: Tuple[Any]) -> EntityKey:
     try:
         return EntityKey(node.text)
     except Exception:
         raise ParsingException(f"{node.text} is not a valid entity name")
Пример #13
0
def test_executor_consumer() -> None:
    """
    End to end integration test
    """
    state.set_config("subscription_mode_events", "new")
    admin_client = AdminClient(get_default_kafka_configuration())
    create_topics(admin_client, [SnubaTopic.SUBSCRIPTION_SCHEDULED_EVENTS])
    create_topics(admin_client, [SnubaTopic.SUBSCRIPTION_RESULTS_EVENTS])

    dataset_name = "events"
    entity_name = "events"
    entity_key = EntityKey(entity_name)
    entity = get_entity(entity_key)
    storage = entity.get_writable_storage()
    assert storage is not None
    stream_loader = storage.get_table_writer().get_stream_loader()

    scheduled_result_topic_spec = stream_loader.get_subscription_result_topic_spec(
    )
    assert scheduled_result_topic_spec is not None
    result_producer = KafkaProducer(
        build_kafka_producer_configuration(scheduled_result_topic_spec.topic))

    result_consumer = KafkaConsumer(
        build_kafka_consumer_configuration(
            scheduled_result_topic_spec.topic,
            str(uuid.uuid1().hex),
            auto_offset_reset="latest",
            strict_offset_reset=False,
        ))
    assigned = False

    def on_partitions_assigned(partitions: Mapping[Partition, int]) -> None:
        nonlocal assigned
        assigned = True

    result_consumer.subscribe(
        [Topic(scheduled_result_topic_spec.topic_name)],
        on_assign=on_partitions_assigned,
    )

    attempts = 10
    while attempts > 0 and not assigned:
        result_consumer.poll(1.0)
        attempts -= 1

    # We need to wait for the consumer to receive partitions otherwise,
    # when we try to consume messages, we will not find anything.
    # Subscription is an async process.
    assert assigned == True, "Did not receive assignment within 10 attempts"

    consumer_group = str(uuid.uuid1().hex)
    auto_offset_reset = "latest"
    strict_offset_reset = False
    executor = build_executor_consumer(
        dataset_name,
        [entity_name],
        consumer_group,
        result_producer,
        2,
        2,
        auto_offset_reset,
        strict_offset_reset,
        TestingMetricsBackend(),
        None,
    )
    for i in range(1, 5):
        # Give time to the executor to subscribe
        time.sleep(1)
        executor._run_once()

    # Produce a scheduled task to the scheduled subscriptions topic
    subscription_data = SubscriptionData(
        project_id=1,
        query="MATCH (events) SELECT count()",
        time_window_sec=60,
        resolution_sec=60,
        entity_subscription=EventsSubscription(data_dict={}),
    )

    task = ScheduledSubscriptionTask(
        timestamp=datetime(1970, 1, 1),
        task=SubscriptionWithMetadata(
            entity_key,
            Subscription(
                SubscriptionIdentifier(
                    PartitionId(1),
                    uuid.UUID("91b46cb6224f11ecb2ddacde48001122")),
                subscription_data,
            ),
            1,
        ),
    )

    encoder = SubscriptionScheduledTaskEncoder()
    encoded_task = encoder.encode(task)

    scheduled_topic_spec = stream_loader.get_subscription_scheduled_topic_spec(
    )
    assert scheduled_topic_spec is not None
    tasks_producer = KafkaProducer(
        build_kafka_producer_configuration(scheduled_topic_spec.topic))

    scheduled_topic = Topic(scheduled_topic_spec.topic_name)
    tasks_producer.produce(scheduled_topic, payload=encoded_task).result()
    tasks_producer.close()

    executor._run_once()
    executor.signal_shutdown()
    # Call run here so that the executor shuts down itself cleanly.
    executor.run()
    result = result_consumer.poll(5)
    assert result is not None, "Did not receive a result message"
    data = json.loads(result.payload.value)
    assert (data["payload"]["subscription_id"] ==
            "1/91b46cb6224f11ecb2ddacde48001122"), "Invalid subscription id"

    result_producer.close()
def subscriptions_scheduler_executor(
    *,
    dataset_name: str,
    entity_names: Sequence[str],
    consumer_group: str,
    followed_consumer_group: str,
    max_concurrent_queries: int,
    total_concurrent_queries: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    schedule_ttl: int,
    delay_seconds: Optional[int],
    stale_threshold_seconds: Optional[int],
    log_level: Optional[str],
    # TODO: Temporarily overrides the scheduling mode.
    # Required for single tenant since some partitions may be empty.
    # To be removed once transactions is no longer semantically partitioned.
    scheduling_mode: Optional[str],
) -> None:
    """
    Combined subscriptions scheduler and executor. Alternative to the separate scheduler and executor processes.
    """
    setup_logging(log_level)
    setup_sentry()

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions.scheduler_executor",
        tags={"dataset": dataset_name},
    )

    configure_metrics(StreamMetricsAdapter(metrics))

    # Just get the result topic configuration from the first entity. Later we
    # check they all have the same result topic anyway before building the consumer.
    entity_key = EntityKey(entity_names[0])

    storage = get_entity(entity_key).get_writable_storage()
    assert storage is not None
    stream_loader = storage.get_table_writer().get_stream_loader()
    result_topic_spec = stream_loader.get_subscription_scheduled_topic_spec()
    assert result_topic_spec is not None

    producer = KafkaProducer(
        build_kafka_producer_configuration(
            result_topic_spec.topic,
            override_params={"partitioner": "consistent"},
        )
    )

    processor = build_scheduler_executor_consumer(
        dataset_name,
        entity_names,
        consumer_group,
        followed_consumer_group,
        producer,
        auto_offset_reset,
        not no_strict_offset_reset,
        schedule_ttl,
        delay_seconds,
        stale_threshold_seconds,
        max_concurrent_queries,
        total_concurrent_queries,
        metrics,
        SchedulingWatermarkMode(scheduling_mode)
        if scheduling_mode is not None
        else None,
    )

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    with closing(producer), flush_querylog():
        processor.run()
Пример #15
0
def subscriptions_executor(
    *,
    dataset_name: str,
    entity_names: Sequence[str],
    consumer_group: str,
    max_concurrent_queries: int,
    total_concurrent_queries: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    log_level: Optional[str],
    stale_threshold_seconds: Optional[int],
    cooperative_rebalancing: bool,
) -> None:
    """
    The subscription's executor consumes scheduled subscriptions from the scheduled
    subscription topic for that entity, executes the queries on ClickHouse and publishes
    results on the results topic.
    """
    setup_logging(log_level)
    setup_sentry()

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions.executor",
        tags={"dataset": dataset_name},
    )

    configure_metrics(StreamMetricsAdapter(metrics))

    # Just get the result topic configuration from the first entity. Later we
    # check they all have the same result topic anyway before building the consumer.
    entity_key = EntityKey(entity_names[0])

    storage = get_entity(entity_key).get_writable_storage()
    assert storage is not None
    stream_loader = storage.get_table_writer().get_stream_loader()
    result_topic_spec = stream_loader.get_subscription_result_topic_spec()
    assert result_topic_spec is not None

    producer = KafkaProducer(
        build_kafka_producer_configuration(
            result_topic_spec.topic,
            override_params={"partitioner": "consistent"},
        ))

    # TODO: Consider removing and always passing via CLI.
    # If a value provided via config, it overrides the one provided via CLI.
    # This is so we can quickly change this in an emergency.
    stale_threshold_seconds = state.get_config(
        f"subscriptions_stale_threshold_sec_{dataset_name}",
        stale_threshold_seconds)

    processor = build_executor_consumer(
        dataset_name,
        entity_names,
        consumer_group,
        producer,
        max_concurrent_queries,
        total_concurrent_queries,
        auto_offset_reset,
        not no_strict_offset_reset,
        metrics,
        stale_threshold_seconds,
        cooperative_rebalancing,
    )

    def handler(signum: int, frame: Any) -> None:
        # TODO: Temporary code for debugging executor shutdown
        logger = logging.getLogger()
        logger.setLevel(logging.DEBUG)

        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    with closing(producer), flush_querylog():
        processor.run()
Пример #16
0
def node(alias: str, name: str) -> IndividualNode[QueryEntity]:
    return IndividualNode(
        alias,
        QueryEntity(EntityKey(name),
                    get_entity(EntityKey(name)).get_data_model()),
    )
Пример #17
0
def subscriptions_scheduler(
    *,
    entity_name: str,
    consumer_group: str,
    followed_consumer_group: str,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    schedule_ttl: int,
    log_level: Optional[str],
    delay_seconds: Optional[int],
    stale_threshold_seconds: Optional[int],
) -> None:
    """
    The subscriptions scheduler's job is to schedule subscriptions for a single entity.
    It consumes the commit log for that entity which is used as a clock and determines
    which subscriptions to run at each interval. It produces a message for each
    scheduled subscription task to the scheduled subscription topic for that entity, so
    it can be picked up and run by subscription executors.

    The subscriptions scheduler consists of a tick consumer and three processing steps.

    - The tick consumer consumes the commit log and reads the "orig_message_ts" header.
    It constructs a new `Tick` message representing the intervals between each of the
    original messages, which gets passed to the processing strategy. Note: A tick always
    corresponds to a single partition on the original topic (not the commit log topic
    as that is never partitioned).

    - The first processing step is a tick buffer. It buffers ticks where needed and
    determines when to submit them to the rest of the pipeline. The tick buffer behavior
    depends on the watermark mode specified by the entity. In PARTITION mode, ticks are
    never buffered and immediately submitted to the next step. In GLOBAL mode we wait
    (filling the buffer) until the timestamp of a tick has been reached on every
    partition before eventually submitting a tick to the next step. This guarantees that
    a subscription is never scheduled before data on every partition up to that
    timestamp is written to storage.

    - The second processing step provides the strategy for committing offsets. Ticks are
    marked with an `offset_to_commit` if processing that tick allows the committed
    offset to be advanced. Only the earliest commit log offset that as already been seen
    by the strategy will get committed. This guarantees at least once scheduling of
    subscriptions.

    - The third processing step checks the subscription store to determine which
    subscriptions need to be scheduled for each tick. Each scheduled subscription task
    is encoded and produced to the scheduled topic. Offsets are commited if the
    `should_commit` value provided by the previous strategy is true, and only once all
    prior scheduled subscriptions were succesfully produced (and replicated).
    """

    setup_logging(log_level)
    setup_sentry()

    metrics = MetricsWrapper(environment.metrics,
                             "subscriptions.scheduler",
                             tags={"entity": entity_name})

    configure_metrics(StreamMetricsAdapter(metrics))

    entity_key = EntityKey(entity_name)

    storage = get_entity(entity_key).get_writable_storage()

    assert (
        storage is not None
    ), f"Entity {entity_name} does not have a writable storage by default."

    if stale_threshold_seconds is not None and delay_seconds is not None:
        assert (stale_threshold_seconds > delay_seconds
                ), "stale_threshold_seconds must be greater than delay_seconds"

    stream_loader = storage.get_table_writer().get_stream_loader()

    scheduled_topic_spec = stream_loader.get_subscription_scheduled_topic_spec(
    )
    assert scheduled_topic_spec is not None

    producer = KafkaProducer(
        build_kafka_producer_configuration(
            scheduled_topic_spec.topic,
            override_params={"partitioner": "consistent"},
        ))

    builder = SchedulerBuilder(
        entity_name,
        consumer_group,
        followed_consumer_group,
        producer,
        auto_offset_reset,
        not no_strict_offset_reset,
        schedule_ttl,
        delay_seconds,
        stale_threshold_seconds,
        metrics,
    )

    processor = builder.build_consumer()

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    with closing(producer):
        processor.run()