Exemplo n.º 1
0
    def create(
        self, commit: Callable[[Mapping[Partition, int]], None]
    ) -> ProcessingStrategy[KafkaPayload]:
        collect = CollectStep(
            self.__build_write_step,
            commit,
            self.__max_batch_size,
            self.__max_batch_time,
        )

        transform_function = functools.partial(process_message, self.__processor)

        strategy: ProcessingStrategy[KafkaPayload]
        if self.__processes is None:
            strategy = TransformStep(transform_function, collect)
        else:
            assert self.__input_block_size is not None
            assert self.__output_block_size is not None
            strategy = ParallelTransformStep(
                transform_function,
                collect,
                self.__processes,
                max_batch_size=self.__max_batch_size,
                max_batch_time=self.__max_batch_time,
                input_block_size=self.__input_block_size,
                output_block_size=self.__output_block_size,
                metrics=MetricsWrapper(self.__metrics, "process"),
            )

        if self.__prefilter is not None:
            strategy = FilterStep(self.__should_accept, strategy)

        return strategy
Exemplo n.º 2
0
 def write_step() -> ProcessedMessageBatchWriter:
     return ProcessedMessageBatchWriter(
         insert_batch_writer=InsertBatchWriter(
             writer, MetricsWrapper(metrics, "insertions")
         ),
         replacement_batch_writer=ReplacementBatchWriter(
             replacements_producer, Topic("replacements")
         ),
     )
Exemplo n.º 3
0
 def build_writer() -> ProcessedMessageBatchWriter:
     return ProcessedMessageBatchWriter(
         InsertBatchWriter(
             MockBatchWriter(storage.get_storage_key(), avg_write_latency,
                             std_deviation),
             MetricsWrapper(
                 metrics,
                 "mock_insertions",
                 {"storage": storage.get_storage_key().value},
             ),
         ),
         MockReplacementBatchWriter()
         if with_replacements is not None else None,
     )
Exemplo n.º 4
0
    def build_writer() -> ProcessedMessageBatchWriter:
        insert_batch_writer = InsertBatchWriter(
            writer, MetricsWrapper(metrics, "insertions"))

        replacement_batch_writer: Optional[ReplacementBatchWriter]
        if supports_replacements:
            assert replacements_producer is not None
            assert replacements_topic is not None
            replacement_batch_writer = ReplacementBatchWriter(
                replacements_producer, replacements_topic)
        else:
            replacement_batch_writer = None

        return ProcessedMessageBatchWriter(insert_batch_writer,
                                           replacement_batch_writer)
Exemplo n.º 5
0
    def __build_batch_writer(
            self,
            storage: WritableTableStorage) -> ProcessedMessageBatchWriter:
        replacement_batch_writer: Optional[ReplacementBatchWriter]
        stream_loader = storage.get_table_writer().get_stream_loader()
        replacement_topic_spec = stream_loader.get_replacement_topic_spec()
        default_topic_spec = stream_loader.get_default_topic_spec()
        if replacement_topic_spec is not None:
            # XXX: The producer is flushed when closed on strategy teardown
            # after an assignment is revoked, but never explicitly closed.
            # XXX: This assumes that the Kafka cluster used for the input topic
            # to the storage is the same as the replacement topic.
            replacement_batch_writer = ReplacementBatchWriter(
                ConfluentKafkaProducer(
                    build_kafka_producer_configuration(
                        default_topic_spec.topic,
                        override_params={
                            "partitioner": "consistent",
                            "message.max.bytes":
                            50000000,  # 50MB, default is 1MB
                        },
                    )),
                Topic(replacement_topic_spec.topic_name),
            )
        else:
            replacement_batch_writer = None

        return ProcessedMessageBatchWriter(
            InsertBatchWriter(
                storage.get_table_writer().get_batch_writer(
                    self.__metrics,
                    {
                        "load_balancing": "in_order",
                        "insert_distributed_sync": 1
                    },
                ),
                MetricsWrapper(
                    self.__metrics,
                    "insertions",
                    {"storage": storage.get_storage_key().value},
                ),
            ),
            replacement_batch_writer,
        )
Exemplo n.º 6
0
from snuba.query.data_source.visitor import DataSourceVisitor
from snuba.querylog import record_query
from snuba.querylog.query_metadata import SnubaQueryMetadata
from snuba.reader import Reader
from snuba.request import Request
from snuba.request.request_settings import RequestSettings
from snuba.util import with_span
from snuba.utils.metrics.gauge import Gauge
from snuba.utils.metrics.timer import Timer
from snuba.utils.metrics.wrapper import MetricsWrapper
from snuba.web import QueryException, QueryResult, transform_column_names
from snuba.web.db_query import raw_query

logger = logging.getLogger("snuba.query")

metrics = MetricsWrapper(environment.metrics, "api")

MAX_QUERY_SIZE_BYTES = 256 * 1024  # 256 KiB by default


class SampleClauseFinder(DataSourceVisitor[bool, Entity], JoinVisitor[bool,
                                                                      Entity]):
    """
    Traverses a query to find FROM clauses that have a sampling
    rate set to check if turbo is set as well.
    """
    def _visit_simple_source(self, data_source: Entity) -> bool:
        return data_source.sample is not None and data_source.sample != 1.0

    def _visit_join(self, data_source: JoinClause[Entity]) -> bool:
        return self.visit_join_clause(data_source)
Exemplo n.º 7
0
def subscriptions_executor(
    *,
    dataset_name: str,
    entity_names: Sequence[str],
    consumer_group: str,
    max_concurrent_queries: int,
    total_concurrent_queries: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    log_level: Optional[str],
    stale_threshold_seconds: Optional[int],
    cooperative_rebalancing: bool,
) -> None:
    """
    The subscription's executor consumes scheduled subscriptions from the scheduled
    subscription topic for that entity, executes the queries on ClickHouse and publishes
    results on the results topic.
    """
    setup_logging(log_level)
    setup_sentry()

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions.executor",
        tags={"dataset": dataset_name},
    )

    configure_metrics(StreamMetricsAdapter(metrics))

    # Just get the result topic configuration from the first entity. Later we
    # check they all have the same result topic anyway before building the consumer.
    entity_key = EntityKey(entity_names[0])

    storage = get_entity(entity_key).get_writable_storage()
    assert storage is not None
    stream_loader = storage.get_table_writer().get_stream_loader()
    result_topic_spec = stream_loader.get_subscription_result_topic_spec()
    assert result_topic_spec is not None

    producer = KafkaProducer(
        build_kafka_producer_configuration(
            result_topic_spec.topic,
            override_params={"partitioner": "consistent"},
        ))

    # TODO: Consider removing and always passing via CLI.
    # If a value provided via config, it overrides the one provided via CLI.
    # This is so we can quickly change this in an emergency.
    stale_threshold_seconds = state.get_config(
        f"subscriptions_stale_threshold_sec_{dataset_name}",
        stale_threshold_seconds)

    processor = build_executor_consumer(
        dataset_name,
        entity_names,
        consumer_group,
        producer,
        max_concurrent_queries,
        total_concurrent_queries,
        auto_offset_reset,
        not no_strict_offset_reset,
        metrics,
        stale_threshold_seconds,
        cooperative_rebalancing,
    )

    def handler(signum: int, frame: Any) -> None:
        # TODO: Temporary code for debugging executor shutdown
        logger = logging.getLogger()
        logger.setLevel(logging.DEBUG)

        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    with closing(producer), flush_querylog():
        processor.run()
Exemplo n.º 8
0
from dateutil.tz import tz

from snuba import environment, settings, state
from snuba.clickhouse.errors import ClickhouseError
from snuba.clickhouse.formatter.nodes import FormattedQuery
from snuba.reader import Reader, Result, build_result_transformer
from snuba.utils.metrics.gauge import ThreadSafeGauge
from snuba.utils.metrics.wrapper import MetricsWrapper

logger = logging.getLogger("snuba.clickhouse")
trace_logger = logging.getLogger("clickhouse_driver.log")
trace_logger.setLevel("INFO")

Params = Optional[Union[Sequence[Any], Mapping[str, Any]]]

metrics = MetricsWrapper(environment.metrics, "clickhouse.native")


class ClickhouseProfile(TypedDict):
    bytes: int
    blocks: int
    rows: int
    elapsed: float


@dataclass(frozen=True)
class ClickhouseResult:
    results: Sequence[Any] = field(default_factory=list)
    meta: Sequence[Any] | None = None
    profile: ClickhouseProfile | None = None
    trace_output: str = ""
Exemplo n.º 9
0
    flatten_nested_field,
)
from snuba.processor import (
    InsertBatch,
    MessageProcessor,
    ProcessedMessage,
    _as_dict_safe,
    _ensure_valid_date,
    _ensure_valid_ip,
    _unicodify,
)
from snuba.utils.metrics.wrapper import MetricsWrapper

logger = logging.getLogger(__name__)

metrics = MetricsWrapper(environment.metrics, "transactions.processor")

UNKNOWN_SPAN_STATUS = 2


class TransactionsMessageProcessor(MessageProcessor):
    PROMOTED_TAGS = {
        "environment",
        "sentry:release",
        "sentry:user",
        "sentry:dist",
    }

    def __extract_timestamp(self, field):
        timestamp = _ensure_valid_date(datetime.fromtimestamp(field))
        if timestamp is None:
Exemplo n.º 10
0
from typing import Any, Mapping, Optional

from sentry_relay import DataCategory

from snuba import environment, settings
from snuba.consumers.types import KafkaMessageMetadata
from snuba.processor import (
    InsertBatch,
    MessageProcessor,
    ProcessedMessage,
    _ensure_valid_date,
    _unicodify,
)
from snuba.utils.metrics.wrapper import MetricsWrapper

metrics = MetricsWrapper(environment.metrics, "outcomes.processor")


class OutcomesProcessor(MessageProcessor):
    def process_message(
            self, value: Mapping[str, Any],
            metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]:
        assert isinstance(value, dict)
        v_uuid = value.get("event_id")

        if value["outcome"] != 4:  # we dont care about abuse outcomes for these metrics
            if "category" not in value:
                metrics.increment("missing_category")
            if "quantity" not in value:
                metrics.increment("missing_quantity")
Exemplo n.º 11
0
def subscriptions(
    *,
    dataset_name: str,
    topic: Optional[str],
    partitions: Optional[int],
    commit_log_topic: Optional[str],
    commit_log_groups: Sequence[str],
    consumer_group: str,
    auto_offset_reset: str,
    bootstrap_servers: Sequence[str],
    max_batch_size: int,
    max_batch_time_ms: int,
    max_query_workers: Optional[int],
    schedule_ttl: int,
    result_topic: Optional[str],
    log_level: Optional[str],
    delay_seconds: Optional[int],
) -> None:
    """Evaluates subscribed queries for a dataset."""

    setup_logging(log_level)
    setup_sentry()

    dataset = get_dataset(dataset_name)

    storage = dataset.get_default_entity().get_writable_storage()
    assert (
        storage is not None
    ), f"Dataset {dataset_name} does not have a writable storage by default."

    loader = enforce_table_writer(dataset).get_stream_loader()
    commit_log_topic_spec = loader.get_commit_log_topic_spec()
    assert commit_log_topic_spec is not None

    result_topic_spec = loader.get_subscription_result_topic_spec()
    assert result_topic_spec is not None

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions",
        tags={
            "group": consumer_group,
            "dataset": dataset_name
        },
    )

    consumer = TickConsumer(
        SynchronizedConsumer(
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    loader.get_default_topic_spec().topic,
                    consumer_group,
                    auto_offset_reset=auto_offset_reset,
                    bootstrap_servers=bootstrap_servers,
                ), ),
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    commit_log_topic_spec.topic,
                    f"subscriptions-commit-log-{uuid.uuid1().hex}",
                    auto_offset_reset="earliest",
                    bootstrap_servers=bootstrap_servers,
                ), ),
            (Topic(commit_log_topic) if commit_log_topic is not None else
             Topic(commit_log_topic_spec.topic_name)),
            set(commit_log_groups),
        ),
        time_shift=(timedelta(seconds=delay_seconds *
                              -1) if delay_seconds is not None else None),
    )

    producer = ProducerEncodingWrapper(
        KafkaProducer(
            build_kafka_producer_configuration(
                loader.get_default_topic_spec().topic,
                bootstrap_servers=bootstrap_servers,
                override_params={
                    "partitioner": "consistent",
                    "message.max.bytes": 50000000,  # 50MB, default is 1MB
                },
            )),
        SubscriptionTaskResultEncoder(),
    )

    executor = ThreadPoolExecutor(max_workers=max_query_workers)
    logger.debug("Starting %r with %s workers...", executor,
                 getattr(executor, "_max_workers", 0))
    metrics.gauge("executor.workers", getattr(executor, "_max_workers", 0))

    with closing(consumer), executor, closing(producer):
        from arroyo import configure_metrics

        configure_metrics(StreamMetricsAdapter(metrics))
        batching_consumer = StreamProcessor(
            consumer,
            (Topic(topic) if topic is not None else Topic(
                loader.get_default_topic_spec().topic_name)),
            BatchProcessingStrategyFactory(
                SubscriptionWorker(
                    dataset,
                    executor,
                    {
                        index: SubscriptionScheduler(
                            RedisSubscriptionDataStore(redis_client, dataset,
                                                       PartitionId(index)),
                            PartitionId(index),
                            cache_ttl=timedelta(seconds=schedule_ttl),
                            metrics=metrics,
                        )
                        for index in
                        range(partitions if partitions is not None else loader.
                              get_default_topic_spec().partitions_number)
                    },
                    producer,
                    Topic(result_topic) if result_topic is not None else Topic(
                        result_topic_spec.topic_name),
                    metrics,
                ),
                max_batch_size,
                max_batch_time_ms,
            ),
        )

        def handler(signum: int, frame: Optional[Any]) -> None:
            batching_consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        batching_consumer.run()
Exemplo n.º 12
0
            "contexts",
            "geo.country_code",
            nullable=True,
        ),
        ColumnToMapping(
            None, "geo_region", None, "contexts", "geo.region", nullable=True
        ),
        ColumnToMapping(None, "geo_city", None, "contexts", "geo.city", nullable=True),
    ],
    subscriptables=[
        SubscriptableMapper(None, "tags", None, "tags"),
        SubscriptableMapper(None, "contexts", None, "contexts"),
    ],
)

metrics = MetricsWrapper(environment.metrics, "snuplicator")


def callback_func(
    storage: str,
    query: Query,
    request_settings: RequestSettings,
    referrer: str,
    results: List[Result[QueryResult]],
) -> None:
    cache_hit = False
    is_duplicate = False

    # Captures if any of the queries involved was a cache hit or duplicate, as cache
    # hits may a cause of inconsistency between results.
    # Doesn't attempt to distinguish between all of the specific scenarios (one or both
Exemplo n.º 13
0
from snuba import environment, state
from snuba.query.conditions import (
    BooleanFunctions,
    ConditionFunctions,
    binary_condition,
)
from snuba.query.expressions import Column, Literal
from snuba.query.extensions import QueryExtension
from snuba.query.logical import Query
from snuba.query.processors import ExtensionData, ExtensionQueryProcessor
from snuba.request.request_settings import RequestSettings
from snuba.util import parse_datetime
from snuba.utils.metrics.decorators import track_calls
from snuba.utils.metrics.wrapper import MetricsWrapper

timeseries_metrics = MetricsWrapper(environment.metrics,
                                    "extensions.timeseries")


def get_time_series_extension_properties(default_granularity: int,
                                         default_window: timedelta):
    return {
        "type": "object",
        "properties": {
            "from_date": {
                "type":
                "string",
                "format":
                "date-time",
                "default":
                track_calls(
                    timeseries_metrics,
Exemplo n.º 14
0
import logging
from typing import Sequence

from snuba import environment, state
from snuba.query.data_source import DataSource
from snuba.query.expressions import Expression
from snuba.query.functions import is_valid_global_function
from snuba.query.validation import FunctionCallValidator, InvalidFunctionCall
from snuba.utils.metrics.wrapper import MetricsWrapper

logger = logging.getLogger(__name__)
metrics = MetricsWrapper(environment.metrics, "validation.functions")


class AllowedFunctionValidator(FunctionCallValidator):
    """
    Validates that the function itself is allowed. This does not validate
    that the function is correctly formed.
    """
    def validate(self, func_name: str, parameters: Sequence[Expression],
                 data_source: DataSource) -> None:

        if is_valid_global_function(func_name):
            return

        if state.get_config("function-validator.enabled", False):
            raise InvalidFunctionCall(f"Invalid function name: {func_name}")
        else:
            metrics.increment("invalid_funcs", tags={"func_name": func_name})
Exemplo n.º 15
0
def multistorage_consumer(
    storage_names: Sequence[str],
    consumer_group: str,
    commit_log_topic: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    parallel_collect: bool,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
    dead_letter_topic: Optional[str] = None,
    cooperative_rebalancing: bool = False,
) -> None:

    DEFAULT_BLOCK_SIZE = int(32 * 1e6)

    if processes is not None:
        if input_block_size is None:
            input_block_size = DEFAULT_BLOCK_SIZE

        if output_block_size is None:
            output_block_size = DEFAULT_BLOCK_SIZE

    setup_logging(log_level)
    setup_sentry()

    logger.info("Consumer Starting")
    storages = {
        key: get_writable_storage(key)
        for key in (getattr(StorageKey, name.upper())
                    for name in storage_names)
    }

    topics = {
        storage.get_table_writer().get_stream_loader().get_default_topic_spec(
        ).topic_name
        for storage in storages.values()
    }

    # XXX: The ``StreamProcessor`` only supports a single topic at this time,
    # but is easily modified. The topic routing in the processing strategy is a
    # bit trickier (but also shouldn't be too bad.)
    topic = Topic(topics.pop())
    if topics:
        raise ValueError("only one topic is supported")

    commit_log: Optional[Topic]
    if commit_log_topic:
        commit_log = Topic(commit_log_topic)
    else:
        # XXX: The ``CommitLogConsumer`` also only supports a single topic at this
        # time. (It is less easily modified.) This also assumes the commit log
        # topic is on the same Kafka cluster as the input topic.
        commit_log_topics = {
            spec.topic_name
            for spec in (storage.get_table_writer().get_stream_loader(
            ).get_commit_log_topic_spec() for storage in storages.values())
            if spec is not None
        }

        if commit_log_topics:
            commit_log = Topic(commit_log_topics.pop())
        else:
            commit_log = None

        if commit_log_topics:
            raise ValueError("only one commit log topic is supported")

    # XXX: This requires that all storages are associated with the same Kafka
    # cluster so that they can be consumed by the same consumer instance.
    # Unfortunately, we don't have the concept of independently configurable
    # Kafka clusters in settings, only consumer configurations that are
    # associated with storages and/or global default configurations. To avoid
    # implementing yet another method of configuring Kafka clusters, this just
    # piggybacks on the existing configuration method(s), with the assumption
    # that most deployments are going to be using the default configuration.
    storage_keys = [*storages.keys()]

    kafka_topic = (storages[storage_keys[0]].get_table_writer().
                   get_stream_loader().get_default_topic_spec().topic)

    consumer_configuration = build_kafka_consumer_configuration(
        kafka_topic,
        consumer_group,
        auto_offset_reset=auto_offset_reset,
        strict_offset_reset=not no_strict_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
    )

    if cooperative_rebalancing is True:
        consumer_configuration[
            "partition.assignment.strategy"] = "cooperative-sticky"

    for storage_key in storage_keys[1:]:
        if (build_kafka_consumer_configuration(
                storages[storage_key].get_table_writer().get_stream_loader().
                get_default_topic_spec().topic,
                consumer_group,
        )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]):
            raise ValueError(
                "storages cannot be located on different Kafka clusters")

    metrics = MetricsWrapper(
        environment.metrics,
        "consumer",
        tags={
            "group": consumer_group,
            "storage": "_".join([storage_keys[0].value, "m"]),
        },
    )
    # Collect metrics from librdkafka if we have stats_collection_freq_ms set
    # for the consumer group, or use the default.
    stats_collection_frequency_ms = get_config(
        f"stats_collection_freq_ms_{consumer_group}",
        get_config("stats_collection_freq_ms", 0),
    )

    if stats_collection_frequency_ms and stats_collection_frequency_ms > 0:

        def stats_callback(stats_json: str) -> None:
            stats = rapidjson.loads(stats_json)
            metrics.gauge("librdkafka.total_queue_size",
                          stats.get("replyq", 0))

        consumer_configuration.update({
            "statistics.interval.ms": stats_collection_frequency_ms,
            "stats_cb": stats_callback,
        })
    if commit_log is None:
        consumer = KafkaConsumer(consumer_configuration)
    else:
        # XXX: This relies on the assumptions that a.) all storages are
        # located on the same Kafka cluster (validated above.)

        commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer(
        ).get_stream_loader().get_commit_log_topic_spec())
        assert commit_log_topic_spec is not None

        producer = ConfluentKafkaProducer(
            build_kafka_producer_configuration(commit_log_topic_spec.topic))
        consumer = KafkaConsumerWithCommitLog(
            consumer_configuration,
            producer=producer,
            commit_log_topic=commit_log,
        )

    dead_letter_producer: Optional[KafkaProducer] = None
    dead_letter_queue: Optional[Topic] = None
    if dead_letter_topic:
        dead_letter_queue = Topic(dead_letter_topic)

        dead_letter_producer = KafkaProducer(
            build_kafka_producer_configuration(
                StreamsTopic(dead_letter_topic)))

    configure_metrics(StreamMetricsAdapter(metrics))
    processor = StreamProcessor(
        consumer,
        topic,
        MultistorageConsumerProcessingStrategyFactory(
            [*storages.values()],
            max_batch_size,
            max_batch_time_ms / 1000.0,
            parallel_collect=parallel_collect,
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
            metrics=metrics,
            producer=dead_letter_producer,
            topic=dead_letter_queue,
        ),
    )

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)
    if dead_letter_producer:
        with closing(dead_letter_producer):
            processor.run()
    else:
        processor.run()
Exemplo n.º 16
0
from datetime import datetime
from typing import Any, Mapping, Optional
from uuid import UUID

from snuba import environment
from snuba.consumers.types import KafkaMessageMetadata
from snuba.processor import InsertBatch, MessageProcessor, ProcessedMessage
from snuba.utils.metrics.wrapper import MetricsWrapper

metrics = MetricsWrapper(environment.metrics, "profiles.processor")

RETENTION_DAYS_ALLOWED = frozenset([30, 90])


class ProfilesMessageProcessor(MessageProcessor):
    def process_message(
            self, message: Mapping[str, Any],
            metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]:
        try:
            retention_days = message["retention_days"]
            if retention_days not in RETENTION_DAYS_ALLOWED:
                retention_days = 30
            processed = {
                "organization_id": message["organization_id"],
                "project_id": message["project_id"],
                "transaction_id": str(UUID(message["transaction_id"])),
                "profile_id": str(UUID(message["profile_id"])),
                "received": datetime.utcfromtimestamp(message["received"]),
                "profile": message["profile"],
                "android_api_level": message.get("android_api_level"),
                "device_classification": message["device_classification"],
Exemplo n.º 17
0
    def __init__(self) -> None:
        super().__init__(default_entity=EntityKey.DISCOVER)

    # XXX: This is temporary code that will eventually need to be ported to Sentry
    # since SnQL will require an entity to always be specified by the user.
    def select_entity(self, query: Query) -> EntityKey:
        selected_entity = match_query_to_entity(query, EVENTS_COLUMNS,
                                                TRANSACTIONS_COLUMNS)

        track_bad_query(query, selected_entity, EVENTS_COLUMNS,
                        TRANSACTIONS_COLUMNS)

        return selected_entity


metrics = MetricsWrapper(environment.metrics, "api.discover")
logger = logging.getLogger(__name__)

EVENT_CONDITION = FunctionCallMatch(
    Param("function", Or([StringMatch(op) for op in BINARY_OPERATORS])),
    (
        Or([ColumnMatch(None, StringMatch("type")),
            LiteralMatch(None)]),
        Param("event_type", Or([ColumnMatch(), LiteralMatch()])),
    ),
)

TRANSACTION_FUNCTIONS = FunctionCallMatch(
    Or([StringMatch("apdex"),
        StringMatch("failure_rate")]), None)
Exemplo n.º 18
0
from snuba.request.request_settings import RequestSettings
from snuba.state.cache.abstract import Cache, ExecutionTimeoutError
from snuba.state.cache.redis.backend import RESULT_VALUE, RESULT_WAIT, RedisCache
from snuba.state.rate_limit import (
    GLOBAL_RATE_LIMIT_NAME,
    PROJECT_RATE_LIMIT_NAME,
    RateLimitAggregator,
    RateLimitExceeded,
)
from snuba.util import force_bytes, with_span
from snuba.utils.codecs import Codec
from snuba.utils.metrics.timer import Timer
from snuba.utils.metrics.wrapper import MetricsWrapper
from snuba.web import QueryException, QueryResult

metrics = MetricsWrapper(environment.metrics, "db_query")


class ResultCacheCodec(Codec[bytes, Result]):
    def encode(self, value: Result) -> bytes:
        return cast(str, rapidjson.dumps(value)).encode("utf-8")

    def decode(self, value: bytes) -> Result:
        ret = rapidjson.loads(value)
        if not isinstance(ret,
                          Mapping) or "meta" not in ret or "data" not in ret:
            raise ValueError("Invalid value type in result cache")
        return cast(Result, ret)


cache: Cache[Result] = RedisCache(redis_client, "snuba-query-cache:",
Exemplo n.º 19
0
    String as StringMatch,
)
from snuba.query.parser.conditions import parse_conditions_to_expr
from snuba.query.parser.exceptions import (
    AliasShadowingException,
    CyclicAliasException,
    ParsingException,
)
from snuba.query.parser.expressions import parse_aggregation, parse_expression
from snuba.query.parser.validation import validate_query
from snuba.util import is_function, to_list, tuplify
from snuba.utils.metrics.wrapper import MetricsWrapper

logger = logging.getLogger(__name__)

metrics = MetricsWrapper(environment.metrics, "parser")


def parse_query(body: MutableMapping[str, Any], dataset: Dataset) -> Query:
    """
    Parses the query body generating the AST. This only takes into
    account the initial query body. Extensions are parsed by extension
    processors and are supposed to update the AST.

    Parsing includes two phases. The first transforms the json body into
    a minimal query Object resolving expressions, conditions, etc.
    The second phase performs some query processing to provide a sane
    query to the dataset specific section.
    - It prevents alias shadowing.
    - It transforms columns from the tags[asd] form into
      SubscriptableReference.
Exemplo n.º 20
0
def consumer(
    *,
    raw_events_topic: Optional[str],
    replacements_topic: Optional[str],
    commit_log_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    parallel_collect: bool,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
    profile_path: Optional[str] = None,
    cooperative_rebalancing: bool = False,
) -> None:

    setup_logging(log_level)
    setup_sentry()
    logger.info("Consumer Starting")
    storage_key = StorageKey(storage_name)

    metrics = MetricsWrapper(
        environment.metrics,
        "consumer",
        tags={"group": consumer_group, "storage": storage_key.value},
    )
    configure_metrics(StreamMetricsAdapter(metrics))

    def stats_callback(stats_json: str) -> None:
        stats = rapidjson.loads(stats_json)
        metrics.gauge("librdkafka.total_queue_size", stats.get("replyq", 0))

    consumer_builder = ConsumerBuilder(
        storage_key=storage_key,
        kafka_params=KafkaParameters(
            raw_topic=raw_events_topic,
            replacements_topic=replacements_topic,
            bootstrap_servers=bootstrap_server,
            group_id=consumer_group,
            commit_log_topic=commit_log_topic,
            auto_offset_reset=auto_offset_reset,
            strict_offset_reset=not no_strict_offset_reset,
            queued_max_messages_kbytes=queued_max_messages_kbytes,
            queued_min_messages=queued_min_messages,
        ),
        processing_params=ProcessingParameters(
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
        ),
        max_batch_size=max_batch_size,
        max_batch_time_ms=max_batch_time_ms,
        metrics=metrics,
        profile_path=profile_path,
        stats_callback=stats_callback,
        parallel_collect=parallel_collect,
        cooperative_rebalancing=cooperative_rebalancing,
    )

    consumer = consumer_builder.build_base_consumer()

    def handler(signum: int, frame: Any) -> None:
        consumer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    consumer.run()
Exemplo n.º 21
0
from snuba.clickhouse.processors import QueryProcessor
from snuba.clickhouse.query import Query
from snuba.clickhouse.query_dsl.accessors import (
    get_object_ids_in_query_ast,
    get_time_range,
)
from snuba.datasets.errors_replacer import ProjectsQueryFlags
from snuba.query.conditions import not_in_condition
from snuba.query.expressions import Column, FunctionCall, Literal
from snuba.query.query_settings import QuerySettings
from snuba.replacers.replacer_processor import ReplacerState
from snuba.state import get_config
from snuba.utils.metrics.wrapper import MetricsWrapper

logger = logging.getLogger(__name__)
metrics = MetricsWrapper(environment.metrics, "processors.replaced_groups")
FINAL_METRIC = "final"


class PostReplacementConsistencyEnforcer(QueryProcessor):
    """
    This processor tweaks the query to ensure that groups that have been manipulated
    by a replacer (like after a deletion) are excluded if they need to be.

    There is a period of time between the replacement executing its query and Clickhouse
    merging the rows to achieve consistency. During this period of time we either
    have to remove those rows manually or to run the query in FINAL mode.
    """

    def __init__(
        self, project_column: str, replacer_state_name: Optional[ReplacerState]
Exemplo n.º 22
0
    ReplacementMessage,
    ReplacementMessageMetadata,
    ReplacerProcessor,
    ReplacerState,
)
from snuba.state import get_config
from snuba.utils.metrics.wrapper import MetricsWrapper

"""
Disambiguate the dataset/storage when there are multiple tables representing errors
that perform event replacements.
In theory this will be needed only during the events to errors migration.
"""

logger = logging.getLogger(__name__)
metrics = MetricsWrapper(environment.metrics, "errors.replacer")


@dataclass(frozen=True)
class NeedsFinal:
    pass


@dataclass(frozen=True)
class ExcludeGroups:
    group_ids: Sequence[int]


QueryTimeFlags = Union[NeedsFinal, ExcludeGroups]

Exemplo n.º 23
0
    ProjectReferrerRateLimiter,
    ReferrerRateLimiterProcessor,
)
from snuba.query.processors.quota_processor import ResourceQuotaProcessor
from snuba.query.processors.timeseries_processor import (
    TimeSeriesProcessor,
    extract_granularity_from_query,
)
from snuba.query.query_settings import QuerySettings, SubscriptionQuerySettings
from snuba.query.validation.validators import (
    ColumnValidationMode,
    EntityRequiredColumnValidator,
)
from snuba.utils.metrics.wrapper import MetricsWrapper

metrics = MetricsWrapper(environment.metrics, "api.sessions")


def function_column(col_name: str, function_name: str) -> ColumnToFunction:
    return ColumnToFunction(
        None,
        col_name,
        function_name,
        (Column(None, None, col_name), ),
    )


def function_call(col_name: str, function_name: str) -> FunctionCall:
    return FunctionCall(None, function_name, (Column(None, None, col_name), ))

Exemplo n.º 24
0
def replacer(
    *,
    replacements_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    log_level: Optional[str] = None,
) -> None:

    from snuba.replacer import ReplacerWorker
    from snuba.utils.streams import Topic
    from snuba.utils.streams.backends.kafka import (
        KafkaConsumer,
        TransportError,
        build_kafka_consumer_configuration,
    )
    from snuba.utils.streams.processing import StreamProcessor
    from snuba.utils.streams.processing.strategies.batching import (
        BatchProcessingStrategyFactory, )

    setup_logging(log_level)
    setup_sentry()

    storage_key = StorageKey(storage_name)
    storage = get_writable_storage(storage_key)
    metrics_tags = {"group": consumer_group, "storage": storage_name}

    stream_loader = storage.get_table_writer().get_stream_loader()
    default_replacement_topic_spec = stream_loader.get_replacement_topic_spec()
    assert (
        default_replacement_topic_spec is not None
    ), f"Storage {storage.get_storage_key().value} does not have a replacement topic."
    replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name

    metrics = MetricsWrapper(
        environment.metrics,
        "replacer",
        tags=metrics_tags,
    )

    replacer = StreamProcessor(
        KafkaConsumer(
            build_kafka_consumer_configuration(
                bootstrap_servers=bootstrap_server,
                group_id=consumer_group,
                auto_offset_reset=auto_offset_reset,
                queued_max_messages_kbytes=queued_max_messages_kbytes,
                queued_min_messages=queued_min_messages,
            ), ),
        Topic(replacements_topic),
        BatchProcessingStrategyFactory(
            worker=ReplacerWorker(storage, metrics=metrics),
            max_batch_size=max_batch_size,
            max_batch_time=max_batch_time_ms,
            metrics=metrics,
        ),
        metrics=metrics,
        recoverable_errors=[TransportError],
    )

    def handler(signum: int, frame: Any) -> None:
        replacer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    replacer.run()
Exemplo n.º 25
0
    def __init__(
        self,
        storage_key: StorageKey,
        raw_topic: Optional[str],
        replacements_topic: Optional[str],
        max_batch_size: int,
        max_batch_time_ms: int,
        bootstrap_servers: Sequence[str],
        group_id: str,
        commit_log_topic: Optional[str],
        auto_offset_reset: str,
        queued_max_messages_kbytes: int,
        queued_min_messages: int,
        processes: Optional[int],
        input_block_size: Optional[int],
        output_block_size: Optional[int],
        commit_retry_policy: Optional[RetryPolicy] = None,
        profile_path: Optional[str] = None,
    ) -> None:
        self.storage = get_writable_storage(storage_key)
        self.bootstrap_servers = bootstrap_servers
        self.broker_config = get_default_kafka_configuration(
            storage_key, bootstrap_servers=bootstrap_servers
        )
        self.producer_broker_config = build_kafka_producer_configuration(
            storage_key,
            bootstrap_servers=bootstrap_servers,
            override_params={
                "partitioner": "consistent",
                "message.max.bytes": 50000000,  # 50MB, default is 1MB
            },
        )

        stream_loader = self.storage.get_table_writer().get_stream_loader()

        self.raw_topic: Topic
        if raw_topic is not None:
            self.raw_topic = Topic(raw_topic)
        else:
            self.raw_topic = Topic(stream_loader.get_default_topic_spec().topic_name)

        self.replacements_topic: Optional[Topic]
        if replacements_topic is not None:
            self.replacements_topic = Topic(replacements_topic)
        else:
            replacement_topic_spec = stream_loader.get_replacement_topic_spec()
            if replacement_topic_spec is not None:
                self.replacements_topic = Topic(replacement_topic_spec.topic_name)
            else:
                self.replacements_topic = None

        self.commit_log_topic: Optional[Topic]
        if commit_log_topic is not None:
            self.commit_log_topic = Topic(commit_log_topic)
        else:
            commit_log_topic_spec = stream_loader.get_commit_log_topic_spec()
            if commit_log_topic_spec is not None:
                self.commit_log_topic = Topic(commit_log_topic_spec.topic_name)
            else:
                self.commit_log_topic = None

        # XXX: This can result in a producer being built in cases where it's
        # not actually required.
        self.producer = Producer(self.producer_broker_config)

        self.metrics = MetricsWrapper(
            environment.metrics,
            "consumer",
            tags={"group": group_id, "storage": storage_key.value},
        )

        self.max_batch_size = max_batch_size
        self.max_batch_time_ms = max_batch_time_ms
        self.group_id = group_id
        self.auto_offset_reset = auto_offset_reset
        self.queued_max_messages_kbytes = queued_max_messages_kbytes
        self.queued_min_messages = queued_min_messages
        self.processes = processes
        self.input_block_size = input_block_size
        self.output_block_size = output_block_size
        self.__profile_path = profile_path

        if commit_retry_policy is None:
            commit_retry_policy = BasicRetryPolicy(
                3,
                constant_delay(1),
                lambda e: isinstance(e, KafkaException)
                and e.args[0].code()
                in (
                    KafkaError.REQUEST_TIMED_OUT,
                    KafkaError.NOT_COORDINATOR,
                    KafkaError._WAIT_COORD,
                ),
            )

        self.__commit_retry_policy = commit_retry_policy
def subscriptions_scheduler_executor(
    *,
    dataset_name: str,
    entity_names: Sequence[str],
    consumer_group: str,
    followed_consumer_group: str,
    max_concurrent_queries: int,
    total_concurrent_queries: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    schedule_ttl: int,
    delay_seconds: Optional[int],
    stale_threshold_seconds: Optional[int],
    log_level: Optional[str],
    # TODO: Temporarily overrides the scheduling mode.
    # Required for single tenant since some partitions may be empty.
    # To be removed once transactions is no longer semantically partitioned.
    scheduling_mode: Optional[str],
) -> None:
    """
    Combined subscriptions scheduler and executor. Alternative to the separate scheduler and executor processes.
    """
    setup_logging(log_level)
    setup_sentry()

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions.scheduler_executor",
        tags={"dataset": dataset_name},
    )

    configure_metrics(StreamMetricsAdapter(metrics))

    # Just get the result topic configuration from the first entity. Later we
    # check they all have the same result topic anyway before building the consumer.
    entity_key = EntityKey(entity_names[0])

    storage = get_entity(entity_key).get_writable_storage()
    assert storage is not None
    stream_loader = storage.get_table_writer().get_stream_loader()
    result_topic_spec = stream_loader.get_subscription_scheduled_topic_spec()
    assert result_topic_spec is not None

    producer = KafkaProducer(
        build_kafka_producer_configuration(
            result_topic_spec.topic,
            override_params={"partitioner": "consistent"},
        )
    )

    processor = build_scheduler_executor_consumer(
        dataset_name,
        entity_names,
        consumer_group,
        followed_consumer_group,
        producer,
        auto_offset_reset,
        not no_strict_offset_reset,
        schedule_ttl,
        delay_seconds,
        stale_threshold_seconds,
        max_concurrent_queries,
        total_concurrent_queries,
        metrics,
        SchedulingWatermarkMode(scheduling_mode)
        if scheduling_mode is not None
        else None,
    )

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    with closing(producer), flush_querylog():
        processor.run()
Exemplo n.º 27
0
    mapping_pattern,
)
from snuba.query.conditions import (
    BooleanFunctions,
    get_first_level_and_conditions,
    get_first_level_or_conditions,
)
from snuba.query.expressions import Column, Expression
from snuba.query.expressions import FunctionCall as FunctionExpr
from snuba.query.expressions import Literal as LiteralExpr
from snuba.query.matchers import Any, FunctionCall, Literal, Or, Param, String
from snuba.request.request_settings import RequestSettings
from snuba.state import get_config
from snuba.utils.metrics.wrapper import MetricsWrapper

metrics = MetricsWrapper(environment.metrics, "processors.tags_hash_map")

ESCAPE_TRANSLATION = str.maketrans({"\\": "\\\\", "=": "\="})


class ConditionClass(Enum):
    IRRELEVANT = 1
    OPTIMIZABLE = 2
    NOT_OPTIMIZABLE = 3


class MappingOptimizer(QueryProcessor):
    """
    Optimize tags conditions by relying on the tags_hash_map column.
    Such column is an array of hashes of `key=value` strings.
    This processor transforms tags conditions that are in the form of
Exemplo n.º 28
0
from snuba import environment
from snuba.datasets.events_format import enforce_retention, extract_extra_tags
from snuba.processor import (
    InsertBatch,
    MessageProcessor,
    ProcessedMessage,
    _as_dict_safe,
    _ensure_valid_date,
    _unicodify,
)
from snuba.utils.metrics.wrapper import MetricsWrapper

UNKNOWN_SPAN_STATUS = 2

metrics = MetricsWrapper(environment.metrics, "spans.processor")


class SpansMessageProcessor(MessageProcessor):
    def __extract_timestamp(self, field: float) -> Tuple[datetime, int]:
        timestamp = _ensure_valid_date(datetime.fromtimestamp(field))
        if timestamp is None:
            timestamp = datetime.utcnow()
        nanoseconds = int(timestamp.microsecond * 1000)
        return (timestamp, nanoseconds)

    def __init_span(self, event: Mapping[str,
                                         Any]) -> MutableMapping[str, Any]:
        """
        Initializes the fields that are the same for all spans within a transaction.
        """
Exemplo n.º 29
0
def multistorage_consumer(
    storage_names: Sequence[str],
    consumer_group: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
) -> None:

    DEFAULT_BLOCK_SIZE = int(32 * 1e6)

    if processes is not None:
        if input_block_size is None:
            input_block_size = DEFAULT_BLOCK_SIZE

        if output_block_size is None:
            output_block_size = DEFAULT_BLOCK_SIZE

    setup_logging(log_level)
    setup_sentry()

    storages = {
        key: get_writable_storage(key)
        for key in (getattr(StorageKey, name.upper())
                    for name in storage_names)
    }

    topics = {
        storage.get_table_writer().get_stream_loader().get_default_topic_spec(
        ).topic_name
        for storage in storages.values()
    }

    # XXX: The ``StreamProcessor`` only supports a single topic at this time,
    # but is easily modified. The topic routing in the processing strategy is a
    # bit trickier (but also shouldn't be too bad.)
    topic = Topic(topics.pop())
    if topics:
        raise ValueError("only one topic is supported")

    # XXX: The ``CommitLogConsumer`` also only supports a single topic at this
    # time. (It is less easily modified.) This also assumes the commit log
    # topic is on the same Kafka cluster as the input topic.
    commit_log_topics = {
        spec.topic_name
        for spec in (storage.get_table_writer().get_stream_loader(
        ).get_commit_log_topic_spec() for storage in storages.values())
        if spec is not None
    }

    commit_log_topic: Optional[Topic]
    if commit_log_topics:
        commit_log_topic = Topic(commit_log_topics.pop())
    else:
        commit_log_topic = None

    if commit_log_topics:
        raise ValueError("only one commit log topic is supported")

    # XXX: This requires that all storages are associated with the same Kafka
    # cluster so that they can be consumed by the same consumer instance.
    # Unfortunately, we don't have the concept of independently configurable
    # Kafka clusters in settings, only consumer configurations that are
    # associated with storages and/or global default configurations. To avoid
    # implementing yet another method of configuring Kafka clusters, this just
    # piggybacks on the existing configuration method(s), with the assumption
    # that most deployments are going to be using the default configuration.
    storage_keys = [*storages.keys()]

    kafka_topic = (storages[storage_keys[0]].get_table_writer().
                   get_stream_loader().get_default_topic_spec().topic)

    consumer_configuration = build_kafka_consumer_configuration(
        kafka_topic,
        consumer_group,
        auto_offset_reset=auto_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
    )

    for storage_key in storage_keys[1:]:
        if (build_kafka_consumer_configuration(
                storages[storage_key].get_table_writer().get_stream_loader().
                get_default_topic_spec().topic,
                consumer_group,
        )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]):
            raise ValueError(
                "storages cannot be located on different Kafka clusters")

    if commit_log_topic is None:
        consumer = KafkaConsumer(consumer_configuration)
    else:
        # XXX: This relies on the assumptions that a.) all storages are
        # located on the same Kafka cluster (validated above.)

        commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer(
        ).get_stream_loader().get_commit_log_topic_spec())
        assert commit_log_topic_spec is not None

        producer = ConfluentKafkaProducer(
            build_kafka_producer_configuration(commit_log_topic_spec.topic))
        consumer = KafkaConsumerWithCommitLog(
            consumer_configuration,
            producer=producer,
            commit_log_topic=commit_log_topic,
        )

    metrics = MetricsWrapper(environment.metrics, "consumer")

    configure_metrics(StreamMetricsAdapter(metrics))
    processor = StreamProcessor(
        consumer,
        topic,
        MultistorageConsumerProcessingStrategyFactory(
            [*storages.values()],
            max_batch_size,
            max_batch_time_ms / 1000.0,
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
            metrics=metrics,
        ),
    )

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    processor.run()
Exemplo n.º 30
0
)

import simplejson as json
from confluent_kafka import KafkaError
from confluent_kafka import Message as KafkaMessage
from confluent_kafka import Producer

from snuba import environment, settings
from snuba.redis import redis_client
from snuba.utils.metrics.wrapper import MetricsWrapper
from snuba.utils.streams.configuration_builder import (
    build_default_kafka_producer_configuration,
)
from snuba.utils.streams.topics import Topic

metrics = MetricsWrapper(environment.metrics, "snuba.state")
logger = logging.getLogger("snuba.state")

kfk = None
rds = redis_client

ratelimit_prefix = "snuba-ratelimit:"
query_lock_prefix = "snuba-query-lock:"
config_hash = "snuba-config"
config_history_hash = "snuba-config-history"
config_changes_list = "snuba-config-changes"
config_changes_list_limit = 25
queries_list = "snuba-queries"

# Rate Limiting and Deduplication