def replacer(replacements_topic, consumer_group, bootstrap_server, clickhouse_server, distributed_table_name, max_batch_size, max_batch_time_ms, auto_offset_reset, queued_max_messages_kbytes, queued_min_messages, log_level, dogstatsd_host, dogstatsd_port): import sentry_sdk from snuba import util from snuba.clickhouse import ClickhousePool from batching_kafka_consumer import BatchingKafkaConsumer from snuba.replacer import ReplacerWorker sentry_sdk.init(dsn=settings.SENTRY_DSN) logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s') metrics = util.create_metrics(dogstatsd_host, dogstatsd_port, 'snuba.replacer', tags=["group:%s" % consumer_group]) client_settings = { # Replacing existing rows requires reconstructing the entire tuple for each # event (via a SELECT), which is a Hard Thing (TM) for columnstores to do. With # the default settings it's common for ClickHouse to go over the default max_memory_usage # of 10GB per query. Lowering the max_block_size reduces memory usage, and increasing the # max_memory_usage gives the query more breathing room. 'max_block_size': settings.REPLACER_MAX_BLOCK_SIZE, 'max_memory_usage': settings.REPLACER_MAX_MEMORY_USAGE, # Don't use up production cache for the count() queries. 'use_uncompressed_cache': 0, } clickhouse = ClickhousePool( host=clickhouse_server.split(':')[0], port=int(clickhouse_server.split(':')[1]), client_settings=client_settings, ) replacer = BatchingKafkaConsumer( replacements_topic, worker=ReplacerWorker(clickhouse, distributed_table_name, metrics=metrics), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, bootstrap_servers=bootstrap_server, group_id=consumer_group, producer=None, commit_log_topic=None, auto_offset_reset=auto_offset_reset, ) def handler(signum, frame): replacer.signal_shutdown() signal.signal(signal.SIGINT, handler) replacer.run()
def consumer(raw_events_topic, replacements_topic, commit_log_topic, consumer_group, bootstrap_server, clickhouse_server, distributed_table_name, max_batch_size, max_batch_time_ms, auto_offset_reset, queued_max_messages_kbytes, queued_min_messages, log_level, dogstatsd_host, dogstatsd_port): import sentry_sdk from snuba import util from snuba.clickhouse import ClickhousePool from batching_kafka_consumer import BatchingKafkaConsumer from snuba.consumer import ConsumerWorker sentry_sdk.init(dsn=settings.SENTRY_DSN) logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s') metrics = util.create_metrics( dogstatsd_host, dogstatsd_port, 'snuba.consumer', tags=["group:%s" % consumer_group] ) clickhouse = ClickhousePool( host=clickhouse_server.split(':')[0], port=int(clickhouse_server.split(':')[1]), client_settings={ 'load_balancing': 'in_order', 'insert_distributed_sync': True, }, metrics=metrics ) producer = Producer({ 'bootstrap.servers': ','.join(bootstrap_server), 'partitioner': 'consistent', 'message.max.bytes': 50000000, # 50MB, default is 1MB }) consumer = BatchingKafkaConsumer( raw_events_topic, worker=ConsumerWorker( clickhouse, distributed_table_name, producer=producer, replacements_topic=replacements_topic, metrics=metrics ), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, bootstrap_servers=bootstrap_server, group_id=consumer_group, producer=producer, commit_log_topic=commit_log_topic, auto_offset_reset=auto_offset_reset, ) def handler(signum, frame): consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) consumer.run()
def __init__(self, dataset_name: str, raw_topic: str, replacements_topic: str, max_batch_size: int, max_batch_time_ms: int, bootstrap_servers: Sequence[str], group_id: str, commit_log_topic: str, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, dogstatsd_host: str, dogstatsd_port: int) -> None: self.dataset = get_dataset(dataset_name) self.dataset_name = dataset_name if not bootstrap_servers: self.bootstrap_servers = settings.DEFAULT_DATASET_BROKERS.get( dataset_name, settings.DEFAULT_BROKERS, ) else: self.bootstrap_servers = bootstrap_servers stream_loader = enforce_table_writer(self.dataset).get_stream_loader() self.raw_topic = raw_topic or stream_loader.get_default_topic_spec( ).topic_name default_replacement_topic_name = stream_loader.get_replacement_topic_spec().topic_name \ if stream_loader.get_replacement_topic_spec() \ else None self.replacements_topic = replacements_topic or default_replacement_topic_name default_commit_log_topic_name = stream_loader.get_commit_log_topic_spec().topic_name \ if stream_loader.get_commit_log_topic_spec() \ else None self.commit_log_topic = commit_log_topic or default_commit_log_topic_name # XXX: This can result in a producer being built in cases where it's # not actually required. self.producer = Producer({ 'bootstrap.servers': ','.join(self.bootstrap_servers), 'partitioner': 'consistent', 'message.max.bytes': 50000000, # 50MB, default is 1MB }) self.metrics = util.create_metrics(dogstatsd_host, dogstatsd_port, 'snuba.consumer', tags={ "group": group_id, "dataset": self.dataset_name, }) self.max_batch_size = max_batch_size self.max_batch_time_ms = max_batch_time_ms self.group_id = group_id self.auto_offset_reset = auto_offset_reset self.queued_max_messages_kbytes = queued_max_messages_kbytes self.queued_min_messages = queued_min_messages
def __init__(self, dataset_name: str, raw_topic: str, replacements_topic: str, max_batch_size: int, max_batch_time_ms: int, bootstrap_servers: Sequence[str], group_id: str, commit_log_topic: str, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, dogstatsd_host: str, dogstatsd_port: int) -> None: self.dataset = get_dataset(dataset_name) self.dataset_name = dataset_name if not bootstrap_servers: self.bootstrap_servers = settings.DEFAULT_DATASET_BROKERS.get( dataset_name, settings.DEFAULT_BROKERS, ) else: self.bootstrap_servers = bootstrap_servers self.raw_topic = raw_topic or self.dataset.get_default_topic() self.replacements_topic = replacements_topic or self.dataset.get_default_replacement_topic( ) self.commit_log_topic = commit_log_topic or self.dataset.get_default_commit_log_topic( ) self.producer = Producer({ 'bootstrap.servers': ','.join(self.bootstrap_servers), 'partitioner': 'consistent', 'message.max.bytes': 50000000, # 50MB, default is 1MB }) self.metrics = util.create_metrics(dogstatsd_host, dogstatsd_port, 'snuba.consumer', tags=[ "group:%s" % group_id, "dataset:%s" % self.dataset_name, ]) self.max_batch_size = max_batch_size self.max_batch_time_ms = max_batch_time_ms self.group_id = group_id self.auto_offset_reset = auto_offset_reset self.queued_max_messages_kbytes = queued_max_messages_kbytes self.queued_min_messages = queued_min_messages
from snuba.redis import redis_client from snuba.request import Request from snuba.state.cache import Cache, RedisCache from snuba.state.rate_limit import ( PROJECT_RATE_LIMIT_NAME, RateLimitAggregator, RateLimitExceeded, ) from snuba.util import create_metrics, force_bytes from snuba.utils.codecs import JSONCodec from snuba.utils.metrics.timer import Timer from snuba.web.split import split_query logger = logging.getLogger("snuba.query") metrics = create_metrics("snuba.api") ClickhouseQueryResult = MutableMapping[str, MutableMapping[str, Any]] class RawQueryException(Exception): def __init__( self, err_type: str, message: str, stats: Mapping[str, Any], sql: str, **meta ): self.err_type = err_type self.message = message self.stats = stats self.sql = sql self.meta = meta
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], ) -> None: """Evaluates subscribed queries for a dataset.""" assert result_topic is not None setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) if not bootstrap_servers: bootstrap_servers = settings.DEFAULT_DATASET_BROKERS.get( dataset_name, settings.DEFAULT_BROKERS ) loader = enforce_table_writer(dataset).get_stream_loader() consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, consumer_group, auto_offset_reset=auto_offset_reset, ), PassthroughCodec(), ), KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", ), CommitCodec(), ), ( Topic(commit_log_topic) if commit_log_topic is not None else Topic(loader.get_commit_log_topic_spec().topic_name) ), set(commit_log_groups), ) ) producer = KafkaProducer( { "bootstrap.servers": ",".join(bootstrap_servers), "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, SubscriptionResultCodec(), ) with closing(consumer), closing(producer): batching_consumer = BatchingConsumer( consumer, ( Topic(topic) if topic is not None else Topic(loader.get_default_topic_spec().topic_name) ), SubscriptionWorker( SubscriptionExecutor( dataset, ThreadPoolExecutor( max_workers=settings.SUBSCRIPTIONS_MAX_CONCURRENT_QUERIES ), ), { index: SubscriptionScheduler( RedisSubscriptionDataStore( redis_client, dataset, PartitionId(index) ), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), ) for index in range( partitions if partitions is not None else loader.get_default_topic_spec().partitions_number ) }, producer, Topic(result_topic), ), max_batch_size, max_batch_time_ms, create_metrics( "snuba.subscriptions", tags={"group": consumer_group, "dataset": dataset_name}, ), ) def handler(signum, frame) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
def replacer( *, replacements_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], dataset_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, log_level: Optional[str] = None, ) -> None: from snuba import util from snuba.clickhouse.native import ClickhousePool from snuba.replacer import ReplacerWorker from snuba.utils.codecs import PassthroughCodec from snuba.utils.streams.batching import BatchingConsumer from snuba.utils.streams.kafka import ( KafkaConsumer, KafkaPayload, TransportError, build_kafka_consumer_configuration, ) from snuba.utils.streams.types import Topic setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) stream_loader = enforce_table_writer(dataset).get_stream_loader() default_replacement_topic_spec = stream_loader.get_replacement_topic_spec() assert ( default_replacement_topic_spec is not None), f"Dataset {dataset} does not have a replacement topic." replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name metrics = util.create_metrics("snuba.replacer", tags={"group": consumer_group}) client_settings = { # Replacing existing rows requires reconstructing the entire tuple for each # event (via a SELECT), which is a Hard Thing (TM) for columnstores to do. With # the default settings it's common for ClickHouse to go over the default max_memory_usage # of 10GB per query. Lowering the max_block_size reduces memory usage, and increasing the # max_memory_usage gives the query more breathing room. "max_block_size": settings.REPLACER_MAX_BLOCK_SIZE, "max_memory_usage": settings.REPLACER_MAX_MEMORY_USAGE, # Don't use up production cache for the count() queries. "use_uncompressed_cache": 0, } clickhouse = ClickhousePool( settings.CLICKHOUSE_HOST, settings.CLICKHOUSE_PORT, client_settings=client_settings, ) codec: PassthroughCodec[KafkaPayload] = PassthroughCodec() replacer = BatchingConsumer( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers=bootstrap_server, group_id=consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), codec=codec, ), Topic(replacements_topic), worker=ReplacerWorker(clickhouse, dataset, metrics=metrics), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, recoverable_errors=[TransportError], ) def handler(signum, frame) -> None: replacer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) replacer.run()
logging.basicConfig( level=getattr(logging, level.upper()), format=settings.LOG_FORMAT, ) def traces_sampler(sampling_context: Mapping[str, Any]) -> Any: return sampling_context["parent_sampled"] or False def setup_sentry() -> None: sentry_sdk.init( dsn=settings.SENTRY_DSN, integrations=[ FlaskIntegration(), GnuBacktraceIntegration(), LoggingIntegration(event_level=logging.WARNING), RedisIntegration(), ], release=os.getenv("SNUBA_RELEASE"), traces_sampler=traces_sampler, ) metrics = create_metrics( "snuba", tags=None, sample_rates=settings.DOGSTATSD_SAMPLING_RATES, )
extract_user, flatten_nested_field, ) from snuba.processor import ( _as_dict_safe, MessageProcessor, ProcessorAction, ProcessedMessage, _ensure_valid_date, _ensure_valid_ip, _unicodify, ) from snuba.util import create_metrics metrics = create_metrics("snuba.transactions.processor") UNKNOWN_SPAN_STATUS = 2 class TransactionsMessageProcessor(MessageProcessor): PROMOTED_TAGS = { "environment", "sentry:release", "sentry:user", "sentry:dist", } def __extract_timestamp(self, field): timestamp = _ensure_valid_date(datetime.fromtimestamp(field)) if timestamp is None:
def setup_logging(level: Optional[str] = None) -> None: if level is None: level = settings.LOG_LEVEL logging.basicConfig( level=getattr(logging, level.upper()), format=settings.LOG_FORMAT, ) def setup_sentry() -> None: sentry_sdk.init( dsn=settings.SENTRY_DSN, integrations=[FlaskIntegration(), GnuBacktraceIntegration()], release=os.getenv("SNUBA_RELEASE"), ) clickhouse_rw = ClickhousePool(settings.CLICKHOUSE_HOST, settings.CLICKHOUSE_PORT) clickhouse_ro = ClickhousePool( settings.CLICKHOUSE_HOST, settings.CLICKHOUSE_PORT, client_settings={"readonly": True}, ) metrics = create_metrics("snuba") reader: Reader[ClickhouseQuery] = NativeDriverReader(clickhouse_ro)
from typing import Any, MutableMapping, NamedTuple from snuba import settings, state from snuba.clickhouse.native import ClickhousePool from snuba.clickhouse.query import ClickhouseQuery from snuba.query.columns import all_referenced_columns from snuba.request import Request from snuba.state.rate_limit import RateLimitAggregator, RateLimitExceeded, PROJECT_RATE_LIMIT_NAME from snuba.util import ( create_metrics, force_bytes, Timer, ) logger = logging.getLogger('snuba.query') metrics = create_metrics(settings.DOGSTATSD_HOST, settings.DOGSTATSD_PORT, 'snuba.api') class QueryResult(NamedTuple): # TODO: Give a better abstraction to QueryResult result: MutableMapping[str, MutableMapping[str, Any]] status: int def raw_query( request: Request, query: ClickhouseQuery, client: ClickhousePool, timer: Timer, stats=None, ) -> QueryResult:
def __init__( self, dataset_name: str, raw_topic: Optional[str], replacements_topic: Optional[str], max_batch_size: int, max_batch_time_ms: int, bootstrap_servers: Sequence[str], group_id: str, commit_log_topic: Optional[str], auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, rapidjson_deserialize: bool, rapidjson_serialize: bool, commit_retry_policy: Optional[RetryPolicy] = None, ) -> None: self.dataset = get_dataset(dataset_name) self.dataset_name = dataset_name if not bootstrap_servers: self.bootstrap_servers = settings.DEFAULT_DATASET_BROKERS.get( dataset_name, settings.DEFAULT_BROKERS, ) else: self.bootstrap_servers = bootstrap_servers stream_loader = enforce_table_writer(self.dataset).get_stream_loader() self.raw_topic: Topic if raw_topic is not None: self.raw_topic = Topic(raw_topic) else: self.raw_topic = Topic(stream_loader.get_default_topic_spec().topic_name) self.replacements_topic: Optional[Topic] if replacements_topic is not None: self.replacements_topic = Topic(replacements_topic) else: replacement_topic_spec = stream_loader.get_replacement_topic_spec() if replacement_topic_spec is not None: self.replacements_topic = Topic(replacement_topic_spec.topic_name) else: self.replacements_topic = None self.commit_log_topic: Optional[Topic] if commit_log_topic is not None: self.commit_log_topic = Topic(commit_log_topic) else: commit_log_topic_spec = stream_loader.get_commit_log_topic_spec() if commit_log_topic_spec is not None: self.commit_log_topic = Topic(commit_log_topic_spec.topic_name) else: self.commit_log_topic = None # XXX: This can result in a producer being built in cases where it's # not actually required. self.producer = Producer( { "bootstrap.servers": ",".join(self.bootstrap_servers), "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB } ) self.metrics = util.create_metrics( "snuba.consumer", tags={"group": group_id, "dataset": self.dataset_name}, ) self.max_batch_size = max_batch_size self.max_batch_time_ms = max_batch_time_ms self.group_id = group_id self.auto_offset_reset = auto_offset_reset self.queued_max_messages_kbytes = queued_max_messages_kbytes self.queued_min_messages = queued_min_messages if commit_retry_policy is None: commit_retry_policy = BasicRetryPolicy( 3, constant_delay(1), lambda e: isinstance(e, KafkaException) and e.args[0].code() in ( KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR_FOR_GROUP, KafkaError._WAIT_COORD, ), ) self.__commit_retry_policy = commit_retry_policy self.__rapidjson_deserialize = rapidjson_deserialize self.__rapidjson_serialize = rapidjson_serialize
def replacer(*, replacements_topic, consumer_group, bootstrap_server, clickhouse_host, clickhouse_port, dataset, max_batch_size, max_batch_time_ms, auto_offset_reset, queued_max_messages_kbytes, queued_min_messages, log_level, dogstatsd_host, dogstatsd_port): import sentry_sdk from snuba import util from snuba.clickhouse.native import ClickhousePool from snuba.replacer import ReplacerWorker from snuba.utils.streams.batching import BatchingConsumer from snuba.utils.streams.kafka import KafkaConsumer, TransportError, build_kafka_consumer_configuration sentry_sdk.init(dsn=settings.SENTRY_DSN) dataset = get_dataset(dataset) logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s') stream_loader = enforce_table_writer(dataset).get_stream_loader() default_replacement_topic_spec = stream_loader.get_replacement_topic_spec() assert default_replacement_topic_spec is not None, f"Dataset {dataset} does not have a replacement topic." replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name metrics = util.create_metrics(dogstatsd_host, dogstatsd_port, 'snuba.replacer', tags={"group": consumer_group}) client_settings = { # Replacing existing rows requires reconstructing the entire tuple for each # event (via a SELECT), which is a Hard Thing (TM) for columnstores to do. With # the default settings it's common for ClickHouse to go over the default max_memory_usage # of 10GB per query. Lowering the max_block_size reduces memory usage, and increasing the # max_memory_usage gives the query more breathing room. 'max_block_size': settings.REPLACER_MAX_BLOCK_SIZE, 'max_memory_usage': settings.REPLACER_MAX_MEMORY_USAGE, # Don't use up production cache for the count() queries. 'use_uncompressed_cache': 0, } clickhouse = ClickhousePool( host=clickhouse_host, port=clickhouse_port, client_settings=client_settings, ) replacer = BatchingConsumer( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers=bootstrap_server, group_id=consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), ), replacements_topic, worker=ReplacerWorker(clickhouse, dataset, metrics=metrics), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, recoverable_errors=[TransportError], ) def handler(signum, frame): replacer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) replacer.run()
import uuid from snuba import settings from snuba.processor import (_as_dict_safe, MessageProcessor, ProcessorAction, ProcessedMessage, _ensure_valid_date, _ensure_valid_ip, _unicodify) from snuba.datasets.events_processor import ( enforce_retention, extract_base, extract_extra_contexts, extract_extra_tags, extract_user, ) from snuba.util import create_metrics metrics = create_metrics(settings.DOGSTATSD_HOST, settings.DOGSTATSD_PORT, 'snuba.transactions.processor') class TransactionsMessageProcessor(MessageProcessor): PROMOTED_TAGS = { "environment", "sentry:release", "sentry:user", "sentry:dist", } def __extract_timestamp(self, field): timestamp = _ensure_valid_date(datetime.fromtimestamp(field)) if timestamp is None: timestamp = datetime.utcnow() milliseconds = int(timestamp.microsecond / 1000)