def generate_transactions() -> None: from datetime import datetime table_writer = get_writable_storage( StorageKey.TRANSACTIONS).get_table_writer() rows = [] for i in range(5): raw_transaction = get_raw_transaction() # Older versions of this table did not have measurements del raw_transaction["data"]["measurements"] processed = ( table_writer.get_stream_loader().get_processor().process_message( (2, "insert", raw_transaction), KafkaMessageMetadata(0, 0, datetime.utcnow()), )) rows.extend(processed.rows) BatchWriterEncoderWrapper( table_writer.get_batch_writer(metrics=DummyMetricsBackend( strict=True)), JSONRowEncoder(), ).write(rows)
def write(*, dataset: Dataset) -> RespTuple: from snuba.processor import InsertBatch rows: MutableSequence[WriterTableRow] = [] offset_base = int(round(time.time() * 1000)) for index, message in enumerate(json.loads(http_request.data)): offset = offset_base + index processed_message = ( enforce_table_writer(dataset) .get_stream_loader() .get_processor() .process_message( message, KafkaMessageMetadata( offset=offset, partition=0, timestamp=datetime.utcnow() ), ) ) if processed_message: assert isinstance(processed_message, InsertBatch) rows.extend(processed_message.rows) BatchWriterEncoderWrapper( enforce_table_writer(dataset).get_batch_writer(metrics), JSONRowEncoder(), ).write(rows) return ("ok", 200, {"Content-Type": "text/plain"})
def write_processed_messages(storage: WritableStorage, messages: Sequence[ProcessedMessage]) -> None: rows: MutableSequence[WriterTableRow] = [] for message in messages: assert isinstance(message, InsertBatch) rows.extend(message.rows) BatchWriterEncoderWrapper( storage.get_table_writer().get_batch_writer( metrics=DummyMetricsBackend(strict=True)), JSONRowEncoder(), ).write(rows)
def __init__( self, storage: WritableTableStorage, metrics: MetricsBackend, producer: Optional[ConfluentKafkaProducer] = None, replacements_topic: Optional[Topic] = None, ) -> None: self.__storage = storage self.producer = producer self.replacements_topic = replacements_topic self.metrics = metrics table_writer = storage.get_table_writer() self.__writer = BatchWriterEncoderWrapper( table_writer.get_batch_writer(metrics, { "load_balancing": "in_order", "insert_distributed_sync": 1 }), JSONRowEncoder(), ) self.__processor: MessageProcessor self.__pre_filter = table_writer.get_stream_loader().get_pre_filter()
class ConsumerWorker(AbstractBatchWorker[KafkaPayload, ProcessedMessage]): def __init__( self, storage: WritableTableStorage, metrics: MetricsBackend, producer: Optional[ConfluentKafkaProducer] = None, replacements_topic: Optional[Topic] = None, ) -> None: self.__storage = storage self.producer = producer self.replacements_topic = replacements_topic self.metrics = metrics table_writer = storage.get_table_writer() self.__writer = BatchWriterEncoderWrapper( table_writer.get_batch_writer(metrics, { "load_balancing": "in_order", "insert_distributed_sync": 1 }), JSONRowEncoder(), ) self.__processor: MessageProcessor self.__pre_filter = table_writer.get_stream_loader().get_pre_filter() def _get_processor(self) -> MessageProcessor: try: return self.__processor except AttributeError: self.__processor = (self.__storage.get_table_writer(). get_stream_loader().get_processor()) return self.__processor def process_message( self, message: Message[KafkaPayload]) -> Optional[ProcessedMessage]: if self.__pre_filter and self.__pre_filter.should_drop(message): return None return self._get_processor().process_message( rapidjson.loads(message.payload.value), KafkaMessageMetadata( offset=message.offset, partition=message.partition.index, timestamp=message.timestamp, ), ) def delivery_callback(self, error, message): if error is not None: # errors are KafkaError objects and inherit from BaseException raise error def flush_batch(self, batch: Sequence[ProcessedMessage]): """First write out all new INSERTs as a single batch, then reproduce any event replacements such as deletions, merges and unmerges.""" inserts: MutableSequence[WriterTableRow] = [] replacements: MutableSequence[ReplacementBatch] = [] for item in batch: if isinstance(item, InsertBatch): inserts.extend(item.rows) elif isinstance(item, ReplacementBatch): replacements.append(item) else: raise TypeError(f"unexpected type: {type(item)!r}") if inserts: self.__writer.write(inserts) self.metrics.timing("inserts", len(inserts)) if replacements: for replacement in replacements: key = replacement.key.encode("utf-8") for value in replacement.values: self.producer.produce( self.replacements_topic.name, key=key, value=rapidjson.dumps(value).encode("utf-8"), on_delivery=self.delivery_callback, ) self.producer.flush()
def generate_transactions(count: int) -> None: """ Generate a deterministic set of events across a time range. """ import calendar import pytz import uuid from datetime import datetime, timedelta table_writer = get_writable_storage( StorageKey.TRANSACTIONS).get_table_writer() rows = [] base_time = datetime.utcnow().replace( minute=0, second=0, microsecond=0, tzinfo=pytz.utc) - timedelta(minutes=count) for tick in range(count): trace_id = "7400045b25c443b885914600aa83ad04" span_id = "8841662216cc598b" processed = ( table_writer.get_stream_loader().get_processor().process_message( ( 2, "insert", { "project_id": 1, "event_id": uuid.uuid4().hex, "deleted": 0, "datetime": (base_time + timedelta(minutes=tick)).isoformat(), "platform": "javascript", "data": { # Project N sends every Nth (mod len(hashes)) hash (and platform) "received": calendar.timegm( (base_time + timedelta(minutes=tick)).timetuple()), "type": "transaction", "transaction": f"/api/do_things/{count}", # XXX(dcramer): would be nice to document why these have to be naive "start_timestamp": datetime.timestamp( (base_time + timedelta(minutes=tick)).replace( tzinfo=None)), "timestamp": datetime.timestamp( (base_time + timedelta(minutes=tick, seconds=1)).replace( tzinfo=None)), "contexts": { "trace": { "trace_id": trace_id, "span_id": span_id, "op": "http", "status": "0", }, }, "request": { "url": "http://127.0.0.1:/query", "headers": [ ["Accept-Encoding", "identity"], ["Content-Length", "398"], ["Host", "127.0.0.1:"], ["Referer", "tagstore.something"], ["Trace", "8fa73032d-1"], ], "data": "", "method": "POST", "env": { "SERVER_PORT": "1010", "SERVER_NAME": "snuba" }, }, "spans": [{ "op": "db", "trace_id": trace_id, "span_id": span_id + "1", "parent_span_id": None, "same_process_as_parent": True, "description": "SELECT * FROM users", "data": {}, "timestamp": calendar.timegm( (base_time + timedelta(minutes=tick)).timetuple()), }], }, }, ), KafkaMessageMetadata(0, 0, base_time), )) rows.extend(processed.rows) BatchWriterEncoderWrapper( table_writer.get_batch_writer(metrics=DummyMetricsBackend( strict=True)), JSONRowEncoder(), ).write(rows)
def write_rows(self, rows: Sequence[WriterTableRow]) -> None: BatchWriterEncoderWrapper( enforce_table_writer(self.dataset).get_batch_writer( metrics=DummyMetricsBackend(strict=True)), JSONRowEncoder(), ).write(rows)