def generate_transactions() -> None: from datetime import datetime table_writer = get_writable_storage( StorageKey.TRANSACTIONS).get_table_writer() rows = [] for i in range(5): raw_transaction = get_raw_transaction() # Older versions of this table did not have measurements del raw_transaction["data"]["measurements"] processed = ( table_writer.get_stream_loader().get_processor().process_message( (2, "insert", raw_transaction), KafkaMessageMetadata(0, 0, datetime.utcnow()), )) rows.extend(processed.rows) BatchWriterEncoderWrapper( table_writer.get_batch_writer(metrics=DummyMetricsBackend( strict=True)), JSONRowEncoder(), ).write(rows)
def write(*, dataset: Dataset) -> RespTuple: from snuba.processor import InsertBatch rows: MutableSequence[WriterTableRow] = [] offset_base = int(round(time.time() * 1000)) for index, message in enumerate(json.loads(http_request.data)): offset = offset_base + index processed_message = ( enforce_table_writer(dataset) .get_stream_loader() .get_processor() .process_message( message, KafkaMessageMetadata( offset=offset, partition=0, timestamp=datetime.utcnow() ), ) ) if processed_message: assert isinstance(processed_message, InsertBatch) rows.extend(processed_message.rows) BatchWriterEncoderWrapper( enforce_table_writer(dataset).get_batch_writer(metrics), JSONRowEncoder(), ).write(rows) return ("ok", 200, {"Content-Type": "text/plain"})
def write_processed_messages(storage: WritableStorage, messages: Sequence[ProcessedMessage]) -> None: rows: MutableSequence[WriterTableRow] = [] for message in messages: assert isinstance(message, InsertBatch) rows.extend(message.rows) BatchWriterEncoderWrapper( storage.get_table_writer().get_batch_writer( metrics=DummyMetricsBackend(strict=True)), JSONRowEncoder(), ).write(rows)
def bulk_load( *, storage_name: str, dest_table: str, source: str, log_level: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.load-snapshot") logger.info("Start bulk load process for storage %s, from source %s", storage_name, source) storage = get_cdc_storage(StorageKey(storage_name)) table_writer = storage.get_table_writer() # TODO: Have a more abstract way to load sources if/when we support more than one. snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) loader = table_writer.get_bulk_loader( snapshot_source, storage.get_postgres_table(), dest_table, storage.get_row_processor(), ) # TODO: see whether we need to pass options to the writer writer = BufferedWriterWrapper( table_writer.get_batch_writer( environment.metrics, table_name=dest_table, chunk_size=settings.BULK_CLICKHOUSE_BUFFER, ), settings.BULK_CLICKHOUSE_BUFFER, JSONRowEncoder(), ) loader.load(writer)
def __init__( self, storage: WritableTableStorage, metrics: MetricsBackend, producer: Optional[ConfluentKafkaProducer] = None, replacements_topic: Optional[Topic] = None, ) -> None: self.__storage = storage self.producer = producer self.replacements_topic = replacements_topic self.metrics = metrics table_writer = storage.get_table_writer() self.__writer = BatchWriterEncoderWrapper( table_writer.get_batch_writer(metrics, { "load_balancing": "in_order", "insert_distributed_sync": 1 }), JSONRowEncoder(), ) self.__processor: MessageProcessor self.__pre_filter = table_writer.get_stream_loader().get_pre_filter()
if self.__replacement_batch_writer is not None: self.__replacement_batch_writer.terminate() def join(self, timeout: Optional[float] = None) -> None: start = time.time() self.__insert_batch_writer.join(timeout) if self.__replacement_batch_writer is not None: if timeout is not None: timeout = max(timeout - (time.time() - start), 0) self.__replacement_batch_writer.join(timeout) json_row_encoder = JSONRowEncoder() def process_message( processor: MessageProcessor, message: Message[KafkaPayload] ) -> Union[None, JSONRowInsertBatch, ReplacementBatch]: result = processor.process_message( rapidjson.loads(message.payload.value), KafkaMessageMetadata( message.offset, message.partition.index, message.timestamp ), ) if isinstance(result, InsertBatch): return JSONRowInsertBatch( [json_row_encoder.encode(row) for row in result.rows],
def bulk_load( *, storage_name: str, dest_table: Optional[str], source: str, ignore_existing_data: bool, pre_processed: bool, show_progress: bool, log_level: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.load-snapshot") logger.info( "Start bulk load process for storage %s, from source %s", storage_name, source ) storage = get_cdc_storage(StorageKey(storage_name)) table_writer = storage.get_table_writer() # TODO: Have a more abstract way to load sources if/when we support more than one. snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) loader = table_writer.get_bulk_loader( snapshot_source, storage.get_postgres_table(), storage.get_row_processor(), dest_table, ) # TODO: see whether we need to pass options to the writer def progress_callback(bar: progressbar.ProgressBar, progress: int) -> None: bar.update(progress) if show_progress: progress = progressbar.ProgressBar( max_value=snapshot_source.get_table_file_size(storage.get_postgres_table()) ) progress_func: Optional[ProgressCallback] = partial(progress_callback, progress) else: progress_func = None table_descriptor = snapshot_source.get_descriptor().get_table( storage.get_postgres_table() ) if pre_processed: writer = table_writer.get_bulk_writer( metrics=environment.metrics, encoding="gzip" if table_descriptor.zip else None, column_names=[c.name for c in table_descriptor.columns or []], table_name=dest_table, ) loader.load_preprocessed( writer, ignore_existing_data, progress_callback=progress_func ) else: buffer_writer = BufferedWriterWrapper( table_writer.get_batch_writer( environment.metrics, table_name=dest_table, chunk_size=settings.BULK_CLICKHOUSE_BUFFER, ), settings.BULK_CLICKHOUSE_BUFFER, JSONRowEncoder(), ) loader.load( buffer_writer, ignore_existing_data, progress_callback=progress_func )
def generate_transactions(count: int) -> None: """ Generate a deterministic set of events across a time range. """ import calendar import pytz import uuid from datetime import datetime, timedelta table_writer = get_writable_storage( StorageKey.TRANSACTIONS).get_table_writer() rows = [] base_time = datetime.utcnow().replace( minute=0, second=0, microsecond=0, tzinfo=pytz.utc) - timedelta(minutes=count) for tick in range(count): trace_id = "7400045b25c443b885914600aa83ad04" span_id = "8841662216cc598b" processed = ( table_writer.get_stream_loader().get_processor().process_message( ( 2, "insert", { "project_id": 1, "event_id": uuid.uuid4().hex, "deleted": 0, "datetime": (base_time + timedelta(minutes=tick)).isoformat(), "platform": "javascript", "data": { # Project N sends every Nth (mod len(hashes)) hash (and platform) "received": calendar.timegm( (base_time + timedelta(minutes=tick)).timetuple()), "type": "transaction", "transaction": f"/api/do_things/{count}", # XXX(dcramer): would be nice to document why these have to be naive "start_timestamp": datetime.timestamp( (base_time + timedelta(minutes=tick)).replace( tzinfo=None)), "timestamp": datetime.timestamp( (base_time + timedelta(minutes=tick, seconds=1)).replace( tzinfo=None)), "contexts": { "trace": { "trace_id": trace_id, "span_id": span_id, "op": "http", "status": "0", }, }, "request": { "url": "http://127.0.0.1:/query", "headers": [ ["Accept-Encoding", "identity"], ["Content-Length", "398"], ["Host", "127.0.0.1:"], ["Referer", "tagstore.something"], ["Trace", "8fa73032d-1"], ], "data": "", "method": "POST", "env": { "SERVER_PORT": "1010", "SERVER_NAME": "snuba" }, }, "spans": [{ "op": "db", "trace_id": trace_id, "span_id": span_id + "1", "parent_span_id": None, "same_process_as_parent": True, "description": "SELECT * FROM users", "data": {}, "timestamp": calendar.timegm( (base_time + timedelta(minutes=tick)).timetuple()), }], }, }, ), KafkaMessageMetadata(0, 0, base_time), )) rows.extend(processed.rows) BatchWriterEncoderWrapper( table_writer.get_batch_writer(metrics=DummyMetricsBackend( strict=True)), JSONRowEncoder(), ).write(rows)
def write_rows(self, rows: Sequence[WriterTableRow]) -> None: BatchWriterEncoderWrapper( enforce_table_writer(self.dataset).get_batch_writer( metrics=DummyMetricsBackend(strict=True)), JSONRowEncoder(), ).write(rows)