def ensure_table_exists(dataset, force=False): if not force and _ensured.get(dataset, False): return assert local_dataset_mode(), "Cannot create table in distributed mode" from snuba import migrate # We cannot build distributed tables this way. So this only works in local # mode. for statement in dataset.get_dataset_schemas().get_create_statements(): clickhouse_rw.execute(statement) migrate.run(clickhouse_rw, dataset) _ensured[dataset] = True
def migrate(log_level): from snuba.migrate import logger, run # TODO: this only supports one dataset so far. More work is needed for the others. dataset = get_dataset('events') logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s') if not local_dataset_mode(): logger.error("The migration tool can only work on local dataset mode.") sys.exit(1) clickhouse = Client( host=settings.CLICKHOUSE_HOST, port=settings.CLICKHOUSE_PORT, ) run(clickhouse, dataset)
def migrate(log_level): from snuba.migrate import logger, run logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s') if settings.CLICKHOUSE_TABLE != 'dev': logger.error( "The migration tool is only intended for local development environment." ) sys.exit(1) host, port = settings.CLICKHOUSE_SERVER.split(':') clickhouse = Client( host=host, port=port, ) run(clickhouse, settings.CLICKHOUSE_TABLE)
def migrate(*, log_level: Optional[str] = None, dataset_name: Optional[str] = None) -> None: from snuba.migrate import logger, run setup_logging(log_level) if not local_dataset_mode(): logger.error("The migration tool can only work on local dataset mode.") sys.exit(1) dataset_names = [dataset_name] if dataset_name else DATASET_NAMES for name in dataset_names: dataset = get_dataset(name) logger.info("Migrating dataset %s", name) clickhouse = Client( host=settings.CLICKHOUSE_HOST, port=settings.CLICKHOUSE_PORT, ) run(clickhouse, dataset)
def bootstrap( *, bootstrap_server: Sequence[str], kafka: bool, force: bool, log_level: Optional[str] = None, ) -> None: """ Warning: Not intended to be used in production yet. """ if not force: raise click.ClickException("Must use --force to run") setup_logging(log_level) logger = logging.getLogger("snuba.bootstrap") import time if kafka: logger.debug("Using Kafka with %r", bootstrap_server) from confluent_kafka.admin import AdminClient, NewTopic attempts = 0 while True: try: logger.debug("Attempting to connect to Kafka (attempt %d)", attempts) client = AdminClient( { "bootstrap.servers": ",".join(bootstrap_server), "socket.timeout.ms": 1000, } ) client.list_topics(timeout=1) break except Exception as e: logger.error( "Connection to Kafka failed (attempt %d)", attempts, exc_info=e ) attempts += 1 if attempts == 60: raise time.sleep(1) topics = {} for name in DATASET_NAMES: dataset = get_dataset(name) table_writer = dataset.get_table_writer() if table_writer: stream_loader = table_writer.get_stream_loader() for topic_spec in stream_loader.get_all_topic_specs(): if topic_spec.topic_name in topics: continue logger.debug( "Adding topic %s to creation list", topic_spec.topic_name ) topics[topic_spec.topic_name] = NewTopic( topic_spec.topic_name, num_partitions=topic_spec.partitions_number, replication_factor=topic_spec.replication_factor, ) logger.debug("Initiating topic creation") for topic, future in client.create_topics( list(topics.values()), operation_timeout=1 ).items(): try: future.result() logger.info("Topic %s created", topic) except Exception as e: logger.error("Failed to create topic %s", topic, exc_info=e) attempts = 0 while True: try: logger.debug("Attempting to connect to Clickhouse (attempt %d)", attempts) clickhouse_rw.execute("SELECT 1") break except Exception as e: logger.error( "Connection to Clickhouse failed (attempt %d)", attempts, exc_info=e ) attempts += 1 if attempts == 60: raise time.sleep(1) # Need to better figure out if we are configured to use replicated # tables or distributed tables, etc. # Create the tables for every dataset. existing_tables = {row[0] for row in clickhouse_rw.execute("show tables")} for name in DATASET_NAMES: dataset = get_dataset(name) logger.debug("Creating tables for dataset %s", name) run_migrations = False for statement in dataset.get_dataset_schemas().get_create_statements(): if statement.table_name not in existing_tables: # This is a hack to deal with updates to Materialized views. # It seems that ClickHouse would parse the SELECT statement that defines a # materialized view even if the view already exists and the CREATE statement # includes the IF NOT EXISTS clause. # When we add a column to a matview, though, we will be in a state where, by # running bootstrap, ClickHouse will parse the SQL statement to try to create # the view and fail because the column does not exist yet on the underlying table, # since the migration on the underlying table has not ran yet. # Migrations are per dataset so they can only run after the bootstrap of an # entire dataset has run. So we would have bootstrap depending on migration # and migration depending on bootstrap. # In order to break this dependency we skip bootstrap DDL calls here if the # table/view already exists, so it is always safe to run bootstrap first. logger.debug("Executing:\n%s", statement.statement) clickhouse_rw.execute(statement.statement) else: logger.debug("Skipping existing table %s", statement.table_name) run_migrations = True if run_migrations: logger.debug("Running missing migrations for dataset %s", name) run(clickhouse_rw, dataset) logger.info("Tables for dataset %s created.", name)
def ensure_table_exists(): from snuba.clickhouse import get_table_definition, get_test_engine clickhouse_rw.execute( get_table_definition( name=settings.CLICKHOUSE_TABLE, engine=get_test_engine(), ) ) ensure_table_exists() if settings.CLICKHOUSE_TABLE == 'dev': from snuba import migrate migrate.run(clickhouse_rw, settings.CLICKHOUSE_TABLE) @application.route('/tests/insert', methods=['POST']) def write(): from snuba.processor import process_message from snuba.writer import row_from_processed_event, write_rows body = json.loads(request.data) rows = [] for event in body: _, processed = process_message(event) row = row_from_processed_event(processed) rows.append(row) ensure_table_exists()