def test_reconnect(FakeClient: Client) -> None: # If the connection NetworkErrors a first time, make sure we call it a second time. FakeClient.return_value.execute.side_effect = [ errors.NetworkError, '{"data": "to my face"}', ] cp = ClickhousePool("0:0:0:0", 9000, "default", "", "default") cp.execute("SHOW TABLES") assert FakeClient.return_value.execute.mock_calls == [ call( "SHOW TABLES", params=None, with_column_types=False, query_id=None, settings=None, types_check=False, columnar=False, ), call( "SHOW TABLES", params=None, with_column_types=False, query_id=None, settings=None, types_check=False, columnar=False, ), ]
def test_add_node() -> None: host_name = os.environ.get("CLICKHOUSE_HOST", "localhost") port = int(os.environ.get("CLICKHOUSE_PORT", 9000)) user = "******" password = "" database = os.environ.get("CLICKHOUSE_DATABASE", "default") client = ClickhousePool( host_name, port, user, password, database, ) assert set(client.execute("SHOW TABLES")) == set() runner.Runner.add_node( node_type=cluster.ClickhouseNodeType.LOCAL, storage_sets=[StorageSetKey.OUTCOMES], host_name=host_name, port=port, user=user, password=password, database=database, ) assert set(client.execute("SHOW TABLES")) == { ("outcomes_raw_local", ), ("outcomes_hourly_local", ), ("outcomes_mv_hourly_local", ), }
def drop_partitions( clickhouse: ClickhousePool, database: str, table: str, parts: Sequence[Tuple[datetime, int]], dry_run: bool = True, ) -> None: query_template = """\ ALTER TABLE %(database)s.%(table)s DROP PARTITION ('%(date_str)s', %(retention_days)s) """ for part_date, retention_days in parts: args = { "database": database, "table": table, "date_str": part_date.strftime("%Y-%m-%d"), "retention_days": retention_days, } query = (query_template % args).strip() if dry_run: logger.info("Dry run: " + query) else: logger.info("Dropping partition: " + query) clickhouse.execute(query)
def load(self, writer: BufferedWriterWrapper) -> None: logger = logging.getLogger('snuba.bulk-loader') clickhouse_ro = ClickhousePool(client_settings={ 'readonly': True, }) clickhouse_tables = clickhouse_ro.execute('show tables') if (self.__dest_table, ) not in clickhouse_tables: raise ValueError("Destination table %s does not exists" % self.__dest_table) table_content = clickhouse_ro.execute("select count(*) from %s" % self.__dest_table) if table_content != [(0, )]: raise ValueError("Destination Table is not empty") descriptor = self.__source.get_descriptor() logger.info("Loading snapshot %s", descriptor.id) with self.__source.get_table_file(self.__source_table) as table: logger.info("Loading table %s from file", self.__source_table) row_count = 0 with writer as buffer_writer: for row in table: clickhouse_data = self.__row_processor(row) buffer_writer.write(clickhouse_data) row_count += 1 logger.info("Load complete %d records loaded", row_count)
def run_query( connection: ClickhousePool, query: str, records_count: int, metrics: MetricsBackend, ) -> None: connection.execute_robust(query) queries[connection.host].append(query)
def test_reconnect(self, FakeClient): # If the connection NetworkErrors a first time, make sure we call it a second time. FakeClient.return_value.execute.side_effect = [ errors.NetworkError, '{"data": "to my face"}' ] cp = ClickhousePool() cp.execute("SHOW TABLES") assert FakeClient.return_value.execute.mock_calls == [ call("SHOW TABLES"), call("SHOW TABLES") ]
def test_get_fallback_host() -> None: FALLBACK_HOSTS_CONFIG_VAL = "host1:100,host2:100,host3:100" FALLBACK_HOSTS = [("host1", 100), ("host2", 100), ("host3", 100)] state.set_config(f"fallback_hosts:{CLUSTER_HOST}:{CLUSTER_PORT}", FALLBACK_HOSTS_CONFIG_VAL) pool = ClickhousePool(CLUSTER_HOST, CLUSTER_PORT, "test", "test", TEST_DB_NAME) assert pool.get_fallback_host() in FALLBACK_HOSTS
def test_reconnect(FakeClient) -> None: # If the connection NetworkErrors a first time, make sure we call it a second time. FakeClient.return_value.execute.side_effect = [ errors.NetworkError, '{"data": "to my face"}', ] cp = ClickhousePool("0:0:0:0", 9000, "default", "", "default") cp.execute("SHOW TABLES") assert FakeClient.return_value.execute.mock_calls == [ call("SHOW TABLES"), call("SHOW TABLES"), ]
def optimize( *, clickhouse_host: str, clickhouse_port: int, database: str, dataset_name: str, timeout: int, log_level: Optional[str] = None, ) -> None: from datetime import datetime from snuba.clickhouse.native import ClickhousePool from snuba.optimize import run_optimize, logger setup_logging(log_level) dataset = get_dataset(dataset_name) table = enforce_table_writer(dataset).get_schema().get_local_table_name() today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) clickhouse = ClickhousePool(clickhouse_host, clickhouse_port, send_receive_timeout=timeout) num_dropped = run_optimize(clickhouse, database, table, before=today) logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
def get_active_partitions(clickhouse: ClickhousePool, storage: WritableTableStorage, database: str, table: str) -> Sequence[util.Part]: response = clickhouse.execute( """ SELECT DISTINCT partition FROM system.parts WHERE database = %(database)s AND table = %(table)s AND active = 1 """, { "database": database, "table": table }, ) schema = storage.get_schema() assert isinstance(schema, TableSchema) part_format = schema.get_part_format() assert part_format is not None return [ util.decode_part_str(part, part_format) for part, in response.results ]
def get_node_connection( self, client_settings: ClickhouseClientSettings, node: ClickhouseNode, ) -> ClickhousePool: """ Get a Clickhouse connection using the client settings provided. Reuse any connection to the same node with the same settings otherwise establish a new connection. """ settings, timeout = client_settings.value cache_key = (node, client_settings) if cache_key not in self.__connection_cache: self.__connection_cache[cache_key] = ClickhousePool( node.host_name, node.port, self.__user, self.__password, self.__database, client_settings=settings, send_receive_timeout=timeout, ) return self.__connection_cache[cache_key]
def perform_select_query( columns: Sequence[str], table: str, where: Optional[Dict[str, str]], limit: Optional[str], connection: ClickhousePool, ) -> Sequence[Any]: """ Performs a SELECT query, with optional WHERE and LIMIT clauses Arguments: columns -- a list of columns to be SELECTed table -- the name of the table, upon which query is being run where -- a dict of WHERE conditions, (str, str) key-value pairs limit -- LIMIT argument, passed in as str connection -- ClickHouse connection object for query execution """ select_clause = "SELECT " + (", ".join(columns)) from_clause = " FROM " + table where_clause = "" if where: where_elems = [(key + " = " + "'" + where[key] + "'") for key in where] where_clause = " WHERE " + (" AND ".join(where_elems)) limit_clause = (" LIMIT " + limit) if limit else "" full_query = select_clause + from_clause + where_clause + limit_clause return connection.execute(full_query)
def optimize_partitions( clickhouse: ClickhousePool, database: str, table: str, parts: Sequence[util.Part], ignore_cutoff: bool, clickhouse_host: Optional[str] = None, ) -> None: query_template = """\ OPTIMIZE TABLE %(database)s.%(table)s PARTITION %(partition)s FINAL """ # Adding 10 minutes to the current time before finding the midnight time # to ensure this keeps working even if the system clock of the host that # starts the pod is slightly ahead of the system clock of the host running # the job. This prevents us from getting the wrong midnight. last_midnight = (datetime.now() + timedelta(minutes=10)).replace( hour=0, minute=0, second=0, microsecond=0) if not ignore_cutoff: cutoff_time: Optional[datetime] = (last_midnight + settings.OPTIMIZE_JOB_CUTOFF_TIME) logger.info("Cutoff time: %s", str(cutoff_time)) else: cutoff_time = None logger.info("Ignoring cutoff time") for part in parts: if cutoff_time is not None and datetime.now() > cutoff_time: raise JobTimeoutException( "Optimize job is running past the cutoff time. Abandoning.") args = { "database": database, "table": table, "partition": part.name, } query = (query_template % args).strip() logger.info(f"Optimizing partition: {part.name}") start = time.time() clickhouse.execute(query) metrics.timing( "optimized_part", time.time() - start, tags=_get_metrics_tags(table, clickhouse_host), )
def add_node( self, node_type: ClickhouseNodeType, storage_sets: Sequence[StorageSetKey], host_name: str, port: int, user: str, password: str, database: str, ) -> None: client_settings = ClickhouseClientSettings.MIGRATE.value clickhouse = ClickhousePool( host_name, port, user, password, database, client_settings=client_settings.settings, send_receive_timeout=client_settings.timeout, ) migrations: List[Migration] = [] for group in get_active_migration_groups(): group_loader = get_group_loader(group) for migration_id in group_loader.get_migrations(): migration = group_loader.load_migration(migration_id) migrations.append(migration) for migration in migrations: if isinstance(migration, ClickhouseNodeMigration): operations = ( migration.forwards_local() if node_type == ClickhouseNodeType.LOCAL else migration.forwards_dist() ) for sql_op in operations: if isinstance(sql_op, SqlOperation): if sql_op._storage_set in storage_sets: sql = sql_op.format_sql() print(f"Executing {sql}") clickhouse.execute(sql) elif isinstance(migration, CodeMigration): for python_op in migration.forwards_global(): python_op.execute_new_node(storage_sets)
def test_messages(self): processor = GroupAssigneeProcessor('sentry_groupasignee') metadata = KafkaMessageMetadata( offset=42, partition=0, ) begin_msg = json.loads(self.BEGIN_MSG) ret = processor.process_message(begin_msg, metadata) assert ret is None commit_msg = json.loads(self.COMMIT_MSG) ret = processor.process_message(commit_msg, metadata) assert ret is None insert_msg = json.loads(self.INSERT_MSG) ret = processor.process_message(insert_msg, metadata) assert ret.data == [self.PROCESSED] self.write_processed_records(ret.data) cp = ClickhousePool() ret = cp.execute("SELECT * FROM test_groupassignee_local;") assert ret[0] == ( 42, # offset 0, # deleted 2, # project_id 1359, # group_id datetime(2019, 9, 19, 0, 17, 55), 1, # user_id None, # team_id ) update_msg = json.loads(self.UPDATE_MSG_NO_KEY_CHANGE) ret = processor.process_message(update_msg, metadata) assert ret.data == [self.PROCESSED] # Tests an update with key change which becomes a two inserts: # one deletion and the insertion of the new row. update_msg = json.loads(self.UPDATE_MSG_WITH_KEY_CHANGE) ret = processor.process_message(update_msg, metadata) assert ret.data == [self.DELETED, self.PROCESSED_UPDATE] delete_msg = json.loads(self.DELETE_MSG) ret = processor.process_message(delete_msg, metadata) assert ret.data == [self.DELETED]
def optimize( *, clickhouse_host: Optional[str], clickhouse_port: Optional[int], storage_name: str, parallel: int, log_level: Optional[str] = None, ) -> None: from datetime import datetime from snuba.clickhouse.native import ClickhousePool from snuba.optimize import logger, run_optimize setup_logging(log_level) setup_sentry() storage: ReadableTableStorage storage_key = StorageKey(storage_name) storage = get_storage(storage_key) (clickhouse_user, clickhouse_password) = storage.get_cluster().get_credentials() today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) database = storage.get_cluster().get_database() # TODO: In distributed mode, optimize currently must be run once for each node # with the host and port of that node provided via the CLI. In the future, # passing this information won't be necessary, and running this command once # will ensure that optimize is performed on all of the individual nodes for # that cluster. if clickhouse_host and clickhouse_port: connection = ClickhousePool( clickhouse_host, clickhouse_port, clickhouse_user, clickhouse_password, database, send_receive_timeout=ClickhouseClientSettings.OPTIMIZE.value.timeout, ) elif not storage.get_cluster().is_single_node(): raise click.ClickException("Provide Clickhouse host and port for optimize") else: connection = storage.get_cluster().get_query_connection( ClickhouseClientSettings.OPTIMIZE ) num_dropped = run_optimize( connection, storage, database, before=today, parallel=parallel, clickhouse_host=clickhouse_host, ) logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
def setup_method(self, test_method, dataset_name=None): assert settings.TESTING, "settings.TESTING is False, try `SNUBA_SETTINGS=test` or `make test`" self.database = 'default' self.dataset_name = dataset_name if self.dataset_name: self.dataset = get_dataset(self.dataset_name) self.clickhouse = ClickhousePool() for statement in self.dataset.get_dataset_schemas( ).get_drop_statements(): self.clickhouse.execute(statement) for statement in self.dataset.get_dataset_schemas( ).get_create_statements(): self.clickhouse.execute(statement) redis_client.flushdb()
def check_clickhouse(clickhouse: ClickhousePool) -> None: ver = clickhouse.execute("SELECT version()").results[0][0] # The newer versions of altinity on arm add this to the version # and it breaks this check ver = ver.replace(".testingarm", "") ver = ver.replace(".altinitystable", "") if version.parse(ver) < version.parse(CLICKHOUSE_SERVER_MIN_VERSION): raise InvalidClickhouseVersion( f"Snuba requires Clickhouse version {CLICKHOUSE_SERVER_MIN_VERSION} ({clickhouse.host}:{clickhouse.port} - {ver})" )
def test_bulk_load(self): row = GroupAssigneeRow.from_bulk({ 'project_id': '2', 'group_id': '1359', 'date_added': '2019-09-19 00:17:55+00', 'user_id': '1', 'team_id': '', }) self.write_processed_records(row.to_clickhouse()) cp = ClickhousePool() ret = cp.execute("SELECT * FROM test_groupassignee_local;") assert ret[0] == ( 0, # offset 0, # deleted 2, # project_id 1359, # group_id datetime(2019, 9, 19, 0, 17, 55), 1, # user_id None, # team_id )
def test_messages(self): processor = GroupedMessageProcessor('sentry_groupedmessage') metadata = KafkaMessageMetadata( offset=42, partition=0, ) begin_msg = json.loads(self.BEGIN_MSG) ret = processor.process_message(begin_msg, metadata) assert ret is None commit_msg = json.loads(self.COMMIT_MSG) ret = processor.process_message(commit_msg, metadata) assert ret is None insert_msg = json.loads(self.INSERT_MSG) ret = processor.process_message(insert_msg, metadata) assert ret[1] == self.PROCESSED self.write_processed_records(ret[1]) cp = ClickhousePool() ret = cp.execute("SELECT * FROM test_groupedmessage_local;") assert ret[0] == ( 42, # offset 0, # deleted 2, # project_id 74, # id 0, # status datetime(2019, 6, 19, 6, 46, 28), datetime(2019, 6, 19, 6, 45, 32), datetime(2019, 6, 19, 6, 45, 32), None, ) update_msg = json.loads(self.UPDATE_MSG) ret = processor.process_message(update_msg, metadata) assert ret[1] == self.PROCESSED delete_msg = json.loads(self.DELETE_MSG) ret = processor.process_message(delete_msg, metadata) assert ret[1] == self.DELETED
def get_ro_node_connection( clickhouse_host: str, clickhouse_port: int, storage_name: str, client_settings: ClickhouseClientSettings, ) -> ClickhousePool: storage_key = None try: storage_key = StorageKey(storage_name) except ValueError: raise InvalidStorageError( f"storage {storage_name} is not a valid storage name", extra_data={"storage_name": storage_name}, ) key = f"{storage_key}-{clickhouse_host}" if key in NODE_CONNECTIONS: return NODE_CONNECTIONS[key] storage = get_storage(storage_key) cluster = storage.get_cluster() if not is_valid_node(clickhouse_host, clickhouse_port, cluster): raise InvalidNodeError( f"host {clickhouse_host} and port {clickhouse_port} are not valid", extra_data={"host": clickhouse_host, "port": clickhouse_port}, ) database = cluster.get_database() assert client_settings in { ClickhouseClientSettings.QUERY, ClickhouseClientSettings.TRACING, }, "admin can only use QUERY or TRACING ClickhouseClientSettings" if client_settings == ClickhouseClientSettings.QUERY: username = settings.CLICKHOUSE_READONLY_USER password = settings.CLICKHOUSE_READONLY_PASSWORD else: username = settings.CLICKHOUSE_TRACE_USER password = settings.CLICKHOUSE_TRACE_PASSWORD connection = ClickhousePool( clickhouse_host, clickhouse_port, username, password, database, max_pool_size=2, client_settings=client_settings.value.settings, ) NODE_CONNECTIONS[key] = connection return connection
def test_fallback_logic() -> None: state.set_config("use_fallback_host_in_native_connection_pool", 1) network_failure_connection = mock.Mock() network_failure_connection.execute.side_effect = EOFError() verification_connection = mock.Mock() verification_connection.execute.return_value = [] pool = ClickhousePool(CLUSTER_HOST, CLUSTER_PORT, "test", "test", TEST_DB_NAME) # The execute method will try to reuse a single slot in the connection # pool but reestablish new connections with _create_conn if a connection # fails with a network-related error. It may be cleaner to move connection # negotation/establishment into another class for separation of concerns. with mock.patch.object(pool, "_create_conn", lambda x, y=False: network_failure_connection): pool.pool = queue.LifoQueue(1) pool.pool.put(network_failure_connection, block=False) pool.fallback_pool = queue.LifoQueue(1) pool.fallback_pool.put(verification_connection, block=False) pool.execute("SELECT something") assert (network_failure_connection.execute.call_count == 3 ), "Expected three (failed) attempts with main connection pool" assert ( verification_connection.execute.call_count == 1 ), "Expected one (successful) attempt with fallback connection pool"
def optimize_partitions( clickhouse: ClickhousePool, database: str, table: str, parts: Sequence[util.Part], ) -> None: query_template = """\ OPTIMIZE TABLE %(database)s.%(table)s PARTITION %(partition)s FINAL """ for part in parts: args = { "database": database, "table": table, "partition": part.name, } query = (query_template % args).strip() logger.info(f"Optimizing partition: {part.name}") clickhouse.execute(query)
def test_bulk_load(self): row = GroupedMessageRow.from_bulk({ 'project_id': '2', 'id': '10', 'status': '0', 'last_seen': '2019-06-28 17:57:32+00', 'first_seen': '2019-06-28 06:40:17+00', 'active_at': '2019-06-28 06:40:17+00', 'first_release_id': '26', }) self.write_processed_records(row.to_clickhouse()) cp = ClickhousePool() ret = cp.execute("SELECT * FROM test_groupedmessage_local;") assert ret[0] == ( 0, # offset 0, # deleted 2, # project_id 10, # id 0, # status datetime(2019, 6, 28, 17, 57, 32), datetime(2019, 6, 28, 6, 40, 17), datetime(2019, 6, 28, 6, 40, 17), 26, )
def drop_partitions( clickhouse: ClickhousePool, database: str, table: str, parts: Sequence[util.Part], dry_run: bool = True, ) -> None: query_template = """\ ALTER TABLE %(database)s.%(table)s DROP PARTITION %(partition)s """ for part in parts: args = { "database": database, "table": table, "partition": part.name, } query = (query_template % args).strip() if dry_run: logger.info("Dry run: " + query) else: logger.info("Dropping partition: " + query) clickhouse.execute(query)
def run_query( connection: ClickhousePool, query: str, records_count: int, metrics: MetricsBackend, ) -> None: t = time.time() logger.debug("Executing replace query: %s" % query) connection.execute_robust(query) duration = int((time.time() - t) * 1000) logger.info("Replacing %s rows took %sms" % (records_count, duration)) metrics.timing( "replacements.count", records_count, tags={"host": connection.host}, ) metrics.timing( "replacements.duration", duration, tags={"host": connection.host}, )
def get_active_partitions( clickhouse: ClickhousePool, database: str, table: str ) -> Sequence[util.Part]: response = clickhouse.execute( """ SELECT DISTINCT partition FROM system.parts WHERE database = %(database)s AND table = %(table)s AND active = 1 """, {"database": database, "table": table}, ) return [util.decode_part_str(part) for part, in response]
def test_cache_partition() -> None: pool = ClickhousePool("localhost", 9000, "", "", "") reader1 = NativeDriverReader(None, pool) reader2 = NativeDriverReader(None, pool) default_cache = _get_cache_partition(reader1) another_default_cache = _get_cache_partition(reader2) assert id(default_cache) == id(another_default_cache) reader3 = NativeDriverReader("non_default", pool) reader4 = NativeDriverReader("non_default", pool) nondefault_cache = _get_cache_partition(reader3) another_nondefault_cache = _get_cache_partition(reader4) assert id(nondefault_cache) == id(another_nondefault_cache) assert id(default_cache) != id(nondefault_cache)
def cleanup( *, clickhouse_host: Optional[str], clickhouse_port: Optional[int], dry_run: bool, database: str, storage_name: str, log_level: Optional[str] = None, ) -> None: """ Deletes stale partitions for ClickHouse tables """ setup_logging(log_level) from snuba.cleanup import run_cleanup, logger from snuba.clickhouse.native import ClickhousePool writable_storage = get_writable_storage(StorageKey(storage_name)) ( clickhouse_user, clickhouse_password, ) = writable_storage.get_cluster().get_credentials() table = writable_storage.get_table_writer().get_schema( ).get_local_table_name() if clickhouse_host and clickhouse_port and database: connection = ClickhousePool( clickhouse_host, clickhouse_port, clickhouse_user, clickhouse_password, database, ) elif not writable_storage.get_cluster().is_single_node(): raise click.ClickException( "Provide ClickHouse host and port for cleanup") else: connection = writable_storage.get_cluster().get_query_connection( ClickhouseClientSettings.CLEANUP) num_dropped = run_cleanup(connection, database, table, dry_run=dry_run) logger.info("Dropped %s partitions on %s" % (num_dropped, clickhouse_host))
def cleanup(clickhouse_host, clickhouse_port, dry_run, database, dataset, log_level): """ Deletes stale partitions for ClickHouse tables """ from snuba.cleanup import run_cleanup, logger from snuba.clickhouse.native import ClickhousePool dataset = get_dataset(dataset) table = enforce_table_writer(dataset).get_schema().get_local_table_name() logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s') clickhouse = ClickhousePool(clickhouse_host, clickhouse_port) num_dropped = run_cleanup(clickhouse, database, table, dry_run=dry_run) logger.info("Dropped %s partitions on %s" % (num_dropped, clickhouse_host))