def _get_engine_type(self, cluster: ClickhouseCluster, table_name: str) -> str: database_name = cluster.get_database() if cluster.is_single_node(): return f"ReplacingMergeTree({self.__version_column})" elif self._unsharded is True: return f"ReplicatedReplacingMergeTree('/clickhouse/tables/{self._storage_set_value}/all/{database_name}/{table_name}', '{{replica}}', {self.__version_column})" else: return f"ReplicatedReplacingMergeTree('/clickhouse/tables/{self._storage_set_value}/{{shard}}/{database_name}/{table_name}', '{{replica}}', {self.__version_column})"
def get_sql(self, cluster: ClickhouseCluster, table_name: str) -> str: cluster_name = cluster.get_clickhouse_cluster_name() assert not cluster.is_single_node() assert cluster_name is not None database_name = cluster.get_database() optional_sharding_key = (f", {self.__sharding_key}" if self.__sharding_key else "") return f"Distributed({cluster_name}, {database_name}, {self.__local_table_name}{optional_sharding_key})"
def test_no_split(dataset_name: str, id_column: str, project_column: str, timestamp_column: str) -> None: events = get_dataset(dataset_name) query = ClickhouseQuery( events.get_default_entity().get_all_storages() [0].get_schema().get_data_source(), ) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader, ) -> QueryResult: assert query == query return QueryResult({}, {}) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy( id_column=id_column, project_column=project_column, timestamp_column=timestamp_column, ), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
def _get_engine_type(self, cluster: ClickhouseCluster, table_name: str) -> str: if cluster.is_single_node(): return "MergeTree()" else: zoo_path = self._get_zookeeper_path(cluster, table_name) return f"ReplicatedMergeTree({zoo_path}, '{{replica}}')"
def _get_engine_type(self, cluster: ClickhouseCluster, table_name: str) -> str: if cluster.is_single_node(): return "MergeTree()" elif self._unsharded is True: return f"ReplicatedMergeTree('/clickhouse/tables/{self._storage_set_value}/all/{table_name}', '{{replica}}')" else: return f"ReplicatedMergeTree('/clickhouse/tables/{self._storage_set_value}/{{shard}}/{table_name}', '{{replica}}')"
def test_col_split( dataset_name: str, id_column: str, project_column: str, timestamp_column: str, first_query_data: Sequence[MutableMapping[str, Any]], second_query_data: Sequence[MutableMapping[str, Any]], ) -> None: def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader[SqlQuery], ) -> QueryResult: selected_cols = query.get_selected_columns() assert selected_cols == [ c.expression.column_name for c in query.get_selected_columns_from_ast() or [] if isinstance(c.expression, Column) ] if selected_cols == list(first_query_data[0].keys()): return QueryResult({"data": first_query_data}, {}) elif selected_cols == list(second_query_data[0].keys()): return QueryResult({"data": second_query_data}, {}) else: raise ValueError(f"Unexpected selected columns: {selected_cols}") events = get_dataset(dataset_name) query = ClickhouseQuery( LogicalQuery( { "selected_columns": list(second_query_data[0].keys()), "conditions": [""], "orderby": "events.event_id", "sample": 10, "limit": 100, "offset": 50, }, events.get_all_storages()[0].get_schema().get_data_source(), selected_columns=[ SelectedExpression(name=col_name, expression=Column(None, None, col_name)) for col_name in second_query_data[0].keys() ], )) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy(id_column, project_column, timestamp_column), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
def _get_engine_type(self, cluster: ClickhouseCluster, table_name: str) -> str: if cluster.is_single_node(): if self.__version_column: return f"ReplacingMergeTree({self.__version_column})" return "ReplacingMergeTree()" else: zoo_path = self._get_zookeeper_path(cluster, table_name) if self.__version_column: return f"ReplicatedReplacingMergeTree({zoo_path}, '{{replica}}', {self.__version_column})" return f"ReplicatedReplacingMergeTree({zoo_path}, '{{replica}}')"
def _get_zookeeper_path(self, cluster: ClickhouseCluster, table_name: str) -> str: database_name = cluster.get_database() if self._unsharded is True: path = f"/clickhouse/tables/{self._storage_set_value}/all/{database_name}/{table_name}" else: path = f"/clickhouse/tables/{self._storage_set_value}/{{shard}}/{database_name}/{table_name}" path_with_override = settings.CLICKHOUSE_ZOOKEEPER_OVERRIDE.get( path, path) return f"'{path_with_override}'"
def pytest_configure() -> None: """ Set up the Sentry SDK to avoid errors hidden by configuration. Ensure the snuba_test database exists """ assert ( settings.TESTING ), "settings.TESTING is False, try `SNUBA_SETTINGS=test` or `make test`" setup_sentry() for cluster in settings.CLUSTERS: clickhouse_cluster = ClickhouseCluster( host=cluster["host"], port=cluster["port"], user="******", password="", database="default", http_port=cluster["http_port"], storage_sets=cluster["storage_sets"], single_node=cluster["single_node"], cluster_name=cluster["cluster_name"] if "cluster_name" in cluster else None, distributed_cluster_name=cluster["distributed_cluster_name"] if "distributed_cluster_name" in cluster else None, ) database_name = cluster["database"] nodes = [ *clickhouse_cluster.get_local_nodes(), *clickhouse_cluster.get_distributed_nodes(), ] for node in nodes: connection = clickhouse_cluster.get_node_connection( ClickhouseClientSettings.MIGRATE, node) connection.execute(f"DROP DATABASE IF EXISTS {database_name};") connection.execute(f"CREATE DATABASE {database_name};")
def test_col_split( dataset_name: str, id_column: str, project_column: str, timestamp_column: str, first_query_data: Sequence[MutableMapping[str, Any]], second_query_data: Sequence[MutableMapping[str, Any]], ) -> None: def do_query( query: ClickhouseQuery, query_settings: QuerySettings, reader: Reader, ) -> QueryResult: selected_col_names = [ c.expression.column_name for c in query.get_selected_columns() or [] if isinstance(c.expression, Column) ] if selected_col_names == list(first_query_data[0].keys()): return QueryResult({"data": first_query_data}, {}) elif selected_col_names == list(second_query_data[0].keys()): return QueryResult({"data": second_query_data}, {}) else: raise ValueError( f"Unexpected selected columns: {selected_col_names}") events = get_dataset(dataset_name) query = ClickhouseQuery( events.get_default_entity().get_all_storages() [0].get_schema().get_data_source(), selected_columns=[ SelectedExpression(name=col_name, expression=Column(None, None, col_name)) for col_name in second_query_data[0].keys() ], ) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy(id_column, project_column, timestamp_column), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPQuerySettings(), do_query)
def test_no_split( dataset_name: str, id_column: str, project_column: str, timestamp_column: str ) -> None: events = get_dataset(dataset_name) query = ClickhouseQuery( LogicalQuery( { "selected_columns": ["event_id"], "conditions": [""], "orderby": "event_id", "sample": 10, "limit": 100, "offset": 50, }, events.get_all_storages()[0].get_schema().get_data_source(), ) ) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader[SqlQuery], ) -> QueryResult: assert query == query return QueryResult({}, {}) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy( id_column=id_column, project_column=project_column, timestamp_column=timestamp_column, ), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
import pytest from snuba.clusters.cluster import ClickhouseCluster from snuba.clusters.storage_sets import StorageSetKey from snuba.migrations import table_engines single_node_cluster = ClickhouseCluster( host="host_1", port=9000, user="******", password="", database="default", http_port=8123, storage_sets={"events"}, single_node=True, ) multi_node_cluster = ClickhouseCluster( host="host_2", port=9000, user="******", password="", database="default", http_port=8123, storage_sets={"events"}, single_node=False, cluster_name="cluster_1", distributed_cluster_name="dist_hosts", ) merge_test_cases = [
def is_valid_node(host: str, port: int, cluster: ClickhouseCluster) -> bool: nodes = [*cluster.get_local_nodes(), cluster.get_query_node()] return any(node.host_name == host and node.port == port for node in nodes)