def _clear_redis_and_force_merge(self) -> None: redis_client.flushdb() cluster = self.storage.get_cluster() clickhouse = cluster.get_query_connection( ClickhouseClientSettings.OPTIMIZE) run_optimize(clickhouse, self.storage, cluster.get_database(), ignore_cutoff=True)
def optimize( *, clickhouse_host: str, clickhouse_port: int, database: str, dataset_name: str, timeout: int, log_level: Optional[str] = None, ) -> None: from datetime import datetime from snuba.clickhouse.native import ClickhousePool from snuba.optimize import run_optimize, logger setup_logging(log_level) dataset = get_dataset(dataset_name) table = enforce_table_writer(dataset).get_schema().get_local_table_name() today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) clickhouse = ClickhousePool(clickhouse_host, clickhouse_port, send_receive_timeout=timeout) num_dropped = run_optimize(clickhouse, database, table, before=today) logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
def optimize( *, clickhouse_host: Optional[str], clickhouse_port: Optional[int], storage_name: str, parallel: int, log_level: Optional[str] = None, ) -> None: from datetime import datetime from snuba.clickhouse.native import ClickhousePool from snuba.optimize import logger, run_optimize setup_logging(log_level) setup_sentry() storage: ReadableTableStorage storage_key = StorageKey(storage_name) storage = get_storage(storage_key) (clickhouse_user, clickhouse_password) = storage.get_cluster().get_credentials() today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) database = storage.get_cluster().get_database() # TODO: In distributed mode, optimize currently must be run once for each node # with the host and port of that node provided via the CLI. In the future, # passing this information won't be necessary, and running this command once # will ensure that optimize is performed on all of the individual nodes for # that cluster. if clickhouse_host and clickhouse_port: connection = ClickhousePool( clickhouse_host, clickhouse_port, clickhouse_user, clickhouse_password, database, send_receive_timeout=ClickhouseClientSettings.OPTIMIZE.value.timeout, ) elif not storage.get_cluster().is_single_node(): raise click.ClickException("Provide Clickhouse host and port for optimize") else: connection = storage.get_cluster().get_query_connection( ClickhouseClientSettings.OPTIMIZE ) num_dropped = run_optimize( connection, storage, database, before=today, parallel=parallel, clickhouse_host=clickhouse_host, ) logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
def flush_batch(self, batch: Sequence[Replacement]) -> None: need_optimize = False for replacement in batch: query_args = { **replacement.query_args, "table_name": self.__replacer_processor.get_schema().get_table_name(), } if replacement.count_query_template is not None: count = self.clickhouse.execute_robust( replacement.count_query_template % query_args)[0][0] if count == 0: continue else: count = 0 need_optimize = (self.__replacer_processor.pre_replacement( replacement, count) or need_optimize) if replacement.insert_query_template is not None: t = time.time() query = replacement.insert_query_template % query_args logger.debug("Executing replace query: %s" % query) self.clickhouse.execute_robust(query) duration = int((time.time() - t) * 1000) logger.info("Replacing %s rows took %sms" % (count, duration)) self.metrics.timing("replacements.count", count) self.metrics.timing("replacements.duration", duration) else: count = duration = 0 self.__replacer_processor.post_replacement(replacement, duration, count) if need_optimize: from snuba.optimize import run_optimize today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) num_dropped = run_optimize( self.clickhouse, self.__storage, self.__database_name, before=today, ) logger.info("Optimized %s partitions on %s" % (num_dropped, self.clickhouse.host))
def flush_batch(self, batch: Sequence[Replacement]) -> None: need_optimize = False clickhouse_read = self.__storage.get_cluster().get_query_connection( ClickhouseClientSettings.REPLACE) for replacement in batch: start_time = time.time() table_name = self.__replacer_processor.get_schema().get_table_name( ) count_query = replacement.get_count_query(table_name) if count_query is not None: count = clickhouse_read.execute_robust( count_query).results[0][0] if count == 0: continue else: count = 0 need_optimize = (self.__replacer_processor.pre_replacement( replacement, count) or need_optimize) query_executor = self.__get_insert_executor(replacement) with self.__rate_limiter as state: self.metrics.increment("insert_state", tags={"state": state[0].value}) count = query_executor.execute(replacement, count) self.__replacer_processor.post_replacement(replacement, count) self._check_timing_and_write_to_redis(replacement, start_time) if need_optimize: from snuba.optimize import run_optimize today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) num_dropped = run_optimize(clickhouse_read, self.__storage, self.__database_name, before=today) logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_read.host))
def optimize(clickhouse_server, database, table, timeout, log_level): from datetime import datetime from snuba.clickhouse import ClickhousePool from snuba.optimize import run_optimize, logger logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s') if not clickhouse_server: logger.error("Must provide at least one Clickhouse server.") sys.exit(1) today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) for server in clickhouse_server: clickhouse = ClickhousePool( server.split(':')[0], port=int(server.split(':')[1]), send_receive_timeout=timeout ) num_dropped = run_optimize(clickhouse, database, table, before=today) logger.info("Optimized %s partitions on %s" % (num_dropped, server))
def optimize(clickhouse_host, clickhouse_port, database, dataset, timeout, log_level): from datetime import datetime from snuba.clickhouse.native import ClickhousePool from snuba.optimize import run_optimize, logger logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s') dataset = get_dataset(dataset) table = enforce_table_writer(dataset).get_schema().get_local_table_name() today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) clickhouse = ClickhousePool(clickhouse_host, clickhouse_port, send_receive_timeout=timeout) num_dropped = run_optimize(clickhouse, database, table, before=today) logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))