def execute(self, replacement: Replacement, records_count: int) -> int: try: query = replacement.get_insert_query(self.__local_table_name) if query is None: return 0 result_futures = [] for nodes in self.__connection_pool.get_connections().values(): result_futures.append( self.__thread_pool.submit( partial( self.__run_multiple_replicas, nodes=nodes, query=query, records_count=records_count, metrics=self.__metrics, ))) for result in as_completed(result_futures): e = result.exception() if e is not None: raise e return records_count except Exception as e: count = self.__backup_executor.execute(replacement, records_count) logger.warning( "Replacement processing failed on the main connection", exc_info=e, ) return count
def process_merge( message: Mapping[str, Any], all_column_names: Sequence[str] ) -> Optional[Replacement]: # HACK: We were sending duplicates of the `end_merge` message from Sentry, # this is only for performance of the backlog. txn = message.get("transaction_id") if txn: if txn in SEEN_MERGE_TXN_CACHE: return None else: SEEN_MERGE_TXN_CACHE.append(txn) previous_group_ids = message["previous_group_ids"] if not previous_group_ids: return None assert all(isinstance(gid, int) for gid in previous_group_ids) timestamp = datetime.strptime(message["datetime"], settings.PAYLOAD_DATETIME_FORMAT) select_columns = map( lambda i: i if i != "group_id" else str(message["new_group_id"]), all_column_names, ) where = """\ PREWHERE group_id IN (%(previous_group_ids)s) WHERE project_id = %(project_id)s AND received <= CAST('%(timestamp)s' AS DateTime) AND NOT deleted """ count_query_template = ( """\ SELECT count() FROM %(dist_read_table_name)s FINAL """ + where ) insert_query_template = ( """\ INSERT INTO %(dist_write_table_name)s (%(all_columns)s) SELECT %(select_columns)s FROM %(dist_read_table_name)s FINAL """ + where ) query_args = { "all_columns": ", ".join(all_column_names), "select_columns": ", ".join(select_columns), "project_id": message["project_id"], "previous_group_ids": ", ".join(str(gid) for gid in previous_group_ids), "timestamp": timestamp.strftime(DATETIME_FORMAT), } query_time_flags = (EXCLUDE_GROUPS, message["project_id"], previous_group_ids) return Replacement( count_query_template, insert_query_template, query_args, query_time_flags )
def _build_event_tombstone_replacement( message: Mapping[str, Any], required_columns: Sequence[str], where: str, query_args: Mapping[str, str], query_time_flags: Tuple[Any, ...], ) -> Replacement: select_columns = map(lambda i: i if i != "deleted" else "1", required_columns) count_query_template = ( """\ SELECT count() FROM %(table_name)s FINAL """ + where ) insert_query_template = ( """\ INSERT INTO %(table_name)s (%(required_columns)s) SELECT %(select_columns)s FROM %(table_name)s FINAL """ + where ) final_query_args = { "required_columns": ", ".join(required_columns), "select_columns": ", ".join(select_columns), "project_id": message["project_id"], } final_query_args.update(query_args) return Replacement( count_query_template, insert_query_template, final_query_args, query_time_flags )
def process_unmerge( message: Mapping[str, Any], all_columns: Sequence[FlattenedColumn], state_name: ReplacerState, ) -> Optional[Replacement]: hashes = message["hashes"] if not hashes: return None assert all(isinstance(h, str) for h in hashes) timestamp = datetime.strptime(message["datetime"], settings.PAYLOAD_DATETIME_FORMAT) all_column_names = [c.escaped for c in all_columns] select_columns = map( lambda i: i if i != "group_id" else str(message["new_group_id"]), all_column_names, ) where = """\ PREWHERE group_id = %(previous_group_id)s WHERE project_id = %(project_id)s AND primary_hash IN (%(hashes)s) AND received <= CAST('%(timestamp)s' AS DateTime) AND NOT deleted """ count_query_template = ("""\ SELECT count() FROM %(table_name)s FINAL """ + where) insert_query_template = ("""\ INSERT INTO %(table_name)s (%(all_columns)s) SELECT %(select_columns)s FROM %(table_name)s FINAL """ + where) query_args = { "all_columns": ", ".join(all_column_names), "select_columns": ", ".join(select_columns), "previous_group_id": message["previous_group_id"], "project_id": message["project_id"], "timestamp": timestamp.strftime(DATETIME_FORMAT), } if state_name == ReplacerState.ERRORS: query_args["hashes"] = ", ".join( ["'%s'" % str(uuid.UUID(_hashify(h))) for h in hashes]) else: query_args["hashes"] = ", ".join("'%s'" % _hashify(h) for h in hashes) query_time_flags = (NEEDS_FINAL, message["project_id"]) return Replacement(count_query_template, insert_query_template, query_args, query_time_flags)
def _build_group_replacement( txn: Optional[str], project_id: int, new_group_id: str, where: str, query_args: Mapping[str, str], query_time_flags: Tuple[Any, ...], all_columns: Sequence[FlattenedColumn], ) -> Optional[Replacement]: # HACK: We were sending duplicates of the `end_merge` message from Sentry, # this is only for performance of the backlog. if txn: if txn in SEEN_MERGE_TXN_CACHE: return None else: SEEN_MERGE_TXN_CACHE.append(txn) all_column_names = [c.escaped for c in all_columns] select_columns = map( lambda i: i if i != "group_id" else str(new_group_id), all_column_names, ) count_query_template = ( """\ SELECT count() FROM %(table_name)s FINAL """ + where ) insert_query_template = ( """\ INSERT INTO %(table_name)s (%(all_columns)s) SELECT %(select_columns)s FROM %(table_name)s FINAL """ + where ) final_query_args = { "all_columns": ", ".join(all_column_names), "select_columns": ", ".join(select_columns), "project_id": project_id, } final_query_args.update(query_args) return Replacement( count_query_template, insert_query_template, final_query_args, query_time_flags )
def process_delete_groups( message: Mapping[str, Any], required_columns: Sequence[str] ) -> Optional[Replacement]: group_ids = message["group_ids"] if not group_ids: return None assert all(isinstance(gid, int) for gid in group_ids) timestamp = datetime.strptime(message["datetime"], settings.PAYLOAD_DATETIME_FORMAT) select_columns = map(lambda i: i if i != "deleted" else "1", required_columns) where = """\ PREWHERE group_id IN (%(group_ids)s) WHERE project_id = %(project_id)s AND received <= CAST('%(timestamp)s' AS DateTime) AND NOT deleted """ count_query_template = ( """\ SELECT count() FROM %(dist_read_table_name)s FINAL """ + where ) insert_query_template = ( """\ INSERT INTO %(dist_write_table_name)s (%(required_columns)s) SELECT %(select_columns)s FROM %(dist_read_table_name)s FINAL """ + where ) query_args = { "required_columns": ", ".join(required_columns), "select_columns": ", ".join(select_columns), "project_id": message["project_id"], "group_ids": ", ".join(str(gid) for gid in group_ids), "timestamp": timestamp.strftime(DATETIME_FORMAT), } query_time_flags = (EXCLUDE_GROUPS, message["project_id"], group_ids) return Replacement( count_query_template, insert_query_template, query_args, query_time_flags )
def _check_timing_and_write_to_redis(self, replacement: Replacement, start_time: float) -> None: """ Write the offset just processed to Redis if execution took longer than the threshold. Also store the offset locally to avoid Read calls to Redis. If the Consumer dies while the insert query for the message was being executed, the most recently executed offset would be present in Redis. """ if (time.time() - start_time) < settings.REPLACER_PROCESSING_TIMEOUT_THRESHOLD: return message_metadata = replacement.get_message_metadata() key = self._build_topic_group_index_key(message_metadata) redis_client.set( key, message_metadata.offset, ex=settings.REPLACER_PROCESSING_TIMEOUT_THRESHOLD_KEY_TTL, ) self.__last_offset_processed_per_partition[ key] = message_metadata.offset
def process_exclude_groups(message: Mapping[str, Any]) -> Optional[Replacement]: """ Exclude a group ID from being searched. This together with process_tombstone_events and process_merge_events is used by reprocessing to split up a group into multiple, event by event. Assuming a group with n events: 1. insert m events that have been selected for reprocessing (with same event ID). 2. process_merge_events for n - m events that have not been selected, i.e. move them into a new group ID 3. exclude old group ID from search queries. This group ID must not receive new events. See docstring in `sentry.reprocessing2` for more information. """ group_ids = message["group_ids"] if not group_ids: return None query_time_flags = (EXCLUDE_GROUPS, message["project_id"], group_ids) return Replacement(None, None, {}, query_time_flags)
def process_delete_tag( message: Mapping[str, Any], schema: TableSchema, tag_column_map: Mapping[str, Mapping[str, str]], promoted_tags: Mapping[str, Sequence[str]], ) -> Optional[Replacement]: tag = message["tag"] if not tag: return None assert isinstance(tag, str) timestamp = datetime.strptime(message["datetime"], settings.PAYLOAD_DATETIME_FORMAT) tag_column_name = tag_column_map["tags"].get(tag, tag) is_promoted = tag in promoted_tags["tags"] where = """\ WHERE project_id = %(project_id)s AND received <= CAST('%(timestamp)s' AS DateTime) AND NOT deleted """ if is_promoted: prewhere = " PREWHERE %(tag_column)s IS NOT NULL " else: prewhere = " PREWHERE has(`tags.key`, %(tag_str)s) " insert_query_template = ( """\ INSERT INTO %(dist_write_table_name)s (%(all_columns)s) SELECT %(select_columns)s FROM %(dist_read_table_name)s FINAL """ + prewhere + where ) all_columns = [ col for col in schema.get_columns() if Materialized not in col.type.get_all_modifiers() ] select_columns = [] for col in all_columns: if is_promoted and col.flattened == tag_column_name: select_columns.append("NULL") elif col.flattened == "tags.key": select_columns.append( "arrayFilter(x -> (indexOf(`tags.key`, x) != indexOf(`tags.key`, %s)), `tags.key`)" % escape_string(tag) ) elif col.flattened == "tags.value": select_columns.append( "arrayMap(x -> arrayElement(`tags.value`, x), arrayFilter(x -> x != indexOf(`tags.key`, %s), arrayEnumerate(`tags.value`)))" % escape_string(tag) ) elif col.flattened == "_tags_flattened": select_columns.append(FLATTENED_COLUMN_TEMPLATE % escape_string(tag)) else: select_columns.append(col.escaped) all_column_names = [col.escaped for col in all_columns] query_args = { "all_columns": ", ".join(all_column_names), "select_columns": ", ".join(select_columns), "project_id": message["project_id"], "tag_str": escape_string(tag), "tag_column": escape_identifier(tag_column_name), "timestamp": timestamp.strftime(DATETIME_FORMAT), } count_query_template = ( """\ SELECT count() FROM %(dist_read_table_name)s FINAL """ + prewhere + where ) query_time_flags = (NEEDS_FINAL, message["project_id"]) return Replacement( count_query_template, insert_query_template, query_args, query_time_flags )
def __get_insert_executor(self, replacement: Replacement) -> InsertExecutor: """ Some replacements need to be executed on each storage node of the cluster instead that through a query node on distributed tables. This happens when the number of shards changes and specific rows resolve to a different node than they were doing before. example: writing the tombstone for an event id. When we change the shards number the tombstone may end up on a different shard than the original row. This returns the InsertExecutor that implements the appropriate policy for the replacement provided. So it can return either a basic DistributedExecutor or a ShardedExecutor to write on each storage node. """ def run_query( connection: ClickhousePool, query: str, records_count: int, metrics: MetricsBackend, ) -> None: t = time.time() logger.debug("Executing replace query: %s" % query) connection.execute_robust(query) duration = int((time.time() - t) * 1000) logger.info("Replacing %s rows took %sms" % (records_count, duration)) metrics.timing( "replacements.count", records_count, tags={"host": connection.host}, ) metrics.timing( "replacements.duration", duration, tags={"host": connection.host}, ) query_table_name = self.__replacer_processor.get_schema( ).get_table_name() local_table_name = self.__replacer_processor.get_schema( ).get_local_table_name() cluster = self.__storage.get_cluster() query_connection = cluster.get_query_connection( ClickhouseClientSettings.REPLACE) write_every_node = replacement.should_write_every_node() query_node_executor = QueryNodeExecutor( runner=run_query, connection=query_connection, table=query_table_name, metrics=self.metrics, ) if not write_every_node or cluster.is_single_node(): return query_node_executor return ShardedExecutor( runner=run_query, cluster=cluster, thread_pool=executor, main_connection_pool=self.__sharded_pool, local_table_name=local_table_name, backup_executor=query_node_executor, metrics=self.metrics, )
def execute(self, replacement: Replacement, records_count: int) -> int: query = replacement.get_insert_query(self.__table) if query is None: return 0 self.__runner(self.__connection, query, records_count, self.__metrics) return records_count
def process_delete_tag( message: Mapping[str, Any], all_columns: Sequence[FlattenedColumn], tag_column_map: Mapping[str, Mapping[str, str]], promoted_tags: Mapping[str, Sequence[str]], use_promoted_prewhere: bool, schema: WritableTableSchema, ) -> Optional[Replacement]: tag = message["tag"] if not tag: return None assert isinstance(tag, str) timestamp = datetime.strptime(message["datetime"], settings.PAYLOAD_DATETIME_FORMAT) tag_column_name = tag_column_map["tags"].get(tag, tag) is_promoted = tag in promoted_tags["tags"] where = """\ WHERE project_id = %(project_id)s AND received <= CAST('%(timestamp)s' AS DateTime) AND NOT deleted """ if is_promoted and use_promoted_prewhere: prewhere = " PREWHERE %(tag_column)s IS NOT NULL " else: prewhere = " PREWHERE has(`tags.key`, %(tag_str)s) " insert_query_template = ( """\ INSERT INTO %(table_name)s (%(all_columns)s) SELECT %(select_columns)s FROM %(table_name)s FINAL """ + prewhere + where ) select_columns = [] for col in all_columns: if is_promoted and col.flattened == tag_column_name: # The promoted tag columns of events are non nullable, but those of # errors are non nullable. We check the column against the schema # to determine whether to write an empty string or NULL. column_type = schema.get_data_source().get_columns().get(tag_column_name) assert column_type is not None is_nullable = column_type.type.has_modifier(Nullable) if is_nullable: select_columns.append("NULL") else: select_columns.append("''") elif col.flattened == "tags.key": select_columns.append( "arrayFilter(x -> (indexOf(`tags.key`, x) != indexOf(`tags.key`, %s)), `tags.key`)" % escape_string(tag) ) elif col.flattened == "tags.value": select_columns.append( "arrayMap(x -> arrayElement(`tags.value`, x), arrayFilter(x -> x != indexOf(`tags.key`, %s), arrayEnumerate(`tags.value`)))" % escape_string(tag) ) else: select_columns.append(col.escaped) all_column_names = [col.escaped for col in all_columns] query_args = { "all_columns": ", ".join(all_column_names), "select_columns": ", ".join(select_columns), "project_id": message["project_id"], "tag_str": escape_string(tag), "tag_column": escape_identifier(tag_column_name), "timestamp": timestamp.strftime(DATETIME_FORMAT), } count_query_template = ( """\ SELECT count() FROM %(table_name)s FINAL """ + prewhere + where ) query_time_flags = (NEEDS_FINAL, message["project_id"]) return Replacement( count_query_template, insert_query_template, query_args, query_time_flags )