예제 #1
0
    def execute(self, replacement: Replacement, records_count: int) -> int:
        try:
            query = replacement.get_insert_query(self.__local_table_name)
            if query is None:
                return 0
            result_futures = []
            for nodes in self.__connection_pool.get_connections().values():
                result_futures.append(
                    self.__thread_pool.submit(
                        partial(
                            self.__run_multiple_replicas,
                            nodes=nodes,
                            query=query,
                            records_count=records_count,
                            metrics=self.__metrics,
                        )))
            for result in as_completed(result_futures):
                e = result.exception()
                if e is not None:
                    raise e
            return records_count

        except Exception as e:
            count = self.__backup_executor.execute(replacement, records_count)
            logger.warning(
                "Replacement processing failed on the main connection",
                exc_info=e,
            )
            return count
예제 #2
0
def process_merge(
    message: Mapping[str, Any], all_column_names: Sequence[str]
) -> Optional[Replacement]:
    # HACK: We were sending duplicates of the `end_merge` message from Sentry,
    # this is only for performance of the backlog.
    txn = message.get("transaction_id")
    if txn:
        if txn in SEEN_MERGE_TXN_CACHE:
            return None
        else:
            SEEN_MERGE_TXN_CACHE.append(txn)

    previous_group_ids = message["previous_group_ids"]
    if not previous_group_ids:
        return None

    assert all(isinstance(gid, int) for gid in previous_group_ids)
    timestamp = datetime.strptime(message["datetime"], settings.PAYLOAD_DATETIME_FORMAT)
    select_columns = map(
        lambda i: i if i != "group_id" else str(message["new_group_id"]),
        all_column_names,
    )

    where = """\
        PREWHERE group_id IN (%(previous_group_ids)s)
        WHERE project_id = %(project_id)s
        AND received <= CAST('%(timestamp)s' AS DateTime)
        AND NOT deleted
    """

    count_query_template = (
        """\
        SELECT count()
        FROM %(dist_read_table_name)s FINAL
    """
        + where
    )

    insert_query_template = (
        """\
        INSERT INTO %(dist_write_table_name)s (%(all_columns)s)
        SELECT %(select_columns)s
        FROM %(dist_read_table_name)s FINAL
    """
        + where
    )

    query_args = {
        "all_columns": ", ".join(all_column_names),
        "select_columns": ", ".join(select_columns),
        "project_id": message["project_id"],
        "previous_group_ids": ", ".join(str(gid) for gid in previous_group_ids),
        "timestamp": timestamp.strftime(DATETIME_FORMAT),
    }

    query_time_flags = (EXCLUDE_GROUPS, message["project_id"], previous_group_ids)

    return Replacement(
        count_query_template, insert_query_template, query_args, query_time_flags
    )
예제 #3
0
def _build_event_tombstone_replacement(
    message: Mapping[str, Any],
    required_columns: Sequence[str],
    where: str,
    query_args: Mapping[str, str],
    query_time_flags: Tuple[Any, ...],
) -> Replacement:
    select_columns = map(lambda i: i if i != "deleted" else "1", required_columns)
    count_query_template = (
        """\
        SELECT count()
        FROM %(table_name)s FINAL
    """
        + where
    )

    insert_query_template = (
        """\
        INSERT INTO %(table_name)s (%(required_columns)s)
        SELECT %(select_columns)s
        FROM %(table_name)s FINAL
    """
        + where
    )

    final_query_args = {
        "required_columns": ", ".join(required_columns),
        "select_columns": ", ".join(select_columns),
        "project_id": message["project_id"],
    }
    final_query_args.update(query_args)

    return Replacement(
        count_query_template, insert_query_template, final_query_args, query_time_flags
    )
예제 #4
0
def process_unmerge(
    message: Mapping[str, Any],
    all_columns: Sequence[FlattenedColumn],
    state_name: ReplacerState,
) -> Optional[Replacement]:
    hashes = message["hashes"]
    if not hashes:
        return None

    assert all(isinstance(h, str) for h in hashes)

    timestamp = datetime.strptime(message["datetime"],
                                  settings.PAYLOAD_DATETIME_FORMAT)
    all_column_names = [c.escaped for c in all_columns]
    select_columns = map(
        lambda i: i if i != "group_id" else str(message["new_group_id"]),
        all_column_names,
    )

    where = """\
        PREWHERE group_id = %(previous_group_id)s
        WHERE project_id = %(project_id)s
        AND primary_hash IN (%(hashes)s)
        AND received <= CAST('%(timestamp)s' AS DateTime)
        AND NOT deleted
    """

    count_query_template = ("""\
        SELECT count()
        FROM %(table_name)s FINAL
    """ + where)

    insert_query_template = ("""\
        INSERT INTO %(table_name)s (%(all_columns)s)
        SELECT %(select_columns)s
        FROM %(table_name)s FINAL
    """ + where)

    query_args = {
        "all_columns": ", ".join(all_column_names),
        "select_columns": ", ".join(select_columns),
        "previous_group_id": message["previous_group_id"],
        "project_id": message["project_id"],
        "timestamp": timestamp.strftime(DATETIME_FORMAT),
    }

    if state_name == ReplacerState.ERRORS:
        query_args["hashes"] = ", ".join(
            ["'%s'" % str(uuid.UUID(_hashify(h))) for h in hashes])
    else:
        query_args["hashes"] = ", ".join("'%s'" % _hashify(h) for h in hashes)

    query_time_flags = (NEEDS_FINAL, message["project_id"])

    return Replacement(count_query_template, insert_query_template, query_args,
                       query_time_flags)
예제 #5
0
def _build_group_replacement(
    txn: Optional[str],
    project_id: int,
    new_group_id: str,
    where: str,
    query_args: Mapping[str, str],
    query_time_flags: Tuple[Any, ...],
    all_columns: Sequence[FlattenedColumn],
) -> Optional[Replacement]:
    # HACK: We were sending duplicates of the `end_merge` message from Sentry,
    # this is only for performance of the backlog.
    if txn:
        if txn in SEEN_MERGE_TXN_CACHE:
            return None
        else:
            SEEN_MERGE_TXN_CACHE.append(txn)

    all_column_names = [c.escaped for c in all_columns]
    select_columns = map(
        lambda i: i if i != "group_id" else str(new_group_id), all_column_names,
    )

    count_query_template = (
        """\
        SELECT count()
        FROM %(table_name)s FINAL
    """
        + where
    )

    insert_query_template = (
        """\
        INSERT INTO %(table_name)s (%(all_columns)s)
        SELECT %(select_columns)s
        FROM %(table_name)s FINAL
    """
        + where
    )

    final_query_args = {
        "all_columns": ", ".join(all_column_names),
        "select_columns": ", ".join(select_columns),
        "project_id": project_id,
    }
    final_query_args.update(query_args)

    return Replacement(
        count_query_template, insert_query_template, final_query_args, query_time_flags
    )
예제 #6
0
def process_delete_groups(
    message: Mapping[str, Any], required_columns: Sequence[str]
) -> Optional[Replacement]:
    group_ids = message["group_ids"]
    if not group_ids:
        return None

    assert all(isinstance(gid, int) for gid in group_ids)
    timestamp = datetime.strptime(message["datetime"], settings.PAYLOAD_DATETIME_FORMAT)
    select_columns = map(lambda i: i if i != "deleted" else "1", required_columns)

    where = """\
        PREWHERE group_id IN (%(group_ids)s)
        WHERE project_id = %(project_id)s
        AND received <= CAST('%(timestamp)s' AS DateTime)
        AND NOT deleted
    """

    count_query_template = (
        """\
        SELECT count()
        FROM %(dist_read_table_name)s FINAL
    """
        + where
    )

    insert_query_template = (
        """\
        INSERT INTO %(dist_write_table_name)s (%(required_columns)s)
        SELECT %(select_columns)s
        FROM %(dist_read_table_name)s FINAL
    """
        + where
    )

    query_args = {
        "required_columns": ", ".join(required_columns),
        "select_columns": ", ".join(select_columns),
        "project_id": message["project_id"],
        "group_ids": ", ".join(str(gid) for gid in group_ids),
        "timestamp": timestamp.strftime(DATETIME_FORMAT),
    }

    query_time_flags = (EXCLUDE_GROUPS, message["project_id"], group_ids)

    return Replacement(
        count_query_template, insert_query_template, query_args, query_time_flags
    )
예제 #7
0
    def _check_timing_and_write_to_redis(self, replacement: Replacement,
                                         start_time: float) -> None:
        """
        Write the offset just processed to Redis if execution took longer than the threshold.
        Also store the offset locally to avoid Read calls to Redis.

        If the Consumer dies while the insert query for the message was being executed,
        the most recently executed offset would be present in Redis.
        """
        if (time.time() -
                start_time) < settings.REPLACER_PROCESSING_TIMEOUT_THRESHOLD:
            return
        message_metadata = replacement.get_message_metadata()
        key = self._build_topic_group_index_key(message_metadata)
        redis_client.set(
            key,
            message_metadata.offset,
            ex=settings.REPLACER_PROCESSING_TIMEOUT_THRESHOLD_KEY_TTL,
        )
        self.__last_offset_processed_per_partition[
            key] = message_metadata.offset
예제 #8
0
def process_exclude_groups(message: Mapping[str, Any]) -> Optional[Replacement]:
    """
    Exclude a group ID from being searched.

    This together with process_tombstone_events and process_merge_events is
    used by reprocessing to split up a group into multiple, event by event.
    Assuming a group with n events:

    1. insert m events that have been selected for reprocessing (with same event ID).
    2. process_merge_events for n - m events that have not been selected, i.e.
       move them into a new group ID
    3. exclude old group ID from search queries. This group ID must not receive
       new events.

    See docstring in `sentry.reprocessing2` for more information.
    """

    group_ids = message["group_ids"]
    if not group_ids:
        return None

    query_time_flags = (EXCLUDE_GROUPS, message["project_id"], group_ids)
    return Replacement(None, None, {}, query_time_flags)
예제 #9
0
def process_delete_tag(
    message: Mapping[str, Any],
    schema: TableSchema,
    tag_column_map: Mapping[str, Mapping[str, str]],
    promoted_tags: Mapping[str, Sequence[str]],
) -> Optional[Replacement]:
    tag = message["tag"]
    if not tag:
        return None

    assert isinstance(tag, str)
    timestamp = datetime.strptime(message["datetime"], settings.PAYLOAD_DATETIME_FORMAT)
    tag_column_name = tag_column_map["tags"].get(tag, tag)
    is_promoted = tag in promoted_tags["tags"]

    where = """\
        WHERE project_id = %(project_id)s
        AND received <= CAST('%(timestamp)s' AS DateTime)
        AND NOT deleted
    """

    if is_promoted:
        prewhere = " PREWHERE %(tag_column)s IS NOT NULL "
    else:
        prewhere = " PREWHERE has(`tags.key`, %(tag_str)s) "

    insert_query_template = (
        """\
        INSERT INTO %(dist_write_table_name)s (%(all_columns)s)
        SELECT %(select_columns)s
        FROM %(dist_read_table_name)s FINAL
    """
        + prewhere + where
    )

    all_columns = [
        col
        for col in schema.get_columns()
        if Materialized not in col.type.get_all_modifiers()
    ]
    select_columns = []
    for col in all_columns:
        if is_promoted and col.flattened == tag_column_name:
            select_columns.append("NULL")
        elif col.flattened == "tags.key":
            select_columns.append(
                "arrayFilter(x -> (indexOf(`tags.key`, x) != indexOf(`tags.key`, %s)), `tags.key`)"
                % escape_string(tag)
            )
        elif col.flattened == "tags.value":
            select_columns.append(
                "arrayMap(x -> arrayElement(`tags.value`, x), arrayFilter(x -> x != indexOf(`tags.key`, %s), arrayEnumerate(`tags.value`)))"
                % escape_string(tag)
            )
        elif col.flattened == "_tags_flattened":
            select_columns.append(FLATTENED_COLUMN_TEMPLATE % escape_string(tag))
        else:
            select_columns.append(col.escaped)

    all_column_names = [col.escaped for col in all_columns]
    query_args = {
        "all_columns": ", ".join(all_column_names),
        "select_columns": ", ".join(select_columns),
        "project_id": message["project_id"],
        "tag_str": escape_string(tag),
        "tag_column": escape_identifier(tag_column_name),
        "timestamp": timestamp.strftime(DATETIME_FORMAT),
    }

    count_query_template = (
        """\
        SELECT count()
        FROM %(dist_read_table_name)s FINAL
    """
        + prewhere + where
    )

    query_time_flags = (NEEDS_FINAL, message["project_id"])

    return Replacement(
        count_query_template, insert_query_template, query_args, query_time_flags
    )
예제 #10
0
    def __get_insert_executor(self,
                              replacement: Replacement) -> InsertExecutor:
        """
        Some replacements need to be executed on each storage node of the
        cluster instead that through a query node on distributed tables.
        This happens when the number of shards changes and specific rows
        resolve to a different node than they were doing before.

        example: writing the tombstone for an event id. When we change the
        shards number the tombstone may end up on a different shard than
        the original row.

        This returns the InsertExecutor that implements the appropriate
        policy for the replacement provided. So it can return either a basic
        DistributedExecutor or a ShardedExecutor to write on each storage
        node.
        """
        def run_query(
            connection: ClickhousePool,
            query: str,
            records_count: int,
            metrics: MetricsBackend,
        ) -> None:
            t = time.time()

            logger.debug("Executing replace query: %s" % query)
            connection.execute_robust(query)
            duration = int((time.time() - t) * 1000)

            logger.info("Replacing %s rows took %sms" %
                        (records_count, duration))
            metrics.timing(
                "replacements.count",
                records_count,
                tags={"host": connection.host},
            )
            metrics.timing(
                "replacements.duration",
                duration,
                tags={"host": connection.host},
            )

        query_table_name = self.__replacer_processor.get_schema(
        ).get_table_name()
        local_table_name = self.__replacer_processor.get_schema(
        ).get_local_table_name()
        cluster = self.__storage.get_cluster()

        query_connection = cluster.get_query_connection(
            ClickhouseClientSettings.REPLACE)
        write_every_node = replacement.should_write_every_node()
        query_node_executor = QueryNodeExecutor(
            runner=run_query,
            connection=query_connection,
            table=query_table_name,
            metrics=self.metrics,
        )
        if not write_every_node or cluster.is_single_node():
            return query_node_executor

        return ShardedExecutor(
            runner=run_query,
            cluster=cluster,
            thread_pool=executor,
            main_connection_pool=self.__sharded_pool,
            local_table_name=local_table_name,
            backup_executor=query_node_executor,
            metrics=self.metrics,
        )
예제 #11
0
 def execute(self, replacement: Replacement, records_count: int) -> int:
     query = replacement.get_insert_query(self.__table)
     if query is None:
         return 0
     self.__runner(self.__connection, query, records_count, self.__metrics)
     return records_count
예제 #12
0
def process_delete_tag(
    message: Mapping[str, Any],
    all_columns: Sequence[FlattenedColumn],
    tag_column_map: Mapping[str, Mapping[str, str]],
    promoted_tags: Mapping[str, Sequence[str]],
    use_promoted_prewhere: bool,
    schema: WritableTableSchema,
) -> Optional[Replacement]:
    tag = message["tag"]
    if not tag:
        return None

    assert isinstance(tag, str)
    timestamp = datetime.strptime(message["datetime"], settings.PAYLOAD_DATETIME_FORMAT)
    tag_column_name = tag_column_map["tags"].get(tag, tag)
    is_promoted = tag in promoted_tags["tags"]

    where = """\
        WHERE project_id = %(project_id)s
        AND received <= CAST('%(timestamp)s' AS DateTime)
        AND NOT deleted
    """

    if is_promoted and use_promoted_prewhere:
        prewhere = " PREWHERE %(tag_column)s IS NOT NULL "
    else:
        prewhere = " PREWHERE has(`tags.key`, %(tag_str)s) "

    insert_query_template = (
        """\
        INSERT INTO %(table_name)s (%(all_columns)s)
        SELECT %(select_columns)s
        FROM %(table_name)s FINAL
    """
        + prewhere
        + where
    )

    select_columns = []
    for col in all_columns:
        if is_promoted and col.flattened == tag_column_name:
            # The promoted tag columns of events are non nullable, but those of
            # errors are non nullable. We check the column against the schema
            # to determine whether to write an empty string or NULL.
            column_type = schema.get_data_source().get_columns().get(tag_column_name)
            assert column_type is not None
            is_nullable = column_type.type.has_modifier(Nullable)
            if is_nullable:
                select_columns.append("NULL")
            else:
                select_columns.append("''")
        elif col.flattened == "tags.key":
            select_columns.append(
                "arrayFilter(x -> (indexOf(`tags.key`, x) != indexOf(`tags.key`, %s)), `tags.key`)"
                % escape_string(tag)
            )
        elif col.flattened == "tags.value":
            select_columns.append(
                "arrayMap(x -> arrayElement(`tags.value`, x), arrayFilter(x -> x != indexOf(`tags.key`, %s), arrayEnumerate(`tags.value`)))"
                % escape_string(tag)
            )
        else:
            select_columns.append(col.escaped)

    all_column_names = [col.escaped for col in all_columns]
    query_args = {
        "all_columns": ", ".join(all_column_names),
        "select_columns": ", ".join(select_columns),
        "project_id": message["project_id"],
        "tag_str": escape_string(tag),
        "tag_column": escape_identifier(tag_column_name),
        "timestamp": timestamp.strftime(DATETIME_FORMAT),
    }

    count_query_template = (
        """\
        SELECT count()
        FROM %(table_name)s FINAL
    """
        + prewhere
        + where
    )

    query_time_flags = (NEEDS_FINAL, message["project_id"])

    return Replacement(
        count_query_template, insert_query_template, query_args, query_time_flags
    )