Exemplo n.º 1
0
    def replicated_table_operations(self, table: TableMigrationData):
        yield AsyncMigrationOperationSQL(
            sql=f"""
            CREATE TABLE {table.tmp_table_name} AS {table.name}
            ENGINE = {self.get_new_engine(table)}
            """,
            rollback=f"DROP TABLE IF EXISTS {table.tmp_table_name}",
        )

        if table.kafka_table_name is not None:
            yield AsyncMigrationOperationSQL(
                sql=f"DROP TABLE IF EXISTS {table.kafka_table_name}",
                rollback=cast(str, table.create_kafka_table))

        yield AsyncMigrationOperation(
            fn=lambda _: self.move_partitions(table.name, table.tmp_table_name
                                              ),
            rollback_fn=lambda _: self.move_partitions(table.tmp_table_name,
                                                       table.name),
        )

        yield AsyncMigrationOperation(
            fn=lambda _: self.rename_tables(
                [table.name, table.backup_table_name],
                [table.tmp_table_name, table.renamed_table_name],
            ),
            rollback_fn=lambda _: self.rename_tables(
                [table.renamed_table_name, table.tmp_table_name],
                [table.backup_table_name, table.name],
                verification_table=table.backup_table_name,
            ),
        )
Exemplo n.º 2
0
    def finalize_table_operations(self, table: TableMigrationData):
        # NOTE: Relies on IF NOT EXISTS on the query
        if isinstance(table, ShardedTableMigrationData):
            for table_name, create_table_query in table.extra_tables.items():
                yield AsyncMigrationOperationSQL(
                    sql=create_table_query,
                    rollback=f"DROP TABLE IF EXISTS {table_name}")

        if isinstance(table, ShardedTableMigrationData
                      ) and table.materialized_view_name is not None:
            yield AsyncMigrationOperationSQL(
                sql=f"DROP TABLE IF EXISTS {table.materialized_view_name}",
                rollback=None)

        if table.kafka_table_name is not None:
            yield AsyncMigrationOperationSQL(
                sql=cast(str, table.create_kafka_table),
                rollback=f"DROP TABLE IF EXISTS {table.kafka_table_name}")

        if isinstance(table, ShardedTableMigrationData
                      ) and table.materialized_view_name is not None:
            yield AsyncMigrationOperationSQL(
                sql=table.create_materialized_view,
                rollback=f"DROP TABLE IF EXISTS {table.materialized_view_name}",
            )
Exemplo n.º 3
0
    def operations(self):
        TABLE_MIGRATION_OPERATIONS = list(
            flatten([
                list(self.replicated_table_operations(table))
                for table in self.tables_to_migrate()
            ]))
        RE_ENABLE_INGESTION_OPERATIONS = list(
            flatten([
                list(self.finalize_table_operations(table))
                for table in self.tables_to_migrate()
            ]))

        return [
            AsyncMigrationOperationSQL(sql="SYSTEM STOP MERGES",
                                       rollback="SYSTEM START MERGES"),
            AsyncMigrationOperation(
                fn=lambda _: setattr(
                    config, "COMPUTE_MATERIALIZED_COLUMNS_ENABLED", False),
                rollback_fn=lambda _: setattr(
                    config, "COMPUTE_MATERIALIZED_COLUMNS_ENABLED", True),
            ),
            *TABLE_MIGRATION_OPERATIONS,
            *RE_ENABLE_INGESTION_OPERATIONS,
            AsyncMigrationOperation(
                fn=lambda _: setattr(
                    config, "COMPUTE_MATERIALIZED_COLUMNS_ENABLED", False),
                rollback_fn=lambda _: setattr(
                    config, "COMPUTE_MATERIALIZED_COLUMNS_ENABLED", True),
            ),
            AsyncMigrationOperationSQL(
                sql="SYSTEM START MERGES",
                rollback="SYSTEM STOP MERGES",
            ),
        ]
Exemplo n.º 4
0
class Migration(AsyncMigrationDefinition):

    # For testing only!!
    fail = False
    error_message = "Healthcheck failed"

    description = "Another example async migration that's less realistic and used in tests."

    sec = SideEffects()

    operations = [
        AsyncMigrationOperationSQL(
            database=AnalyticsDBMS.POSTGRES,
            sql=
            "CREATE TABLE test_async_migration ( key VARCHAR, value VARCHAR )",
            rollback="DROP TABLE test_async_migration",
        ),
        AsyncMigrationOperation(
            fn=sec.side_effect,
            rollback_fn=sec.side_effect_rollback,
        ),
        AsyncMigrationOperationSQL(
            database=AnalyticsDBMS.POSTGRES,
            sql=
            "INSERT INTO test_async_migration (key, value) VALUES ('a', 'b')",
            rollback="TRUNCATE TABLE test_async_migration",
        ),
        AsyncMigrationOperationSQL(database=AnalyticsDBMS.POSTGRES,
                                   sql="SELECT pg_sleep(1)",
                                   rollback=None),
        AsyncMigrationOperation(
            fn=sec.side_effect,
            rollback_fn=sec.side_effect_rollback,
        ),
        AsyncMigrationOperationSQL(
            database=AnalyticsDBMS.POSTGRES,
            sql="UPDATE test_async_migration SET value='c' WHERE key='a'",
            rollback="UPDATE test_async_migration SET value='b' WHERE key='a'",
        ),
        AsyncMigrationOperation(
            fn=sec.side_effect,
            rollback_fn=sec.side_effect_rollback,
        ),
    ]

    def healthcheck(self):
        if self.fail:
            return (False, self.error_message)

        return (True, None)
Exemplo n.º 5
0
 def migrate_team_operation(self, team_id: int):
     return AsyncMigrationOperationSQL(
         database=AnalyticsDBMS.CLICKHOUSE,
         sql=f"""
             INSERT INTO person_distinct_id2(team_id, distinct_id, person_id, is_deleted, version)
             SELECT
                 team_id,
                 distinct_id,
                 argMax(person_id, _timestamp) as person_id,
                 0 as is_deleted,
                 0 as version
             FROM (
                 SELECT
                     distinct_id,
                     person_id,
                     any(team_id) as team_id,
                     max(_timestamp) as _timestamp
                 FROM
                     person_distinct_id
                 WHERE
                     person_distinct_id.team_id = {team_id}
                 GROUP BY
                     person_id,
                     distinct_id
                 HAVING
                     max(is_deleted) = 0
             )
             GROUP BY team_id, distinct_id
         """,
         rollback=None,
     )
Exemplo n.º 6
0
def generate_insert_into_op(partition_gte: int,
                            partition_lt=None) -> AsyncMigrationOperation:
    lt_expression = f"AND toYYYYMM(timestamp) < {partition_lt}" if partition_lt else ""
    op = AsyncMigrationOperationSQL(
        database=AnalyticsDBMS.CLICKHOUSE,
        sql=f"""
        INSERT INTO {TEMPORARY_TABLE_NAME}
        SELECT *
        FROM {EVENTS_TABLE}
        WHERE
            toYYYYMM(timestamp) >= {partition_gte} {lt_expression}
        """,
        rollback=
        f"TRUNCATE TABLE IF EXISTS {TEMPORARY_TABLE_NAME} ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
        timeout_seconds=2 * 24 * 60 * 60,  # two days
    )
    return op
Exemplo n.º 7
0
from posthog.async_migrations.definition import AsyncMigrationOperationSQL
from posthog.async_migrations.test.util import create_async_migration
from posthog.async_migrations.utils import (
    complete_migration,
    execute_op,
    force_stop_migration,
    process_error,
    trigger_migration,
)
from posthog.constants import AnalyticsDBMS
from posthog.models.async_migration import AsyncMigrationError, MigrationStatus
from posthog.test.base import BaseTest

DEFAULT_CH_OP = AsyncMigrationOperationSQL(sql="SELECT 1",
                                           rollback=None,
                                           timeout_seconds=10)

DEFAULT_POSTGRES_OP = AsyncMigrationOperationSQL(
    database=AnalyticsDBMS.POSTGRES, sql="SELECT 1", rollback=None)


class TestUtils(BaseTest):
    @pytest.mark.ee
    @patch("posthog.client.sync_execute")
    def test_execute_op_clickhouse(self, mock_sync_execute):
        execute_op(DEFAULT_CH_OP, "some_id")

        # correctly routes to ch
        mock_sync_execute.assert_called_once_with(
            "/* some_id */ SELECT 1", settings={"max_execution_time": 10})
Exemplo n.º 8
0
class Migration(AsyncMigrationDefinition):

    description = "An example async migration."

    posthog_min_version = "1.29.0"
    posthog_max_version = "1.30.0"

    service_version_requirements = [
        ServiceVersionRequirement(service="clickhouse",
                                  supported_version=">=21.6.0,<21.7.0"),
    ]

    operations = [
        AsyncMigrationOperationSQL(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=PERSONS_DISTINCT_ID_TABLE_SQL().replace(
                PERSONS_DISTINCT_ID_TABLE, TEMPORARY_TABLE_NAME, 1),
            rollback=
            f"DROP TABLE IF EXISTS {TEMPORARY_TABLE_NAME} ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
        ),
        AsyncMigrationOperationSQL(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=
            f"DROP TABLE person_distinct_id_mv ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
            rollback=PERSONS_DISTINCT_ID_TABLE_MV_SQL,
        ),
        AsyncMigrationOperationSQL(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=
            f"DROP TABLE kafka_person_distinct_id ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
            rollback=KAFKA_PERSONS_DISTINCT_ID_TABLE_SQL(),
        ),
        AsyncMigrationOperationSQL(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=f"""
                INSERT INTO {TEMPORARY_TABLE_NAME} (distinct_id, person_id, team_id, _sign, _timestamp, _offset)
                SELECT
                    distinct_id,
                    person_id,
                    team_id,
                    if(is_deleted==0, 1, -1) as _sign,
                    _timestamp,
                    _offset
                FROM {PERSONS_DISTINCT_ID_TABLE}
            """,
            rollback=f"DROP TABLE IF EXISTS {TEMPORARY_TABLE_NAME}",
        ),
        AsyncMigrationOperationSQL(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=f"""
                RENAME TABLE
                    {CLICKHOUSE_DATABASE}.{PERSONS_DISTINCT_ID_TABLE} to {CLICKHOUSE_DATABASE}.person_distinct_id_async_migration_backup,
                    {CLICKHOUSE_DATABASE}.{TEMPORARY_TABLE_NAME} to {CLICKHOUSE_DATABASE}.{PERSONS_DISTINCT_ID_TABLE}
                ON CLUSTER '{CLICKHOUSE_CLUSTER}'
            """,
            rollback=f"""
                RENAME TABLE
                    {CLICKHOUSE_DATABASE}.{PERSONS_DISTINCT_ID_TABLE} to {CLICKHOUSE_DATABASE}.{TEMPORARY_TABLE_NAME}
                    {CLICKHOUSE_DATABASE}.person_distinct_id_async_migration_backup to {CLICKHOUSE_DATABASE}.person_distinct_id,
                ON CLUSTER '{CLICKHOUSE_CLUSTER}'
            """,
        ),
        AsyncMigrationOperationSQL(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=KAFKA_PERSONS_DISTINCT_ID_TABLE_SQL(),
            rollback=
            f"DROP TABLE IF EXISTS kafka_person_distinct_id ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
        ),
        AsyncMigrationOperationSQL(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=PERSONS_DISTINCT_ID_TABLE_MV_SQL,
            rollback=
            f"DROP TABLE IF EXISTS person_distinct_id_mv ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
        ),
        AsyncMigrationOperation(fn=example_fn,
                                rollback_fn=example_rollback_fn),
    ]

    def healthcheck(self):
        result = sync_execute(
            "SELECT total_space, free_space FROM system.disks")
        total_space = result[0][0]
        free_space = result[0][1]
        if free_space > total_space / 3:
            return (True, None)
        else:
            return (False, "Upgrade your ClickHouse storage.")

    def progress(self, _):
        result = sync_execute(f"SELECT COUNT(1) FROM {TEMPORARY_TABLE_NAME}")
        result2 = sync_execute(
            f"SELECT COUNT(1) FROM {PERSONS_DISTINCT_ID_TABLE}")
        total_events_to_move = result2[0][0]
        total_events_moved = result[0][0]

        progress = 100 * total_events_moved / total_events_to_move
        return progress

    def is_required(self):
        res = sync_execute("SHOW CREATE TABLE person_distinct_id")
        return "ReplacingMergeTree" in res[0][0]
Exemplo n.º 9
0
    def operations(self):
        if self._events_table_engine() == "Distributed":
            # Note: This _should_ be impossible but hard to ensure.
            raise RuntimeError(
                "Cannot run the migration as `events` table is already Distributed engine."
            )

        create_table_op: List[AsyncMigrationOperation] = [
            AsyncMigrationOperationSQL(
                database=AnalyticsDBMS.CLICKHOUSE,
                sql=f"""
                CREATE TABLE IF NOT EXISTS {TEMPORARY_TABLE_NAME} ON CLUSTER '{CLICKHOUSE_CLUSTER}' AS {EVENTS_TABLE_NAME}
                ENGINE = ReplacingMergeTree(_timestamp)
                PARTITION BY toYYYYMM(timestamp)
                ORDER BY (team_id, toDate(timestamp), event, cityHash64(distinct_id), cityHash64(uuid))
                SAMPLE BY cityHash64(distinct_id)
                """,
                rollback=
                f"DROP TABLE IF EXISTS {TEMPORARY_TABLE_NAME} ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
            )
        ]

        old_partition_ops = []
        previous_partition = self._partitions[0] if len(
            self._partitions) > 0 else None
        for partition in self._partitions[1:]:
            old_partition_ops.append(
                generate_insert_into_op(previous_partition, partition))
            previous_partition = partition

        detach_mv_ops = [
            AsyncMigrationOperation(
                fn=lambda _: setattr(
                    config, "COMPUTE_MATERIALIZED_COLUMNS_ENABLED", False),
                rollback_fn=lambda _: setattr(
                    config, "COMPUTE_MATERIALIZED_COLUMNS_ENABLED", True),
            ),
            AsyncMigrationOperationSQL(
                database=AnalyticsDBMS.CLICKHOUSE,
                sql=
                f"DETACH TABLE {EVENTS_TABLE_NAME}_mv ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
                rollback=
                f"ATTACH TABLE {EVENTS_TABLE_NAME}_mv ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
            ),
        ]

        last_partition_op = [
            generate_insert_into_op(
                self._partitions[-1] if len(self._partitions) > 0 else 0)
        ]

        def optimize_table_fn(query_id):
            default_timeout = ASYNC_MIGRATIONS_DEFAULT_TIMEOUT_SECONDS
            try:
                execute_op_clickhouse(
                    f"OPTIMIZE TABLE {EVENTS_TABLE_NAME} FINAL",
                    query_id,
                    settings={
                        "max_execution_time": default_timeout,
                        "send_timeout": default_timeout,
                        "receive_timeout": default_timeout,
                    },
                )
            except:  # TODO: we should only pass the timeout one here
                pass

        post_insert_ops = [
            AsyncMigrationOperationSQL(
                database=AnalyticsDBMS.CLICKHOUSE,
                sql=f"""
                    RENAME TABLE
                        {EVENTS_TABLE_NAME} to {BACKUP_TABLE_NAME},
                        {TEMPORARY_TABLE_NAME} to {EVENTS_TABLE_NAME}
                    ON CLUSTER '{CLICKHOUSE_CLUSTER}'
                """,
                rollback=f"""
                    RENAME TABLE
                        {EVENTS_TABLE_NAME} to {FAILED_EVENTS_TABLE_NAME},
                        {BACKUP_TABLE_NAME} to {EVENTS_TABLE_NAME}
                    ON CLUSTER '{CLICKHOUSE_CLUSTER}'
                """,
            ),
            AsyncMigrationOperationSQL(
                database=AnalyticsDBMS.CLICKHOUSE,
                sql=
                f"ATTACH TABLE {EVENTS_TABLE_NAME}_mv ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
                rollback=
                f"DETACH TABLE {EVENTS_TABLE_NAME}_mv ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
            ),
            AsyncMigrationOperation(
                fn=lambda _: setattr(
                    config, "COMPUTE_MATERIALIZED_COLUMNS_ENABLED", True),
                rollback_fn=lambda _: setattr(
                    config, "COMPUTE_MATERIALIZED_COLUMNS_ENABLED", False),
            ),
            AsyncMigrationOperation(fn=optimize_table_fn),
        ]

        _operations = create_table_op + old_partition_ops + detach_mv_ops + last_partition_op + post_insert_ops
        return _operations