예제 #1
0
def attempt_migration_rollback(migration_instance: AsyncMigration):
    """
    Cycle through the operations in reverse order starting from the last completed op and run
    the specified rollback statements.
    """
    migration_instance.refresh_from_db()
    ops = get_async_migration_definition(migration_instance.name).operations
    # if the migration was completed the index is set 1 after, normally we should try rollback for current op
    current_index = min(migration_instance.current_operation_index,
                        len(ops) - 1)
    for op_index in range(current_index, -1, -1):
        try:
            op = ops[op_index]
            execute_op(op, str(UUIDT()), rollback=True)
        except Exception as e:
            error = f"At operation {op_index} rollback failed with error:{str(e)}"
            process_error(
                migration_instance=migration_instance,
                error=error,
                rollback=False,
                alert=True,
                current_operation_index=op_index,
            )

            return

    update_async_migration(migration_instance=migration_instance,
                           status=MigrationStatus.RolledBack,
                           progress=0)
예제 #2
0
def run_async_migration_next_op(
        migration_name: str,
        migration_instance: Optional[AsyncMigration] = None):
    """
    Runs the next operation specified by the currently running migration
    We run the next operation of the migration which needs attention

    Returns (run_next, success)
    Terminology:
    - migration_instance: The migration object as stored in the DB
    - migration_definition: The actual migration class outlining the operations (e.g. async_migrations/examples/example.py)
    """

    if not migration_instance:
        try:
            migration_instance = AsyncMigration.objects.get(
                name=migration_name, status=MigrationStatus.Running)
        except AsyncMigration.DoesNotExist:
            return (False, False)
    else:
        migration_instance.refresh_from_db()

    assert migration_instance is not None

    migration_definition = get_async_migration_definition(migration_name)
    if migration_instance.current_operation_index > len(
            migration_definition.operations) - 1:
        complete_migration(migration_instance)
        return (False, True)

    error = None
    current_query_id = str(UUIDT())

    try:
        op = migration_definition.operations[
            migration_instance.current_operation_index]

        execute_op(op, current_query_id)
        update_async_migration(
            migration_instance=migration_instance,
            current_query_id=current_query_id,
            current_operation_index=migration_instance.current_operation_index
            + 1,
        )

    except Exception as e:
        error = f"Exception was thrown while running operation {migration_instance.current_operation_index} : {str(e)}"
        process_error(migration_instance, error, alert=True)

    if error:
        return (False, False)

    update_migration_progress(migration_instance)
    return (True, False)
예제 #3
0
    def test_process_error(self, _):
        sm = create_async_migration()
        process_error(sm, "some error")
        process_error(sm, "second error")

        sm.refresh_from_db()
        self.assertEqual(sm.status, MigrationStatus.Errored)
        self.assertGreater(sm.finished_at,
                           datetime.now(timezone.utc) - timedelta(hours=1))
        errors = AsyncMigrationError.objects.filter(
            async_migration=sm).order_by("created_at")
        self.assertEqual(errors.count(), 2)
        self.assertEqual(errors[0].description, "some error")
        self.assertEqual(errors[1].description, "second error")
예제 #4
0
def check_async_migration_health() -> None:
    from posthog.models.async_migration import AsyncMigration, MigrationStatus

    try:
        migration_instance: AsyncMigration = AsyncMigration.objects.get(status=MigrationStatus.Running)
    except AsyncMigration.DoesNotExist:
        return

    migration_task_celery_state = AsyncResult(migration_instance.celery_task_id).state

    # we only care about "supposedly running" tasks here
    # failures and successes are handled elsewhere
    # pending means we haven't picked up the task yet
    # retry is not possible as max_retries == 0
    if migration_task_celery_state != states.STARTED:
        return

    inspector = app.control.inspect()
    active_tasks_per_node = inspector.active()

    active_task_ids = []

    if active_tasks_per_node:
        for _, tasks in active_tasks_per_node.items():
            active_task_ids += [task["id"] for task in tasks]

    # the worker crashed - this is how we find out and process the error
    if migration_instance.celery_task_id not in active_task_ids:
        if getattr(config, "ASYNC_MIGRATIONS_AUTO_CONTINUE"):
            trigger_migration(migration_instance, fresh_start=False)
        else:
            process_error(migration_instance, "Celery worker crashed while running migration.")
        return

    ok, error = run_migration_healthcheck(migration_instance)

    if not ok:
        force_stop_migration(migration_instance, f"Healthcheck failed with error: {error}")
        return

    update_migration_progress(migration_instance)
예제 #5
0
def start_async_migration(migration_name: str,
                          ignore_posthog_version=False) -> bool:
    """
    Performs some basic checks to ensure the migration can indeed run, and then kickstarts the chain of operations

    Returns whether migration was successful
    Checks:
    1. We're not over the concurrent migrations limit
    2. The migration can be run with the current PostHog version
    3. The migration is not already running
    4. The migration is required given the instance configuration
    5. The service version requirements are met (e.g. X < ClickHouse version < Y)
    6. The migration's healthcheck passes
    7. The migration's dependency has been completed
    """

    migration_instance = AsyncMigration.objects.get(name=migration_name)
    over_concurrent_migrations_limit = len(
        get_all_running_async_migrations()) >= MAX_CONCURRENT_ASYNC_MIGRATIONS
    posthog_version_valid = ignore_posthog_version or is_posthog_version_compatible(
        migration_instance.posthog_min_version,
        migration_instance.posthog_max_version)

    if (not migration_instance or over_concurrent_migrations_limit
            or not posthog_version_valid
            or migration_instance.status == MigrationStatus.Running):
        return False

    migration_definition = get_async_migration_definition(migration_name)

    if not migration_definition.is_required():
        complete_migration(migration_instance, email=False)
        return True

    ok, error = check_service_version_requirements(
        migration_definition.service_version_requirements)
    if not ok:
        process_error(migration_instance,
                      error,
                      status=MigrationStatus.FailedAtStartup)
        return False

    ok, error = is_migration_dependency_fulfilled(migration_instance.name)
    if not ok:
        process_error(migration_instance,
                      error,
                      status=MigrationStatus.FailedAtStartup)
        return False

    ok, error = run_migration_precheck(migration_instance)
    if not ok:
        process_error(migration_instance,
                      f"Migration precheck failed with error:{error}",
                      status=MigrationStatus.FailedAtStartup)
        return False

    ok, error = run_migration_healthcheck(migration_instance)
    if not ok:
        process_error(
            migration_instance,
            f"Migration healthcheck failed with error:{error}",
            status=MigrationStatus.FailedAtStartup,
        )
        return False

    mark_async_migration_as_running(migration_instance)

    return run_async_migration_operations(migration_name, migration_instance)