예제 #1
0
def clean_up_stuck_tasks():
    if not settings.TASK_TIMEOUT:
        return

    # Celery should clean up tasks automatically so add a buffer to let that happen.
    task_timeout = settings.TASK_TIMEOUT + 120
    client, app_name = get_scale_client()
    time_threshold = datetime.datetime.now(
        timezone.utc) - datetime.timedelta(seconds=task_timeout)
    export_task_records = (ExportTaskRecord.objects.prefetch_related(
        "export_provider_task__tasks").select_related(
            "export_provider_task__run").filter(
                Q(status=TaskState.RUNNING.value)
                & Q(started_at__lt=time_threshold)))
    run_uids = []
    for export_task_record in export_task_records:
        run = export_task_record.export_provider_task.run
        run_uids.append(str(run.uid))

        # Cancel the export task records that are over the timeout
        export_task_record.status = TaskState.CANCELED.value
        export_task_record.save()

        # Update DPTR to pending so that it can get picked up again
        data_provider_task_record = export_task_record.export_provider_task
        data_provider_task_record.status = TaskState.PENDING.value
        data_provider_task_record.save()

        # Update run to submitted so that it can get picked up again.
        run.status = TaskState.SUBMITTED.value
        run.save()

    kill_workers(run_uids, client)
예제 #2
0
def scale_by_tasks(celery_tasks, max_tasks_memory):

    client, app_name = get_scale_client()

    broker_api_url = getattr(settings, "BROKER_API_URL")
    queue_class = "queues"

    celery_pcf_task_details = get_celery_task_details(client, app_name,
                                                      celery_tasks)

    logger.info(
        f"Running Tasks Memory used: {celery_pcf_task_details['memory']} MB")

    celery_tasks = order_celery_tasks(celery_tasks,
                                      celery_pcf_task_details["task_counts"])

    # we don't want to exceed our memory but we also don't want to prevent tasks that _can_ run from running.
    smallest_memory_required = int(
        min([v["memory"] for k, v in celery_tasks.items()])) or 0
    logger.info(f"smallest_memory_required: {smallest_memory_required}")
    logger.info(f"max_tasks_memory: {max_tasks_memory}")
    running_tasks_memory = celery_pcf_task_details["memory"]
    while running_tasks_memory + smallest_memory_required <= max_tasks_memory:
        queues = get_all_rabbitmq_objects(broker_api_url, queue_class)
        dicts = list_to_dict(queues, "name")
        # If no tasks were run, give up... otherwise try to run another task.
        has_run_task = False
        running_tasks = client.get_running_tasks(app_name)
        if not any(
            [queue.get("messages", 0) for queue_name, queue in dicts.items()]):
            running_task_names = []
            for running_task in running_tasks.get("resources"):
                running_task_name = running_task.get("name")
                running_task_names.append(running_task_name)
                logger.info(
                    f"No messages left in the queue, shutting down {running_task_name}."
                )
            kill_workers(task_names=running_task_names, client=client)
            break

        queues_to_kill = []
        for celery_task_name, celery_task in celery_tasks.items():
            queue = dicts.get(celery_task_name)
            if not queue:
                continue
            queue_name = queue.get("name")
            pending_messages = queue.get("messages", 0)
            if pending_messages:
                logger.info(
                    f"Queue {queue_name} has {pending_messages} pending messages."
                )
            # Get updated information...
            running_tasks_by_queue = client.get_running_tasks(
                app_name, queue_name)
            running_tasks_by_queue_count = running_tasks_by_queue[
                "pagination"].get("total_results", 0)
            if pending_messages > running_tasks_by_queue_count:
                # Allow queues to have a limit, so that we don't spin up 30 priority queues.
                limit = celery_task.get("limit")
                if limit:
                    if running_tasks_by_queue_count >= limit:
                        continue
                if running_tasks_memory + celery_tasks[queue_name][
                        "memory"] <= max_tasks_memory:
                    run_task_command(client, app_name, queue_name,
                                     celery_tasks[queue_name])
            elif running_tasks_by_queue_count and not pending_messages:
                logger.info(
                    f"The {queue_name} has no messages, but has running_tasks_by_queue_count. Scheduling shutdown..."
                )
                queues_to_kill.append(queue_name)
            else:
                if running_tasks_by_queue_count:
                    logger.info(
                        f"Already {running_tasks_by_queue_count} workers, processing {pending_messages} total pending "
                        f"messages left in {queue_name} queue.")

            running_tasks_memory = client.get_running_tasks_memory(app_name)
            kill_workers(queues_to_kill, client)

        if not has_run_task:
            break
예제 #3
0
def scale_by_runs(max_tasks_memory):
    """
    @param max_tasks_memory: The amount of memory in MB to allow for all of the tasks.
    @type max_tasks_memory: int
    """
    from audit_logging.utils import get_user_details

    client, app_name = get_scale_client()

    celery_task_details = get_celery_task_details(client, app_name)
    running_tasks_memory = int(celery_task_details["memory"])
    celery_tasks = get_celery_tasks_scale_by_run()

    # Check if we need to scale for default system tasks.
    scale_default_tasks(client, app_name, celery_tasks)

    # Get run in progress
    runs = ExportRun.objects.filter(status=TaskState.SUBMITTED.value,
                                    deleted=False)
    total_tasks = 0
    running_tasks = client.get_running_tasks(app_name)
    logger.info(f"Running tasks: {running_tasks}")

    if running_tasks:
        total_tasks = running_tasks["pagination"].get("total_results", 0)
        # Get a list of running task names excluding the default celery tasks.
        running_task_names = [
            resource.get("name") for resource in running_tasks.get("resources")
            if resource.get("name") != "celery"
        ]
        finished_runs = ExportRun.objects.filter(
            Q(uid__in=running_task_names)
            & (Q(status__in=[
                state.value for state in TaskState.get_finished_states()
            ]) | Q(deleted=True)))

        finished_run_uids = []
        for finished_run in finished_runs:
            logger.info(
                f"Stopping {finished_run.uid} because it is in a finished state ({finished_run.status}) "
                f"or was deleted ({finished_run.deleted}).")
            finished_run_uids.append(str(finished_run.uid))
        kill_workers(task_names=finished_run_uids, client=client)

    for run in runs:
        celery_run_task = copy.deepcopy(celery_tasks["run"])

        logger.info(
            f"Checking to see if submitted run {run.uid} needs a new worker.")
        max_runs = int(os.getenv("RUNS_CONCURRENCY", 3))

        if max_runs and total_tasks >= max_runs:
            logger.info(
                f"total_tasks ({total_tasks}) >= max_runs ({max_runs})")
            break
        if running_tasks_memory + celery_run_task["memory"] >= max_tasks_memory:
            logger.info("Not enough available memory to scale another run.")
            break
        task_name = run.uid

        running_tasks_by_queue = client.get_running_tasks(app_name, task_name)
        running_tasks_by_queue_count = running_tasks_by_queue[
            "pagination"].get("total_results", 0)

        logger.info(
            f"Currently {running_tasks_by_queue_count} tasks running for {task_name}."
        )
        if running_tasks_by_queue_count:
            logger.info(f"Already a consumer for {task_name}")
            continue
        user_session = UserSession.objects.filter(user=run.user).last()
        session_token = None
        if user_session:
            session = Session.objects.get(session_key=user_session.session_id)
            session_token = session.get_decoded().get("session_token")

        user_details = get_user_details(run.user)
        pick_up_run_task.s(run_uid=str(run.uid),
                           session_token=session_token,
                           user_details=user_details).apply_async(
                               queue=str(task_name),
                               routing_key=str(task_name))
        celery_run_task["command"] = celery_run_task["command"].format(
            celery_group_name=task_name)
        run_task_command(client, app_name, str(task_name), celery_run_task)
        # Keep track of new resources being used.
        total_tasks += 1
        running_tasks_memory += celery_run_task["memory"]