Exemplo n.º 1
0
def test_job_not_blocking_inventory_update(default_instance_group, job_template_factory, inventory_source_factory):
    objects = job_template_factory('jt', organization='org1', project='proj',
                                   inventory='inv', credential='cred',
                                   jobs=["job"])
    job = objects.jobs["job"]
    job.instance_group = default_instance_group
    job.status = "running"
    job.save()

    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
        task_manager = TaskManager()
        task_manager._schedule()

        inv = objects.inventory
        inv_source = inventory_source_factory("ec2")
        inv_source.source = "ec2"
        inv.inventory_sources.add(inv_source)
        inventory_update = inv_source.create_inventory_update()
        inventory_update.instance_group = default_instance_group
        inventory_update.status = "pending"
        inventory_update.save()

        assert not task_manager.is_job_blocked(inventory_update)

        dependency_graph = DependencyGraph(None)
        dependency_graph.add_job(job)
        assert not dependency_graph.is_job_blocked(inventory_update)
Exemplo n.º 2
0
    def after_lock_init(self):
        """
        Init AFTER we know this instance of the task manager will run because the lock is acquired.
        """
        instances = Instance.objects.filter(hostname__isnull=False, enabled=True).exclude(node_type='hop')
        self.real_instances = {i.hostname: i for i in instances}
        self.controlplane_ig = None
        self.dependency_graph = DependencyGraph()

        instances_partial = [
            SimpleNamespace(
                obj=instance,
                node_type=instance.node_type,
                remaining_capacity=instance.remaining_capacity,
                capacity=instance.capacity,
                jobs_running=instance.jobs_running,
                hostname=instance.hostname,
            )
            for instance in instances
        ]

        instances_by_hostname = {i.hostname: i for i in instances_partial}

        for rampart_group in InstanceGroup.objects.prefetch_related('instances'):
            if rampart_group.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME:
                self.controlplane_ig = rampart_group
            self.graph[rampart_group.name] = dict(
                instances=[
                    instances_by_hostname[instance.hostname] for instance in rampart_group.instances.all() if instance.hostname in instances_by_hostname
                ],
            )
Exemplo n.º 3
0
def test_job_not_blocking_project_update(default_instance_group,
                                         job_template_factory):
    objects = job_template_factory('jt',
                                   organization='org1',
                                   project='proj',
                                   inventory='inv',
                                   credential='cred',
                                   jobs=["job"])
    job = objects.jobs["job"]
    job.instance_group = default_instance_group
    job.status = "running"
    job.save()

    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
        task_manager = TaskManager()
        task_manager._schedule()

        proj = objects.project
        project_update = proj.create_project_update()
        project_update.instance_group = default_instance_group
        project_update.status = "pending"
        project_update.save()
        assert not task_manager.job_blocked_by(project_update)

        dependency_graph = DependencyGraph()
        dependency_graph.add_job(job)
        assert not dependency_graph.task_blocked_by(project_update)
Exemplo n.º 4
0
 def after_lock_init(self, all_sorted_tasks):
     """
     Init AFTER we know this instance of the task manager will run because the lock is acquired.
     """
     self.dependency_graph = DependencyGraph()
     self.instances = TaskManagerInstances(all_sorted_tasks)
     self.instance_groups = TaskManagerInstanceGroups(instances_by_hostname=self.instances)
     self.controlplane_ig = self.instance_groups.controlplane_ig
Exemplo n.º 5
0
    def after_lock_init(self):
        '''
        Init AFTER we know this instance of the task manager will run because the lock is acquired.
        '''
        instances = Instance.objects.filter(~Q(hostname=None),
                                            capacity__gt=0,
                                            enabled=True)
        self.real_instances = {i.hostname: i for i in instances}

        instances_partial = [
            SimpleNamespace(obj=instance,
                            remaining_capacity=instance.remaining_capacity,
                            capacity=instance.capacity,
                            jobs_running=instance.jobs_running,
                            hostname=instance.hostname)
            for instance in instances
        ]

        instances_by_hostname = {i.hostname: i for i in instances_partial}

        for rampart_group in InstanceGroup.objects.prefetch_related(
                'instances'):
            self.graph[rampart_group.name] = dict(
                graph=DependencyGraph(rampart_group.name),
                capacity_total=rampart_group.capacity,
                consumed_capacity=0,
                instances=[])
            for instance in rampart_group.instances.filter(
                    capacity__gt=0, enabled=True).order_by('hostname'):
                if instance.hostname in instances_by_hostname:
                    self.graph[rampart_group.name]['instances'].append(
                        instances_by_hostname[instance.hostname])
Exemplo n.º 6
0
 def __init__(self):
     self.graph = dict()
     for rampart_group in InstanceGroup.objects.prefetch_related(
             'instances'):
         self.graph[rampart_group.name] = dict(
             graph=DependencyGraph(rampart_group.name),
             capacity_total=rampart_group.capacity,
             consumed_capacity=0)
Exemplo n.º 7
0
 def __init__(self):
     self.graph = dict()
     # start task limit indicates how many pending jobs can be started on this
     # .schedule() run. Starting jobs is expensive, and there is code in place to reap
     # the task manager after 5 minutes. At scale, the task manager can easily take more than
     # 5 minutes to start pending jobs. If this limit is reached, pending jobs
     # will no longer be started and will be started on the next task manager cycle.
     self.start_task_limit = settings.START_TASK_LIMIT
     for rampart_group in InstanceGroup.objects.prefetch_related(
             'instances'):
         self.graph[rampart_group.name] = dict(
             graph=DependencyGraph(rampart_group.name),
             capacity_total=rampart_group.capacity,
             consumed_capacity=0)
Exemplo n.º 8
0
    def after_lock_init(self):
        """
        Init AFTER we know this instance of the task manager will run because the lock is acquired.
        """
        instances = Instance.objects.filter(
            hostname__isnull=False, enabled=True).exclude(node_type='hop')
        self.real_instances = {i.hostname: i for i in instances}
        self.controlplane_ig = None

        instances_partial = [
            SimpleNamespace(
                obj=instance,
                node_type=instance.node_type,
                remaining_capacity=instance.remaining_capacity,
                capacity=instance.capacity,
                jobs_running=instance.jobs_running,
                hostname=instance.hostname,
            ) for instance in instances
        ]

        instances_by_hostname = {i.hostname: i for i in instances_partial}

        for rampart_group in InstanceGroup.objects.prefetch_related(
                'instances'):
            if rampart_group.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME:
                self.controlplane_ig = rampart_group
            self.graph[rampart_group.name] = dict(
                graph=DependencyGraph(),
                execution_capacity=0,
                control_capacity=0,
                consumed_capacity=0,
                consumed_control_capacity=0,
                consumed_execution_capacity=0,
                instances=[],
            )
            for instance in rampart_group.instances.all():
                if not instance.enabled:
                    continue
                for capacity_type in ('control', 'execution'):
                    if instance.node_type in (capacity_type, 'hybrid'):
                        self.graph[rampart_group.name][
                            f'{capacity_type}_capacity'] += instance.capacity
            for instance in rampart_group.instances.filter(
                    enabled=True).order_by('hostname'):
                if instance.hostname in instances_by_hostname:
                    self.graph[rampart_group.name]['instances'].append(
                        instances_by_hostname[instance.hostname])
Exemplo n.º 9
0
class TaskManager:
    def __init__(self):
        """
        Do NOT put database queries or other potentially expensive operations
        in the task manager init. The task manager object is created every time a
        job is created, transitions state, and every 30 seconds on each tower node.
        More often then not, the object is destroyed quickly because the NOOP case is hit.

        The NOOP case is short-circuit logic. If the task manager realizes that another instance
        of the task manager is already running, then it short-circuits and decides not to run.
        """
        self.graph = dict()
        # start task limit indicates how many pending jobs can be started on this
        # .schedule() run. Starting jobs is expensive, and there is code in place to reap
        # the task manager after 5 minutes. At scale, the task manager can easily take more than
        # 5 minutes to start pending jobs. If this limit is reached, pending jobs
        # will no longer be started and will be started on the next task manager cycle.
        self.start_task_limit = settings.START_TASK_LIMIT

        self.time_delta_job_explanation = timedelta(seconds=30)

    def after_lock_init(self):
        """
        Init AFTER we know this instance of the task manager will run because the lock is acquired.
        """
        instances = Instance.objects.filter(hostname__isnull=False, enabled=True).exclude(node_type='hop')
        self.real_instances = {i.hostname: i for i in instances}
        self.controlplane_ig = None
        self.dependency_graph = DependencyGraph()

        instances_partial = [
            SimpleNamespace(
                obj=instance,
                node_type=instance.node_type,
                remaining_capacity=instance.remaining_capacity,
                capacity=instance.capacity,
                jobs_running=instance.jobs_running,
                hostname=instance.hostname,
            )
            for instance in instances
        ]

        instances_by_hostname = {i.hostname: i for i in instances_partial}

        for rampart_group in InstanceGroup.objects.prefetch_related('instances'):
            if rampart_group.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME:
                self.controlplane_ig = rampart_group
            self.graph[rampart_group.name] = dict(
                instances=[
                    instances_by_hostname[instance.hostname] for instance in rampart_group.instances.all() if instance.hostname in instances_by_hostname
                ],
            )

    def job_blocked_by(self, task):
        # TODO: I'm not happy with this, I think blocking behavior should be decided outside of the dependency graph
        #       in the old task manager this was handled as a method on each task object outside of the graph and
        #       probably has the side effect of cutting down *a lot* of the logic from this task manager class
        blocked_by = self.dependency_graph.task_blocked_by(task)
        if blocked_by:
            return blocked_by

        if not task.dependent_jobs_finished():
            blocked_by = task.dependent_jobs.first()
            if blocked_by:
                return blocked_by

        return None

    def get_tasks(self, status_list=('pending', 'waiting', 'running')):
        jobs = [j for j in Job.objects.filter(status__in=status_list).prefetch_related('instance_group')]
        inventory_updates_qs = (
            InventoryUpdate.objects.filter(status__in=status_list).exclude(source='file').prefetch_related('inventory_source', 'instance_group')
        )
        inventory_updates = [i for i in inventory_updates_qs]
        # Notice the job_type='check': we want to prevent implicit project updates from blocking our jobs.
        project_updates = [p for p in ProjectUpdate.objects.filter(status__in=status_list, job_type='check').prefetch_related('instance_group')]
        system_jobs = [s for s in SystemJob.objects.filter(status__in=status_list).prefetch_related('instance_group')]
        ad_hoc_commands = [a for a in AdHocCommand.objects.filter(status__in=status_list).prefetch_related('instance_group')]
        workflow_jobs = [w for w in WorkflowJob.objects.filter(status__in=status_list)]
        all_tasks = sorted(jobs + project_updates + inventory_updates + system_jobs + ad_hoc_commands + workflow_jobs, key=lambda task: task.created)
        return all_tasks

    def get_running_workflow_jobs(self):
        graph_workflow_jobs = [wf for wf in WorkflowJob.objects.filter(status='running')]
        return graph_workflow_jobs

    def get_inventory_source_tasks(self, all_sorted_tasks):
        inventory_ids = set()
        for task in all_sorted_tasks:
            if isinstance(task, Job):
                inventory_ids.add(task.inventory_id)
        return [invsrc for invsrc in InventorySource.objects.filter(inventory_id__in=inventory_ids, update_on_launch=True)]

    def spawn_workflow_graph_jobs(self, workflow_jobs):
        for workflow_job in workflow_jobs:
            if workflow_job.cancel_flag:
                logger.debug('Not spawning jobs for %s because it is pending cancelation.', workflow_job.log_format)
                continue
            dag = WorkflowDAG(workflow_job)
            spawn_nodes = dag.bfs_nodes_to_run()
            if spawn_nodes:
                logger.debug('Spawning jobs for %s', workflow_job.log_format)
            else:
                logger.debug('No nodes to spawn for %s', workflow_job.log_format)
            for spawn_node in spawn_nodes:
                if spawn_node.unified_job_template is None:
                    continue
                kv = spawn_node.get_job_kwargs()
                job = spawn_node.unified_job_template.create_unified_job(**kv)
                spawn_node.job = job
                spawn_node.save()
                logger.debug('Spawned %s in %s for node %s', job.log_format, workflow_job.log_format, spawn_node.pk)
                can_start = True
                if isinstance(spawn_node.unified_job_template, WorkflowJobTemplate):
                    workflow_ancestors = job.get_ancestor_workflows()
                    if spawn_node.unified_job_template in set(workflow_ancestors):
                        can_start = False
                        logger.info(
                            'Refusing to start recursive workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
                                job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]
                            )
                        )
                        display_list = [spawn_node.unified_job_template] + workflow_ancestors
                        job.job_explanation = gettext_noop(
                            "Workflow Job spawned from workflow could not start because it " "would result in recursion (spawn order, most recent first: {})"
                        ).format(', '.join(['<{}>'.format(tmp) for tmp in display_list]))
                    else:
                        logger.debug(
                            'Starting workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
                                job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]
                            )
                        )
                if not job._resources_sufficient_for_launch():
                    can_start = False
                    job.job_explanation = gettext_noop(
                        "Job spawned from workflow could not start because it " "was missing a related resource such as project or inventory"
                    )
                if can_start:
                    if workflow_job.start_args:
                        start_args = json.loads(decrypt_field(workflow_job, 'start_args'))
                    else:
                        start_args = {}
                    can_start = job.signal_start(**start_args)
                    if not can_start:
                        job.job_explanation = gettext_noop(
                            "Job spawned from workflow could not start because it " "was not in the right state or required manual credentials"
                        )
                if not can_start:
                    job.status = 'failed'
                    job.save(update_fields=['status', 'job_explanation'])
                    job.websocket_emit_status('failed')

                # TODO: should we emit a status on the socket here similar to tasks.py awx_periodic_scheduler() ?
                # emit_websocket_notification('/socket.io/jobs', '', dict(id=))

    def process_finished_workflow_jobs(self, workflow_jobs):
        result = []
        for workflow_job in workflow_jobs:
            dag = WorkflowDAG(workflow_job)
            status_changed = False
            if workflow_job.cancel_flag:
                workflow_job.workflow_nodes.filter(do_not_run=False, job__isnull=True).update(do_not_run=True)
                logger.debug('Canceling spawned jobs of %s due to cancel flag.', workflow_job.log_format)
                cancel_finished = dag.cancel_node_jobs()
                if cancel_finished:
                    logger.info('Marking %s as canceled, all spawned jobs have concluded.', workflow_job.log_format)
                    workflow_job.status = 'canceled'
                    workflow_job.start_args = ''  # blank field to remove encrypted passwords
                    workflow_job.save(update_fields=['status', 'start_args'])
                    status_changed = True
            else:
                workflow_nodes = dag.mark_dnr_nodes()
                for n in workflow_nodes:
                    n.save(update_fields=['do_not_run'])
                is_done = dag.is_workflow_done()
                if not is_done:
                    continue
                has_failed, reason = dag.has_workflow_failed()
                logger.debug('Marking %s as %s.', workflow_job.log_format, 'failed' if has_failed else 'successful')
                result.append(workflow_job.id)
                new_status = 'failed' if has_failed else 'successful'
                logger.debug("Transitioning {} to {} status.".format(workflow_job.log_format, new_status))
                update_fields = ['status', 'start_args']
                workflow_job.status = new_status
                if reason:
                    logger.info(f'Workflow job {workflow_job.id} failed due to reason: {reason}')
                    workflow_job.job_explanation = gettext_noop("No error handling paths found, marking workflow as failed")
                    update_fields.append('job_explanation')
                workflow_job.start_args = ''  # blank field to remove encrypted passwords
                workflow_job.save(update_fields=update_fields)
                status_changed = True
            if status_changed:
                workflow_job.websocket_emit_status(workflow_job.status)
                # Operations whose queries rely on modifications made during the atomic scheduling session
                workflow_job.send_notification_templates('succeeded' if workflow_job.status == 'successful' else 'failed')
                if workflow_job.spawned_by_workflow:
                    schedule_task_manager()
        return result

    def start_task(self, task, rampart_group, dependent_tasks=None, instance=None):
        self.start_task_limit -= 1
        if self.start_task_limit == 0:
            # schedule another run immediately after this task manager
            schedule_task_manager()
        from awx.main.tasks.system import handle_work_error, handle_work_success

        dependent_tasks = dependent_tasks or []

        task_actual = {
            'type': get_type_for_model(type(task)),
            'id': task.id,
        }
        dependencies = [{'type': get_type_for_model(type(t)), 'id': t.id} for t in dependent_tasks]

        task.status = 'waiting'

        (start_status, opts) = task.pre_start()
        if not start_status:
            task.status = 'failed'
            if task.job_explanation:
                task.job_explanation += ' '
            task.job_explanation += 'Task failed pre-start check.'
            task.save()
            # TODO: run error handler to fail sub-tasks and send notifications
        else:
            if type(task) is WorkflowJob:
                task.status = 'running'
                task.send_notification_templates('running')
                logger.debug('Transitioning %s to running status.', task.log_format)
                schedule_task_manager()
            # at this point we already have control/execution nodes selected for the following cases
            else:
                task.instance_group = rampart_group
                execution_node_msg = f' and execution node {task.execution_node}' if task.execution_node else ''
                logger.debug(
                    f'Submitting job {task.log_format} controlled by {task.controller_node} to instance group {rampart_group.name}{execution_node_msg}.'
                )
            with disable_activity_stream():
                task.celery_task_id = str(uuid.uuid4())
                task.save()
                task.log_lifecycle("waiting")

        def post_commit():
            if task.status != 'failed' and type(task) is not WorkflowJob:
                # Before task is dispatched, ensure that job_event partitions exist
                create_partition(task.event_class._meta.db_table, start=task.created)
                task_cls = task._get_task_class()
                task_cls.apply_async(
                    [task.pk],
                    opts,
                    queue=task.get_queue_name(),
                    uuid=task.celery_task_id,
                    callbacks=[{'task': handle_work_success.name, 'kwargs': {'task_actual': task_actual}}],
                    errbacks=[{'task': handle_work_error.name, 'args': [task.celery_task_id], 'kwargs': {'subtasks': [task_actual] + dependencies}}],
                )

        task.websocket_emit_status(task.status)  # adds to on_commit
        connection.on_commit(post_commit)

    def process_running_tasks(self, running_tasks):
        for task in running_tasks:
            self.dependency_graph.add_job(task)

    def create_project_update(self, task):
        project_task = Project.objects.get(id=task.project_id).create_project_update(_eager_fields=dict(launch_type='dependency'))

        # Project created 1 seconds behind
        project_task.created = task.created - timedelta(seconds=1)
        project_task.status = 'pending'
        project_task.save()
        logger.debug('Spawned {} as dependency of {}'.format(project_task.log_format, task.log_format))
        return project_task

    def create_inventory_update(self, task, inventory_source_task):
        inventory_task = InventorySource.objects.get(id=inventory_source_task.id).create_inventory_update(_eager_fields=dict(launch_type='dependency'))

        inventory_task.created = task.created - timedelta(seconds=2)
        inventory_task.status = 'pending'
        inventory_task.save()
        logger.debug('Spawned {} as dependency of {}'.format(inventory_task.log_format, task.log_format))
        # inventory_sources = self.get_inventory_source_tasks([task])
        # self.process_inventory_sources(inventory_sources)
        return inventory_task

    def capture_chain_failure_dependencies(self, task, dependencies):
        with disable_activity_stream():
            task.dependent_jobs.add(*dependencies)

            for dep in dependencies:
                # Add task + all deps except self
                dep.dependent_jobs.add(*([task] + [d for d in dependencies if d != dep]))

    def get_latest_inventory_update(self, inventory_source):
        latest_inventory_update = InventoryUpdate.objects.filter(inventory_source=inventory_source).order_by("-created")
        if not latest_inventory_update.exists():
            return None
        return latest_inventory_update.first()

    def should_update_inventory_source(self, job, latest_inventory_update):
        now = tz_now()

        if latest_inventory_update is None:
            return True
        '''
        If there's already a inventory update utilizing this job that's about to run
        then we don't need to create one
        '''
        if latest_inventory_update.status in ['waiting', 'pending', 'running']:
            return False

        timeout_seconds = timedelta(seconds=latest_inventory_update.inventory_source.update_cache_timeout)
        if (latest_inventory_update.finished + timeout_seconds) < now:
            return True
        if latest_inventory_update.inventory_source.update_on_launch is True and latest_inventory_update.status in ['failed', 'canceled', 'error']:
            return True
        return False

    def get_latest_project_update(self, job):
        latest_project_update = ProjectUpdate.objects.filter(project=job.project, job_type='check').order_by("-created")
        if not latest_project_update.exists():
            return None
        return latest_project_update.first()

    def should_update_related_project(self, job, latest_project_update):
        now = tz_now()

        if latest_project_update is None:
            return True

        if latest_project_update.status in ['failed', 'canceled']:
            return True

        '''
        If there's already a project update utilizing this job that's about to run
        then we don't need to create one
        '''
        if latest_project_update.status in ['waiting', 'pending', 'running']:
            return False

        '''
        If the latest project update has a created time == job_created_time-1
        then consider the project update found. This is so we don't enter an infinite loop
        of updating the project when cache timeout is 0.
        '''
        if (
            latest_project_update.project.scm_update_cache_timeout == 0
            and latest_project_update.launch_type == 'dependency'
            and latest_project_update.created == job.created - timedelta(seconds=1)
        ):
            return False
        '''
        Normal Cache Timeout Logic
        '''
        timeout_seconds = timedelta(seconds=latest_project_update.project.scm_update_cache_timeout)
        if (latest_project_update.finished + timeout_seconds) < now:
            return True
        return False

    def generate_dependencies(self, undeped_tasks):
        created_dependencies = []
        for task in undeped_tasks:
            task.log_lifecycle("acknowledged")
            dependencies = []
            if not type(task) is Job:
                continue
            # TODO: Can remove task.project None check after scan-job-default-playbook is removed
            if task.project is not None and task.project.scm_update_on_launch is True:
                latest_project_update = self.get_latest_project_update(task)
                if self.should_update_related_project(task, latest_project_update):
                    project_task = self.create_project_update(task)
                    created_dependencies.append(project_task)
                    dependencies.append(project_task)
                else:
                    dependencies.append(latest_project_update)

            # Inventory created 2 seconds behind job
            try:
                start_args = json.loads(decrypt_field(task, field_name="start_args"))
            except ValueError:
                start_args = dict()
            for inventory_source in [invsrc for invsrc in self.all_inventory_sources if invsrc.inventory == task.inventory]:
                if "inventory_sources_already_updated" in start_args and inventory_source.id in start_args['inventory_sources_already_updated']:
                    continue
                if not inventory_source.update_on_launch:
                    continue
                latest_inventory_update = self.get_latest_inventory_update(inventory_source)
                if self.should_update_inventory_source(task, latest_inventory_update):
                    inventory_task = self.create_inventory_update(task, inventory_source)
                    created_dependencies.append(inventory_task)
                    dependencies.append(inventory_task)
                else:
                    dependencies.append(latest_inventory_update)

            if len(dependencies) > 0:
                self.capture_chain_failure_dependencies(task, dependencies)

        UnifiedJob.objects.filter(pk__in=[task.pk for task in undeped_tasks]).update(dependencies_processed=True)
        return created_dependencies

    def process_pending_tasks(self, pending_tasks):
        running_workflow_templates = {wf.unified_job_template_id for wf in self.get_running_workflow_jobs()}
        tasks_to_update_job_explanation = []
        for task in pending_tasks:
            if self.start_task_limit <= 0:
                break
            blocked_by = self.job_blocked_by(task)
            if blocked_by:
                task.log_lifecycle("blocked", blocked_by=blocked_by)
                job_explanation = gettext_noop(f"waiting for {blocked_by._meta.model_name}-{blocked_by.id} to finish")
                if task.job_explanation != job_explanation:
                    if task.created < (tz_now() - self.time_delta_job_explanation):
                        task.job_explanation = job_explanation
                        tasks_to_update_job_explanation.append(task)
                continue

            found_acceptable_queue = False
            preferred_instance_groups = task.preferred_instance_groups

            if isinstance(task, WorkflowJob):
                if task.unified_job_template_id in running_workflow_templates:
                    if not task.allow_simultaneous:
                        logger.debug("{} is blocked from running, workflow already running".format(task.log_format))
                        continue
                else:
                    running_workflow_templates.add(task.unified_job_template_id)
                self.start_task(task, None, task.get_jobs_fail_chain(), None)
                continue

            # Determine if there is control capacity for the task
            if task.capacity_type == 'control':
                control_impact = task.task_impact + settings.AWX_CONTROL_NODE_TASK_IMPACT
            else:
                control_impact = settings.AWX_CONTROL_NODE_TASK_IMPACT
            control_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(
                task, self.graph[settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME]['instances'], impact=control_impact, capacity_type='control'
            )
            if not control_instance:
                self.task_needs_capacity(task, tasks_to_update_job_explanation)
                logger.debug(f"Skipping task {task.log_format} in pending, not enough capacity left on controlplane to control new tasks")
                continue

            task.controller_node = control_instance.hostname

            # All task.capacity_type == 'control' jobs should run on control plane, no need to loop over instance groups
            if task.capacity_type == 'control':
                task.execution_node = control_instance.hostname
                control_instance.remaining_capacity = max(0, control_instance.remaining_capacity - control_impact)
                control_instance.jobs_running += 1
                self.dependency_graph.add_job(task)
                execution_instance = self.real_instances[control_instance.hostname]
                task.log_lifecycle("controller_node_chosen")
                task.log_lifecycle("execution_node_chosen")
                self.start_task(task, self.controlplane_ig, task.get_jobs_fail_chain(), execution_instance)
                found_acceptable_queue = True
                continue

            for rampart_group in preferred_instance_groups:
                if rampart_group.is_container_group:
                    control_instance.jobs_running += 1
                    self.dependency_graph.add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), None)
                    found_acceptable_queue = True
                    break

                # TODO: remove this after we have confidence that OCP control nodes are reporting node_type=control
                if settings.IS_K8S and task.capacity_type == 'execution':
                    logger.debug("Skipping group {}, task cannot run on control plane".format(rampart_group.name))
                    continue
                # at this point we know the instance group is NOT a container group
                # because if it was, it would have started the task and broke out of the loop.
                execution_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(
                    task, self.graph[rampart_group.name]['instances'], add_hybrid_control_cost=True
                ) or InstanceGroup.find_largest_idle_instance(self.graph[rampart_group.name]['instances'], capacity_type=task.capacity_type)

                if execution_instance:
                    task.execution_node = execution_instance.hostname
                    # If our execution instance is a hybrid, prefer to do control tasks there as well.
                    if execution_instance.node_type == 'hybrid':
                        control_instance = execution_instance
                        task.controller_node = execution_instance.hostname

                    control_instance.remaining_capacity = max(0, control_instance.remaining_capacity - settings.AWX_CONTROL_NODE_TASK_IMPACT)
                    task.log_lifecycle("controller_node_chosen")
                    if control_instance != execution_instance:
                        control_instance.jobs_running += 1
                    execution_instance.remaining_capacity = max(0, execution_instance.remaining_capacity - task.task_impact)
                    execution_instance.jobs_running += 1
                    task.log_lifecycle("execution_node_chosen")
                    logger.debug(
                        "Starting {} in group {} instance {} (remaining_capacity={})".format(
                            task.log_format, rampart_group.name, execution_instance.hostname, execution_instance.remaining_capacity
                        )
                    )
                    execution_instance = self.real_instances[execution_instance.hostname]
                    self.dependency_graph.add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), execution_instance)
                    found_acceptable_queue = True
                    break
                else:
                    logger.debug(
                        "No instance available in group {} to run job {} w/ capacity requirement {}".format(
                            rampart_group.name, task.log_format, task.task_impact
                        )
                    )
            if not found_acceptable_queue:
                self.task_needs_capacity(task, tasks_to_update_job_explanation)
        UnifiedJob.objects.bulk_update(tasks_to_update_job_explanation, ['job_explanation'])

    def task_needs_capacity(self, task, tasks_to_update_job_explanation):
        task.log_lifecycle("needs_capacity")
        job_explanation = gettext_noop("This job is not ready to start because there is not enough available capacity.")
        if task.job_explanation != job_explanation:
            if task.created < (tz_now() - self.time_delta_job_explanation):
                # Many launched jobs are immediately blocked, but most blocks will resolve in a few seconds.
                # Therefore we should only update the job_explanation after some time has elapsed to
                # prevent excessive task saves.
                task.job_explanation = job_explanation
                tasks_to_update_job_explanation.append(task)
        logger.debug("{} couldn't be scheduled on graph, waiting for next cycle".format(task.log_format))

    def timeout_approval_node(self):
        workflow_approvals = WorkflowApproval.objects.filter(status='pending')
        now = tz_now()
        for task in workflow_approvals:
            approval_timeout_seconds = timedelta(seconds=task.timeout)
            if task.timeout == 0:
                continue
            if (now - task.created) >= approval_timeout_seconds:
                timeout_message = _("The approval node {name} ({pk}) has expired after {timeout} seconds.").format(
                    name=task.name, pk=task.pk, timeout=task.timeout
                )
                logger.warning(timeout_message)
                task.timed_out = True
                task.status = 'failed'
                task.send_approval_notification('timed_out')
                task.websocket_emit_status(task.status)
                task.job_explanation = timeout_message
                task.save(update_fields=['status', 'job_explanation', 'timed_out'])

    def reap_jobs_from_orphaned_instances(self):
        # discover jobs that are in running state but aren't on an execution node
        # that we know about; this is a fairly rare event, but it can occur if you,
        # for example, SQL backup an awx install with running jobs and restore it
        # elsewhere
        for j in UnifiedJob.objects.filter(
            status__in=['pending', 'waiting', 'running'],
        ).exclude(execution_node__in=Instance.objects.exclude(node_type='hop').values_list('hostname', flat=True)):
            if j.execution_node and not j.is_container_group_task:
                logger.error(f'{j.execution_node} is not a registered instance; reaping {j.log_format}')
                reap_job(j, 'failed')

    def process_tasks(self, all_sorted_tasks):
        running_tasks = [t for t in all_sorted_tasks if t.status in ['waiting', 'running']]

        self.process_running_tasks(running_tasks)

        pending_tasks = [t for t in all_sorted_tasks if t.status == 'pending']
        undeped_tasks = [t for t in pending_tasks if not t.dependencies_processed]
        dependencies = self.generate_dependencies(undeped_tasks)
        self.process_pending_tasks(dependencies)
        self.process_pending_tasks(pending_tasks)

    def _schedule(self):
        finished_wfjs = []
        all_sorted_tasks = self.get_tasks()

        self.after_lock_init()

        if len(all_sorted_tasks) > 0:
            # TODO: Deal with
            # latest_project_updates = self.get_latest_project_update_tasks(all_sorted_tasks)
            # self.process_latest_project_updates(latest_project_updates)

            # latest_inventory_updates = self.get_latest_inventory_update_tasks(all_sorted_tasks)
            # self.process_latest_inventory_updates(latest_inventory_updates)

            self.all_inventory_sources = self.get_inventory_source_tasks(all_sorted_tasks)

            running_workflow_tasks = self.get_running_workflow_jobs()
            finished_wfjs = self.process_finished_workflow_jobs(running_workflow_tasks)

            previously_running_workflow_tasks = running_workflow_tasks
            running_workflow_tasks = []
            for workflow_job in previously_running_workflow_tasks:
                if workflow_job.status == 'running':
                    running_workflow_tasks.append(workflow_job)
                else:
                    logger.debug('Removed %s from job spawning consideration.', workflow_job.log_format)

            self.spawn_workflow_graph_jobs(running_workflow_tasks)

            self.timeout_approval_node()
            self.reap_jobs_from_orphaned_instances()

            self.process_tasks(all_sorted_tasks)
        return finished_wfjs

    def schedule(self):
        # Lock
        with advisory_lock('task_manager_lock', wait=False) as acquired:
            with transaction.atomic():
                if acquired is False:
                    logger.debug("Not running scheduler, another task holds lock")
                    return
                logger.debug("Starting Scheduler")
                with task_manager_bulk_reschedule():
                    self._schedule()
                logger.debug("Finishing Scheduler")
Exemplo n.º 10
0
class TaskManager(TaskBase):
    def __init__(self):
        """
        Do NOT put database queries or other potentially expensive operations
        in the task manager init. The task manager object is created every time a
        job is created, transitions state, and every 30 seconds on each tower node.
        More often then not, the object is destroyed quickly because the NOOP case is hit.

        The NOOP case is short-circuit logic. If the task manager realizes that another instance
        of the task manager is already running, then it short-circuits and decides not to run.
        """
        # start task limit indicates how many pending jobs can be started on this
        # .schedule() run. Starting jobs is expensive, and there is code in place to reap
        # the task manager after 5 minutes. At scale, the task manager can easily take more than
        # 5 minutes to start pending jobs. If this limit is reached, pending jobs
        # will no longer be started and will be started on the next task manager cycle.
        self.time_delta_job_explanation = timedelta(seconds=30)
        super().__init__(prefix="task_manager")

    def after_lock_init(self):
        """
        Init AFTER we know this instance of the task manager will run because the lock is acquired.
        """
        self.dependency_graph = DependencyGraph()
        self.instances = TaskManagerInstances(self.all_tasks)
        self.instance_groups = TaskManagerInstanceGroups(
            instances_by_hostname=self.instances)
        self.controlplane_ig = self.instance_groups.controlplane_ig

    def job_blocked_by(self, task):
        # TODO: I'm not happy with this, I think blocking behavior should be decided outside of the dependency graph
        #       in the old task manager this was handled as a method on each task object outside of the graph and
        #       probably has the side effect of cutting down *a lot* of the logic from this task manager class
        blocked_by = self.dependency_graph.task_blocked_by(task)
        if blocked_by:
            return blocked_by

        for dep in task.dependent_jobs.all():
            if dep.status in ACTIVE_STATES:
                return dep
            # if we detect a failed or error dependency, go ahead and fail this
            # task. The errback on the dependency takes some time to trigger,
            # and we don't want the task to enter running state if its
            # dependency has failed or errored.
            elif dep.status in ("error", "failed"):
                task.status = 'failed'
                task.job_explanation = 'Previous Task Failed: {"job_type": "%s", "job_name": "%s", "job_id": "%s"}' % (
                    get_type_for_model(type(dep)),
                    dep.name,
                    dep.id,
                )
                task.save(update_fields=['status', 'job_explanation'])
                task.websocket_emit_status('failed')
                return dep

        return None

    @timeit
    def start_task(self,
                   task,
                   instance_group,
                   dependent_tasks=None,
                   instance=None):
        self.dependency_graph.add_job(task)
        self.subsystem_metrics.inc(f"{self.prefix}_tasks_started", 1)
        self.start_task_limit -= 1
        if self.start_task_limit == 0:
            # schedule another run immediately after this task manager
            ScheduleTaskManager().schedule()
        from awx.main.tasks.system import handle_work_error, handle_work_success

        # update capacity for control node and execution node
        if task.controller_node:
            self.instances[task.controller_node].consume_capacity(
                settings.AWX_CONTROL_NODE_TASK_IMPACT)
        if task.execution_node:
            self.instances[task.execution_node].consume_capacity(
                task.task_impact)

        dependent_tasks = dependent_tasks or []

        task_actual = {
            'type': get_type_for_model(type(task)),
            'id': task.id,
        }
        dependencies = [{
            'type': get_type_for_model(type(t)),
            'id': t.id
        } for t in dependent_tasks]

        task.status = 'waiting'

        (start_status, opts) = task.pre_start()
        if not start_status:
            task.status = 'failed'
            if task.job_explanation:
                task.job_explanation += ' '
            task.job_explanation += 'Task failed pre-start check.'
            task.save()
            # TODO: run error handler to fail sub-tasks and send notifications
        else:
            if type(task) is WorkflowJob:
                task.status = 'running'
                task.send_notification_templates('running')
                logger.debug('Transitioning %s to running status.',
                             task.log_format)
                # Call this to ensure Workflow nodes get spawned in timely manner
                ScheduleWorkflowManager().schedule()
            # at this point we already have control/execution nodes selected for the following cases
            else:
                task.instance_group = instance_group
                execution_node_msg = f' and execution node {task.execution_node}' if task.execution_node else ''
                logger.debug(
                    f'Submitting job {task.log_format} controlled by {task.controller_node} to instance group {instance_group.name}{execution_node_msg}.'
                )
            with disable_activity_stream():
                task.celery_task_id = str(uuid.uuid4())
                task.save()
                task.log_lifecycle("waiting")

        def post_commit():
            if task.status != 'failed' and type(task) is not WorkflowJob:
                # Before task is dispatched, ensure that job_event partitions exist
                create_partition(task.event_class._meta.db_table,
                                 start=task.created)
                task_cls = task._get_task_class()
                task_cls.apply_async(
                    [task.pk],
                    opts,
                    queue=task.get_queue_name(),
                    uuid=task.celery_task_id,
                    callbacks=[{
                        'task': handle_work_success.name,
                        'kwargs': {
                            'task_actual': task_actual
                        }
                    }],
                    errbacks=[{
                        'task': handle_work_error.name,
                        'args': [task.celery_task_id],
                        'kwargs': {
                            'subtasks': [task_actual] + dependencies
                        }
                    }],
                )

        task.websocket_emit_status(task.status)  # adds to on_commit
        connection.on_commit(post_commit)

    @timeit
    def process_running_tasks(self, running_tasks):
        for task in running_tasks:
            if type(task) is WorkflowJob:
                ScheduleWorkflowManager().schedule()
            self.dependency_graph.add_job(task)

    @timeit
    def process_pending_tasks(self, pending_tasks):
        tasks_to_update_job_explanation = []
        for task in pending_tasks:
            if self.start_task_limit <= 0:
                break
            if self.timed_out():
                logger.warning(
                    "Task manager has reached time out while processing pending jobs, exiting loop early"
                )
                break
            blocked_by = self.job_blocked_by(task)
            if blocked_by:
                self.subsystem_metrics.inc(f"{self.prefix}_tasks_blocked", 1)
                task.log_lifecycle("blocked", blocked_by=blocked_by)
                job_explanation = gettext_noop(
                    f"waiting for {blocked_by._meta.model_name}-{blocked_by.id} to finish"
                )
                if task.job_explanation != job_explanation:
                    if task.created < (tz_now() -
                                       self.time_delta_job_explanation):
                        task.job_explanation = job_explanation
                        tasks_to_update_job_explanation.append(task)
                continue

            if isinstance(task, WorkflowJob):
                # Previously we were tracking allow_simultaneous blocking both here and in DependencyGraph.
                # Double check that using just the DependencyGraph works for Workflows and Sliced Jobs.
                self.start_task(task, None, task.get_jobs_fail_chain(), None)
                continue

            found_acceptable_queue = False

            preferred_instance_groups = self.instance_groups.get_instance_groups_from_task_cache(
                task)

            # Determine if there is control capacity for the task
            if task.capacity_type == 'control':
                control_impact = task.task_impact + settings.AWX_CONTROL_NODE_TASK_IMPACT
            else:
                control_impact = settings.AWX_CONTROL_NODE_TASK_IMPACT
            control_instance = self.instance_groups.fit_task_to_most_remaining_capacity_instance(
                task,
                instance_group_name=settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME,
                impact=control_impact,
                capacity_type='control')
            if not control_instance:
                self.task_needs_capacity(task, tasks_to_update_job_explanation)
                logger.debug(
                    f"Skipping task {task.log_format} in pending, not enough capacity left on controlplane to control new tasks"
                )
                continue

            task.controller_node = control_instance.hostname

            # All task.capacity_type == 'control' jobs should run on control plane, no need to loop over instance groups
            if task.capacity_type == 'control':
                task.execution_node = control_instance.hostname
                execution_instance = self.instances[
                    control_instance.hostname].obj
                task.log_lifecycle("controller_node_chosen")
                task.log_lifecycle("execution_node_chosen")
                self.start_task(task, self.controlplane_ig,
                                task.get_jobs_fail_chain(), execution_instance)
                found_acceptable_queue = True
                continue

            for instance_group in preferred_instance_groups:
                if instance_group.is_container_group:
                    self.start_task(task, instance_group,
                                    task.get_jobs_fail_chain(), None)
                    found_acceptable_queue = True
                    break

                # TODO: remove this after we have confidence that OCP control nodes are reporting node_type=control
                if settings.IS_K8S and task.capacity_type == 'execution':
                    logger.debug(
                        "Skipping group {}, task cannot run on control plane".
                        format(instance_group.name))
                    continue
                # at this point we know the instance group is NOT a container group
                # because if it was, it would have started the task and broke out of the loop.
                execution_instance = self.instance_groups.fit_task_to_most_remaining_capacity_instance(
                    task,
                    instance_group_name=instance_group.name,
                    add_hybrid_control_cost=True
                ) or self.instance_groups.find_largest_idle_instance(
                    instance_group_name=instance_group.name,
                    capacity_type=task.capacity_type)

                if execution_instance:
                    task.execution_node = execution_instance.hostname
                    # If our execution instance is a hybrid, prefer to do control tasks there as well.
                    if execution_instance.node_type == 'hybrid':
                        control_instance = execution_instance
                        task.controller_node = execution_instance.hostname

                    task.log_lifecycle("controller_node_chosen")
                    task.log_lifecycle("execution_node_chosen")
                    logger.debug(
                        "Starting {} in group {} instance {} (remaining_capacity={})"
                        .format(task.log_format, instance_group.name,
                                execution_instance.hostname,
                                execution_instance.remaining_capacity))
                    execution_instance = self.instances[
                        execution_instance.hostname].obj
                    self.start_task(task, instance_group,
                                    task.get_jobs_fail_chain(),
                                    execution_instance)
                    found_acceptable_queue = True
                    break
                else:
                    logger.debug(
                        "No instance available in group {} to run job {} w/ capacity requirement {}"
                        .format(instance_group.name, task.log_format,
                                task.task_impact))
            if not found_acceptable_queue:
                self.task_needs_capacity(task, tasks_to_update_job_explanation)
        UnifiedJob.objects.bulk_update(tasks_to_update_job_explanation,
                                       ['job_explanation'])

    def task_needs_capacity(self, task, tasks_to_update_job_explanation):
        task.log_lifecycle("needs_capacity")
        job_explanation = gettext_noop(
            "This job is not ready to start because there is not enough available capacity."
        )
        if task.job_explanation != job_explanation:
            if task.created < (tz_now() - self.time_delta_job_explanation):
                # Many launched jobs are immediately blocked, but most blocks will resolve in a few seconds.
                # Therefore we should only update the job_explanation after some time has elapsed to
                # prevent excessive task saves.
                task.job_explanation = job_explanation
                tasks_to_update_job_explanation.append(task)
        logger.debug(
            "{} couldn't be scheduled on graph, waiting for next cycle".format(
                task.log_format))

    def reap_jobs_from_orphaned_instances(self):
        # discover jobs that are in running state but aren't on an execution node
        # that we know about; this is a fairly rare event, but it can occur if you,
        # for example, SQL backup an awx install with running jobs and restore it
        # elsewhere
        for j in UnifiedJob.objects.filter(status__in=[
                'pending', 'waiting', 'running'
        ], ).exclude(execution_node__in=Instance.objects.exclude(
                node_type='hop').values_list('hostname', flat=True)):
            if j.execution_node and not j.is_container_group_task:
                logger.error(
                    f'{j.execution_node} is not a registered instance; reaping {j.log_format}'
                )
                reap_job(j, 'failed')

    def process_tasks(self):
        running_tasks = [
            t for t in self.all_tasks if t.status in ['waiting', 'running']
        ]
        self.process_running_tasks(running_tasks)
        self.subsystem_metrics.inc(f"{self.prefix}_running_processed",
                                   len(running_tasks))

        pending_tasks = [t for t in self.all_tasks if t.status == 'pending']

        self.process_pending_tasks(pending_tasks)
        self.subsystem_metrics.inc(f"{self.prefix}_pending_processed",
                                   len(pending_tasks))

    def timeout_approval_node(self, task):
        if self.timed_out():
            logger.warning(
                "Task manager has reached time out while processing approval nodes, exiting loop early"
            )
            # Do not process any more workflow approval nodes. Stop here.
            # Maybe we should schedule another TaskManager run
            return
        timeout_message = _(
            "The approval node {name} ({pk}) has expired after {timeout} seconds."
        ).format(name=task.name, pk=task.pk, timeout=task.timeout)
        logger.warning(timeout_message)
        task.timed_out = True
        task.status = 'failed'
        task.send_approval_notification('timed_out')
        task.websocket_emit_status(task.status)
        task.job_explanation = timeout_message
        task.save(update_fields=['status', 'job_explanation', 'timed_out'])

    def get_expired_workflow_approvals(self):
        # timeout of 0 indicates that it never expires
        qs = WorkflowApproval.objects.filter(status='pending').exclude(
            timeout=0).filter(expires__lt=tz_now())
        return qs

    @timeit
    def _schedule(self):
        self.get_tasks(
            dict(status__in=["pending", "waiting", "running"],
                 dependencies_processed=True))

        self.after_lock_init()
        self.reap_jobs_from_orphaned_instances()

        if len(self.all_tasks) > 0:
            self.process_tasks()

        for workflow_approval in self.get_expired_workflow_approvals():
            self.timeout_approval_node(workflow_approval)