Пример #1
0
    def handle(self, *arg, **options):
        if options.get('status'):
            print(Control('dispatcher').status())
            return
        if options.get('running'):
            print(Control('dispatcher').running())
            return
        if options.get('reload'):
            return Control('dispatcher').control({'control': 'reload'})

        # It's important to close these because we're _about_ to fork, and we
        # don't want the forked processes to inherit the open sockets
        # for the DB and memcached connections (that way lies race conditions)
        django_connection.close()
        django_cache.close()

        # spawn a daemon thread to periodically enqueues scheduled tasks
        # (like the node heartbeat)
        periodic.run_continuously()

        reaper.reap()
        consumer = None

        try:
            queues = ['tower_broadcast_all', get_local_queuename()]
            consumer = AWXConsumerPG('dispatcher', TaskWorker(), queues,
                                     AutoscalePool(min_workers=4))
            consumer.run()
        except KeyboardInterrupt:
            logger.debug('Terminating Task Dispatcher')
            if consumer:
                consumer.stop()
Пример #2
0
    def test_workflow_does_not_reap(self):
        i = Instance(hostname='awx')
        i.save()
        j = WorkflowJob(status='running', execution_node='awx')
        j.save()
        reaper.reap(i)

        assert WorkflowJob.objects.first().status == 'running'
Пример #3
0
    def cleanup(self):
        """
        Perform some internal account and cleanup.  This is run on
        every cluster node heartbeat:

        1.  Discover worker processes that exited, and recover messages they
            were handling.
        2.  Clean up unnecessary, idle workers.
        3.  Check to see if the database says this node is running any tasks
            that aren't actually running.  If so, reap them.

        IMPORTANT: this function is one of the few places in the dispatcher
        (aside from setting lookups) where we talk to the database.  As such,
        if there's an outage, this method _can_ throw various
        django.db.utils.Error exceptions.  Act accordingly.
        """
        orphaned = []
        for w in self.workers[::]:
            if not w.alive:
                # the worker process has exited
                # 1. take the task it was running and enqueue the error
                #    callbacks
                # 2. take any pending tasks delivered to its queue and
                #    send them to another worker
                logger.error('worker pid:{} is gone (exit={})'.format(w.pid, w.exitcode))
                if w.current_task:
                    if w.current_task != 'QUIT':
                        try:
                            for j in UnifiedJob.objects.filter(celery_task_id=w.current_task['uuid']):
                                reaper.reap_job(j, 'failed')
                        except Exception:
                            logger.exception('failed to reap job UUID {}'.format(w.current_task['uuid']))
                orphaned.extend(w.orphaned_tasks)
                self.workers.remove(w)
            elif w.idle and len(self.workers) > self.min_workers:
                # the process has an empty queue (it's idle) and we have
                # more processes in the pool than we need (> min)
                # send this process a message so it will exit gracefully
                # at the next opportunity
                logger.warn('scaling down worker pid:{}'.format(w.pid))
                w.quit()
                self.workers.remove(w)

        for m in orphaned:
            # if all the workers are dead, spawn at least one
            if not len(self.workers):
                self.up()
            idx = random.choice(range(len(self.workers)))
            self.write(idx, m)

        # if the database says a job is running on this node, but it's *not*,
        # then reap it
        running_uuids = []
        for worker in self.workers:
            worker.calculate_managed_tasks()
            running_uuids.extend(list(worker.managed_tasks.keys()))
        reaper.reap(excluded_uuids=running_uuids)
Пример #4
0
    def handle(self, *arg, **options):
        if options.get('status'):
            print(Control('dispatcher').status())
            return
        if options.get('running'):
            print(Control('dispatcher').running())
            return
        if options.get('reload'):
            return Control('dispatcher').control({'control': 'reload'})

        # It's important to close these because we're _about_ to fork, and we
        # don't want the forked processes to inherit the open sockets
        # for the DB and memcached connections (that way lies race conditions)
        django_connection.close()
        django_cache.close()

        # spawn a daemon thread to periodically enqueues scheduled tasks
        # (like the node heartbeat)
        periodic.run_continuously()

        reaper.reap()
        consumer = None

        # don't ship external logs inside the dispatcher's parent process
        # this exists to work around a race condition + deadlock bug on fork
        # in cpython itself:
        # https://bugs.python.org/issue37429
        AWXProxyHandler.disable()
        with Connection(settings.BROKER_URL) as conn:
            try:
                bcast = 'tower_broadcast_all'
                queues = [
                    Queue(q, Exchange(q), routing_key=q)
                    for q in (settings.AWX_CELERY_QUEUES_STATIC + [get_local_queuename()])
                ]
                queues.append(
                    Queue(
                        construct_bcast_queue_name(bcast),
                        exchange=Exchange(bcast, type='fanout'),
                        routing_key=bcast,
                        reply=True
                    )
                )
                consumer = AWXConsumer(
                    'dispatcher',
                    conn,
                    TaskWorker(),
                    queues,
                    AutoscalePool(min_workers=4)
                )
                consumer.run()
            except KeyboardInterrupt:
                logger.debug('Terminating Task Dispatcher')
                if consumer:
                    consumer.stop()
Пример #5
0
def inform_cluster_of_shutdown():
    try:
        this_inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID)
        this_inst.mark_offline(update_last_seen=True, errors=_('Instance received normal shutdown signal'))
        try:
            reaper.reap(this_inst)
        except Exception:
            logger.exception('failed to reap jobs for {}'.format(this_inst.hostname))
        logger.warning('Normal shutdown signal for instance {}, ' 'removed self from capacity pool.'.format(this_inst.hostname))
    except Exception:
        logger.exception('Encountered problem with normal shutdown signal.')
Пример #6
0
    def handle(self, *arg, **options):
        if options.get('status'):
            print Control('dispatcher').status()
            return
        if options.get('running'):
            print Control('dispatcher').running()
            return
        if options.get('reload'):
            return Control('dispatcher').control({'control': 'reload'})

        # It's important to close these because we're _about_ to fork, and we
        # don't want the forked processes to inherit the open sockets
        # for the DB and memcached connections (that way lies race conditions)
        django_connection.close()
        django_cache.close()
        beat = Process(target=self.beat)
        beat.daemon = True
        beat.start()

        reaper.reap()
        consumer = None
        with Connection(settings.BROKER_URL) as conn:
            try:
                bcast = 'tower_broadcast_all'
                queues = [
                    Queue(q, Exchange(q), routing_key=q)
                    for q in (settings.AWX_CELERY_QUEUES_STATIC + [get_local_queuename()])
                ]
                queues.append(
                    Queue(
                        construct_bcast_queue_name(bcast),
                        exchange=Exchange(bcast, type='fanout'),
                        routing_key=bcast,
                        reply=True
                    )
                )
                consumer = AWXConsumer(
                    'dispatcher',
                    conn,
                    TaskWorker(),
                    queues,
                    AutoscalePool(min_workers=4)
                )
                consumer.run()
            except KeyboardInterrupt:
                logger.debug('Terminating Task Dispatcher')
                if consumer:
                    consumer.stop()
Пример #7
0
    def test_do_not_reap_excluded_uuids(self, excluded_uuids, fail):
        i = Instance(hostname='awx')
        i.save()
        j = Job(
            status='running',
            execution_node='awx',
            controller_node='',
            start_args='SENSITIVE',
            celery_task_id='abc123',
        )
        j.save()

        # if the UUID is excluded, don't reap it
        reaper.reap(i, excluded_uuids=excluded_uuids)
        job = Job.objects.first()
        if fail:
            assert job.status == 'failed'
            assert 'marked as failed' in job.job_explanation
            assert job.start_args == ''
        else:
            assert job.status == 'running'
Пример #8
0
 def test_should_reap(self, status, fail, execution_node, controller_node, modified):
     i = Instance(hostname='awx')
     i.save()
     j = Job(
         status=status,
         execution_node=execution_node,
         controller_node=controller_node,
         start_args='SENSITIVE',
     )
     j.save()
     if modified:
         # we have to edit the modification time _without_ calling save()
         # (because .save() overwrites it to _now_)
         Job.objects.filter(id=j.id).update(modified=modified)
     reaper.reap(i)
     job = Job.objects.first()
     if fail:
         assert job.status == 'failed'
         assert 'marked as failed' in job.job_explanation
         assert job.start_args == ''
     else:
         assert job.status == status
Пример #9
0
def cluster_node_heartbeat():
    logger.debug("Cluster node heartbeat task.")
    nowtime = now()
    instance_list = list(Instance.objects.all())
    this_inst = None
    lost_instances = []

    for inst in instance_list:
        if inst.hostname == settings.CLUSTER_HOST_ID:
            this_inst = inst
            break

    inspect_execution_nodes(instance_list)

    for inst in list(instance_list):
        if inst == this_inst:
            continue
        if inst.is_lost(ref_time=nowtime):
            lost_instances.append(inst)
            instance_list.remove(inst)

    if this_inst:
        startup_event = this_inst.is_lost(ref_time=nowtime)
        this_inst.local_health_check()
        if startup_event and this_inst.capacity != 0:
            logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname))
            return
    else:
        raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
    # IFF any node has a greater version than we do, then we'll shutdown services
    for other_inst in instance_list:
        if other_inst.node_type in ('execution', 'hop'):
            continue
        if other_inst.version == "" or other_inst.version.startswith('ansible-runner'):
            continue
        if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG:
            logger.error(
                "Host {} reports version {}, but this node {} is at {}, shutting down".format(
                    other_inst.hostname, other_inst.version, this_inst.hostname, this_inst.version
                )
            )
            # Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance.
            # The heartbeat task will reset the capacity to the system capacity after upgrade.
            stop_local_services(communicate=False)
            raise RuntimeError("Shutting down.")

    for other_inst in lost_instances:
        try:
            reaper.reap(other_inst)
        except Exception:
            logger.exception('failed to reap jobs for {}'.format(other_inst.hostname))
        try:
            if settings.AWX_AUTO_DEPROVISION_INSTANCES:
                deprovision_hostname = other_inst.hostname
                other_inst.delete()
                logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname))
            elif other_inst.capacity != 0 or (not other_inst.errors):
                other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
                logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))

        except DatabaseError as e:
            if 'did not affect any rows' in str(e):
                logger.debug('Another instance has marked {} as lost'.format(other_inst.hostname))
            else:
                logger.exception('Error marking {} as lost'.format(other_inst.hostname))
Пример #10
0
    def cleanup(self):
        """
        Perform some internal account and cleanup.  This is run on
        every cluster node heartbeat:

        1.  Discover worker processes that exited, and recover messages they
            were handling.
        2.  Clean up unnecessary, idle workers.
        3.  Check to see if the database says this node is running any tasks
            that aren't actually running.  If so, reap them.
        """
        orphaned = []
        for w in self.workers[::]:
            if not w.alive:
                # the worker process has exited
                # 1. take the task it was running and enqueue the error
                #    callbacks
                # 2. take any pending tasks delivered to its queue and
                #    send them to another worker
                logger.error('worker pid:{} is gone (exit={})'.format(
                    w.pid, w.exitcode))
                if w.current_task:
                    if w.current_task != 'QUIT':
                        try:
                            for j in UnifiedJob.objects.filter(
                                    celery_task_id=w.current_task['uuid']):
                                reaper.reap_job(j, 'failed')
                        except Exception:
                            logger.exception(
                                'failed to reap job UUID {}'.format(
                                    w.current_task['uuid']))
                orphaned.extend(w.orphaned_tasks)
                self.workers.remove(w)
            elif w.idle and len(self.workers) > self.min_workers:
                # the process has an empty queue (it's idle) and we have
                # more processes in the pool than we need (> min)
                # send this process a message so it will exit gracefully
                # at the next opportunity
                logger.warn('scaling down worker pid:{}'.format(w.pid))
                w.quit()
                self.workers.remove(w)

        for m in orphaned:
            # if all the workers are dead, spawn at least one
            if not len(self.workers):
                self.up()
            idx = random.choice(range(len(self.workers)))
            self.write(idx, m)

        # if the database says a job is running on this node, but it's *not*,
        # then reap it
        running_uuids = []
        for worker in self.workers:
            worker.calculate_managed_tasks()
            running_uuids.extend(list(worker.managed_tasks.keys()))
        try:
            reaper.reap(excluded_uuids=running_uuids)
        except Exception:
            # we _probably_ failed here due to DB connectivity issues, so
            # don't use our logger (it accesses the database for configuration)
            _, _, tb = sys.exc_info()
            traceback.print_tb(tb)
Пример #11
0
    def cleanup(self):
        """
        Perform some internal account and cleanup.  This is run on
        every cluster node heartbeat:

        1.  Discover worker processes that exited, and recover messages they
            were handling.
        2.  Clean up unnecessary, idle workers.
        3.  Check to see if the database says this node is running any tasks
            that aren't actually running.  If so, reap them.

        IMPORTANT: this function is one of the few places in the dispatcher
        (aside from setting lookups) where we talk to the database.  As such,
        if there's an outage, this method _can_ throw various
        django.db.utils.Error exceptions.  Act accordingly.
        """
        orphaned = []
        for w in self.workers[::]:
            if not w.alive:
                # the worker process has exited
                # 1. take the task it was running and enqueue the error
                #    callbacks
                # 2. take any pending tasks delivered to its queue and
                #    send them to another worker
                logger.error('worker pid:{} is gone (exit={})'.format(
                    w.pid, w.exitcode))
                if w.current_task:
                    if w.current_task != 'QUIT':
                        try:
                            for j in UnifiedJob.objects.filter(
                                    celery_task_id=w.current_task['uuid']):
                                reaper.reap_job(j, 'failed')
                        except Exception:
                            logger.exception(
                                'failed to reap job UUID {}'.format(
                                    w.current_task['uuid']))
                orphaned.extend(w.orphaned_tasks)
                self.workers.remove(w)
            elif w.idle and len(self.workers) > self.min_workers:
                # the process has an empty queue (it's idle) and we have
                # more processes in the pool than we need (> min)
                # send this process a message so it will exit gracefully
                # at the next opportunity
                logger.debug('scaling down worker pid:{}'.format(w.pid))
                w.quit()
                self.workers.remove(w)
            if w.alive:
                # if we discover a task manager invocation that's been running
                # too long, reap it (because otherwise it'll just hold the postgres
                # advisory lock forever); the goal of this code is to discover
                # deadlocks or other serious issues in the task manager that cause
                # the task manager to never do more work
                current_task = w.current_task
                if current_task and isinstance(current_task, dict):
                    endings = [
                        'tasks.task_manager', 'tasks.dependency_manager',
                        'tasks.workflow_manager'
                    ]
                    current_task_name = current_task.get('task', '')
                    if any(current_task_name.endswith(e) for e in endings):
                        if 'started' not in current_task:
                            w.managed_tasks[
                                current_task['uuid']]['started'] = time.time()
                        age = time.time() - current_task['started']
                        w.managed_tasks[current_task['uuid']]['age'] = age
                        if age > (settings.TASK_MANAGER_TIMEOUT +
                                  settings.TASK_MANAGER_TIMEOUT_GRACE_PERIOD):
                            logger.error(
                                f'{current_task_name} has held the advisory lock for {age}, sending SIGTERM to {w.pid}'
                            )  # noqa
                            os.kill(w.pid, signal.SIGTERM)

        for m in orphaned:
            # if all the workers are dead, spawn at least one
            if not len(self.workers):
                self.up()
            idx = random.choice(range(len(self.workers)))
            self.write(idx, m)

        # if the database says a job is running on this node, but it's *not*,
        # then reap it
        running_uuids = []
        for worker in self.workers:
            worker.calculate_managed_tasks()
            running_uuids.extend(list(worker.managed_tasks.keys()))
        reaper.reap(excluded_uuids=running_uuids)
Пример #12
0
def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None):
    logger.debug("Cluster node heartbeat task.")
    nowtime = now()
    instance_list = list(Instance.objects.all())
    this_inst = None
    lost_instances = []

    for inst in instance_list:
        if inst.hostname == settings.CLUSTER_HOST_ID:
            this_inst = inst
            break

    inspect_execution_nodes(instance_list)

    for inst in list(instance_list):
        if inst == this_inst:
            continue
        if inst.is_lost(ref_time=nowtime):
            lost_instances.append(inst)
            instance_list.remove(inst)

    if this_inst:
        startup_event = this_inst.is_lost(ref_time=nowtime)
        last_last_seen = this_inst.last_seen
        this_inst.local_health_check()
        if startup_event and this_inst.capacity != 0:
            logger.warning(
                f'Rejoining the cluster as instance {this_inst.hostname}. Prior last_seen {last_last_seen}'
            )
            return
        elif not last_last_seen:
            logger.warning(
                f'Instance does not have recorded last_seen, updating to {nowtime}'
            )
        elif (nowtime - last_last_seen) > timedelta(
                seconds=settings.CLUSTER_NODE_HEARTBEAT_PERIOD + 2):
            logger.warning(
                f'Heartbeat skew - interval={(nowtime - last_last_seen).total_seconds():.4f}, expected={settings.CLUSTER_NODE_HEARTBEAT_PERIOD}'
            )
    else:
        if settings.AWX_AUTO_DEPROVISION_INSTANCES:
            (changed, this_inst) = Instance.objects.register(
                ip_address=os.environ.get('MY_POD_IP'),
                node_type='control',
                uuid=settings.SYSTEM_UUID)
            if changed:
                logger.warning(
                    f'Recreated instance record {this_inst.hostname} after unexpected removal'
                )
            this_inst.local_health_check()
        else:
            raise RuntimeError("Cluster Host Not Found: {}".format(
                settings.CLUSTER_HOST_ID))
    # IFF any node has a greater version than we do, then we'll shutdown services
    for other_inst in instance_list:
        if other_inst.node_type in ('execution', 'hop'):
            continue
        if other_inst.version == "" or other_inst.version.startswith(
                'ansible-runner'):
            continue
        if Version(other_inst.version.split('-', 1)[0]) > Version(
                awx_application_version.split('-',
                                              1)[0]) and not settings.DEBUG:
            logger.error(
                "Host {} reports version {}, but this node {} is at {}, shutting down"
                .format(other_inst.hostname, other_inst.version,
                        this_inst.hostname, this_inst.version))
            # Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance.
            # The heartbeat task will reset the capacity to the system capacity after upgrade.
            stop_local_services(communicate=False)
            raise RuntimeError("Shutting down.")

    for other_inst in lost_instances:
        try:
            explanation = "Job reaped due to instance shutdown"
            reaper.reap(other_inst, job_explanation=explanation)
            reaper.reap_waiting(other_inst,
                                grace_period=0,
                                job_explanation=explanation)
        except Exception:
            logger.exception('failed to reap jobs for {}'.format(
                other_inst.hostname))
        try:
            if settings.AWX_AUTO_DEPROVISION_INSTANCES:
                deprovision_hostname = other_inst.hostname
                other_inst.delete()
                logger.info("Host {} Automatically Deprovisioned.".format(
                    deprovision_hostname))
            elif other_inst.capacity != 0 or (not other_inst.errors):
                other_inst.mark_offline(errors=_(
                    'Another cluster node has determined this instance to be unresponsive'
                ))
                logger.error(
                    "Host {} last checked in at {}, marked as lost.".format(
                        other_inst.hostname, other_inst.last_seen))

        except DatabaseError as e:
            if 'did not affect any rows' in str(e):
                logger.debug('Another instance has marked {} as lost'.format(
                    other_inst.hostname))
            else:
                logger.exception('Error marking {} as lost'.format(
                    other_inst.hostname))

    # Run local reaper
    if worker_tasks is not None:
        active_task_ids = []
        for task_list in worker_tasks.values():
            active_task_ids.extend(task_list)
        reaper.reap(instance=this_inst, excluded_uuids=active_task_ids)
        if max(len(task_list) for task_list in worker_tasks.values()) <= 1:
            reaper.reap_waiting(instance=this_inst,
                                excluded_uuids=active_task_ids,
                                ref_time=datetime.fromisoformat(dispatch_time))