def handle(self, *arg, **options): if options.get('status'): print(Control('dispatcher').status()) return if options.get('running'): print(Control('dispatcher').running()) return if options.get('reload'): return Control('dispatcher').control({'control': 'reload'}) # It's important to close these because we're _about_ to fork, and we # don't want the forked processes to inherit the open sockets # for the DB and memcached connections (that way lies race conditions) django_connection.close() django_cache.close() # spawn a daemon thread to periodically enqueues scheduled tasks # (like the node heartbeat) periodic.run_continuously() reaper.reap() consumer = None try: queues = ['tower_broadcast_all', get_local_queuename()] consumer = AWXConsumerPG('dispatcher', TaskWorker(), queues, AutoscalePool(min_workers=4)) consumer.run() except KeyboardInterrupt: logger.debug('Terminating Task Dispatcher') if consumer: consumer.stop()
def test_workflow_does_not_reap(self): i = Instance(hostname='awx') i.save() j = WorkflowJob(status='running', execution_node='awx') j.save() reaper.reap(i) assert WorkflowJob.objects.first().status == 'running'
def cleanup(self): """ Perform some internal account and cleanup. This is run on every cluster node heartbeat: 1. Discover worker processes that exited, and recover messages they were handling. 2. Clean up unnecessary, idle workers. 3. Check to see if the database says this node is running any tasks that aren't actually running. If so, reap them. IMPORTANT: this function is one of the few places in the dispatcher (aside from setting lookups) where we talk to the database. As such, if there's an outage, this method _can_ throw various django.db.utils.Error exceptions. Act accordingly. """ orphaned = [] for w in self.workers[::]: if not w.alive: # the worker process has exited # 1. take the task it was running and enqueue the error # callbacks # 2. take any pending tasks delivered to its queue and # send them to another worker logger.error('worker pid:{} is gone (exit={})'.format(w.pid, w.exitcode)) if w.current_task: if w.current_task != 'QUIT': try: for j in UnifiedJob.objects.filter(celery_task_id=w.current_task['uuid']): reaper.reap_job(j, 'failed') except Exception: logger.exception('failed to reap job UUID {}'.format(w.current_task['uuid'])) orphaned.extend(w.orphaned_tasks) self.workers.remove(w) elif w.idle and len(self.workers) > self.min_workers: # the process has an empty queue (it's idle) and we have # more processes in the pool than we need (> min) # send this process a message so it will exit gracefully # at the next opportunity logger.warn('scaling down worker pid:{}'.format(w.pid)) w.quit() self.workers.remove(w) for m in orphaned: # if all the workers are dead, spawn at least one if not len(self.workers): self.up() idx = random.choice(range(len(self.workers))) self.write(idx, m) # if the database says a job is running on this node, but it's *not*, # then reap it running_uuids = [] for worker in self.workers: worker.calculate_managed_tasks() running_uuids.extend(list(worker.managed_tasks.keys())) reaper.reap(excluded_uuids=running_uuids)
def handle(self, *arg, **options): if options.get('status'): print(Control('dispatcher').status()) return if options.get('running'): print(Control('dispatcher').running()) return if options.get('reload'): return Control('dispatcher').control({'control': 'reload'}) # It's important to close these because we're _about_ to fork, and we # don't want the forked processes to inherit the open sockets # for the DB and memcached connections (that way lies race conditions) django_connection.close() django_cache.close() # spawn a daemon thread to periodically enqueues scheduled tasks # (like the node heartbeat) periodic.run_continuously() reaper.reap() consumer = None # don't ship external logs inside the dispatcher's parent process # this exists to work around a race condition + deadlock bug on fork # in cpython itself: # https://bugs.python.org/issue37429 AWXProxyHandler.disable() with Connection(settings.BROKER_URL) as conn: try: bcast = 'tower_broadcast_all' queues = [ Queue(q, Exchange(q), routing_key=q) for q in (settings.AWX_CELERY_QUEUES_STATIC + [get_local_queuename()]) ] queues.append( Queue( construct_bcast_queue_name(bcast), exchange=Exchange(bcast, type='fanout'), routing_key=bcast, reply=True ) ) consumer = AWXConsumer( 'dispatcher', conn, TaskWorker(), queues, AutoscalePool(min_workers=4) ) consumer.run() except KeyboardInterrupt: logger.debug('Terminating Task Dispatcher') if consumer: consumer.stop()
def inform_cluster_of_shutdown(): try: this_inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID) this_inst.mark_offline(update_last_seen=True, errors=_('Instance received normal shutdown signal')) try: reaper.reap(this_inst) except Exception: logger.exception('failed to reap jobs for {}'.format(this_inst.hostname)) logger.warning('Normal shutdown signal for instance {}, ' 'removed self from capacity pool.'.format(this_inst.hostname)) except Exception: logger.exception('Encountered problem with normal shutdown signal.')
def handle(self, *arg, **options): if options.get('status'): print Control('dispatcher').status() return if options.get('running'): print Control('dispatcher').running() return if options.get('reload'): return Control('dispatcher').control({'control': 'reload'}) # It's important to close these because we're _about_ to fork, and we # don't want the forked processes to inherit the open sockets # for the DB and memcached connections (that way lies race conditions) django_connection.close() django_cache.close() beat = Process(target=self.beat) beat.daemon = True beat.start() reaper.reap() consumer = None with Connection(settings.BROKER_URL) as conn: try: bcast = 'tower_broadcast_all' queues = [ Queue(q, Exchange(q), routing_key=q) for q in (settings.AWX_CELERY_QUEUES_STATIC + [get_local_queuename()]) ] queues.append( Queue( construct_bcast_queue_name(bcast), exchange=Exchange(bcast, type='fanout'), routing_key=bcast, reply=True ) ) consumer = AWXConsumer( 'dispatcher', conn, TaskWorker(), queues, AutoscalePool(min_workers=4) ) consumer.run() except KeyboardInterrupt: logger.debug('Terminating Task Dispatcher') if consumer: consumer.stop()
def test_do_not_reap_excluded_uuids(self, excluded_uuids, fail): i = Instance(hostname='awx') i.save() j = Job( status='running', execution_node='awx', controller_node='', start_args='SENSITIVE', celery_task_id='abc123', ) j.save() # if the UUID is excluded, don't reap it reaper.reap(i, excluded_uuids=excluded_uuids) job = Job.objects.first() if fail: assert job.status == 'failed' assert 'marked as failed' in job.job_explanation assert job.start_args == '' else: assert job.status == 'running'
def test_should_reap(self, status, fail, execution_node, controller_node, modified): i = Instance(hostname='awx') i.save() j = Job( status=status, execution_node=execution_node, controller_node=controller_node, start_args='SENSITIVE', ) j.save() if modified: # we have to edit the modification time _without_ calling save() # (because .save() overwrites it to _now_) Job.objects.filter(id=j.id).update(modified=modified) reaper.reap(i) job = Job.objects.first() if fail: assert job.status == 'failed' assert 'marked as failed' in job.job_explanation assert job.start_args == '' else: assert job.status == status
def cluster_node_heartbeat(): logger.debug("Cluster node heartbeat task.") nowtime = now() instance_list = list(Instance.objects.all()) this_inst = None lost_instances = [] for inst in instance_list: if inst.hostname == settings.CLUSTER_HOST_ID: this_inst = inst break inspect_execution_nodes(instance_list) for inst in list(instance_list): if inst == this_inst: continue if inst.is_lost(ref_time=nowtime): lost_instances.append(inst) instance_list.remove(inst) if this_inst: startup_event = this_inst.is_lost(ref_time=nowtime) this_inst.local_health_check() if startup_event and this_inst.capacity != 0: logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname)) return else: raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID)) # IFF any node has a greater version than we do, then we'll shutdown services for other_inst in instance_list: if other_inst.node_type in ('execution', 'hop'): continue if other_inst.version == "" or other_inst.version.startswith('ansible-runner'): continue if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG: logger.error( "Host {} reports version {}, but this node {} is at {}, shutting down".format( other_inst.hostname, other_inst.version, this_inst.hostname, this_inst.version ) ) # Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance. # The heartbeat task will reset the capacity to the system capacity after upgrade. stop_local_services(communicate=False) raise RuntimeError("Shutting down.") for other_inst in lost_instances: try: reaper.reap(other_inst) except Exception: logger.exception('failed to reap jobs for {}'.format(other_inst.hostname)) try: if settings.AWX_AUTO_DEPROVISION_INSTANCES: deprovision_hostname = other_inst.hostname other_inst.delete() logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname)) elif other_inst.capacity != 0 or (not other_inst.errors): other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive')) logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen)) except DatabaseError as e: if 'did not affect any rows' in str(e): logger.debug('Another instance has marked {} as lost'.format(other_inst.hostname)) else: logger.exception('Error marking {} as lost'.format(other_inst.hostname))
def cleanup(self): """ Perform some internal account and cleanup. This is run on every cluster node heartbeat: 1. Discover worker processes that exited, and recover messages they were handling. 2. Clean up unnecessary, idle workers. 3. Check to see if the database says this node is running any tasks that aren't actually running. If so, reap them. """ orphaned = [] for w in self.workers[::]: if not w.alive: # the worker process has exited # 1. take the task it was running and enqueue the error # callbacks # 2. take any pending tasks delivered to its queue and # send them to another worker logger.error('worker pid:{} is gone (exit={})'.format( w.pid, w.exitcode)) if w.current_task: if w.current_task != 'QUIT': try: for j in UnifiedJob.objects.filter( celery_task_id=w.current_task['uuid']): reaper.reap_job(j, 'failed') except Exception: logger.exception( 'failed to reap job UUID {}'.format( w.current_task['uuid'])) orphaned.extend(w.orphaned_tasks) self.workers.remove(w) elif w.idle and len(self.workers) > self.min_workers: # the process has an empty queue (it's idle) and we have # more processes in the pool than we need (> min) # send this process a message so it will exit gracefully # at the next opportunity logger.warn('scaling down worker pid:{}'.format(w.pid)) w.quit() self.workers.remove(w) for m in orphaned: # if all the workers are dead, spawn at least one if not len(self.workers): self.up() idx = random.choice(range(len(self.workers))) self.write(idx, m) # if the database says a job is running on this node, but it's *not*, # then reap it running_uuids = [] for worker in self.workers: worker.calculate_managed_tasks() running_uuids.extend(list(worker.managed_tasks.keys())) try: reaper.reap(excluded_uuids=running_uuids) except Exception: # we _probably_ failed here due to DB connectivity issues, so # don't use our logger (it accesses the database for configuration) _, _, tb = sys.exc_info() traceback.print_tb(tb)
def cleanup(self): """ Perform some internal account and cleanup. This is run on every cluster node heartbeat: 1. Discover worker processes that exited, and recover messages they were handling. 2. Clean up unnecessary, idle workers. 3. Check to see if the database says this node is running any tasks that aren't actually running. If so, reap them. IMPORTANT: this function is one of the few places in the dispatcher (aside from setting lookups) where we talk to the database. As such, if there's an outage, this method _can_ throw various django.db.utils.Error exceptions. Act accordingly. """ orphaned = [] for w in self.workers[::]: if not w.alive: # the worker process has exited # 1. take the task it was running and enqueue the error # callbacks # 2. take any pending tasks delivered to its queue and # send them to another worker logger.error('worker pid:{} is gone (exit={})'.format( w.pid, w.exitcode)) if w.current_task: if w.current_task != 'QUIT': try: for j in UnifiedJob.objects.filter( celery_task_id=w.current_task['uuid']): reaper.reap_job(j, 'failed') except Exception: logger.exception( 'failed to reap job UUID {}'.format( w.current_task['uuid'])) orphaned.extend(w.orphaned_tasks) self.workers.remove(w) elif w.idle and len(self.workers) > self.min_workers: # the process has an empty queue (it's idle) and we have # more processes in the pool than we need (> min) # send this process a message so it will exit gracefully # at the next opportunity logger.debug('scaling down worker pid:{}'.format(w.pid)) w.quit() self.workers.remove(w) if w.alive: # if we discover a task manager invocation that's been running # too long, reap it (because otherwise it'll just hold the postgres # advisory lock forever); the goal of this code is to discover # deadlocks or other serious issues in the task manager that cause # the task manager to never do more work current_task = w.current_task if current_task and isinstance(current_task, dict): endings = [ 'tasks.task_manager', 'tasks.dependency_manager', 'tasks.workflow_manager' ] current_task_name = current_task.get('task', '') if any(current_task_name.endswith(e) for e in endings): if 'started' not in current_task: w.managed_tasks[ current_task['uuid']]['started'] = time.time() age = time.time() - current_task['started'] w.managed_tasks[current_task['uuid']]['age'] = age if age > (settings.TASK_MANAGER_TIMEOUT + settings.TASK_MANAGER_TIMEOUT_GRACE_PERIOD): logger.error( f'{current_task_name} has held the advisory lock for {age}, sending SIGTERM to {w.pid}' ) # noqa os.kill(w.pid, signal.SIGTERM) for m in orphaned: # if all the workers are dead, spawn at least one if not len(self.workers): self.up() idx = random.choice(range(len(self.workers))) self.write(idx, m) # if the database says a job is running on this node, but it's *not*, # then reap it running_uuids = [] for worker in self.workers: worker.calculate_managed_tasks() running_uuids.extend(list(worker.managed_tasks.keys())) reaper.reap(excluded_uuids=running_uuids)
def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None): logger.debug("Cluster node heartbeat task.") nowtime = now() instance_list = list(Instance.objects.all()) this_inst = None lost_instances = [] for inst in instance_list: if inst.hostname == settings.CLUSTER_HOST_ID: this_inst = inst break inspect_execution_nodes(instance_list) for inst in list(instance_list): if inst == this_inst: continue if inst.is_lost(ref_time=nowtime): lost_instances.append(inst) instance_list.remove(inst) if this_inst: startup_event = this_inst.is_lost(ref_time=nowtime) last_last_seen = this_inst.last_seen this_inst.local_health_check() if startup_event and this_inst.capacity != 0: logger.warning( f'Rejoining the cluster as instance {this_inst.hostname}. Prior last_seen {last_last_seen}' ) return elif not last_last_seen: logger.warning( f'Instance does not have recorded last_seen, updating to {nowtime}' ) elif (nowtime - last_last_seen) > timedelta( seconds=settings.CLUSTER_NODE_HEARTBEAT_PERIOD + 2): logger.warning( f'Heartbeat skew - interval={(nowtime - last_last_seen).total_seconds():.4f}, expected={settings.CLUSTER_NODE_HEARTBEAT_PERIOD}' ) else: if settings.AWX_AUTO_DEPROVISION_INSTANCES: (changed, this_inst) = Instance.objects.register( ip_address=os.environ.get('MY_POD_IP'), node_type='control', uuid=settings.SYSTEM_UUID) if changed: logger.warning( f'Recreated instance record {this_inst.hostname} after unexpected removal' ) this_inst.local_health_check() else: raise RuntimeError("Cluster Host Not Found: {}".format( settings.CLUSTER_HOST_ID)) # IFF any node has a greater version than we do, then we'll shutdown services for other_inst in instance_list: if other_inst.node_type in ('execution', 'hop'): continue if other_inst.version == "" or other_inst.version.startswith( 'ansible-runner'): continue if Version(other_inst.version.split('-', 1)[0]) > Version( awx_application_version.split('-', 1)[0]) and not settings.DEBUG: logger.error( "Host {} reports version {}, but this node {} is at {}, shutting down" .format(other_inst.hostname, other_inst.version, this_inst.hostname, this_inst.version)) # Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance. # The heartbeat task will reset the capacity to the system capacity after upgrade. stop_local_services(communicate=False) raise RuntimeError("Shutting down.") for other_inst in lost_instances: try: explanation = "Job reaped due to instance shutdown" reaper.reap(other_inst, job_explanation=explanation) reaper.reap_waiting(other_inst, grace_period=0, job_explanation=explanation) except Exception: logger.exception('failed to reap jobs for {}'.format( other_inst.hostname)) try: if settings.AWX_AUTO_DEPROVISION_INSTANCES: deprovision_hostname = other_inst.hostname other_inst.delete() logger.info("Host {} Automatically Deprovisioned.".format( deprovision_hostname)) elif other_inst.capacity != 0 or (not other_inst.errors): other_inst.mark_offline(errors=_( 'Another cluster node has determined this instance to be unresponsive' )) logger.error( "Host {} last checked in at {}, marked as lost.".format( other_inst.hostname, other_inst.last_seen)) except DatabaseError as e: if 'did not affect any rows' in str(e): logger.debug('Another instance has marked {} as lost'.format( other_inst.hostname)) else: logger.exception('Error marking {} as lost'.format( other_inst.hostname)) # Run local reaper if worker_tasks is not None: active_task_ids = [] for task_list in worker_tasks.values(): active_task_ids.extend(task_list) reaper.reap(instance=this_inst, excluded_uuids=active_task_ids) if max(len(task_list) for task_list in worker_tasks.values()) <= 1: reaper.reap_waiting(instance=this_inst, excluded_uuids=active_task_ids, ref_time=datetime.fromisoformat(dispatch_time))