def cluster_node_heartbeat(): logger.debug("Cluster node heartbeat task.") nowtime = now() instance_list = list(Instance.objects.all()) this_inst = None lost_instances = [] for inst in instance_list: if inst.hostname == settings.CLUSTER_HOST_ID: this_inst = inst break inspect_execution_nodes(instance_list) for inst in list(instance_list): if inst == this_inst: continue if inst.is_lost(ref_time=nowtime): lost_instances.append(inst) instance_list.remove(inst) if this_inst: startup_event = this_inst.is_lost(ref_time=nowtime) this_inst.local_health_check() if startup_event and this_inst.capacity != 0: logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname)) return else: raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID)) # IFF any node has a greater version than we do, then we'll shutdown services for other_inst in instance_list: if other_inst.node_type in ('execution', 'hop'): continue if other_inst.version == "" or other_inst.version.startswith('ansible-runner'): continue if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG: logger.error( "Host {} reports version {}, but this node {} is at {}, shutting down".format( other_inst.hostname, other_inst.version, this_inst.hostname, this_inst.version ) ) # Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance. # The heartbeat task will reset the capacity to the system capacity after upgrade. stop_local_services(communicate=False) raise RuntimeError("Shutting down.") for other_inst in lost_instances: try: reaper.reap(other_inst) except Exception: logger.exception('failed to reap jobs for {}'.format(other_inst.hostname)) try: if settings.AWX_AUTO_DEPROVISION_INSTANCES: deprovision_hostname = other_inst.hostname other_inst.delete() logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname)) elif other_inst.capacity != 0 or (not other_inst.errors): other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive')) logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen)) except DatabaseError as e: if 'did not affect any rows' in str(e): logger.debug('Another instance has marked {} as lost'.format(other_inst.hostname)) else: logger.exception('Error marking {} as lost'.format(other_inst.hostname))
def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None): logger.debug("Cluster node heartbeat task.") nowtime = now() instance_list = list(Instance.objects.all()) this_inst = None lost_instances = [] for inst in instance_list: if inst.hostname == settings.CLUSTER_HOST_ID: this_inst = inst break inspect_execution_nodes(instance_list) for inst in list(instance_list): if inst == this_inst: continue if inst.is_lost(ref_time=nowtime): lost_instances.append(inst) instance_list.remove(inst) if this_inst: startup_event = this_inst.is_lost(ref_time=nowtime) last_last_seen = this_inst.last_seen this_inst.local_health_check() if startup_event and this_inst.capacity != 0: logger.warning( f'Rejoining the cluster as instance {this_inst.hostname}. Prior last_seen {last_last_seen}' ) return elif not last_last_seen: logger.warning( f'Instance does not have recorded last_seen, updating to {nowtime}' ) elif (nowtime - last_last_seen) > timedelta( seconds=settings.CLUSTER_NODE_HEARTBEAT_PERIOD + 2): logger.warning( f'Heartbeat skew - interval={(nowtime - last_last_seen).total_seconds():.4f}, expected={settings.CLUSTER_NODE_HEARTBEAT_PERIOD}' ) else: if settings.AWX_AUTO_DEPROVISION_INSTANCES: (changed, this_inst) = Instance.objects.register( ip_address=os.environ.get('MY_POD_IP'), node_type='control', uuid=settings.SYSTEM_UUID) if changed: logger.warning( f'Recreated instance record {this_inst.hostname} after unexpected removal' ) this_inst.local_health_check() else: raise RuntimeError("Cluster Host Not Found: {}".format( settings.CLUSTER_HOST_ID)) # IFF any node has a greater version than we do, then we'll shutdown services for other_inst in instance_list: if other_inst.node_type in ('execution', 'hop'): continue if other_inst.version == "" or other_inst.version.startswith( 'ansible-runner'): continue if Version(other_inst.version.split('-', 1)[0]) > Version( awx_application_version.split('-', 1)[0]) and not settings.DEBUG: logger.error( "Host {} reports version {}, but this node {} is at {}, shutting down" .format(other_inst.hostname, other_inst.version, this_inst.hostname, this_inst.version)) # Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance. # The heartbeat task will reset the capacity to the system capacity after upgrade. stop_local_services(communicate=False) raise RuntimeError("Shutting down.") for other_inst in lost_instances: try: explanation = "Job reaped due to instance shutdown" reaper.reap(other_inst, job_explanation=explanation) reaper.reap_waiting(other_inst, grace_period=0, job_explanation=explanation) except Exception: logger.exception('failed to reap jobs for {}'.format( other_inst.hostname)) try: if settings.AWX_AUTO_DEPROVISION_INSTANCES: deprovision_hostname = other_inst.hostname other_inst.delete() logger.info("Host {} Automatically Deprovisioned.".format( deprovision_hostname)) elif other_inst.capacity != 0 or (not other_inst.errors): other_inst.mark_offline(errors=_( 'Another cluster node has determined this instance to be unresponsive' )) logger.error( "Host {} last checked in at {}, marked as lost.".format( other_inst.hostname, other_inst.last_seen)) except DatabaseError as e: if 'did not affect any rows' in str(e): logger.debug('Another instance has marked {} as lost'.format( other_inst.hostname)) else: logger.exception('Error marking {} as lost'.format( other_inst.hostname)) # Run local reaper if worker_tasks is not None: active_task_ids = [] for task_list in worker_tasks.values(): active_task_ids.extend(task_list) reaper.reap(instance=this_inst, excluded_uuids=active_task_ids) if max(len(task_list) for task_list in worker_tasks.values()) <= 1: reaper.reap_waiting(instance=this_inst, excluded_uuids=active_task_ids, ref_time=datetime.fromisoformat(dispatch_time))