Exemplo n.º 1
0
Arquivo: system.py Projeto: mahak/awx
def awx_receptor_workunit_reaper():
    """
    When an AWX job is launched via receptor, files such as status, stdin, and stdout are created
    in a specific receptor directory. This directory on disk is a random 8 character string, e.g. qLL2JFNT
    This is also called the work Unit ID in receptor, and is used in various receptor commands,
    e.g. "work results qLL2JFNT"
    After an AWX job executes, the receptor work unit directory is cleaned up by
    issuing the work release command. In some cases the release process might fail, or
    if AWX crashes during a job's execution, the work release command is never issued to begin with.
    As such, this periodic task will obtain a list of all receptor work units, and find which ones
    belong to AWX jobs that are in a completed state (status is canceled, error, or succeeded).
    This task will call "work release" on each of these work units to clean up the files on disk.

    Note that when we call "work release" on a work unit that actually represents remote work
    both the local and remote work units are cleaned up.

    Since we are cleaning up jobs that controller considers to be inactive, we take the added
    precaution of calling "work cancel" in case the work unit is still active.
    """
    if not settings.RECEPTOR_RELEASE_WORK:
        return
    logger.debug("Checking for unreleased receptor work units")
    receptor_ctl = get_receptor_ctl()
    receptor_work_list = receptor_ctl.simple_command("work list")

    unit_ids = [id for id in receptor_work_list]
    jobs_with_unreleased_receptor_units = UnifiedJob.objects.filter(work_unit_id__in=unit_ids).exclude(status__in=ACTIVE_STATES)
    for job in jobs_with_unreleased_receptor_units:
        logger.debug(f"{job.log_format} is not active, reaping receptor work unit {job.work_unit_id}")
        receptor_ctl.simple_command(f"work cancel {job.work_unit_id}")
        receptor_ctl.simple_command(f"work release {job.work_unit_id}")

    administrative_workunit_reaper(receptor_work_list)
Exemplo n.º 2
0
def inspect_execution_nodes(instance_list):
    with advisory_lock('inspect_execution_nodes_lock', wait=False):
        node_lookup = {inst.hostname: inst for inst in instance_list}

        ctl = get_receptor_ctl()
        mesh_status = ctl.simple_command('status')

        nowtime = now()
        workers = mesh_status['Advertisements']
        for ad in workers:
            hostname = ad['NodeID']

            if hostname in node_lookup:
                instance = node_lookup[hostname]
            else:
                logger.warning(
                    f"Unrecognized node advertising on mesh: {hostname}")
                continue

            # Control-plane nodes are dealt with via local_health_check instead.
            if instance.node_type in ('control', 'hybrid'):
                continue

            was_lost = instance.is_lost(ref_time=nowtime)
            last_seen = parse_date(ad['Time'])

            if instance.last_seen and instance.last_seen >= last_seen:
                continue
            instance.last_seen = last_seen
            instance.save(update_fields=['last_seen'])

            # Only execution nodes should be dealt with by execution_node_health_check
            if instance.node_type == 'hop':
                if was_lost and (not instance.is_lost(ref_time=nowtime)):
                    logger.warning(
                        f'Hop node {hostname}, has rejoined the receptor mesh')
                    instance.save_health_data(errors='')
                continue

            if was_lost:
                # if the instance *was* lost, but has appeared again,
                # attempt to re-establish the initial capacity and version
                # check
                logger.warning(
                    f'Execution node attempting to rejoin as instance {hostname}.'
                )
                execution_node_health_check.apply_async([hostname])
            elif instance.capacity == 0 and instance.enabled:
                # nodes with proven connection but need remediation run health checks are reduced frequency
                if not instance.last_health_check or (
                        nowtime - instance.last_health_check).total_seconds(
                        ) >= settings.EXECUTION_NODE_REMEDIATION_CHECKS:
                    # Periodically re-run the health check of errored nodes, in case someone fixed it
                    # TODO: perhaps decrease the frequency of these checks
                    logger.debug(
                        f'Restarting health check for execution node {hostname} with known errors.'
                    )
                    execution_node_health_check.apply_async([hostname])