Exemplo n.º 1
0
def kill_looping_job(job):
    """
    Kill the looping process.
    TODO: add allow_looping_job() exp. spec?

    :param job: job object.
    :return: (updated job object.)
    """

    # the child process is looping, kill it
    diagnostics = "pilot has decided to kill looping job %s at %s" % (
        job.jobid, time_stamp())
    logger.fatal(diagnostics)

    cmd = 'ps -fwu %s' % whoami()
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'ls -ltr %s' % (job.workdir)
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami()
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'pstree -g -a'
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    # set the relevant error code
    if job.state == 'stagein':
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEINTIMEOUT)
    elif job.state == 'stageout':
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEOUTTIMEOUT)
    else:
        # most likely in the 'running' state, but use the catch-all 'else'
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.LOOPINGJOB)
    set_pilot_state(job=job, state="failed")

    # remove any lingering input files from the work dir
    lfns, guids = job.get_lfns_and_guids()
    if lfns:
        ec = remove_files(job.workdir, lfns)
        if ec != 0:
            logger.warning('failed to remove all files')

    kill_processes(job.pid)
Exemplo n.º 2
0
def kill_orphans():
    """
    Find and kill all orphan processes belonging to current pilot user.

    :return:
    """

    # exception for BOINC
    if 'BOINC' in os.environ.get('PILOT_SITENAME', ''):
        logger.info("Do not look for orphan processes in BOINC jobs")
        return

    if 'PILOT_NOKILL' in os.environ:
        return

    logger.info("searching for orphan processes")

    cmd = "ps -o pid,ppid,args -u %s" % whoami()
    exit_code, _processes, stderr = execute(cmd)
    #pattern = re.compile(r'(\d+)\s+(\d+)\s+(\S+)')  # Python 3 (added r)
    pattern = re.compile(r'(\d+)\s+(\d+)\s+([\S\s]+)')  # Python 3 (added r)

    count = 0
    for line in _processes.split('\n'):
        ids = pattern.search(line)
        if ids:
            pid = ids.group(1)
            ppid = ids.group(2)
            args = ids.group(3)
            try:
                pid = int(pid)
            except Exception as error:
                logger.warning('failed to convert pid to int: %s', error)
                continue
            if 'cvmfs2' in args:
                logger.info(
                    "ignoring possible orphan process running cvmfs2: pid=%s, ppid=%s, args=\'%s\'",
                    pid, ppid, args)
            elif 'pilots_starter.py' in args or 'runpilot2-wrapper.sh' in args:
                logger.info(
                    "ignoring pilot launcher: pid=%s, ppid=%s, args='%s'", pid,
                    ppid, args)
            elif ppid == '1':
                count += 1
                logger.info("found orphan process: pid=%s, ppid=%s, args='%s'",
                            pid, ppid, args)
                if 'bash' in args or ('python' in args and 'pilot.py' in args):
                    logger.info("will not kill bash process")
                else:
                    killpg(pid, signal.SIGTERM, args)
                    _t = 10
                    logger.info("sleeping %d s to allow processes to exit", _t)
                    time.sleep(_t)
                    killpg(pid, signal.SIGKILL, args)

    if count == 0:
        logger.info("did not find any orphan processes")
    else:
        logger.info("found %d orphan process(es)", count)
Exemplo n.º 3
0
def kill_orphans():
    """
    Find and kill all orphan processes belonging to current pilot user.

    :return:
    """

    # exception for BOINC
    if 'BOINC' in os.environ.get('PILOT_SITENAME', ''):
        logger.info("Do not look for orphan processes in BOINC jobs")
        return

    if 'PILOT_NOKILL' in os.environ:
        return

    logger.info("searching for orphan processes")

    cmd = "ps -o pid,ppid,args -u %s" % whoami()
    exit_code, _processes, stderr = execute(cmd)
    #pattern = re.compile(r'(\d+)\s+(\d+)\s+(\S+)')  # Python 3 (added r)
    pattern = re.compile(r'(\d+)\s+(\d+)\s+([\S\s]+)')  # Python 3 (added r)

    count = 0
    for line in _processes.split('\n'):
        ids = pattern.search(line)
        if ids:
            pid = ids.group(1)
            ppid = ids.group(2)
            args = ids.group(3)
            if 'cvmfs2' in args:
                logger.info(
                    "ignoring possible orphan process running cvmfs2: pid=%s, ppid=%s, args=\'%s\'"
                    % (pid, ppid, args))
            elif 'pilots_starter.py' in args:
                logger.info(
                    "ignoring pilot launcher: pid=%s, ppid=%s, args='%s'" %
                    (pid, ppid, args))
            elif ppid == '1':
                count += 1
                logger.info(
                    "found orphan process: pid=%s, ppid=%s, args='%s'" %
                    (pid, ppid, args))
                #if args.endswith('bash'):
                if 'bash' in args:
                    logger.info("will not kill bash process")
                else:
                    try:
                        os.killpg(int(pid), signal.SIGKILL)
                    except Exception as e:
                        logger.warning("failed to execute killpg(): %s" % e)
                        cmd = 'kill -9 %s' % (pid)
                        exit_code, rs, stderr = execute(cmd)
                        if exit_code != 0:
                            logger.warning(rs)
                        else:
                            logger.info("killed orphaned process %s (%s)" %
                                        (pid, args))
                    else:
                        logger.info("killed orphaned process group %s (%s)" %
                                    (pid, args))

    if count == 0:
        logger.info("did not find any orphan processes")
    else:
        logger.info("found %d orphan process(es)" % count)