def kill_looping_job(job): """ Kill the looping process. TODO: add allow_looping_job() exp. spec? :param job: job object. :return: (updated job object.) """ # the child process is looping, kill it diagnostics = "pilot has decided to kill looping job %s at %s" % ( job.jobid, time_stamp()) logger.fatal(diagnostics) cmd = 'ps -fwu %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ls -ltr %s' % (job.workdir) exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'pstree -g -a' exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) # set the relevant error code if job.state == 'stagein': job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEINTIMEOUT) elif job.state == 'stageout': job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEOUTTIMEOUT) else: # most likely in the 'running' state, but use the catch-all 'else' job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.LOOPINGJOB) set_pilot_state(job=job, state="failed") # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: ec = remove_files(job.workdir, lfns) if ec != 0: logger.warning('failed to remove all files') kill_processes(job.pid)
def kill_orphans(): """ Find and kill all orphan processes belonging to current pilot user. :return: """ # exception for BOINC if 'BOINC' in os.environ.get('PILOT_SITENAME', ''): logger.info("Do not look for orphan processes in BOINC jobs") return if 'PILOT_NOKILL' in os.environ: return logger.info("searching for orphan processes") cmd = "ps -o pid,ppid,args -u %s" % whoami() exit_code, _processes, stderr = execute(cmd) #pattern = re.compile(r'(\d+)\s+(\d+)\s+(\S+)') # Python 3 (added r) pattern = re.compile(r'(\d+)\s+(\d+)\s+([\S\s]+)') # Python 3 (added r) count = 0 for line in _processes.split('\n'): ids = pattern.search(line) if ids: pid = ids.group(1) ppid = ids.group(2) args = ids.group(3) try: pid = int(pid) except Exception as error: logger.warning('failed to convert pid to int: %s', error) continue if 'cvmfs2' in args: logger.info( "ignoring possible orphan process running cvmfs2: pid=%s, ppid=%s, args=\'%s\'", pid, ppid, args) elif 'pilots_starter.py' in args or 'runpilot2-wrapper.sh' in args: logger.info( "ignoring pilot launcher: pid=%s, ppid=%s, args='%s'", pid, ppid, args) elif ppid == '1': count += 1 logger.info("found orphan process: pid=%s, ppid=%s, args='%s'", pid, ppid, args) if 'bash' in args or ('python' in args and 'pilot.py' in args): logger.info("will not kill bash process") else: killpg(pid, signal.SIGTERM, args) _t = 10 logger.info("sleeping %d s to allow processes to exit", _t) time.sleep(_t) killpg(pid, signal.SIGKILL, args) if count == 0: logger.info("did not find any orphan processes") else: logger.info("found %d orphan process(es)", count)
def kill_orphans(): """ Find and kill all orphan processes belonging to current pilot user. :return: """ # exception for BOINC if 'BOINC' in os.environ.get('PILOT_SITENAME', ''): logger.info("Do not look for orphan processes in BOINC jobs") return if 'PILOT_NOKILL' in os.environ: return logger.info("searching for orphan processes") cmd = "ps -o pid,ppid,args -u %s" % whoami() exit_code, _processes, stderr = execute(cmd) #pattern = re.compile(r'(\d+)\s+(\d+)\s+(\S+)') # Python 3 (added r) pattern = re.compile(r'(\d+)\s+(\d+)\s+([\S\s]+)') # Python 3 (added r) count = 0 for line in _processes.split('\n'): ids = pattern.search(line) if ids: pid = ids.group(1) ppid = ids.group(2) args = ids.group(3) if 'cvmfs2' in args: logger.info( "ignoring possible orphan process running cvmfs2: pid=%s, ppid=%s, args=\'%s\'" % (pid, ppid, args)) elif 'pilots_starter.py' in args: logger.info( "ignoring pilot launcher: pid=%s, ppid=%s, args='%s'" % (pid, ppid, args)) elif ppid == '1': count += 1 logger.info( "found orphan process: pid=%s, ppid=%s, args='%s'" % (pid, ppid, args)) #if args.endswith('bash'): if 'bash' in args: logger.info("will not kill bash process") else: try: os.killpg(int(pid), signal.SIGKILL) except Exception as e: logger.warning("failed to execute killpg(): %s" % e) cmd = 'kill -9 %s' % (pid) exit_code, rs, stderr = execute(cmd) if exit_code != 0: logger.warning(rs) else: logger.info("killed orphaned process %s (%s)" % (pid, args)) else: logger.info("killed orphaned process group %s (%s)" % (pid, args)) if count == 0: logger.info("did not find any orphan processes") else: logger.info("found %d orphan process(es)" % count)