예제 #1
0
파일: memory.py 프로젝트: anisyonk/pilot2
def memory_usage(job):
    """
    Perform memory usage verification.

    :param job: job object
    :return: exit code (int), diagnostics (string).
    """

    exit_code = 0
    diagnostics = ""

    # Get the maxPSS value from the memory monitor
    summary_dictionary = get_memory_values(job.workdir, name=job.memorymonitor)

    if not summary_dictionary:
        exit_code = errors.BADMEMORYMONITORJSON
        diagnostics = "Memory monitor output could not be read"
        return exit_code, diagnostics

    maxdict = summary_dictionary.get('Max', {})
    maxpss_int = maxdict.get('maxPSS', -1)

    # Only proceed if values are set
    if maxpss_int != -1:
        maxrss = job.infosys.queuedata.maxrss

        if maxrss:
            # correction for SCORE/4CORE/nCORE jobs on UCORE queues
            scale = get_ucore_scale_factor(job)
            try:
                maxrss_int = 2 * int(maxrss * scale) * 1024  # Convert to int and kB
            except Exception as e:
                logger.warning("unexpected value for maxRSS: %s" % e)
            else:
                # Compare the maxRSS with the maxPSS from memory monitor
                if maxrss_int > 0 and maxpss_int > 0:
                    if maxpss_int > maxrss_int:
                        diagnostics = "job has exceeded the memory limit %d kB > %d kB (2 * queuedata.maxrss)" % \
                                      (maxpss_int, maxrss_int)
                        logger.warning(diagnostics)

                        # Create a lockfile to let RunJob know that it should not restart the memory monitor after it has been killed
                        #pUtil.createLockFile(False, self.__env['jobDic'][k][1].workdir, lockfile="MEMORYEXCEEDED")

                        # Kill the job
                        set_pilot_state(job=job, state="failed")
                        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADEXCEEDMAXMEM)
                        kill_processes(job.pid)
                    else:
                        logger.info("max memory (maxPSS) used by the payload is within the allowed limit: "
                                    "%d B (2 * maxRSS = %d B)" % (maxpss_int, maxrss_int))
        else:
            if maxrss == 0 or maxrss == "0":
                logger.info("queuedata.maxrss set to 0 (no memory checks will be done)")
            else:
                logger.warning("queuedata.maxrss is not set")

    return exit_code, diagnostics
예제 #2
0
파일: monitoring.py 프로젝트: ptrlv/pilot2
def check_work_dir(job):
    """
    Check the size of the work directory.
    The function also updates the workdirsizes list in the job object.

    :param job: job object.
    :return: exit code (int), error diagnostics (string)
    """

    exit_code = 0
    diagnostics = ""

    log = get_logger(job.jobid)

    if os.path.exists(job.workdir):
        # get the limit of the workdir
        maxwdirsize = get_max_allowed_work_dir_size(job.infosys.queuedata)

        if os.path.exists(job.workdir):
            workdirsize = get_directory_size(directory=job.workdir)

            # is user dir within allowed size limit?
            if workdirsize > maxwdirsize:
                exit_code = errors.USERDIRTOOLARGE
                diagnostics = "work directory (%s) is too large: %d B (must be < %d B)" % \
                              (job.workdir, workdirsize, maxwdirsize)
                log.fatal("%s" % diagnostics)

                cmd = 'ls -altrR %s' % job.workdir
                exit_code, stdout, stderr = execute(cmd, mute=True)
                log.info("%s: %s" % (cmd + '\n', stdout))

                # kill the job
                # pUtil.createLockFile(True, self.__env['jobDic'][k][1].workdir, lockfile="JOBWILLBEKILLED")
                kill_processes(job.pid)

                # remove any lingering input files from the work dir
                lfns, guids = job.get_lfns_and_guids()
                if lfns:
                    remove_files(job.workdir, lfns)

                    # remeasure the size of the workdir at this point since the value is stored below
                    workdirsize = get_directory_size(directory=job.workdir)
            else:
                log.info(
                    "size of work directory %s: %d B (within %d B limit)" %
                    (job.workdir, workdirsize, maxwdirsize))

            # Store the measured disk space (the max value will later be sent with the job metrics)
            if workdirsize > 0:
                job.add_workdir_size(workdirsize)
        else:
            log.warning('job work dir does not exist: %s' % job.workdir)
    else:
        log.warning(
            'skipping size check of workdir since it has not been created yet')

    return exit_code, diagnostics
예제 #3
0
def kill_looping_job(job):
    """
    Kill the looping process.
    TODO: add allow_looping_job() exp. spec?

    :param job: job object.
    :return: (updated job object.)
    """

    # the child process is looping, kill it
    diagnostics = "pilot has decided to kill looping job %s at %s" % (
        job.jobid, time_stamp())
    logger.fatal(diagnostics)

    cmd = 'ps -fwu %s' % whoami()
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'ls -ltr %s' % (job.workdir)
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami()
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'pstree -g -a'
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    # set the relevant error code
    if job.state == 'stagein':
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEINTIMEOUT)
    elif job.state == 'stageout':
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEOUTTIMEOUT)
    else:
        # most likely in the 'running' state, but use the catch-all 'else'
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.LOOPINGJOB)
    set_pilot_state(job=job, state="failed")

    # remove any lingering input files from the work dir
    lfns, guids = job.get_lfns_and_guids()
    if lfns:
        ec = remove_files(job.workdir, lfns)
        if ec != 0:
            logger.warning('failed to remove all files')

    kill_processes(job.pid)
예제 #4
0
def interrupt(args, signum, frame):
    """
    Interrupt function on the receiving end of kill signals.
    This function is forwarded any incoming signals (SIGINT, SIGTERM, etc) and will set abort_job which instructs
    the threads to abort the job.

    :param args: pilot arguments.
    :param signum: signal.
    :param frame: stack/execution frame pointing to the frame that was interrupted by the signal.
    :return:
    """

    try:
        sig = [v for v, k in signal.__dict__.iteritems() if k == signum][0]
    except Exception:
        sig = [v for v, k in list(signal.__dict__.items()) if k == signum][0]

    args.signal_counter += 1

    # keep track of when first kill signal arrived, any stuck loops should abort at a defined cut off time
    if args.kill_time == 0:
        args.kill_time = int(time())

    max_kill_wait_time = MAX_KILL_WAIT_TIME + 60  # add another minute of grace to let threads finish
    current_time = int(time())
    if args.kill_time and current_time - args.kill_time > max_kill_wait_time:
        logger.warning(
            'passed maximum waiting time after first kill signal - will commit suicide - farewell'
        )
        try:
            rmtree(args.sourcedir)
        except Exception as e:
            logger.warning(e)
        logging.shutdown()
        kill_processes(getpid())

    add_to_pilot_timing('0', PILOT_KILL_SIGNAL, time(), args)
    add_to_pilot_timing('1', PILOT_KILL_SIGNAL, time(), args)
    logger.warning('caught signal: %s in FRAME=\n%s' %
                   (sig, '\n'.join(traceback.format_stack(frame))))

    args.signal = sig
    logger.warning('will instruct threads to abort and update the server')
    args.abort_job.set()
    logger.warning('waiting for threads to finish')
    args.job_aborted.wait()
    logger.warning(
        'setting graceful stop (in case it was not set already), pilot will abort'
    )
    args.graceful_stop.set()
예제 #5
0
파일: generic.py 프로젝트: mlassnig/pilot2
    def run(self):  # noqa: C901
        """
        Run all payload processes (including pre- and post-processes, and utilities).
        In the case of HPO jobs, this function will loop over all processes until the preprocess returns a special
        exit code.
        :return:
        """

        # get the payload command from the user specific code
        self.pre_setup(self.__job)

        cmd = self.get_payload_command(self.__job)
        # extract the setup in case the preprocess command needs it
        self.__job.setup = self.extract_setup(cmd)
        self.post_setup(self.__job)

        # a loop is needed for HPO jobs
        # abort when nothing more to run, or when the preprocess returns a special exit code
        iteration = 0
        while True:

            logger.info('payload iteration loop #%d', iteration + 1)
            os.environ['PILOT_EXEC_ITERATION_COUNT'] = '%s' % iteration
            show_memory_usage()

            # first run the preprocess (if necessary) - note: this might update jobparams -> must update cmd
            jobparams_pre = self.__job.jobparams
            exit_code = self.run_preprocess(self.__job)
            jobparams_post = self.__job.jobparams
            if exit_code:
                if exit_code >= 160 and exit_code <= 162:
                    exit_code = 0
                    # wipe the output file list since there won't be any new files
                    # any output files from previous iterations, should have been transferred already
                    logger.debug(
                        'reset outdata since further output should not be expected after preprocess exit'
                    )
                    self.__job.outdata = []
                break
            if jobparams_pre != jobparams_post:
                logger.debug(
                    'jobparams were updated by utility_before_payload()')
                # must update cmd
                cmd = cmd.replace(jobparams_pre, jobparams_post)

            # now run the main payload, when it finishes, run the postprocess (if necessary)
            # note: no need to run any main payload in HPO Horovod jobs on Kubernetes
            if os.environ.get('HARVESTER_HOROVOD', '') == '':

                #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[before payload start] stdout=%s', _stdout)
                #logger.debug('[before payload start] stderr=%s', _stderr)

                proc = self.run_payload(self.__job, cmd, self.__out,
                                        self.__err)
            else:
                proc = None

            proc_co = None
            if proc is None:
                # run the post-process command even if there was no main payload
                if os.environ.get('HARVESTER_HOROVOD', '') != '':
                    logger.info('No need to execute any main payload')
                    exit_code = self.run_utility_after_payload_finished(
                        exit_code, True, UTILITY_AFTER_PAYLOAD_FINISHED2)
                    self.post_payload(self.__job)
                else:
                    break
            else:
                # the process is now running, update the server
                # test 'tobekilled' from here to try payload kill
                send_state(self.__job, self.__args, self.__job.state)

                # note: when sending a state change to the server, the server might respond with 'tobekilled'
                if self.__job.state == 'failed':
                    logger.warning(
                        'job state is \'failed\' - abort payload and run()')
                    kill_processes(proc.pid)
                    break

                # allow for a secondary command to be started after the payload (e.g. a coprocess)
                utility_cmd = self.get_utility_command(
                    order=UTILITY_AFTER_PAYLOAD_STARTED2)
                if utility_cmd:
                    logger.debug('starting utility command: %s', utility_cmd)
                    label = 'coprocess' if 'coprocess' in utility_cmd else None
                    proc_co = self.run_command(utility_cmd, label=label)

                logger.info('will wait for graceful exit')
                exit_code = self.wait_graceful(self.__args, proc)
                # reset error if Raythena decided to kill payload (no error)
                if errors.KILLPAYLOAD in self.__job.piloterrorcodes:
                    logger.debug('ignoring KILLPAYLOAD error')
                    self.__job.piloterrorcodes, self.__job.piloterrordiags = errors.remove_error_code(
                        errors.KILLPAYLOAD,
                        pilot_error_codes=self.__job.piloterrorcodes,
                        pilot_error_diags=self.__job.piloterrordiags)
                    exit_code = 0
                    state = 'finished'
                else:
                    state = 'finished' if exit_code == 0 else 'failed'
                set_pilot_state(job=self.__job, state=state)
                logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n',
                            proc.pid, exit_code, self.__job.state)

                #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[after payload finish] stdout=%s', _stdout)
                #logger.debug('[after payload finish] stderr=%s', _stderr)

                # stop the utility command (e.g. a coprocess if necessary
                if proc_co:
                    logger.debug('stopping utility command: %s', utility_cmd)
                    kill_processes(proc_co.pid)

                if exit_code is None:
                    logger.warning(
                        'detected unset exit_code from wait_graceful - reset to -1'
                    )
                    exit_code = -1

                for order in [
                        UTILITY_AFTER_PAYLOAD_FINISHED,
                        UTILITY_AFTER_PAYLOAD_FINISHED2
                ]:
                    exit_code = self.run_utility_after_payload_finished(
                        exit_code, state, order)

                self.post_payload(self.__job)

                # stop any running utilities
                if self.__job.utilities != {}:
                    self.stop_utilities()

            if self.__job.is_hpo and state != 'failed':
                # in case there are more hyper-parameter points, move away the previous log files
                #self.rename_log_files(iteration)
                iteration += 1
            else:
                break

        return exit_code
예제 #6
0
def check_payload_stdout(job):
    """
    Check the size of the payload stdout.

    :param job: job object.
    :return: exit code (int), diagnostics (string).
    """

    exit_code = 0
    diagnostics = ""

    # get list of log files
    file_list = glob(os.path.join(job.workdir, 'log.*'))

    # is this a multi-trf job?
    n_jobs = job.jobparams.count("\n") + 1
    for _i in range(n_jobs):
        # get name of payload stdout file created by the pilot
        _stdout = config.Payload.payloadstdout
        if n_jobs > 1:
            _stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1))

        # add the primary stdout file to the fileList
        file_list.append(os.path.join(job.workdir, _stdout))

    tmp_list = glob(os.path.join(job.workdir, 'workDir/tmp.stdout.*'))
    if tmp_list:
        file_list += tmp_list
    logger.debug('file list=%s' % str(file_list))

    # now loop over all files and check each individually (any large enough file will fail the job)
    for filename in file_list:

        logger.debug('check_payload_stdout: filename=%s', filename)
        if "job.log.tgz" in filename:
            logger.info("skipping file size check of file (%s) since it is a special log file", filename)
            continue

        if os.path.exists(filename):
            try:
                # get file size in bytes
                fsize = os.path.getsize(filename)
            except Exception as error:
                logger.warning("could not read file size of %s: %s", filename, error)
            else:
                # is the file too big?
                localsizelimit_stdout = get_local_size_limit_stdout()
                if fsize > localsizelimit_stdout:
                    exit_code = errors.STDOUTTOOBIG
                    diagnostics = "Payload stdout file too big: %d B (larger than limit %d B)" % \
                                  (fsize, localsizelimit_stdout)
                    logger.warning(diagnostics)

                    # kill the job
                    set_pilot_state(job=job, state="failed")
                    job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code)
                    kill_processes(job.pid)

                    # remove the payload stdout file after the log extracts have been created

                    # remove any lingering input files from the work dir
                    lfns, guids = job.get_lfns_and_guids()
                    if lfns:
                        # remove any lingering input files from the work dir
                        exit_code = remove_files(job.workdir, lfns)
                else:
                    logger.info("payload log (%s) within allowed size limit (%d B): %d B", os.path.basename(filename), localsizelimit_stdout, fsize)
        else:
            logger.info("skipping file size check of payload stdout file (%s) since it has not been created yet", filename)

    return exit_code, diagnostics