예제 #1
0
def PrepareJobForDeferredStageout(job_state, **kwargs):
    """
    Function prepares job for staging out
    :param job_state:   (JobState) decoded job state file
                        mandatory

    Other params can be passed into functions:
        1. To overwrite environment variables:
            1. uflag -- pUtil.isSameType
            2. pshttpurl, psport, pilot_initdir -- pUtil.getJobStatus
            3. thisSite -- compare the site names
            4. maxNumberOfRecoveryAttempts -- check recoveryAttempt
            5. psport, jobSchedulerId, pilotId -- server updates

    Unused parameters are ommited.

    :return: (integer)
        0. job is prepared
        1. job is not prepared, skip
        2. job is to be removed, proceed to cleanup
    """

    # Next test is testing trf type and PROD/ANAL job vs pilot comparison
    if not pUtil.isSameType(job_state.job.trf.split(",")[0], DorE(kwargs, "uflag")):
        return lognret(ReturnCode.SkipJob, "Job is not the same type as current pilot")

    # Next test ensures that the pilot is started on the same site.
    # Maybe we should add env var to switch this test on and off, there can be a number of equal nodes sharing one FS
    # Which can equally perform recovery of the lost jobs from each other
    if job_state.site.sitename != DorE(kwargs, "thisSite").sitename:
        return lognret(ReturnCode.SkipJob, "Job is not running on the same site")

    # This test ensures that the number of recovery attempts did not exceeded and if exceeded, updates the server and
    # the state file
    if job_state.recoveryAttempt >= DorE(kwargs, "maxNumberOfRecoveryAttempts"):
        log("!!WARNING!!1100!! Max number of recovery attempts exceeded: %d" % (env["maxNumberOfRecoveryAttempts"]))
        job_state.job.setState(["failed", job_state.job.result[1], PilotErrors().ERR_LOSTJOBMAXEDOUT])
        rt, retNode = updatePandaServer(job_state, **kwargs)
        if rt == 0:
            log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2]))

        else:
            log("Panda server returned a %d" % (rt))
            return lognret(ReturnCode.SkipJob, "(Failed to update panda server - leave for next pilot)")

    jobStatus = job_state.job.result[0]
    jobStatusCode = 0
    jobAttemptNr = job_state.job.attemptNr

    # There can be unclear state, consult the server
    if jobStatus not in acceptedJobStatesFromFile:
        log("Job state may be unclear (found state %s), checking with the server" % jobStatus)
        jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus(
            job_state.job.jobId, DorE(kwargs, "pshttpurl"), DorE(kwargs, "psport"), DorE(kwargs, "pilot_initdir")
        )
        if jobStatusCode != 0:
            return lognret(
                ReturnCode.SkipJob, "Received general error code from dispatcher call (leave job for later" " pilot)"
            )
        else:
            log("Job state is %s" % jobStatus)

    # If any inconvenience occurs or job is finalised, cleanup
    if job_state.job.attemptNr != jobAttemptNr or jobStatus in finalJobStates or "tobekilled" in job_state.job.action:
        if job_state.job.attemptNr != jobAttemptNr:
            return lognret(
                ReturnCode.Cleanup,
                "Further recovery attempts will be prevented for this job. It has a "
                "mismatch in the attempt number record.",
            )
        if "tobekilled" in job_state.job.action:
            return lognret(
                ReturnCode.Cleanup,
                "Further recovery attempts will be prevented for this job. It was" " marked to be killed.",
            )
        return lognret(
            ReturnCode.Cleanup,
            "Further recovery attempts will be prevented for this job, it is in final " "state: %s." % jobStatus,
        )

    if jobStatus != "holding":
        # is the attemptNr defined?
        try:
            attemptNr = job_state.job.attemptNr
        except Exception, e:
            log("!!WARNING!!1100!! Attempt number not defined [ignore]: %s" % str(e))
        else:
            # check if the attemptNr (set during initial getJob command) is the same
            # as the current jobAttemptNr from the server (protection against failed lost
            # heartbeat jobs due to reassigned panda job id numbers)
            if attemptNr != jobAttemptNr:
                log(
                    "!!WARNING!!1100!! Attempt number mismatch for job %s (according to server - will not be"
                    " recovered)" % job_state.job.jobId
                )
                log("....Initial attempt number: %d" % attemptNr)
                log("....Current attempt number: %d" % jobAttemptNr)
                log("....Job status (server)   : %s" % jobStatus)
                log("....Job status (state)    : %s" % job_state.job.result[0])
                return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job")
            else:
                log("Attempt numbers from server and job state file agree: %d" % attemptNr)
예제 #2
0
def PrepareJobForDeferredStageout(job_state, **kwargs):
    """
    Function prepares job for staging out
    :param job_state:   (JobState) decoded job state file
                        mandatory

    Other params can be passed into functions:
        1. To overwrite environment variables:
            1. uflag -- pUtil.isSameType
            2. pshttpurl, psport, pilot_initdir -- pUtil.getJobStatus
            3. thisSite -- compare the site names
            4. maxNumberOfRecoveryAttempts -- check recoveryAttempt
            5. psport, jobSchedulerId, pilotId -- server updates

    Unused parameters are ommited.

    :return: (integer)
        0. job is prepared
        1. job is not prepared, skip
        2. job is to be removed, proceed to cleanup
    """

    # Next test is testing trf type and PROD/ANAL job vs pilot comparison
    if not pUtil.isSameType(job_state.job.trf.split(",")[0], DorE(kwargs, 'uflag')):
        return lognret(ReturnCode.SkipJob, "Job is not the same type as current pilot")

    # Next test ensures that the pilot is started on the same site.
    # Maybe we should add env var to switch this test on and off, there can be a number of equal nodes sharing one FS
    # Which can equally perform recovery of the lost jobs from each other
    if job_state.site.sitename != DorE(kwargs, 'thisSite').sitename:
        return lognret(ReturnCode.SkipJob, "Job is not running on the same site")

    # This test ensures that the number of recovery attempts did not exceeded and if exceeded, updates the server and
    # the state file
    if job_state.recoveryAttempt >= DorE(kwargs, 'maxNumberOfRecoveryAttempts'):
        log("!!WARNING!!1100!! Max number of recovery attempts exceeded: %d" %
                    (env['maxNumberOfRecoveryAttempts']))
        job_state.job.setState(['failed', job_state.job.result[1], PilotErrors().ERR_LOSTJOBMAXEDOUT])
        rt, retNode = updatePandaServer(job_state, **kwargs)
        if rt == 0:
            log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2]))

        else:
            log("Panda server returned a %d" % (rt))
            return lognret(ReturnCode.SkipJob, "(Failed to update panda server - leave for next pilot)")

    jobStatus = job_state.job.result[0]
    jobStatusCode = 0
    jobAttemptNr = job_state.job.attemptNr

    # There can be unclear state, consult the server
    if jobStatus not in acceptedJobStatesFromFile:
        log("Job state may be unclear (found state %s), checking with the server" % jobStatus)
        jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus(job_state.job.jobId,
                                                                    DorE(kwargs, 'pshttpurl'),
                                                                    DorE(kwargs, 'psport'),
                                                                    DorE(kwargs, 'pilot_initdir'))
        if jobStatusCode != 0:
            return lognret(ReturnCode.SkipJob, "Received general error code from dispatcher call (leave job for later"
                                                " pilot)")
        else:
            log("Job state is %s" % jobStatus)

    # If any inconvenience occurs or job is finalised, cleanup
    if job_state.job.attemptNr != jobAttemptNr or\
                     jobStatus in finalJobStates or\
                     "tobekilled" in job_state.job.action:
        if job_state.job.attemptNr != jobAttemptNr:
            return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job. It has a "
                                               "mismatch in the attempt number record.")
        if "tobekilled" in job_state.job.action:
            return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job. It was"
                                               " marked to be killed.")
        return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job, it is in final "
                                           "state: %s." % jobStatus)

    if jobStatus != 'holding':
        # is the attemptNr defined?
        try:
            attemptNr = job_state.job.attemptNr
        except Exception, e:
            log("!!WARNING!!1100!! Attempt number not defined [ignore]: %s" % str(e))
        else:
            # check if the attemptNr (set during initial getJob command) is the same
            # as the current jobAttemptNr from the server (protection against failed lost
            # heartbeat jobs due to reassigned panda job id numbers)
            if attemptNr != jobAttemptNr:
                log("!!WARNING!!1100!! Attempt number mismatch for job %s (according to server - will not be"
                            " recovered)" % job_state.job.jobId)
                log("....Initial attempt number: %d" % attemptNr)
                log("....Current attempt number: %d" % jobAttemptNr)
                log("....Job status (server)   : %s" % jobStatus)
                log("....Job status (state)    : %s" % job_state.job.result[0])
                return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job")
            else:
                log("Attempt numbers from server and job state file agree: %d" % attemptNr)
예제 #3
0
def DeferredStageoutHPCJob(job_dir, deferred_stageout_logfile=False, **kwargs):
    """
    Performs stageing out preparation for the HPC job in specified directory.

    :param job_dir:     (string)    directory with a job.
                        mandatory parameter

    :param deferred_stageout_logfile: (string|False)    template name for deferred log stageout
                                                        Replaces "{job_id}" with current job id like
                                                        "log-{job_id}.txt" -> "log-124124.txt"
                                        Default False

    Other parameters are passed into other functions

    :return: (bool) the fact of stageout being performed
    """
    log('Deferred stageout from HPC job directory "%s"' % job_dir)

    file_path = job_dir + "/" + hpc_jobState_file_wildcart
    current_dir = os.getcwd()
    log("Working on %s" % file_path)
    log("Chdir from current dir %s to %s" % (current_dir, job_dir))
    pUtil.chdir(job_dir)

    try:
        with LockFileWrapper(file_path) as is_locked:
            if not is_locked:
                return False

            from json import load

            with open(file_path) as data_file:
                HPC_state = load(data_file)
            job_state_file = HPC_state["JobStateFile"]
            job_command = HPC_state["JobCommand"]
            # global_work_dir = HPC_state['GlobalWorkingDir']
            JS = JobState()
            JS.get(job_state_file)
            _job, _site, _node, _recoveryAttempt = JS.decode()

            with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger:
                jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus(
                    _job.jobId, DorE(kwargs, "pshttpurl"), DorE(kwargs, "psport"), DorE(kwargs, "pilot_initdir")
                )
                # recover this job?
                if jobStatusCode == 20:
                    log("Received general error code from dispatcher call (leave job for later pilot)")
                    # release the atomic lockfile and go to the next directory
                    # releaseAtomicLockFile(fd, lockfile_name)
                    return False
                elif jobStatus in finalJobStates or "tobekilled" in _job.action:
                    log(
                        "Job %s is currently in state '%s' with attemptNr = %d (according to server - will not"
                        " perform staging out)" % (_job.jobId, jobStatus, jobAttemptNr)
                    )
                    # releaseAtomicLockFile(fd, lockfile_name)
                    return False

                # update job state file at this point to prevent a parallel pilot from doing a simultaneous recovery
                _retjs = pUtil.updateJobState(_job, _site, _node, _recoveryAttempt)
                # releaseAtomicLockFile(fd, lockfile_name)

        monitor = Monitor(env)
        monitor.monitor_recovery_job(_job, _site, _node, job_command, job_state_file, recover_dir=job_dir)

        log("Chdir back to %s" % current_dir)
        pUtil.chdir(current_dir)

        panda_jobs = glob(job_dir + "/PandaJob_*_*")
        panda_logs = glob(job_dir + "/*.log.tgz.*")
        if panda_jobs or panda_logs:
            log(
                "Number of founded panda jobs: %d, number of panda log tar file %d, will not remove job dir"
                % (len(panda_jobs), len(panda_logs))
            )
        else:
            log(
                "Number of founded panda jobs: %d, number of panda log tar file %d, will remove job dir"
                % (len(panda_jobs), len(panda_logs))
            )
            log("Remove job dir %s" % job_dir)
            os.system("rm -rf %s" % job_dir)
        return True
    except:
        log("Failed to start deferred stage out for HPC job: %s" % traceback.format_exc())
        return False
예제 #4
0
def DeferredStageoutHPCJob(job_dir, deferred_stageout_logfile=False, **kwargs):
    """
    Performs stageing out preparation for the HPC job in specified directory.

    :param job_dir:     (string)    directory with a job.
                        mandatory parameter

    :param deferred_stageout_logfile: (string|False)    template name for deferred log stageout
                                                        Replaces "{job_id}" with current job id like
                                                        "log-{job_id}.txt" -> "log-124124.txt"
                                        Default False

    Other parameters are passed into other functions

    :return: (bool) the fact of stageout being performed
    """
    log("Deferred stageout from HPC job directory \"%s\"" % job_dir)

    file_path = job_dir+"/"+hpc_jobState_file_wildcart
    current_dir = os.getcwd()
    log("Working on %s" % file_path)
    log("Chdir from current dir %s to %s" % (current_dir, job_dir))
    pUtil.chdir(job_dir)

    try:
        with LockFileWrapper(file_path) as is_locked:
            if not is_locked:
                return False

            from json import load
            with open(file_path) as data_file:
                HPC_state = load(data_file)
            job_state_file = HPC_state['JobStateFile']
            job_command = HPC_state['JobCommand']
            # global_work_dir = HPC_state['GlobalWorkingDir']
            JS = JobState()
            JS.get(job_state_file)
            _job, _site, _node, _recoveryAttempt = JS.decode()

            with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger:
                jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus(_job.jobId, DorE(kwargs,'pshttpurl'),
                                                                            DorE(kwargs,'psport'),
                                                                            DorE(kwargs,'pilot_initdir'))
                # recover this job?
                if jobStatusCode == 20:
                    log("Received general error code from dispatcher call (leave job for later pilot)")
                    # release the atomic lockfile and go to the next directory
                    # releaseAtomicLockFile(fd, lockfile_name)
                    return False
                elif jobStatus in finalJobStates or "tobekilled" in _job.action:
                    log("Job %s is currently in state \'%s\' with attemptNr = %d (according to server - will not"
                                " perform staging out)" % (_job.jobId, jobStatus, jobAttemptNr))
                    # releaseAtomicLockFile(fd, lockfile_name)
                    return False

                # update job state file at this point to prevent a parallel pilot from doing a simultaneous recovery
                _retjs = pUtil.updateJobState(_job, _site, _node, _recoveryAttempt)
                # releaseAtomicLockFile(fd, lockfile_name)

        monitor = Monitor(env)
        monitor.monitor_recovery_job(_job, _site, _node, job_command, job_state_file, recover_dir=job_dir)

        log("Chdir back to %s" % current_dir)
        pUtil.chdir(current_dir)

        panda_jobs = glob(job_dir + "/PandaJob_*_*")
        panda_logs = glob(job_dir + "/*.log.tgz.*")
        if panda_jobs or panda_logs:
            log("Number of founded panda jobs: %d, number of panda log tar file %d, will not remove job dir"
                        % (len(panda_jobs), len(panda_logs)))
        else:
            log("Number of founded panda jobs: %d, number of panda log tar file %d, will remove job dir"
                        % (len(panda_jobs), len(panda_logs)))
            log("Remove job dir %s" % job_dir)
            os.system("rm -rf %s" % job_dir)
        return True
    except:
        log("Failed to start deferred stage out for HPC job: %s" % traceback.format_exc())
        return False