示例#1
0
def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False, **kwargs):
    """
    Performs stageing out preparation and stages out the job in specified directory.

    :param job_dir:     (string)    directory with a job.
                        mandatory parameter
    :param job_state_file:  (string)    path to job state file or other file containing job state. If empty, job
                                        state file is located as job_dir+'/jobState-*.*'.
                            defaults to ""

    :param deferred_stageout_logfile: (string|False)    template name for deferred log stageout
                                                        Replaces "{job_id}" with current job id like
                                                        "log-{job_id}.txt" -> "log-124124.txt"
                                        Default False

    Other parameters are passed into other functions

    :return: (bool) the fact of stageout being performed
    """
    log('Deferred stageout from job directory "%s"' % job_dir)

    job_state = JobState()

    if job_state_file == "":
        try:
            job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0]
        except:
            log("There is no job state file in the provided directory, exiting")
            return False

    log("Job state file is %s" % job_state_file)

    # lockfd, lockfn = createAtomicLockFile(job_dir)

    with LockFileWrapper(job_dir):
        if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs):
            log('Job "%s" does not need deferred stageout procedure (yet)' % job_dir)
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        if not job_state.get(job_state_file):
            log("Job state file reading failed, exiting")
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        log('Working with job in "%s"' % job_dir)
        _job, _site, _node, _recoveryAttempt = job_state.decode()

        if not (_job and _site and _node):
            log("Can not decode jobState file, exiting")
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger:

            rc = PrepareJobForDeferredStageout(job_state, **kwargs)

            if rc == ReturnCode.PostJobOnly:
                pUtil.postJobTask(
                    job_state.job,
                    job_state.site,
                    DorE(kwargs, "workerNode"),
                    DorE(kwargs, "experiment"),
                    jr=True,
                    ra=job_state.recoveryAttempt,
                )
                # releaseAtomicLockFile(lockfd, lockfn)
                return True

            if rc > 0:
                log("Job is not prepared for stageout, exiting")
                if rc == ReturnCode.Cleanup:
                    cleanup(job_state)
                # releaseAtomicLockFile(lockfd, lockfn)
                return False

            rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs)

            XMLStr = ""
            if datadir == "":
                try:
                    XMLStr = job_state.node["xml"]
                except:
                    pass

            if XMLStr == "":
                XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId)

            currentdir = os.getcwd()
            pUtil.chdir(job_state.site.workdir)

            if len(filelist):
                log("Stageout will now transfer the files")
                rc = TransferFiles(job_state, datadir, filelist, **kwargs)

                if rc == ReturnCode.Holding:
                    job_state.job.result[0] = "holding"
                if rc == ReturnCode.FailedJob:
                    job_state.job.result[0] = "failed"

                job_state.job.setState(job_state.job.result)

            pUtil.chdir(job_state.site.workdir)
            ret = True
            if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir):
                log("Stageout will now transfer the log")
                _log = JobLog()
                ret, _ = _log.transferLogFile(
                    job_state.job, job_state.site, DorE(kwargs, "experiment"), dest=None, jr=True
                )

            if not ret:
                rc = ReturnCode.Holding  # We need to transfer log file regardless the files

            if rc == ReturnCode.OK:
                if pUtil.verifyTransfer(job_state.site.workdir):
                    job_state.job.result[0] = "finished"
                else:
                    job_state.job.result[0] = "failed"
                job_state.job.setState(job_state.job.result)

            if job_state.job.result[0] in finalJobStates:
                job_state.job.final_state = job_state.job.result[0]

            log("Stageout will now update the server with new status")

            rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs)

            if rt == 0:
                log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2]))

                # did the server send back a command?
                if "tobekilled" in job_state.job.action:
                    log("!!WARNING!!1120!! Panda server returned a 'tobekilled' command")
                    job_state.job.result[0] = "failed"

                # further recovery attempt unnecessary, but keep the work dir for debugging
                if job_state.job.result[0] == "failed":
                    log("Further recovery attempts will be prevented for failed job (will leave work dir)")
                    if not job_state.rename(job_state.site, job_state.job):
                        log("(Fate of job state file left for next pilot)")

            else:
                log("!!WARNING!!1120!! Panda server returned a %d" % (rt))

                # store the final state so that the next pilot will know

                # store the metadata xml
                retNode["xml"] = XMLStr

                # update the job state file with the new state information
                _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt)

            log("Stageout will now proceed to post-job actions")

            if job_state.job.result[0] in finalJobStates:
                pUtil.postJobTask(
                    job_state.job,
                    job_state.site,
                    DorE(kwargs, "workerNode"),
                    DorE(kwargs, "experiment"),
                    jr=True,
                    ra=job_state.recoveryAttempt,
                )

            pUtil.chdir(currentdir)

            # releaseAtomicLockFile(lockfd, lockfn)

            if job_state.job.result[0] == "finished":
                log("Stageout will now remove the job, it is in finished state and can be removed")
                cleanup(job_state)

            return True
示例#2
0
def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False,
                        **kwargs):
    """
    Performs stageing out preparation and stages out the job in specified directory.

    :param job_dir:     (string)    directory with a job.
                        mandatory parameter
    :param job_state_file:  (string)    path to job state file or other file containing job state. If empty, job
                                        state file is located as job_dir+'/jobState-*.*'.
                            defaults to ""

    :param deferred_stageout_logfile: (string|False)    template name for deferred log stageout
                                                        Replaces "{job_id}" with current job id like
                                                        "log-{job_id}.txt" -> "log-124124.txt"
                                        Default False

    Other parameters are passed into other functions

    :return: (bool) the fact of stageout being performed
    """
    log("Deferred stageout from job directory \"%s\"" % job_dir)

    job_state = JobState()

    if job_state_file == "":
        try:
            job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0]
        except:
            log("There is no job state file in the provided directory, exiting")
            return False

    log("Job state file is %s"%job_state_file)

    # lockfd, lockfn = createAtomicLockFile(job_dir)

    with LockFileWrapper(job_dir):
        if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs):
            log("Job \"%s\" does not need deferred stageout procedure (yet)" % job_dir)
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        if not job_state.get(job_state_file):
            log("Job state file reading failed, exiting")
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        log("Working with job in \"%s\"" % job_dir)
        _job, _site, _node, _recoveryAttempt = job_state.decode()

        if not (_job and _site and _node):
            log("Can not decode jobState file, exiting")
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger:

            rc = PrepareJobForDeferredStageout(job_state, **kwargs)

            if rc == ReturnCode.PostJobOnly:
                pUtil.postJobTask(job_state.job, job_state.site, DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'),
                                  jr=True, ra=job_state.recoveryAttempt)
                # releaseAtomicLockFile(lockfd, lockfn)
                return True

            if rc > 0:
                log("Job is not prepared for stageout, exiting")
                if rc == ReturnCode.Cleanup:
                    cleanup(job_state)
                # releaseAtomicLockFile(lockfd, lockfn)
                return False

            rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs)

            XMLStr = ''
            if datadir == "":
                try:
                    XMLStr = job_state.node['xml']
                except:
                    pass

            if XMLStr == '':
                XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId)

            currentdir = os.getcwd()
            pUtil.chdir(job_state.site.workdir)

            if len(filelist):
                log("Stageout will now transfer the files")
                rc = TransferFiles(job_state, datadir, filelist, **kwargs)

                if rc == ReturnCode.Holding:
                    job_state.job.result[0] = "holding"
                if rc == ReturnCode.FailedJob:
                    job_state.job.result[0] = "failed"

                job_state.job.setState(job_state.job.result)

            pUtil.chdir(job_state.site.workdir)
            ret = True
            if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir):
                log("Stageout will now transfer the log")
                _log = JobLog()
                ret, _ = _log.transferLogFile(job_state.job, job_state.site, DorE(kwargs, 'experiment'), dest=None,
                                              jr=True)

            if not ret:
                rc = ReturnCode.Holding  # We need to transfer log file regardless the files

            if rc == ReturnCode.OK:
                if pUtil.verifyTransfer(job_state.site.workdir):
                    job_state.job.result[0] = "finished"
                else:
                    job_state.job.result[0] = "failed"
                job_state.job.setState(job_state.job.result)

            if job_state.job.result[0] in finalJobStates:
                job_state.job.final_state = job_state.job.result[0]

            log("Stageout will now update the server with new status")

            rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs)

            if rt == 0:
                log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2]))

                # did the server send back a command?
                if "tobekilled" in job_state.job.action:
                    log("!!WARNING!!1120!! Panda server returned a \'tobekilled\' command")
                    job_state.job.result[0] = "failed"

                # further recovery attempt unnecessary, but keep the work dir for debugging
                if job_state.job.result[0] == "failed":
                    log("Further recovery attempts will be prevented for failed job (will leave work dir)")
                    if not job_state.rename(job_state.site, job_state.job):
                        log("(Fate of job state file left for next pilot)")

            else:
                log("!!WARNING!!1120!! Panda server returned a %d" % (rt))

                # store the final state so that the next pilot will know

                # store the metadata xml
                retNode['xml'] = XMLStr

                # update the job state file with the new state information
                _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt)

            log("Stageout will now proceed to post-job actions")

            if job_state.job.result[0] in finalJobStates:
                pUtil.postJobTask(job_state.job, job_state.site,
                                  DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'), jr=True,
                                  ra=job_state.recoveryAttempt)

            pUtil.chdir(currentdir)

            # releaseAtomicLockFile(lockfd, lockfn)

            if job_state.job.result[0] == "finished":
                log("Stageout will now remove the job, it is in finished state and can be removed")
                cleanup(job_state)

            return True