def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation and stages out the job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param job_state_file: (string) path to job state file or other file containing job state. If empty, job state file is located as job_dir+'/jobState-*.*'. defaults to "" :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log('Deferred stageout from job directory "%s"' % job_dir) job_state = JobState() if job_state_file == "": try: job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0] except: log("There is no job state file in the provided directory, exiting") return False log("Job state file is %s" % job_state_file) # lockfd, lockfn = createAtomicLockFile(job_dir) with LockFileWrapper(job_dir): if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs): log('Job "%s" does not need deferred stageout procedure (yet)' % job_dir) # releaseAtomicLockFile(lockfd, lockfn) return False if not job_state.get(job_state_file): log("Job state file reading failed, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False log('Working with job in "%s"' % job_dir) _job, _site, _node, _recoveryAttempt = job_state.decode() if not (_job and _site and _node): log("Can not decode jobState file, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: rc = PrepareJobForDeferredStageout(job_state, **kwargs) if rc == ReturnCode.PostJobOnly: pUtil.postJobTask( job_state.job, job_state.site, DorE(kwargs, "workerNode"), DorE(kwargs, "experiment"), jr=True, ra=job_state.recoveryAttempt, ) # releaseAtomicLockFile(lockfd, lockfn) return True if rc > 0: log("Job is not prepared for stageout, exiting") if rc == ReturnCode.Cleanup: cleanup(job_state) # releaseAtomicLockFile(lockfd, lockfn) return False rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs) XMLStr = "" if datadir == "": try: XMLStr = job_state.node["xml"] except: pass if XMLStr == "": XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId) currentdir = os.getcwd() pUtil.chdir(job_state.site.workdir) if len(filelist): log("Stageout will now transfer the files") rc = TransferFiles(job_state, datadir, filelist, **kwargs) if rc == ReturnCode.Holding: job_state.job.result[0] = "holding" if rc == ReturnCode.FailedJob: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) pUtil.chdir(job_state.site.workdir) ret = True if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir): log("Stageout will now transfer the log") _log = JobLog() ret, _ = _log.transferLogFile( job_state.job, job_state.site, DorE(kwargs, "experiment"), dest=None, jr=True ) if not ret: rc = ReturnCode.Holding # We need to transfer log file regardless the files if rc == ReturnCode.OK: if pUtil.verifyTransfer(job_state.site.workdir): job_state.job.result[0] = "finished" else: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) if job_state.job.result[0] in finalJobStates: job_state.job.final_state = job_state.job.result[0] log("Stageout will now update the server with new status") rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs) if rt == 0: log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2])) # did the server send back a command? if "tobekilled" in job_state.job.action: log("!!WARNING!!1120!! Panda server returned a 'tobekilled' command") job_state.job.result[0] = "failed" # further recovery attempt unnecessary, but keep the work dir for debugging if job_state.job.result[0] == "failed": log("Further recovery attempts will be prevented for failed job (will leave work dir)") if not job_state.rename(job_state.site, job_state.job): log("(Fate of job state file left for next pilot)") else: log("!!WARNING!!1120!! Panda server returned a %d" % (rt)) # store the final state so that the next pilot will know # store the metadata xml retNode["xml"] = XMLStr # update the job state file with the new state information _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt) log("Stageout will now proceed to post-job actions") if job_state.job.result[0] in finalJobStates: pUtil.postJobTask( job_state.job, job_state.site, DorE(kwargs, "workerNode"), DorE(kwargs, "experiment"), jr=True, ra=job_state.recoveryAttempt, ) pUtil.chdir(currentdir) # releaseAtomicLockFile(lockfd, lockfn) if job_state.job.result[0] == "finished": log("Stageout will now remove the job, it is in finished state and can be removed") cleanup(job_state) return True
def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation and stages out the job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param job_state_file: (string) path to job state file or other file containing job state. If empty, job state file is located as job_dir+'/jobState-*.*'. defaults to "" :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log("Deferred stageout from job directory \"%s\"" % job_dir) job_state = JobState() if job_state_file == "": try: job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0] except: log("There is no job state file in the provided directory, exiting") return False log("Job state file is %s"%job_state_file) # lockfd, lockfn = createAtomicLockFile(job_dir) with LockFileWrapper(job_dir): if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs): log("Job \"%s\" does not need deferred stageout procedure (yet)" % job_dir) # releaseAtomicLockFile(lockfd, lockfn) return False if not job_state.get(job_state_file): log("Job state file reading failed, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False log("Working with job in \"%s\"" % job_dir) _job, _site, _node, _recoveryAttempt = job_state.decode() if not (_job and _site and _node): log("Can not decode jobState file, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: rc = PrepareJobForDeferredStageout(job_state, **kwargs) if rc == ReturnCode.PostJobOnly: pUtil.postJobTask(job_state.job, job_state.site, DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'), jr=True, ra=job_state.recoveryAttempt) # releaseAtomicLockFile(lockfd, lockfn) return True if rc > 0: log("Job is not prepared for stageout, exiting") if rc == ReturnCode.Cleanup: cleanup(job_state) # releaseAtomicLockFile(lockfd, lockfn) return False rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs) XMLStr = '' if datadir == "": try: XMLStr = job_state.node['xml'] except: pass if XMLStr == '': XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId) currentdir = os.getcwd() pUtil.chdir(job_state.site.workdir) if len(filelist): log("Stageout will now transfer the files") rc = TransferFiles(job_state, datadir, filelist, **kwargs) if rc == ReturnCode.Holding: job_state.job.result[0] = "holding" if rc == ReturnCode.FailedJob: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) pUtil.chdir(job_state.site.workdir) ret = True if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir): log("Stageout will now transfer the log") _log = JobLog() ret, _ = _log.transferLogFile(job_state.job, job_state.site, DorE(kwargs, 'experiment'), dest=None, jr=True) if not ret: rc = ReturnCode.Holding # We need to transfer log file regardless the files if rc == ReturnCode.OK: if pUtil.verifyTransfer(job_state.site.workdir): job_state.job.result[0] = "finished" else: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) if job_state.job.result[0] in finalJobStates: job_state.job.final_state = job_state.job.result[0] log("Stageout will now update the server with new status") rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs) if rt == 0: log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2])) # did the server send back a command? if "tobekilled" in job_state.job.action: log("!!WARNING!!1120!! Panda server returned a \'tobekilled\' command") job_state.job.result[0] = "failed" # further recovery attempt unnecessary, but keep the work dir for debugging if job_state.job.result[0] == "failed": log("Further recovery attempts will be prevented for failed job (will leave work dir)") if not job_state.rename(job_state.site, job_state.job): log("(Fate of job state file left for next pilot)") else: log("!!WARNING!!1120!! Panda server returned a %d" % (rt)) # store the final state so that the next pilot will know # store the metadata xml retNode['xml'] = XMLStr # update the job state file with the new state information _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt) log("Stageout will now proceed to post-job actions") if job_state.job.result[0] in finalJobStates: pUtil.postJobTask(job_state.job, job_state.site, DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'), jr=True, ra=job_state.recoveryAttempt) pUtil.chdir(currentdir) # releaseAtomicLockFile(lockfd, lockfn) if job_state.job.result[0] == "finished": log("Stageout will now remove the job, it is in finished state and can be removed") cleanup(job_state) return True