def TransferFiles(job_state, datadir, files, **kwargs): """ Transfers files from list 'files' May change CWD with pUtil.chdir (several times) :param job_state: :param datadir: job data dir :param files: list of filenames :param kwargs: specific arguments for other purposes :return: """ job = job_state.job pUtil.chdir(datadir) XMLMetadata = pUtil.getMetadata(job_state.site.workdir, job.jobId) thisSite = DorE(kwargs, 'thisSite') if not setGuids(job_state, files, **kwargs): job.result[2] = PilotErrors().ERR_LOSTJOBPFC return ReturnCode.FailedJob outPFC = updateOutPFC(job, **kwargs) if not outPFC: return ReturnCode.FailedJob dsname = defaultDSname(job.destinationDblock) datasetDict = pUtil.getDatasetDict(job.outFiles, job.destinationDblock, job.logFile, job.logDblock) if not datasetDict: log("Output files will go to default dataset: %s" % (dsname)) # the cmtconfig is needed by at least the xrdcp site mover cmtconfig = pUtil.getCmtconfig(job.cmtconfig) tin_0 = os.times() rf = None _state = ReturnCode.OK _msg = "" ec = -1 try: # Note: alt stage-out numbers are not saved in recovery mode (job object not returned from this function) rc, pilotErrorDiag, rf, rs, job.filesNormalStageOut, job.filesAltStageOut, os_bucket_id = Mover.mover_put_data( "xmlcatalog_file:%s" % outPFC, dsname, thisSite.sitename, thisSite.computingElement, analysisJob=pUtil.isAnalysisJob(job.trf.split(",")[0]), proxycheck=DorE(kwargs, 'proxycheckFlag'), pinitdir=DorE(kwargs, 'pilot_initdir'), datasetDict=datasetDict, stageoutTries=DorE(kwargs, 'stageoutretry'), cmtconfig=cmtconfig, recoveryWorkDir=thisSite.workdir, job=job) except Exception, e: pilotErrorDiag = "Put function can not be called for staging out: %s" % str(e) log("!!%s!!1105!! %s" % (env['errorLabel'], pilotErrorDiag)) ec = PilotErrors().ERR_PUTFUNCNOCALL _state = ReturnCode.Holding _msg = env['errorLabel']
def __exit__(self, *args): try: os.close(self.__fd) os.unlink(self.__name) except Exception, e: if "Bad file descriptor" in e: log("Lock file already released") else: log("WARNING: Could not release lock file: %s" % (e))
def setGuids(job_state, files, **kwargs): job = job_state.job try: job.outFilesGuids = map(lambda x: pUtil.getGuidsFromXML(job_state.site.workdir, id=job.jobId, filename=x)[0], files) job.outFiles = files except Exception, e: job.outFilesGuids = None log("!!FAILED!!1105!! Exception caught (Could not read guids for the remaining output files): %s" % str(e)) return False
def updateOutPFC(job, **kwargs): file_name = "OutPutFileCatalog.xml" file_path = os.path.join(DorE(kwargs, 'thisSite').workdir, file_name) try: guids_status = pUtil.PFCxml(job.experiment, file_path, job.outFiles, fguids=job.outFilesGuids, fntag="pfn", analJob=pUtil.isAnalysisJob(job.trf.split(",")[0]), jr=True) except Exception, e: log("!!FAILED!!1105!! Exception caught (Could not generate xml for the remaining output files): %s" % str(e)) job.result[2] = PilotErrors().ERR_LOSTJOBXML return False
def releaseAtomicLockFile(fd, lockfile_name): """ Release the atomic lock file """ try: os.close(fd) os.unlink(lockfile_name) except Exception, e: if "Bad file descriptor" in e: log("Lock file already released") else: log("WARNING: Could not release lock file: %s" % (e))
def setGuids(job_state, files, **kwargs): job = job_state.job try: job.outFilesGuids = map( lambda x: pUtil.getGuidsFromXML(job_state.site.workdir, id=job.jobId, filename=x)[0], files ) job.outFiles = files except Exception, e: job.outFilesGuids = None log("!!FAILED!!1105!! Exception caught (Could not read guids for the remaining output files): %s" % str(e)) return False
def DeferredStageoutLocal(**kwargs): """ This is a simple alias to `DeferredStageout.DeferredStageoutDir('.', lockfile_name="LOCKFILE")` :return: (integer) number of staged out jobs """ log("Starting deferred stageout procedure. This procedure will scan current dir for any left job files" " and try to preform stageout for each file that haven't been sent.") kwargs.setdefault('lockfile_name', "LOCKFILE") return DeferredStageoutDir('.', **kwargs)
def __exit__(self, *args): global log_useful if not self.same: log("Staging down log file") self.useful = self.useful and log_useful log_useful = self.__old_useful os.chdir(os.path.dirname(self.__global_old)) pUtil.setPilotlogFilename(self.__old_name) if not self.useful: os.remove(self.__global)
def createAtomicLockFile(file_path): """ Create an atomic lockfile while probing this dir to avoid a possible race-condition """ lockfile_name = os.path.join(os.path.dirname(file_path), "ATOMIC_LOCKFILE") try: # acquire the lock fd = os.open(lockfile_name, os.O_EXCL|os.O_CREAT) except OSError: # work dir is locked, so exit log("Found lock file: %s (skip this dir)" % (lockfile_name)) fd = None else: log("Created lock file: %s" % (lockfile_name)) return fd, lockfile_name
def createAtomicLockFile(file_path): """ Create an atomic lockfile while probing this dir to avoid a possible race-condition """ lockfile_name = os.path.join(os.path.dirname(file_path), "ATOMIC_LOCKFILE") try: # acquire the lock fd = os.open(lockfile_name, os.O_EXCL | os.O_CREAT) except OSError: # work dir is locked, so exit log("Found lock file: %s (skip this dir)" % (lockfile_name)) fd = None else: log("Created lock file: %s" % (lockfile_name)) return fd, lockfile_name
def __enter__(self): global log_useful self.__old_name = pUtil.getPilotlogFilename() if not self.__name or self.__name == self.__old_name: self.same = True else: log("Staging up log file, new log file is %s" % self.__name) self.__global_old = os.path.abspath(self.__old_name) self.__global = os.path.abspath(self.__name) pUtil.setPilotlogFilename(self.__global) self.__old_useful = log_useful log_useful = False return self
def DeferredStageout(deferred_stageout_dirs, max_stageout_jobs=0, **kwargs): """ The procedure loops through provided dirs and performs staging out on each job in these dirs. Usage: `DeferredStageout(['path/to/dir1','path/to/dir2',...])` :param deferred_stageout_dirs: (array of strings) list of directories to search for jobs. mandatory parameter :param max_stageout_jobs: (integer) maximum stageout jobs to be finished, if zero, every job will be processed defaults to zero Other parameters are passed into DeferredStageoutDir :return: (integer) number of staged out jobs """ log( "Starting deferred stageout procedure. This procedure will scan directories for any left job files" " and try to preform stageout for each file that haven't been sent." ) stageout_jobs = 0 d = dict(kwargs) # reduce duplicates dirs_set = set(map(os.path.abspath, deferred_stageout_dirs)) for deferred_stageout_dir in dirs_set: # increment in two steps, because maybe I'll send some other params d.update({"max_stageout_jobs": max_stageout_jobs - stageout_jobs if max_stageout_jobs > 0 else 0}) stageout_jobs_d = DeferredStageoutDir(deferred_stageout_dir, **d) # We don't need to do anything except of staging out every job in these dirs # If there is something to do here -- think, maybe there should be another place to do it? stageout_jobs += stageout_jobs_d if stageout_jobs >= max_stageout_jobs > 0: return stageout_jobs return stageout_jobs
def DeferredStageout(deferred_stageout_dirs, max_stageout_jobs=0, **kwargs): """ The procedure loops through provided dirs and performs staging out on each job in these dirs. Usage: `DeferredStageout(['path/to/dir1','path/to/dir2',...])` :param deferred_stageout_dirs: (array of strings) list of directories to search for jobs. mandatory parameter :param max_stageout_jobs: (integer) maximum stageout jobs to be finished, if zero, every job will be processed defaults to zero Other parameters are passed into DeferredStageoutDir :return: (integer) number of staged out jobs """ log("Starting deferred stageout procedure. This procedure will scan directories for any left job files" " and try to preform stageout for each file that haven't been sent.") stageout_jobs = 0 d = dict(kwargs) # reduce duplicates dirs_set = set(map(os.path.abspath, deferred_stageout_dirs)) for deferred_stageout_dir in dirs_set: # increment in two steps, because maybe I'll send some other params d.update({'max_stageout_jobs': max_stageout_jobs-stageout_jobs if max_stageout_jobs > 0 else 0}) stageout_jobs_d = DeferredStageoutDir(deferred_stageout_dir, **d) # We don't need to do anything except of staging out every job in these dirs # If there is something to do here -- think, maybe there should be another place to do it? stageout_jobs += stageout_jobs_d if stageout_jobs >= max_stageout_jobs > 0: return stageout_jobs return stageout_jobs
def cleanup(job_state): log("Cleanup job directory called") cmd = "rm -rf %s" % os.path.dirname(job_state.filename) if os.path.isdir(job_state.job.newDirNM): cmd += " %s" % job_state.job.newDirNM if os.path.isdir(job_state.job.datadir): cmd += " %s" % job_state.job.datadir if os.path.isdir(job_state.site.workdir): cmd += " %s" % job_state.job.workdir log("Executing command: %s" % (cmd)) try: ec, rs = commands.getstatusoutput(cmd) except Exception, e: log("FAILURE: JobState cleanup failed to cleanup: %s " % str(e))
def cleanup(job_state): log("Cleanup job directory called") cmd = "rm -rf %s" % os.path.dirname(job_state.filename) if os.path.isdir(job_state.job.newDirNM): cmd += (" %s" % job_state.job.newDirNM) if os.path.isdir(job_state.job.datadir): cmd += (" %s" % job_state.job.datadir) if os.path.isdir(job_state.site.workdir): cmd += (" %s" % job_state.job.workdir) log("Executing command: %s" % (cmd)) try: ec, rs = commands.getstatusoutput(cmd) except Exception, e: log("FAILURE: JobState cleanup failed to cleanup: %s " % str(e))
def DeferredStageoutDir(deferred_stageout_dir, max_stageout_jobs=0, remove_empty_dir=False, **kwargs): """ The procedure loops through the directory and performs staging out on each job. Usage `DeferredStageout.DeferredStageoutDir('path/to/dir'[, param_name=param_value])` :param deferred_stageout_dir: (string) directory to scan for deferred stageout jobs mandatory parameter :param max_stageout_jobs: (integer) maximum stageout jobs to be finished, if zero, every job will be processed defaults to zero :param job_state_mode: ("default"|"test") Mode of job state file defaults to "default" :param remove_empty_dir: (bool) Remove the directory or not defaults to False Other parameters are passed into DeferredStageoutJob :return: (integer) number of staged out jobs """ log('Scanning directory "%s" for deferred stageout jobs.' % deferred_stageout_dir) stageout_jobs = 0 d = dict(kwargs) job_state_files = glob(deferred_stageout_dir + "/*/" + jobState_file_wildcart) job_state_files = pUtil.removeTestFiles(job_state_files, mode=kwargs.get("job_state_mode", "default")) hpc_job_state_dirs = map(os.path.dirname, glob(deferred_stageout_dir + "/*/" + hpc_jobState_file_wildcart)) job_state_files = filter(lambda jsf: os.path.dirname(jsf) not in hpc_job_state_dirs, job_state_files) for hpc_job_state_dir in hpc_job_state_dirs: # increment in two steps, because maybe I'll return some other params was_stageout = DeferredStageoutHPCJob(hpc_job_state_dir, **kwargs) if was_stageout: stageout_jobs += 1 if stageout_jobs >= max_stageout_jobs > 0: return stageout_jobs for job_state_file in job_state_files: # increment in two steps, because maybe I'll return some other params d.update({"job_state_file": job_state_file}) was_stageout = DeferredStageoutJob(os.path.dirname(job_state_file), **kwargs) if was_stageout: stageout_jobs += 1 if stageout_jobs >= max_stageout_jobs > 0: return stageout_jobs # if not all dirs were processed, don't remove # remove only if there is no dir unprocessed log('Finished processing directory "%s".' % deferred_stageout_dir) if remove_empty_dir: log('Directory "%s" is to be removed.' % deferred_stageout_dir) log("Contents:") o, e = commands.getstatusoutput("ls -la " + deferred_stageout_dir) log("%s" % o) dirs = filter(os.path.isdir, glob(deferred_stageout_dir + "/*")) if len(dirs) < 1: log("It is OK to remove it, proceeding." % deferred_stageout_dir) o, e = commands.getstatusoutput("rm -rf " + deferred_stageout_dir) else: log("There are subdirs in this dir, can not remove.") log("Remaining subdirs: %s" % dirs) return stageout_jobs
def PrepareJobForDeferredStageout(job_state, **kwargs): """ Function prepares job for staging out :param job_state: (JobState) decoded job state file mandatory Other params can be passed into functions: 1. To overwrite environment variables: 1. uflag -- pUtil.isSameType 2. pshttpurl, psport, pilot_initdir -- pUtil.getJobStatus 3. thisSite -- compare the site names 4. maxNumberOfRecoveryAttempts -- check recoveryAttempt 5. psport, jobSchedulerId, pilotId -- server updates Unused parameters are ommited. :return: (integer) 0. job is prepared 1. job is not prepared, skip 2. job is to be removed, proceed to cleanup """ # Next test is testing trf type and PROD/ANAL job vs pilot comparison if not pUtil.isSameType(job_state.job.trf.split(",")[0], DorE(kwargs, "uflag")): return lognret(ReturnCode.SkipJob, "Job is not the same type as current pilot") # Next test ensures that the pilot is started on the same site. # Maybe we should add env var to switch this test on and off, there can be a number of equal nodes sharing one FS # Which can equally perform recovery of the lost jobs from each other if job_state.site.sitename != DorE(kwargs, "thisSite").sitename: return lognret(ReturnCode.SkipJob, "Job is not running on the same site") # This test ensures that the number of recovery attempts did not exceeded and if exceeded, updates the server and # the state file if job_state.recoveryAttempt >= DorE(kwargs, "maxNumberOfRecoveryAttempts"): log("!!WARNING!!1100!! Max number of recovery attempts exceeded: %d" % (env["maxNumberOfRecoveryAttempts"])) job_state.job.setState(["failed", job_state.job.result[1], PilotErrors().ERR_LOSTJOBMAXEDOUT]) rt, retNode = updatePandaServer(job_state, **kwargs) if rt == 0: log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2])) else: log("Panda server returned a %d" % (rt)) return lognret(ReturnCode.SkipJob, "(Failed to update panda server - leave for next pilot)") jobStatus = job_state.job.result[0] jobStatusCode = 0 jobAttemptNr = job_state.job.attemptNr # There can be unclear state, consult the server if jobStatus not in acceptedJobStatesFromFile: log("Job state may be unclear (found state %s), checking with the server" % jobStatus) jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus( job_state.job.jobId, DorE(kwargs, "pshttpurl"), DorE(kwargs, "psport"), DorE(kwargs, "pilot_initdir") ) if jobStatusCode != 0: return lognret( ReturnCode.SkipJob, "Received general error code from dispatcher call (leave job for later" " pilot)" ) else: log("Job state is %s" % jobStatus) # If any inconvenience occurs or job is finalised, cleanup if job_state.job.attemptNr != jobAttemptNr or jobStatus in finalJobStates or "tobekilled" in job_state.job.action: if job_state.job.attemptNr != jobAttemptNr: return lognret( ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job. It has a " "mismatch in the attempt number record.", ) if "tobekilled" in job_state.job.action: return lognret( ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job. It was" " marked to be killed.", ) return lognret( ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job, it is in final " "state: %s." % jobStatus, ) if jobStatus != "holding": # is the attemptNr defined? try: attemptNr = job_state.job.attemptNr except Exception, e: log("!!WARNING!!1100!! Attempt number not defined [ignore]: %s" % str(e)) else: # check if the attemptNr (set during initial getJob command) is the same # as the current jobAttemptNr from the server (protection against failed lost # heartbeat jobs due to reassigned panda job id numbers) if attemptNr != jobAttemptNr: log( "!!WARNING!!1100!! Attempt number mismatch for job %s (according to server - will not be" " recovered)" % job_state.job.jobId ) log("....Initial attempt number: %d" % attemptNr) log("....Current attempt number: %d" % jobAttemptNr) log("....Job status (server) : %s" % jobStatus) log("....Job status (state) : %s" % job_state.job.result[0]) return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job") else: log("Attempt numbers from server and job state file agree: %d" % attemptNr)
def lognret(ret, _str): log(_str) return ret
def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation and stages out the job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param job_state_file: (string) path to job state file or other file containing job state. If empty, job state file is located as job_dir+'/jobState-*.*'. defaults to "" :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log('Deferred stageout from job directory "%s"' % job_dir) job_state = JobState() if job_state_file == "": try: job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0] except: log("There is no job state file in the provided directory, exiting") return False log("Job state file is %s" % job_state_file) # lockfd, lockfn = createAtomicLockFile(job_dir) with LockFileWrapper(job_dir): if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs): log('Job "%s" does not need deferred stageout procedure (yet)' % job_dir) # releaseAtomicLockFile(lockfd, lockfn) return False if not job_state.get(job_state_file): log("Job state file reading failed, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False log('Working with job in "%s"' % job_dir) _job, _site, _node, _recoveryAttempt = job_state.decode() if not (_job and _site and _node): log("Can not decode jobState file, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: rc = PrepareJobForDeferredStageout(job_state, **kwargs) if rc == ReturnCode.PostJobOnly: pUtil.postJobTask( job_state.job, job_state.site, DorE(kwargs, "workerNode"), DorE(kwargs, "experiment"), jr=True, ra=job_state.recoveryAttempt, ) # releaseAtomicLockFile(lockfd, lockfn) return True if rc > 0: log("Job is not prepared for stageout, exiting") if rc == ReturnCode.Cleanup: cleanup(job_state) # releaseAtomicLockFile(lockfd, lockfn) return False rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs) XMLStr = "" if datadir == "": try: XMLStr = job_state.node["xml"] except: pass if XMLStr == "": XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId) currentdir = os.getcwd() pUtil.chdir(job_state.site.workdir) if len(filelist): log("Stageout will now transfer the files") rc = TransferFiles(job_state, datadir, filelist, **kwargs) if rc == ReturnCode.Holding: job_state.job.result[0] = "holding" if rc == ReturnCode.FailedJob: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) pUtil.chdir(job_state.site.workdir) ret = True if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir): log("Stageout will now transfer the log") _log = JobLog() ret, _ = _log.transferLogFile( job_state.job, job_state.site, DorE(kwargs, "experiment"), dest=None, jr=True ) if not ret: rc = ReturnCode.Holding # We need to transfer log file regardless the files if rc == ReturnCode.OK: if pUtil.verifyTransfer(job_state.site.workdir): job_state.job.result[0] = "finished" else: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) if job_state.job.result[0] in finalJobStates: job_state.job.final_state = job_state.job.result[0] log("Stageout will now update the server with new status") rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs) if rt == 0: log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2])) # did the server send back a command? if "tobekilled" in job_state.job.action: log("!!WARNING!!1120!! Panda server returned a 'tobekilled' command") job_state.job.result[0] = "failed" # further recovery attempt unnecessary, but keep the work dir for debugging if job_state.job.result[0] == "failed": log("Further recovery attempts will be prevented for failed job (will leave work dir)") if not job_state.rename(job_state.site, job_state.job): log("(Fate of job state file left for next pilot)") else: log("!!WARNING!!1120!! Panda server returned a %d" % (rt)) # store the final state so that the next pilot will know # store the metadata xml retNode["xml"] = XMLStr # update the job state file with the new state information _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt) log("Stageout will now proceed to post-job actions") if job_state.job.result[0] in finalJobStates: pUtil.postJobTask( job_state.job, job_state.site, DorE(kwargs, "workerNode"), DorE(kwargs, "experiment"), jr=True, ra=job_state.recoveryAttempt, ) pUtil.chdir(currentdir) # releaseAtomicLockFile(lockfd, lockfn) if job_state.job.result[0] == "finished": log("Stageout will now remove the job, it is in finished state and can be removed") cleanup(job_state) return True
def DeferredStageoutHPCJob(job_dir, deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation for the HPC job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log('Deferred stageout from HPC job directory "%s"' % job_dir) file_path = job_dir + "/" + hpc_jobState_file_wildcart current_dir = os.getcwd() log("Working on %s" % file_path) log("Chdir from current dir %s to %s" % (current_dir, job_dir)) pUtil.chdir(job_dir) try: with LockFileWrapper(file_path) as is_locked: if not is_locked: return False from json import load with open(file_path) as data_file: HPC_state = load(data_file) job_state_file = HPC_state["JobStateFile"] job_command = HPC_state["JobCommand"] # global_work_dir = HPC_state['GlobalWorkingDir'] JS = JobState() JS.get(job_state_file) _job, _site, _node, _recoveryAttempt = JS.decode() with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus( _job.jobId, DorE(kwargs, "pshttpurl"), DorE(kwargs, "psport"), DorE(kwargs, "pilot_initdir") ) # recover this job? if jobStatusCode == 20: log("Received general error code from dispatcher call (leave job for later pilot)") # release the atomic lockfile and go to the next directory # releaseAtomicLockFile(fd, lockfile_name) return False elif jobStatus in finalJobStates or "tobekilled" in _job.action: log( "Job %s is currently in state '%s' with attemptNr = %d (according to server - will not" " perform staging out)" % (_job.jobId, jobStatus, jobAttemptNr) ) # releaseAtomicLockFile(fd, lockfile_name) return False # update job state file at this point to prevent a parallel pilot from doing a simultaneous recovery _retjs = pUtil.updateJobState(_job, _site, _node, _recoveryAttempt) # releaseAtomicLockFile(fd, lockfile_name) monitor = Monitor(env) monitor.monitor_recovery_job(_job, _site, _node, job_command, job_state_file, recover_dir=job_dir) log("Chdir back to %s" % current_dir) pUtil.chdir(current_dir) panda_jobs = glob(job_dir + "/PandaJob_*_*") panda_logs = glob(job_dir + "/*.log.tgz.*") if panda_jobs or panda_logs: log( "Number of founded panda jobs: %d, number of panda log tar file %d, will not remove job dir" % (len(panda_jobs), len(panda_logs)) ) else: log( "Number of founded panda jobs: %d, number of panda log tar file %d, will remove job dir" % (len(panda_jobs), len(panda_logs)) ) log("Remove job dir %s" % job_dir) os.system("rm -rf %s" % job_dir) return True except: log("Failed to start deferred stage out for HPC job: %s" % traceback.format_exc()) return False
def DeferredStageoutDir(deferred_stageout_dir, max_stageout_jobs=0, remove_empty_dir=False, **kwargs): """ The procedure loops through the directory and performs staging out on each job. Usage `DeferredStageout.DeferredStageoutDir('path/to/dir'[, param_name=param_value])` :param deferred_stageout_dir: (string) directory to scan for deferred stageout jobs mandatory parameter :param max_stageout_jobs: (integer) maximum stageout jobs to be finished, if zero, every job will be processed defaults to zero :param job_state_mode: ("default"|"test") Mode of job state file defaults to "default" :param remove_empty_dir: (bool) Remove the directory or not defaults to False Other parameters are passed into DeferredStageoutJob :return: (integer) number of staged out jobs """ log("Scanning directory \"%s\" for deferred stageout jobs." % deferred_stageout_dir) stageout_jobs = 0 d = dict(kwargs) job_state_files = glob(deferred_stageout_dir + "/*/" + jobState_file_wildcart) job_state_files = pUtil.removeTestFiles(job_state_files, mode=kwargs.get("job_state_mode", "default")) hpc_job_state_dirs = map(os.path.dirname, glob(deferred_stageout_dir + "/*/" + hpc_jobState_file_wildcart)) job_state_files = filter(lambda jsf: os.path.dirname(jsf) not in hpc_job_state_dirs, job_state_files) for hpc_job_state_dir in hpc_job_state_dirs: # increment in two steps, because maybe I'll return some other params was_stageout = DeferredStageoutHPCJob(hpc_job_state_dir, **kwargs) if was_stageout: stageout_jobs += 1 if stageout_jobs >= max_stageout_jobs > 0: return stageout_jobs for job_state_file in job_state_files: # increment in two steps, because maybe I'll return some other params d.update({'job_state_file': job_state_file}) was_stageout = DeferredStageoutJob(os.path.dirname(job_state_file), **kwargs) if was_stageout: stageout_jobs += 1 if stageout_jobs >= max_stageout_jobs > 0: return stageout_jobs # if not all dirs were processed, don't remove # remove only if there is no dir unprocessed log("Finished processing directory \"%s\"." % deferred_stageout_dir) if remove_empty_dir: log("Directory \"%s\" is to be removed." % deferred_stageout_dir) log("Contents:") o, e = commands.getstatusoutput("ls -la "+deferred_stageout_dir) log("%s" % o) dirs = filter(os.path.isdir, glob(deferred_stageout_dir + "/*")) if len(dirs) < 1: log("It is OK to remove it, proceeding." % deferred_stageout_dir) o, e = commands.getstatusoutput("rm -rf "+deferred_stageout_dir) else: log("There are subdirs in this dir, can not remove.") log("Remaining subdirs: %s" % dirs) return stageout_jobs
def DeferredStageoutHPCJob(job_dir, deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation for the HPC job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log("Deferred stageout from HPC job directory \"%s\"" % job_dir) file_path = job_dir+"/"+hpc_jobState_file_wildcart current_dir = os.getcwd() log("Working on %s" % file_path) log("Chdir from current dir %s to %s" % (current_dir, job_dir)) pUtil.chdir(job_dir) try: with LockFileWrapper(file_path) as is_locked: if not is_locked: return False from json import load with open(file_path) as data_file: HPC_state = load(data_file) job_state_file = HPC_state['JobStateFile'] job_command = HPC_state['JobCommand'] # global_work_dir = HPC_state['GlobalWorkingDir'] JS = JobState() JS.get(job_state_file) _job, _site, _node, _recoveryAttempt = JS.decode() with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus(_job.jobId, DorE(kwargs,'pshttpurl'), DorE(kwargs,'psport'), DorE(kwargs,'pilot_initdir')) # recover this job? if jobStatusCode == 20: log("Received general error code from dispatcher call (leave job for later pilot)") # release the atomic lockfile and go to the next directory # releaseAtomicLockFile(fd, lockfile_name) return False elif jobStatus in finalJobStates or "tobekilled" in _job.action: log("Job %s is currently in state \'%s\' with attemptNr = %d (according to server - will not" " perform staging out)" % (_job.jobId, jobStatus, jobAttemptNr)) # releaseAtomicLockFile(fd, lockfile_name) return False # update job state file at this point to prevent a parallel pilot from doing a simultaneous recovery _retjs = pUtil.updateJobState(_job, _site, _node, _recoveryAttempt) # releaseAtomicLockFile(fd, lockfile_name) monitor = Monitor(env) monitor.monitor_recovery_job(_job, _site, _node, job_command, job_state_file, recover_dir=job_dir) log("Chdir back to %s" % current_dir) pUtil.chdir(current_dir) panda_jobs = glob(job_dir + "/PandaJob_*_*") panda_logs = glob(job_dir + "/*.log.tgz.*") if panda_jobs or panda_logs: log("Number of founded panda jobs: %d, number of panda log tar file %d, will not remove job dir" % (len(panda_jobs), len(panda_logs))) else: log("Number of founded panda jobs: %d, number of panda log tar file %d, will remove job dir" % (len(panda_jobs), len(panda_logs))) log("Remove job dir %s" % job_dir) os.system("rm -rf %s" % job_dir) return True except: log("Failed to start deferred stage out for HPC job: %s" % traceback.format_exc()) return False
# acquire the lock self.__fd = os.open(self.__name, os.O_EXCL | os.O_CREAT) return True def __exit__(self, *args): try: os.close(self.__fd) os.unlink(self.__name) except Exception, e: if "Bad file descriptor" in e: log("Lock file already released") else: log("WARNING: Could not release lock file: %s" % (e)) else: log("Released lock file: %s" % (self.__name)) class ReturnCode: OK = 0 SkipJob = 1 Cleanup = 2 PostJobOnly = 3 FailedJob = 4 Holding = 5 def GetDefaultDeferredStageoutDir(**kwargs): """ Resolves standard deferred stageout path. :param `thisSite` can be set
def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation and stages out the job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param job_state_file: (string) path to job state file or other file containing job state. If empty, job state file is located as job_dir+'/jobState-*.*'. defaults to "" :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log("Deferred stageout from job directory \"%s\"" % job_dir) job_state = JobState() if job_state_file == "": try: job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0] except: log("There is no job state file in the provided directory, exiting") return False log("Job state file is %s"%job_state_file) # lockfd, lockfn = createAtomicLockFile(job_dir) with LockFileWrapper(job_dir): if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs): log("Job \"%s\" does not need deferred stageout procedure (yet)" % job_dir) # releaseAtomicLockFile(lockfd, lockfn) return False if not job_state.get(job_state_file): log("Job state file reading failed, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False log("Working with job in \"%s\"" % job_dir) _job, _site, _node, _recoveryAttempt = job_state.decode() if not (_job and _site and _node): log("Can not decode jobState file, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: rc = PrepareJobForDeferredStageout(job_state, **kwargs) if rc == ReturnCode.PostJobOnly: pUtil.postJobTask(job_state.job, job_state.site, DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'), jr=True, ra=job_state.recoveryAttempt) # releaseAtomicLockFile(lockfd, lockfn) return True if rc > 0: log("Job is not prepared for stageout, exiting") if rc == ReturnCode.Cleanup: cleanup(job_state) # releaseAtomicLockFile(lockfd, lockfn) return False rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs) XMLStr = '' if datadir == "": try: XMLStr = job_state.node['xml'] except: pass if XMLStr == '': XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId) currentdir = os.getcwd() pUtil.chdir(job_state.site.workdir) if len(filelist): log("Stageout will now transfer the files") rc = TransferFiles(job_state, datadir, filelist, **kwargs) if rc == ReturnCode.Holding: job_state.job.result[0] = "holding" if rc == ReturnCode.FailedJob: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) pUtil.chdir(job_state.site.workdir) ret = True if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir): log("Stageout will now transfer the log") _log = JobLog() ret, _ = _log.transferLogFile(job_state.job, job_state.site, DorE(kwargs, 'experiment'), dest=None, jr=True) if not ret: rc = ReturnCode.Holding # We need to transfer log file regardless the files if rc == ReturnCode.OK: if pUtil.verifyTransfer(job_state.site.workdir): job_state.job.result[0] = "finished" else: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) if job_state.job.result[0] in finalJobStates: job_state.job.final_state = job_state.job.result[0] log("Stageout will now update the server with new status") rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs) if rt == 0: log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2])) # did the server send back a command? if "tobekilled" in job_state.job.action: log("!!WARNING!!1120!! Panda server returned a \'tobekilled\' command") job_state.job.result[0] = "failed" # further recovery attempt unnecessary, but keep the work dir for debugging if job_state.job.result[0] == "failed": log("Further recovery attempts will be prevented for failed job (will leave work dir)") if not job_state.rename(job_state.site, job_state.job): log("(Fate of job state file left for next pilot)") else: log("!!WARNING!!1120!! Panda server returned a %d" % (rt)) # store the final state so that the next pilot will know # store the metadata xml retNode['xml'] = XMLStr # update the job state file with the new state information _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt) log("Stageout will now proceed to post-job actions") if job_state.job.result[0] in finalJobStates: pUtil.postJobTask(job_state.job, job_state.site, DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'), jr=True, ra=job_state.recoveryAttempt) pUtil.chdir(currentdir) # releaseAtomicLockFile(lockfd, lockfn) if job_state.job.result[0] == "finished": log("Stageout will now remove the job, it is in finished state and can be removed") cleanup(job_state) return True
# acquire the lock self.__fd = os.open(self.__name, os.O_EXCL|os.O_CREAT) return True def __exit__(self, *args): try: os.close(self.__fd) os.unlink(self.__name) except Exception, e: if "Bad file descriptor" in e: log("Lock file already released") else: log("WARNING: Could not release lock file: %s" % (e)) else: log("Released lock file: %s" % (self.__name)) class ReturnCode: OK = 0 SkipJob = 1 Cleanup = 2 PostJobOnly = 3 FailedJob = 4 Holding = 5 def GetDefaultDeferredStageoutDir(**kwargs): """ Resolves standard deferred stageout path. :param `thisSite` can be set
def PrepareJobForDeferredStageout(job_state, **kwargs): """ Function prepares job for staging out :param job_state: (JobState) decoded job state file mandatory Other params can be passed into functions: 1. To overwrite environment variables: 1. uflag -- pUtil.isSameType 2. pshttpurl, psport, pilot_initdir -- pUtil.getJobStatus 3. thisSite -- compare the site names 4. maxNumberOfRecoveryAttempts -- check recoveryAttempt 5. psport, jobSchedulerId, pilotId -- server updates Unused parameters are ommited. :return: (integer) 0. job is prepared 1. job is not prepared, skip 2. job is to be removed, proceed to cleanup """ # Next test is testing trf type and PROD/ANAL job vs pilot comparison if not pUtil.isSameType(job_state.job.trf.split(",")[0], DorE(kwargs, 'uflag')): return lognret(ReturnCode.SkipJob, "Job is not the same type as current pilot") # Next test ensures that the pilot is started on the same site. # Maybe we should add env var to switch this test on and off, there can be a number of equal nodes sharing one FS # Which can equally perform recovery of the lost jobs from each other if job_state.site.sitename != DorE(kwargs, 'thisSite').sitename: return lognret(ReturnCode.SkipJob, "Job is not running on the same site") # This test ensures that the number of recovery attempts did not exceeded and if exceeded, updates the server and # the state file if job_state.recoveryAttempt >= DorE(kwargs, 'maxNumberOfRecoveryAttempts'): log("!!WARNING!!1100!! Max number of recovery attempts exceeded: %d" % (env['maxNumberOfRecoveryAttempts'])) job_state.job.setState(['failed', job_state.job.result[1], PilotErrors().ERR_LOSTJOBMAXEDOUT]) rt, retNode = updatePandaServer(job_state, **kwargs) if rt == 0: log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2])) else: log("Panda server returned a %d" % (rt)) return lognret(ReturnCode.SkipJob, "(Failed to update panda server - leave for next pilot)") jobStatus = job_state.job.result[0] jobStatusCode = 0 jobAttemptNr = job_state.job.attemptNr # There can be unclear state, consult the server if jobStatus not in acceptedJobStatesFromFile: log("Job state may be unclear (found state %s), checking with the server" % jobStatus) jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus(job_state.job.jobId, DorE(kwargs, 'pshttpurl'), DorE(kwargs, 'psport'), DorE(kwargs, 'pilot_initdir')) if jobStatusCode != 0: return lognret(ReturnCode.SkipJob, "Received general error code from dispatcher call (leave job for later" " pilot)") else: log("Job state is %s" % jobStatus) # If any inconvenience occurs or job is finalised, cleanup if job_state.job.attemptNr != jobAttemptNr or\ jobStatus in finalJobStates or\ "tobekilled" in job_state.job.action: if job_state.job.attemptNr != jobAttemptNr: return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job. It has a " "mismatch in the attempt number record.") if "tobekilled" in job_state.job.action: return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job. It was" " marked to be killed.") return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job, it is in final " "state: %s." % jobStatus) if jobStatus != 'holding': # is the attemptNr defined? try: attemptNr = job_state.job.attemptNr except Exception, e: log("!!WARNING!!1100!! Attempt number not defined [ignore]: %s" % str(e)) else: # check if the attemptNr (set during initial getJob command) is the same # as the current jobAttemptNr from the server (protection against failed lost # heartbeat jobs due to reassigned panda job id numbers) if attemptNr != jobAttemptNr: log("!!WARNING!!1100!! Attempt number mismatch for job %s (according to server - will not be" " recovered)" % job_state.job.jobId) log("....Initial attempt number: %d" % attemptNr) log("....Current attempt number: %d" % jobAttemptNr) log("....Job status (server) : %s" % jobStatus) log("....Job status (state) : %s" % job_state.job.result[0]) return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job") else: log("Attempt numbers from server and job state file agree: %d" % attemptNr)