def kill(self, obj): """ Kill the job instance """ if type(obj) == Job: jobList = [obj] elif type(obj) == Task: jobList = obj.jobs else: raise SchedulerError('wrong argument type', str(type(obj))) jobsFile, arcId2job = self.createJobsFile(jobList, "Will kill") cmd = self.pre_arcCmd + "arckill -i " + jobsFile.name output, stat = self.ExecuteCommand(cmd) if stat != 0: raise SchedulerError('arckill returned %i' % stat, output, cmd) for line in output.split('\n'): # If a job URL ("arcId") occurs on a line of output, it tends # to be en error message: errorMatch = re.match(".*: *(gsiftp://[a-zA-Z0-9.-]+\S*/\w*)", line) if errorMatch: arcId = errorMatch.group(1) job = arcId2job[arcId] job.runningJob.errors.append("Killing job %s failed: %s" % (job['name'], line))
def getOutput(self, obj, outdir=''): """ Retrieve (move) job output from cache directory to outdir User files from CondorG appear asynchronously in the cache directory """ if type(obj) == RunningJob: # The object passed is a RunningJob raise SchedulerError( 'Operation not possible', 'CondorG cannot retrieve files when passed RunningJob') elif type(obj) == Job: # The object passed is a Job # check for the RunningJob integrity if not self.valid(obj.runningJob): raise SchedulerError('invalid object', str(obj.runningJob)) # retrieve output self.getCondorOutput(obj, outdir) # the object passed is a Task elif type(obj) == Task: if outdir == '': outdir = obj['outputDirectory'] for job in obj.jobs: if self.valid(job.runningJob): self.getCondorOutput(job, outdir) # unknown object type else: raise SchedulerError('wrong argument type', str(type(obj)))
def kill(self, obj): """ kill job """ jobsToKill = [] # the object passed is a job if type(obj) == Job and self.valid(obj.runningJob): # check for the RunningJob integrity schedIdList = str(obj.runningJob['schedulerId']).strip() command = "glite-wms-job-cancel --json --noint " + schedIdList out, ret = self.ExecuteCommand(self.proxyString + command) if ret != 0: raise SchedulerError('error executing glite-wms-job-cancel', out) elif ret == 0: try: ## try to see if we got a real json result = eval(out) except SyntaxError, ex: ## not possible to evaluate json - try as string if out.find("result: success") == -1: raise SchedulerError('error', out) else: ## if was a json... if 'result' in result: if not result['result'] == "success": raise SchedulerError('error', result) else: raise SchedulerError('Missing result', result)
def checkUserProxy(self): """ Retrieve the user proxy for the task If the proxy is valid pass, otherwise raise an axception """ if self.validProxy is not None: return self.validProxy command = 'voms-proxy-info' if self.cert != '': command += ' --file ' + self.cert output, ret = self.ExecuteCommand(command) try: output = output.split("timeleft :")[1].strip() except IndexError: self.validProxy = False raise SchedulerError("Missing Proxy", output.strip()) if output == "0:00:00": self.validProxy = False raise SchedulerError("Proxy Expired", output.strip()) self.validProxy = True return self.validProxy
def purgeService(self, obj): """ purge the service used by the scheduler from job files not available for every scheduler """ # check the proxy self.schedObj.checkUserProxy() # perform action self.schedObj.purgeService(obj) timestamp = int(time.time()) # the object passed is a runningJob if type(obj) == RunningJob and self.schedObj.valid(obj): obj['status'] = 'E' obj['closed'] = 'Y' obj['getOutputTime'] = timestamp obj['statusScheduler'] = "Cleared" # the object passed is a job elif type(obj) == Job and self.schedObj.valid(obj.runningJob): obj.runningJob['status'] = 'E' obj.runningJob['closed'] = 'Y' obj.runningJob['getOutputTime'] = timestamp obj.runningJob['statusScheduler'] = "Cleared" # the object passed is a Task elif type(obj) == Task: # error messages collector errors = '' # update objects for job in obj.jobs: # skip jobs not requested for action if not self.schedObj.valid(job.runningJob): continue # evaluate errors: if not, update if job.runningJob.isError(): errors += str(job.runningJob.errors) else: job.runningJob['status'] = 'E' job.runningJob['closed'] = 'Y' job.runningJob['getOutputTime'] = timestamp job.runningJob['statusScheduler'] = "Cleared" # handle errors if errors != '': raise SchedulerError('interaction failed for some jobs', \ errors ) # unknown object type else: raise SchedulerError('wrong argument type', str(type(obj)))
def __init__(self, **args): # call super class init method super(SchedulerGLite, self).__init__(**args) # some initializations self.warnings = [] # typical options self.vo = args.get("vo", "cms") self.service = args.get("service", "") self.config = args.get("config", "") self.delegationId = args.get("proxyname", "bossproxy") # rename output files with submission number self.renameOutputFiles = args.get("renameOutputFiles", 0) self.renameOutputFiles = int(self.renameOutputFiles) # x509 string & hackEnv for CLI commands if self.cert != '': self.proxyString = "env X509_USER_PROXY=" + self.cert + ' ' self.hackEnv = hackTheEnv() else: self.proxyString = '' self.hackEnv = hackTheEnv('env') # this section requires an improvement.... if os.environ.get('CRABDIR'): self.commandQueryPath = os.environ.get('CRABDIR') + \ '/external/ProdCommon/BossLite/Scheduler/' elif os.environ.get('PRODCOMMON_ROOT'): self.commandQueryPath = os.environ.get('PRODCOMMON_ROOT') + \ '/lib/ProdCommon/BossLite/Scheduler/' else: # Impossible to locate GLiteQueryStatus.py ... raise SchedulerError('Impossible to locate GLiteQueryStatus.py ') # cache pattern to optimize reg-exp substitution self.pathPattern = re.compile('location:([\S]+)$', re.M) self.patternCE = re.compile('(?<= - ).*(?=:)', re.M) # init BossliteJsonDecoder specialized class self.myJSONDecoder = BossliteJsonDecoder() # Raise an error if UI is old than 3.2 ... version, ret = self.ExecuteCommand('glite-version') version = version.strip() if version.find('3.2') != 0: version1, ret1 = self.ExecuteCommand( 'glite-version -n glite-UI -v') version1 = version1.strip() if version1.find('3.2') != 0: raise SchedulerError('SchedulerGLite is allowed on UI >3.2') # job killed per CLI call (tunable value) self.killThreshold = 100
def getOutput(self, obj, outdir): """ retrieve output or just put it in the destination directory """ # check the proxy self.schedObj.checkUserProxy() # perform action self.schedObj.getOutput(obj, outdir) timestamp = int(time.time()) # the object passed is a runningJob if type(obj) == RunningJob and self.schedObj.valid(obj): obj['status'] = 'E' obj['closed'] = 'Y' obj['getOutputTime'] = timestamp obj['statusScheduler'] = "Retrieved" # the object passed is a job elif type(obj) == Job and self.schedObj.valid(obj.runningJob): obj.runningJob['status'] = 'E' obj.runningJob['closed'] = 'Y' obj.runningJob['getOutputTime'] = timestamp obj.runningJob['statusScheduler'] = "Retrieved" # the object passed is a Task elif type(obj) == Task: # error messages collector errors = '' # update objects for job in obj.jobs: # skip jobs not requested for action if not self.schedObj.valid(job.runningJob): continue # evaluate errors: if not, update if job.runningJob.isError(): errors += str(job.runningJob.errors) else: job.runningJob['status'] = 'E' job.runningJob['closed'] = 'Y' job.runningJob['getOutputTime'] = timestamp job.runningJob['statusScheduler'] = "Retrieved" # handle errors if errors != '': raise SchedulerError('interaction failed for some jobs', \ errors ) # unknown object type else: raise SchedulerError('wrong argument type', str(type(obj)))
def postMortem(self, obj, schedulerId, outfile, service): """ Get detailed postMortem job info """ if not type(obj) == Task: raise SchedulerError('Wrong argument type or object type', str(type(obj)) + ' ' + str(objType)) if not outfile: raise SchedulerError('Empty filename', 'postMortem called with empty logfile name') taskId = obj['name'] condorId = schedulerId.split('//')[-1] header = '========= LOGGING INFO FOR %s =========\n' % schedulerId horsep = '\n' + 80 * '=' + '\n' sep1 = '\n========= OUTPUT OF : Condor_history -match 1 -l %s =========\n' % condorId sep2 = '\n========= OUTPUT OF : Condor_q -l %s =========\n' % condorId self.initializeGsissh(obj) fp = open(outfile, 'w') fp.write(header) fp.write(horsep) fp.write(sep1) command = '%s %s %s %s ' \ % (self.unsetenvScram, self.remoteCommand, self.gsisshOptions, self.remoteUserHost) command += ' "condor_history -match 1 -userlog %s/condor.log -l %s"' % \ (taskId, condorId) (status, output) = commands.getstatusoutput(command) if (status): if "already exists" in output: self.removeGsisshSocket() fp.write(output) fp.write(horsep) # the following condor_q only makes sense if job status # is 1(Idle), 2(Run) or 5(Held) but may cost little to do always fp.write(sep2) command = '%s %s %s %s ' \ % (self.unsetenvScram, self.remoteCommand, self.gsisshOptions, self.remoteUserHost) command += ' "condor_q -l %s"' % condorId (status, output) = commands.getstatusoutput(command) fp.write(output) fp.write('\n') fp.write(horsep) fp.close() return
def getOutput(self, obj, outdir=''): """ Retrieve (move) job output from cache directory to outdir """ self.initializeGsissh(obj) filesToGet = [] if type(obj) == RunningJob: # The object passed is a RunningJob raise SchedulerError( 'Operation not possible', 'Condor cannot retrieve files when passed RunningJob') elif type(obj) == Job: # The object passed is a Job # check for the RunningJob integrity if not self.valid(obj.runningJob): raise SchedulerError('Invalid object', \ str( obj.runningJob )) filesToGet = obj['outputFiles'] # the object passed is a Task elif type(obj) == Task: taskId = obj['name'] self.taskId = taskId if outdir == '': outdir = obj['outputDirectory'] for job in obj.jobs: if self.valid(job.runningJob): filesToGet.extend(job['outputFiles']) # unknown object type else: raise SchedulerError('Wrong argument type', str(type(obj))) retval, stdout = \ self.rsyncFromRemoteHost(self.remoteUserHost, outdir, filesToGet ) if retval: # uh oh if type(obj) == Job: jobList = [obj] else: jobList = obj.jobs for job in jobList: for fileName in job['outputFiles']: if not os.access(outdir + '/' + fileName, os.F_OK): msg = "Could not retrieve file %s." % fileName self.logging.error(msg) msg += " Rsync failed with status,output=\n%d\n%s" % \ (retval,stdout) job.runningJob.errors.append(msg) if "already exists" in stdout: self.removeGsisshSocket()
def query(self, obj, service='', objType='node'): """ query status and eventually other scheduler related information It may use single 'node' scheduler id or bulk id for association """ if type(obj) != Task: raise SchedulerError('wrong argument type', str(type(obj))) jobids = [] for job in obj.jobs: if not self.valid(job.runningJob): continue id = str(job.runningJob['schedulerId']).strip() #p = subprocess.Popen( ['qstat', '-x', id], stdout=subprocess.PIPE, p = subprocess.Popen([ 'squeue', '-h', '-o', '<jobid>%i</jobid><exec_host>%B</exec_host><job_state>%t</job_state>', '-j', id ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) qstat_output, \ qstat_error = p.communicate() qstat_return = p.returncode if qstat_return: #if qstat_return != 153: # 153 means the job isn't there if qstat_return != 1: # 153 means the job isn't there self.logging.error('Error in job query for ' + id) self.logging.error('SLURM stdout: \n %s' % qstat_output) self.logging.error('SLURM stderr: \n %s' % qstat_error) raise SchedulerError( 'SLURM error', '%s: %s' % (qstat_error, qstat_return)) host = '' if len(qstat_output) == 0: pbs_stat = 'Done' else: if qstat_output.find('</exec_host>') >= 0: host = qstat_output[qstat_output.find('<exec_host>') + len('<exec_host>'):qstat_output. find('</exec_host>')] if qstat_output.find('</job_state>') >= 0: pbs_stat = qstat_output[qstat_output.find('<job_state>') + len('<job_state>'):qstat_output. find('</job_state>')] job.runningJob['statusScheduler'] = pbs_stat job.runningJob['status'] = self.status_map[pbs_stat] job.runningJob['destination'] = host
def kill(self, obj): """ kill the job instance does not return """ r = re.compile("Job <(\d+)> is being terminated") rFinished = re.compile("Job <(\d+)>: Job has already finished") # for jobid in schedIdList: for job in obj.jobs: if not self.valid(job.runningJob): continue jobid = str(job.runningJob['schedulerId']).strip() command = 'bkill ' + str(jobid) if self.ksuCmd: # write a ksu tmpFile cmd = '%s\n' % command command, fname = self.createCommand(cmd, obj) out, ret = self.executeCommandWrapper(command) if self.ksuCmd: os.unlink(fname) mFailed = rFinished.search(out) if mFailed: raise SchedulerError( "Unable to kill job " + jobid + " . Reason: ", out, command) pass pass
def kill(self, obj): """ kill the job instance does not return """ r = re.compile("has registered the job (\d+) for deletion") rFinished = re.compile("Job <(\d+)>: Job has already finished") r2 = re.compile("has deleted job (\d+)") #by Leo # for jobid in schedIdList: for job in obj.jobs: if not self.valid(job.runningJob): continue jobid = str(job.runningJob['schedulerId']).strip() cmd = 'qdel ' + str(jobid) out, ret = self.ExecuteCommand(cmd) #print "kill:"+out mKilled = r.search(out) mKilled2 = r2.search(out) if not mKilled and not mKilled2: raise SchedulerError( "Unable to kill job #" + str(job['jobId']) + " (SGE id:" + jobid + ") . Reason: ", out) pass pass
def postMortem(self, obj, outfile): """ execute any post mortem command such as logging-info """ # check the proxy self.schedObj.checkUserProxy() # the object passed is a runningJob if type(obj) == RunningJob: self.schedObj.postMortem(obj, obj['schedulerId'], outfile, self.parameters['service']) # the object passed is a job elif type(obj) == Job: self.schedObj.postMortem( obj, obj.runningJob['schedulerId'], \ outfile, self.parameters['service'] ) # the object passed is a Task elif type(obj) == Task: for job in obj.jobs: if job.runningJob is None: continue self.schedObj.postMortem( obj, job.runningJob['schedulerId'], \ outfile, self.parameters['service'] ) # unknown object type else: raise SchedulerError('wrong argument type', str(type(obj)))
def getOutput(self, obj, outdir=''): """ retrieve output or just put it in the destination directory """ # obj can be a task, a job or even a running job # several possibilities: # 1) connect to a service and perform a remote copy # 2) just eventually copy the local output to the destination dir # 3) wrap a CLI command like glite-wms-job-output errorList = [] if outdir == '' and obj['outputDirectory'] is not None: outdir = obj['outputDirectory'] if outdir != '' and not os.path.exists(outdir): raise SchedulerError( 'Permission denied', \ 'Unable to write files in ' + outdir ) # retrieve scheduler id list schedIdList = {} for job in obj.jobs: if self.valid(job.runningJob): # retrieve output # if error: job.runningJob.errors.append( error ) pass
def getOutput(self, obj, outdir): """ retrieve output or just put it in the destination directory does not return """ #output ends up in the wrong location with a user defined #output directory...Thus we have to move it to the correct #directory here.... #print "SchedulerSGE:getOutput called!" if type(obj) == Task: # oldoutdir=obj[ 'outputDirectory' ] oldoutdir = obj[ 'outputDirectory'] + '/temp' ## copy new output files from temp" if (outdir != oldoutdir): for job in obj.jobs: jobid = job['id'] #print "job:"+str(jobid) if self.valid(job.runningJob): #print "is valid" for outFile in job['outputFiles']: #print "outputFile:"+outFile command = "mv " + oldoutdir + "/" + outFile + " " + outdir + "/. \n" #print command out, ret = self.ExecuteCommand(command) if (out != ""): raise SchedulerError('unable to move file', out) #raise SchedulerError("unable to move file "+oldoutdir+"/"+outFile+" ",out) pass pass pass pass pass
def query(self, obj, objType='node'): """ query status and eventually other scheduler related information """ # check the proxy self.schedObj.checkUserProxy() # error messages collector errors = '' # delegate query to scheduler plugin self.schedObj.query(obj, self.parameters['service'], objType) # handle errors for job in obj.jobs: # evaluate errors: if job.runningJob.isError(): errors += str(job.runningJob.errors) continue # handle errors if errors != '': raise SchedulerError('interaction failed for some jobs', errors)
class Scheduler(object): """ Upper layer for scheduler interaction """ def __init__(self, scheduler, parameters=None): """ initialization """ # define scheduler parameters self.scheduler = scheduler defaults = {'user_proxy': '', 'service': '', 'config': ''} if parameters is not None: defaults.update(parameters) self.parameters = defaults # load scheduler plugin try: module = __import__( 'ProdCommon.BossLite.Scheduler.' + self.scheduler, \ globals(), locals(), [self.scheduler] ) schedClass = vars(module)[self.scheduler] self.schedObj = schedClass(**self.parameters) except KeyError, e: msg = 'Scheduler interface' + self.scheduler + 'not found' raise SchedulerError(msg, str(e)) except Exception, e: raise SchedulerError(e.__class__.__name__, str(e))
def submitJob(self, job, task=None, requirements=''): """Need to copy the inputsandbox to WN before submitting a job""" arg = self.decode(job, task, requirements) command = "qsub " + arg self.logging.debug(command) out, ret = self.ExecuteCommand(command) self.logging.debug("crab: %s" % out) r = re.compile("Your job (\d+) .* has been submitted") m = r.search(out) if m is not None: jobId = m.group(1) command = "qstat -j " + jobId #out, ret = self.ExecuteCommand(command) #print "out:" + out + "\n" #queue = m.group(2) queue = "all" else: #rNot = re.compile("Job not submitted.*<(\w+)>") #m= rNot.search(out) #if m is not None: # print m # print "Job NOT submitted" # print out raise SchedulerError('error', out) taskId = None #print "Your job identifier is: ", taskId, queue map = {job['name']: jobId} return map, taskId, queue
def pbs_conn(self): conn = pbs.pbs_connect(pbs.pbs_default()) if (conn < 0): err, err_text = pbs.error() self.logging.error('Error in PBS server conncet') self.logging.error('PBS error code ' + str(err) + ': ' + err_text) raise SchedulerError('PBS error', str(err) + ': ' + err_text) return conn
def getOutput(self, obj, outdir=''): """ Get output files from jobs in 'obj' and put them in 'outdir', and remove the job from the CE. """ if type(obj) == Task: joblist = obj.jobs if outdir == '': outdir = obj['outputDirectory'] elif type(obj) == Job: joblist = [obj] else: raise SchedulerError('wrong argument type', str(type(obj))) assert outdir != '' if outdir[-1] != '/': outdir += '/' jobsFile, arcId2job = self.createJobsFile(joblist, "Will fetch") # Create a tmp dir where ngget can create its subdirs of job # output. Use outdir as the parent dir, to keep moving of files # afterwards within the same files system (faster!) tmpdir = tempfile.mkdtemp(prefix="joboutputs.", dir=outdir) cmd = self.pre_arcCmd + 'arcget -i %s -dir %s' % (jobsFile.name, tmpdir) self.logging.debug("Running command: %s" % cmd) output, stat = self.ExecuteCommand(cmd) self.logging.debug("Output of arcget: %s" % output) jobsFile.close() if stat != 0: raise SchedulerError('arcget returned %i' % stat, output, cmd) # Copy the dowlodaed files to their final destination cmd = 'mv %s/*/* %s' % (tmpdir, outdir) self.logging.debug("Moving files from %s/* to %s" % (tmpdir, outdir)) output, stat = self.ExecuteCommand(cmd) if stat != 0: raise SchedulerError('mv returned %i' % stat, output, cmd) # Remove the tmp output dir cmd = 'rm -r %s' % tmpdir output, stat = self.ExecuteCommand(cmd) if stat != 0: raise SchedulerError('rm returned %i' % stat, output, cmd)
def submitJob(self, job, task=None, requirements=''): """ Need to copy the inputsandbox to WN before submitting a job""" arg = self.decode(job, task, requirements) # command = "bsub " + arg chDir = "pushd . > /dev/null ; " resetDir = " ; popd > /dev/null" command = " bsub " + arg + resetDir if self.ksuCmd: chDir += "cd /tmp; " cmd = "#!/usr/bin/pagsh.krb\n" cmd += "aklog\n" cmd += '%s %s\n' % (chDir, command) command, fname = self.createCommand(cmd, task) else: # execute bsub in the directory where files have be returned chDir += " cd %s ;" % task['outputDirectory'] command = '%s %s' % (chDir, command) out, ret = self.executeCommandWrapper(command) if self.ksuCmd: os.unlink(fname) if ret != 0: raise SchedulerError('Error in submit', out, command) r = re.compile("Job <(\d+)> is submitted.*<(\w+)>") m = r.search(out) if m is not None: jobId = m.group(1) queue = m.group(2) else: rNot = re.compile("Job not submitted.*<(\w+)>") m = rNot.search(out) if m is not None: self.logging.error(m) self.logging.error("Job NOT submitted") self.logging.error(out) job.runningJob.errors.append(out) raise SchedulerError('Cannot submit ', out, command) taskId = None #print "Your job identifier is: ", taskId, queue map = {job['name']: jobId} return map, taskId, queue
def query(self, obj, service='', objType='node'): """ query status and eventually other scheduler related information It may use single 'node' scheduler id or bulk id for association """ if type(obj) != Task: raise SchedulerError('wrong argument type', str(type(obj))) jobids = [] conn = self.pbs_conn() attrl = pbs.new_attrl(2) attrl[0].name = 'job_state' attrl[1].name = 'exec_host' for job in obj.jobs: if not self.valid(job.runningJob): continue id = str(job.runningJob['schedulerId']).strip() jobstat = pbs.pbs_statjob(conn, id, attrl, 'Null') if not jobstat: err, err_text = pbs.error() if err != 15001: # unknown job (probably finished) self.logging.error('Error in job query for ' + id) self.logging.error('PBS error code ' + str(err) + ': ' + err_text) self.pbs_disconn(conn) raise SchedulerError('PBS error', str(err) + ': ' + err_text) host = '' if len(jobstat) == 0: pbs_stat = 'Done' else: pbs_stat = jobstat[0].attribs[0].value if len(jobstat[0].attribs) > 1: host = jobstat[0].attribs[1].value job.runningJob['statusScheduler'] = pbs_stat job.runningJob['status'] = self.status_map[pbs_stat] job.runningJob['destination'] = host self.pbs_disconn(conn)
def purgeService(self, obj): """ Purge job (even bulk) from service """ # not always available... # it may be useful to connect to a remote service and purge job sandbox out = "whatever" if out.find('error') >= 0: raise SchedulerError("Unable to purge job", out)
def __init__(self, **args): super(SchedulerPbs, self).__init__(**args) self.jobScriptDir = args['jobScriptDir'] self.jobResDir = args['jobResDir'] self.queue = args['queue'] self.workerNodeWorkDir = args.get('workDir', '') self.res_dict = {} for a in args['resources'].split(','): if len(a) > 0: if a.find("=") != -1: res, val = a.split('=') self.res_dict.update({res: val}) else: raise SchedulerError("PBS error", +\ "Unkown resource format: " + a) env = [] for v in ('HOME', 'LANG', 'LOGNAME', 'MAIL', 'PATH', 'SHELL'): env.append('PBS_O_' + v + '=' + os.environ[v]) env.append('PBS_O_WORKDIR=' + os.getcwd()) env.append('PBS_O_HOST=' + pbs.pbs_default()) #if 'use_proxy' in args: # if args['use_proxy'] == 1: # proxy_location = '' # try: # proxy_location = os.environ['X509_USER_PROXY'] # except: # proxy_location = '/tmp/x509up_u'+ repr(os.getuid()) # msg, ret = self.ExecuteCommand('cp ' + proxy_location + " " + self.cert) ## proxy_path = self.getUserProxy() # env.append('X509_USER_PROXY=' + self.cert) # env.append('X509_USER_CERT=' + self.cert) # env.append('X509_USER_KEY=' + self.cert) # else: # raise SchedulerError(str(args), self.cert) self.pbs_env = ','.join(env) self.status_map = { 'E': 'R', 'H': 'SS', 'Q': 'SS', 'R': 'R', 'S': 'R', 'T': 'R', 'W': 'SS', 'Done': 'SD', 'C': 'SD' }
def matchResources(self, obj, requirements='', config='', service=''): """ resources list match """ # several possibilities: # 1) connect to a service and ask # 2) wrap a CLI command like glite-wms-job-listmatch # 3) nor available... skip # 4) there is a useful lcgInfo... out = "whatever" if out.find('error') >= 0: raise SchedulerError("Unable to find resources", out)
def getOutput(self, obj, outdir=''): """ Get output files from jobs in 'obj' and put them in 'outdir', and remove the job from the CE. """ if type(obj) == Task: self.logging.debug("getOutput called for %i jobs" % len(obj.jobs)) joblist = obj.jobs if outdir == '': outdir = obj['outputDirectory'] elif type(obj) == Job: self.logging.debug("getOutput called for 1 job") joblist = [obj] else: raise SchedulerError('wrong argument type', str(type(obj))) assert outdir != '' if outdir[-1] != '/': outdir += '/' for job in joblist: tmpdir = tempfile.mkdtemp(prefix="joboutputs.", dir=outdir) cmd = self.pre_arcCmd + 'arcget --timeout=600 %s --dir %s' % ( job.runningJob['schedulerId'], tmpdir) self.logging.debug("Running command: %s" % cmd) output, stat = self.ExecuteCommand(cmd) self.logging.debug("Status and output of arcget: %i, '%s'" % (stat, output)) if stat != 0: msg = "arcget failed with status %i: %s" % (stat, output) self.logging.warning(msg) else: # Copy the dowlodaed files to their final destination cmd = 'mv %s/*/* %s' % (tmpdir, outdir) self.logging.debug("Moving files from %s/* to %s" % (tmpdir, outdir)) output, stat = self.ExecuteCommand(cmd) if stat != 0: msg = "Moving files to final destination failed: %s" % ( output) self.logging.warning(msg) else: cmd = ' rm -r %s' % (tmpdir) self.logging.debug("Removing tempdir %s" % (tmpdir)) output, stat = self.ExecuteCommand(cmd) if stat != 0: msg = "Removing tempdir: %s" % (output) self.logging.warning(msg)
def kill(self, obj): for job in obj.jobs : if not self.valid( job.runningJob ): continue id=str(job.runningJob['schedulerId']).strip() p = subprocess.Popen( ['qdel', id], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) qdel_output, \ qdel_error = p.communicate() qdel_return = p.returncode if qdel_return != 0: self.logging.error('Error in job kill for '+id) self.logging.error('PBS Error stdout: %s' % qdel_output) raise SchedulerError('PBS Error in kill', qdel_output)
def submit(self, task, requirements='', config='', service=''): """ user submission function takes as arguments: - a finite, dedicated jdl - eventually a list of services to connect - eventually a config file the passed config file or, if not provided, a default one can be used from eventual defaults the function returns an eventual parent id, the service of the successfully submission and a map associating the jobname to the node id. If the submission is not bulk, the parent id should be the node id of the unique entry of the map """ taskId = None queue = None retMap = {} for job in task.jobs: command = self.decodeJob(job, task, requirements) out, ret = self.ExecuteCommand(command) if ret != 0: raise SchedulerError('Error in submit', out, command) r = re.compile("Job <(\d+)> is submitted.*<(\w+)>") m = r.search(out) if m is not None: jobId = m.group(1) queue = m.group(2) retMap[job['name']] = jobId else: rNot = re.compile("Job not submitted.*<(\w+)>") m = rNot.search(out) if m is not None: self.logging.error("Job NOT submitted: %s" % out) job.runningJob.errors.append('Cannot submit using %s: %s' % (out, command)) return retMap, taskId, queue
def query(self, obj, service='', objType='node'): """ query status and eventually other scheduler related information """ # ask for the job informations, mainly status # some systems allow a query job per job, others also bulk queries #print schedIdList, service, objType r = re.compile("(\d+)\s+\w+\s+(\w+).*") rfull = re.compile("(\d+)\s+\w+\s+(\w+)\s+(\w+)\s+\w+\s+(\w+).*") rnotfound = re.compile("Job <(\d+)> is not found") for job in obj.jobs: if not self.valid(job.runningJob): continue jobid = str(job.runningJob['schedulerId']).strip() command = 'bjobs ' + str(jobid) out, ret = self.ExecuteCommand(command) if ret != 0: raise SchedulerError('Error in status query', out, command) mnotfound = rnotfound.search(out) queue = None host = None sid = None st = None if (mnotfound): sid = mnotfound.group(1) st = 'DONE' else: mfull = rfull.search(out) if (mfull): sid, st, queue, host = mfull.groups() else: m = r.search(out) if (m): sid, st = m.groups() if (st): job.runningJob['statusScheduler'] = st job.runningJob['status'] = self.statusMap[st] if (host): job.runningJob['destination'] = host
def kill(self, obj): conn = self.pbs_conn() for job in obj.jobs: if not self.valid(job.runningJob): continue id = str(job.runningJob['schedulerId']).strip() res = pbs.pbs_deljob(conn, id, '') if res != 0: err, err_text = pbs.error() self.logging.error('Error in job kill for ' + id) self.logging.error('PBS error code ' + str(err) + ': ' + err_text) self.pbs_disconn(conn) raise SchedulerError('PBS error', str(err) + ': ' + err_text) self.pbs_disconn(conn)