示例#1
0
    def kill(self, obj):
        """
        Kill the job instance
        """
        if type(obj) == Job:
            jobList = [obj]
        elif type(obj) == Task:
            jobList = obj.jobs
        else:
            raise SchedulerError('wrong argument type', str(type(obj)))

        jobsFile, arcId2job = self.createJobsFile(jobList, "Will kill")

        cmd = self.pre_arcCmd + "arckill -i " + jobsFile.name
        output, stat = self.ExecuteCommand(cmd)
        if stat != 0:
            raise SchedulerError('arckill returned %i' % stat, output, cmd)

        for line in output.split('\n'):
            # If a job URL ("arcId") occurs on a line of output, it tends
            # to be en error message:
            errorMatch = re.match(".*: *(gsiftp://[a-zA-Z0-9.-]+\S*/\w*)",
                                  line)
            if errorMatch:
                arcId = errorMatch.group(1)
                job = arcId2job[arcId]
                job.runningJob.errors.append("Killing job %s failed: %s" %
                                             (job['name'], line))
示例#2
0
    def getOutput(self, obj, outdir=''):
        """
        Retrieve (move) job output from cache directory to outdir
        User files from CondorG appear asynchronously in the cache directory
        """

        if type(obj) == RunningJob:  # The object passed is a RunningJob
            raise SchedulerError(
                'Operation not possible',
                'CondorG cannot retrieve files when passed RunningJob')
        elif type(obj) == Job:  # The object passed is a Job

            # check for the RunningJob integrity
            if not self.valid(obj.runningJob):
                raise SchedulerError('invalid object', str(obj.runningJob))

            # retrieve output
            self.getCondorOutput(obj, outdir)

        # the object passed is a Task
        elif type(obj) == Task:

            if outdir == '':
                outdir = obj['outputDirectory']

            for job in obj.jobs:
                if self.valid(job.runningJob):
                    self.getCondorOutput(job, outdir)

        # unknown object type
        else:
            raise SchedulerError('wrong argument type', str(type(obj)))
示例#3
0
    def kill(self, obj):
        """
        kill job
        """

        jobsToKill = []

        # the object passed is a job
        if type(obj) == Job and self.valid(obj.runningJob):

            # check for the RunningJob integrity
            schedIdList = str(obj.runningJob['schedulerId']).strip()

            command = "glite-wms-job-cancel --json --noint " + schedIdList

            out, ret = self.ExecuteCommand(self.proxyString + command)

            if ret != 0:
                raise SchedulerError('error executing glite-wms-job-cancel',
                                     out)
            elif ret == 0:
                try:
                    ## try to see if we got a real json
                    result = eval(out)
                except SyntaxError, ex:
                    ## not possible to evaluate json - try as string
                    if out.find("result: success") == -1:
                        raise SchedulerError('error', out)
                else:
                    ## if was a json...
                    if 'result' in result:
                        if not result['result'] == "success":
                            raise SchedulerError('error', result)
                    else:
                        raise SchedulerError('Missing result', result)
示例#4
0
    def checkUserProxy(self):
        """
        Retrieve the user proxy for the task
        If the proxy is valid pass, otherwise raise an axception
        """

        if self.validProxy is not None:
            return self.validProxy

        command = 'voms-proxy-info'

        if self.cert != '':
            command += ' --file ' + self.cert

        output, ret = self.ExecuteCommand(command)

        try:
            output = output.split("timeleft  :")[1].strip()
        except IndexError:
            self.validProxy = False
            raise SchedulerError("Missing Proxy", output.strip())

        if output == "0:00:00":
            self.validProxy = False
            raise SchedulerError("Proxy Expired", output.strip())

        self.validProxy = True
        return self.validProxy
示例#5
0
    def purgeService(self, obj):
        """
        purge the service used by the scheduler from job files
        not available for every scheduler
        """

        # check the proxy
        self.schedObj.checkUserProxy()

        # perform action
        self.schedObj.purgeService(obj)
        timestamp = int(time.time())

        # the object passed is a runningJob
        if type(obj) == RunningJob and self.schedObj.valid(obj):
            obj['status'] = 'E'
            obj['closed'] = 'Y'
            obj['getOutputTime'] = timestamp
            obj['statusScheduler'] = "Cleared"

        # the object passed is a job
        elif type(obj) == Job and self.schedObj.valid(obj.runningJob):
            obj.runningJob['status'] = 'E'
            obj.runningJob['closed'] = 'Y'
            obj.runningJob['getOutputTime'] = timestamp
            obj.runningJob['statusScheduler'] = "Cleared"

        # the object passed is a Task
        elif type(obj) == Task:

            # error messages collector
            errors = ''

            # update objects
            for job in obj.jobs:

                # skip jobs not requested for action
                if not self.schedObj.valid(job.runningJob):
                    continue

                # evaluate errors: if not, update
                if job.runningJob.isError():
                    errors += str(job.runningJob.errors)
                else:
                    job.runningJob['status'] = 'E'
                    job.runningJob['closed'] = 'Y'
                    job.runningJob['getOutputTime'] = timestamp
                    job.runningJob['statusScheduler'] = "Cleared"

            # handle errors
            if errors != '':
                raise SchedulerError('interaction failed for some jobs', \
                                     errors )

        # unknown object type
        else:
            raise SchedulerError('wrong argument type', str(type(obj)))
示例#6
0
    def __init__(self, **args):

        # call super class init method
        super(SchedulerGLite, self).__init__(**args)

        # some initializations
        self.warnings = []

        # typical options
        self.vo = args.get("vo", "cms")
        self.service = args.get("service", "")
        self.config = args.get("config", "")
        self.delegationId = args.get("proxyname", "bossproxy")

        # rename output files with submission number
        self.renameOutputFiles = args.get("renameOutputFiles", 0)
        self.renameOutputFiles = int(self.renameOutputFiles)

        # x509 string & hackEnv for CLI commands
        if self.cert != '':
            self.proxyString = "env X509_USER_PROXY=" + self.cert + ' '
            self.hackEnv = hackTheEnv()
        else:
            self.proxyString = ''
            self.hackEnv = hackTheEnv('env')

        # this section requires an improvement....
        if os.environ.get('CRABDIR'):
            self.commandQueryPath = os.environ.get('CRABDIR') + \
                                    '/external/ProdCommon/BossLite/Scheduler/'
        elif os.environ.get('PRODCOMMON_ROOT'):
            self.commandQueryPath = os.environ.get('PRODCOMMON_ROOT') + \
                                        '/lib/ProdCommon/BossLite/Scheduler/'
        else:
            # Impossible to locate GLiteQueryStatus.py ...
            raise SchedulerError('Impossible to locate GLiteQueryStatus.py ')

        # cache pattern to optimize reg-exp substitution
        self.pathPattern = re.compile('location:([\S]+)$', re.M)
        self.patternCE = re.compile('(?<= - ).*(?=:)', re.M)

        # init BossliteJsonDecoder specialized class
        self.myJSONDecoder = BossliteJsonDecoder()

        # Raise an error if UI is old than 3.2 ...
        version, ret = self.ExecuteCommand('glite-version')
        version = version.strip()
        if version.find('3.2') != 0:
            version1, ret1 = self.ExecuteCommand(
                'glite-version -n glite-UI -v')
            version1 = version1.strip()
            if version1.find('3.2') != 0:
                raise SchedulerError('SchedulerGLite is allowed on UI >3.2')

        # job killed per CLI call (tunable value)
        self.killThreshold = 100
示例#7
0
    def getOutput(self, obj, outdir):
        """
        retrieve output or just put it in the destination directory
        """

        # check the proxy
        self.schedObj.checkUserProxy()

        # perform action
        self.schedObj.getOutput(obj, outdir)
        timestamp = int(time.time())

        # the object passed is a runningJob
        if type(obj) == RunningJob and self.schedObj.valid(obj):
            obj['status'] = 'E'
            obj['closed'] = 'Y'
            obj['getOutputTime'] = timestamp
            obj['statusScheduler'] = "Retrieved"

        # the object passed is a job
        elif type(obj) == Job and self.schedObj.valid(obj.runningJob):
            obj.runningJob['status'] = 'E'
            obj.runningJob['closed'] = 'Y'
            obj.runningJob['getOutputTime'] = timestamp
            obj.runningJob['statusScheduler'] = "Retrieved"

        # the object passed is a Task
        elif type(obj) == Task:

            # error messages collector
            errors = ''

            # update objects
            for job in obj.jobs:

                # skip jobs not requested for action
                if not self.schedObj.valid(job.runningJob):
                    continue

                # evaluate errors: if not, update
                if job.runningJob.isError():
                    errors += str(job.runningJob.errors)
                else:
                    job.runningJob['status'] = 'E'
                    job.runningJob['closed'] = 'Y'
                    job.runningJob['getOutputTime'] = timestamp
                    job.runningJob['statusScheduler'] = "Retrieved"

            # handle errors
            if errors != '':
                raise SchedulerError('interaction failed for some jobs', \
                                     errors )

        # unknown object type
        else:
            raise SchedulerError('wrong argument type', str(type(obj)))
示例#8
0
    def postMortem(self, obj, schedulerId, outfile, service):
        """
        Get detailed postMortem job info
        """

        if not type(obj) == Task:
            raise SchedulerError('Wrong argument type or object type',
                                 str(type(obj)) + ' ' + str(objType))

        if not outfile:
            raise SchedulerError('Empty filename',
                                 'postMortem called with empty logfile name')

        taskId = obj['name']
        condorId = schedulerId.split('//')[-1]
        header = '========= LOGGING INFO FOR %s =========\n' % schedulerId
        horsep = '\n' + 80 * '=' + '\n'
        sep1 = '\n========= OUTPUT OF : Condor_history -match 1 -l %s =========\n' % condorId
        sep2 = '\n========= OUTPUT OF : Condor_q -l  %s =========\n' % condorId

        self.initializeGsissh(obj)

        fp = open(outfile, 'w')
        fp.write(header)
        fp.write(horsep)
        fp.write(sep1)

        command = '%s %s %s %s ' \
                  % (self.unsetenvScram, self.remoteCommand, self.gsisshOptions, self.remoteUserHost)
        command += ' "condor_history -match 1 -userlog %s/condor.log -l %s"' % \
                   (taskId, condorId)
        (status, output) = commands.getstatusoutput(command)
        if (status):
            if "already exists" in output:
                self.removeGsisshSocket()

        fp.write(output)
        fp.write(horsep)

        # the following condor_q only makes sense if job status
        # is 1(Idle), 2(Run) or 5(Held) but may cost little to do always

        fp.write(sep2)
        command = '%s %s %s %s ' \
                  % (self.unsetenvScram, self.remoteCommand, self.gsisshOptions, self.remoteUserHost)
        command += ' "condor_q -l %s"' % condorId
        (status, output) = commands.getstatusoutput(command)

        fp.write(output)
        fp.write('\n')
        fp.write(horsep)
        fp.close()

        return
示例#9
0
    def getOutput(self, obj, outdir=''):
        """
        Retrieve (move) job output from cache directory to outdir
        """

        self.initializeGsissh(obj)
        filesToGet = []
        if type(obj) == RunningJob:  # The object passed is a RunningJob
            raise SchedulerError(
                'Operation not possible',
                'Condor cannot retrieve files when passed RunningJob')
        elif type(obj) == Job:  # The object passed is a Job
            # check for the RunningJob integrity
            if not self.valid(obj.runningJob):
                raise SchedulerError('Invalid object', \
                                str( obj.runningJob ))
            filesToGet = obj['outputFiles']

        # the object passed is a Task
        elif type(obj) == Task:
            taskId = obj['name']
            self.taskId = taskId
            if outdir == '':
                outdir = obj['outputDirectory']
            for job in obj.jobs:
                if self.valid(job.runningJob):
                    filesToGet.extend(job['outputFiles'])

        # unknown object type
        else:
            raise SchedulerError('Wrong argument type', str(type(obj)))

        retval, stdout = \
            self.rsyncFromRemoteHost(self.remoteUserHost, outdir, filesToGet )

        if retval:
            # uh oh
            if type(obj) == Job:
                jobList = [obj]
            else:
                jobList = obj.jobs
            for job in jobList:
                for fileName in job['outputFiles']:
                    if not os.access(outdir + '/' + fileName, os.F_OK):
                        msg = "Could not retrieve file %s." % fileName
                        self.logging.error(msg)
                        msg += " Rsync failed with status,output=\n%d\n%s" % \
                                        (retval,stdout)
                        job.runningJob.errors.append(msg)

            if "already exists" in stdout:
                self.removeGsisshSocket()
示例#10
0
    def query(self, obj, service='', objType='node'):
        """
        query status and eventually other scheduler related information
        It may use single 'node' scheduler id or bulk id for association
        """
        if type(obj) != Task:
            raise SchedulerError('wrong argument type', str(type(obj)))

        jobids = []
        for job in obj.jobs:
            if not self.valid(job.runningJob): continue
            id = str(job.runningJob['schedulerId']).strip()
            #p = subprocess.Popen( ['qstat', '-x', id], stdout=subprocess.PIPE,
            p = subprocess.Popen([
                'squeue', '-h', '-o',
                '<jobid>%i</jobid><exec_host>%B</exec_host><job_state>%t</job_state>',
                '-j', id
            ],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            qstat_output, \
                qstat_error = p.communicate()
            qstat_return = p.returncode

            if qstat_return:
                #if qstat_return != 153: # 153 means the job isn't there
                if qstat_return != 1:  # 153 means the job isn't there
                    self.logging.error('Error in job query for ' + id)
                    self.logging.error('SLURM stdout: \n %s' % qstat_output)
                    self.logging.error('SLURM stderr: \n %s' % qstat_error)
                    raise SchedulerError(
                        'SLURM error', '%s: %s' % (qstat_error, qstat_return))

            host = ''
            if len(qstat_output) == 0:
                pbs_stat = 'Done'
            else:
                if qstat_output.find('</exec_host>') >= 0:
                    host = qstat_output[qstat_output.find('<exec_host>') +
                                        len('<exec_host>'):qstat_output.
                                        find('</exec_host>')]
                if qstat_output.find('</job_state>') >= 0:
                    pbs_stat = qstat_output[qstat_output.find('<job_state>') +
                                            len('<job_state>'):qstat_output.
                                            find('</job_state>')]

            job.runningJob['statusScheduler'] = pbs_stat
            job.runningJob['status'] = self.status_map[pbs_stat]
            job.runningJob['destination'] = host
示例#11
0
    def kill(self, obj):
        """
        kill the job instance

        does not return
        """
        r = re.compile("Job <(\d+)> is being terminated")
        rFinished = re.compile("Job <(\d+)>: Job has already finished")
        # for jobid in schedIdList:
        for job in obj.jobs:
            if not self.valid(job.runningJob):
                continue
            jobid = str(job.runningJob['schedulerId']).strip()
            command = 'bkill ' + str(jobid)
            if self.ksuCmd:
                # write a ksu tmpFile
                cmd = '%s\n' % command
                command, fname = self.createCommand(cmd, obj)

            out, ret = self.executeCommandWrapper(command)

            if self.ksuCmd: os.unlink(fname)
            mFailed = rFinished.search(out)
            if mFailed:
                raise SchedulerError(
                    "Unable to kill job " + jobid + " . Reason: ", out,
                    command)
            pass
        pass
示例#12
0
    def kill(self, obj):
        """
        kill the job instance

        does not return
        """
        r = re.compile("has registered the job (\d+) for deletion")
        rFinished = re.compile("Job <(\d+)>: Job has already finished")
        r2 = re.compile("has deleted job (\d+)")  #by Leo
        # for jobid in schedIdList:
        for job in obj.jobs:
            if not self.valid(job.runningJob):
                continue
            jobid = str(job.runningJob['schedulerId']).strip()
            cmd = 'qdel ' + str(jobid)
            out, ret = self.ExecuteCommand(cmd)
            #print "kill:"+out
            mKilled = r.search(out)
            mKilled2 = r2.search(out)

            if not mKilled and not mKilled2:
                raise SchedulerError(
                    "Unable to kill job #" + str(job['jobId']) + " (SGE id:" +
                    jobid + ") . Reason: ", out)
            pass
        pass
示例#13
0
    def postMortem(self, obj, outfile):
        """
        execute any post mortem command such as logging-info
        """

        # check the proxy
        self.schedObj.checkUserProxy()

        # the object passed is a runningJob
        if type(obj) == RunningJob:
            self.schedObj.postMortem(obj, obj['schedulerId'], outfile,
                                     self.parameters['service'])

        # the object passed is a job
        elif type(obj) == Job:
            self.schedObj.postMortem( obj, obj.runningJob['schedulerId'], \
                                      outfile, self.parameters['service']
                )

        # the object passed is a Task
        elif type(obj) == Task:
            for job in obj.jobs:
                if job.runningJob is None:
                    continue
                self.schedObj.postMortem( obj, job.runningJob['schedulerId'], \
                                          outfile, self.parameters['service'] )

        # unknown object type
        else:
            raise SchedulerError('wrong argument type', str(type(obj)))
示例#14
0
    def getOutput(self, obj, outdir=''):
        """
        retrieve output or just put it in the destination directory
        """

        # obj can be a task, a job or even a running job
        # several possibilities:
        # 1) connect to a service and perform a remote copy
        # 2) just eventually copy the local output to the destination dir
        # 3) wrap a CLI command like glite-wms-job-output

        errorList = []

        if outdir == '' and obj['outputDirectory'] is not None:
            outdir = obj['outputDirectory']

        if outdir != '' and not os.path.exists(outdir):
            raise SchedulerError( 'Permission denied', \
                                  'Unable to write files in ' + outdir )

        # retrieve scheduler id list
        schedIdList = {}
        for job in obj.jobs:
            if self.valid(job.runningJob):
                # retrieve output
                # if error: job.runningJob.errors.append( error )
                pass
示例#15
0
    def getOutput(self, obj, outdir):
        """
        retrieve output or just put it in the destination directory

        does not return
        """
        #output ends up in the wrong location with a user defined
        #output directory...Thus we have to move it to the correct
        #directory here....
        #print "SchedulerSGE:getOutput called!"

        if type(obj) == Task:
            #           oldoutdir=obj[ 'outputDirectory' ]
            oldoutdir = obj[
                'outputDirectory'] + '/temp'  ## copy new output  files from temp"
            if (outdir != oldoutdir):
                for job in obj.jobs:
                    jobid = job['id']
                    #print "job:"+str(jobid)
                    if self.valid(job.runningJob):
                        #print "is valid"
                        for outFile in job['outputFiles']:
                            #print "outputFile:"+outFile
                            command = "mv " + oldoutdir + "/" + outFile + " " + outdir + "/. \n"
                            #print command
                            out, ret = self.ExecuteCommand(command)
                            if (out != ""):
                                raise SchedulerError('unable to move file',
                                                     out)
                                #raise SchedulerError("unable to move file "+oldoutdir+"/"+outFile+" ",out)
                            pass
                        pass
                    pass
                pass
            pass
示例#16
0
    def query(self, obj, objType='node'):
        """
        query status and eventually other scheduler related information
        """

        # check the proxy
        self.schedObj.checkUserProxy()

        # error messages collector
        errors = ''

        # delegate query to scheduler plugin
        self.schedObj.query(obj, self.parameters['service'], objType)

        # handle errors
        for job in obj.jobs:

            # evaluate errors:
            if job.runningJob.isError():
                errors += str(job.runningJob.errors)
                continue

        # handle errors
        if errors != '':
            raise SchedulerError('interaction failed for some jobs', errors)
示例#17
0
class Scheduler(object):
    """
    Upper layer for scheduler interaction

    """
    def __init__(self, scheduler, parameters=None):
        """
        initialization
        """

        # define scheduler parameters
        self.scheduler = scheduler
        defaults = {'user_proxy': '', 'service': '', 'config': ''}
        if parameters is not None:
            defaults.update(parameters)
        self.parameters = defaults

        # load scheduler plugin
        try:
            module =  __import__(
                'ProdCommon.BossLite.Scheduler.' + self.scheduler, \
                globals(), locals(), [self.scheduler]
                )
            schedClass = vars(module)[self.scheduler]
            self.schedObj = schedClass(**self.parameters)
        except KeyError, e:
            msg = 'Scheduler interface' + self.scheduler + 'not found'
            raise SchedulerError(msg, str(e))
        except Exception, e:
            raise SchedulerError(e.__class__.__name__, str(e))
示例#18
0
    def submitJob(self, job, task=None, requirements=''):
        """Need to copy the inputsandbox to WN before submitting a job"""

        arg = self.decode(job, task, requirements)

        command = "qsub " + arg
        self.logging.debug(command)
        out, ret = self.ExecuteCommand(command)
        self.logging.debug("crab:  %s" % out)
        r = re.compile("Your job (\d+) .* has been submitted")

        m = r.search(out)
        if m is not None:
            jobId = m.group(1)
            command = "qstat -j " + jobId
            #out, ret = self.ExecuteCommand(command)
            #print "out:" + out + "\n"
            #queue = m.group(2)
            queue = "all"
        else:
            #rNot = re.compile("Job not submitted.*<(\w+)>")
            #m= rNot.search(out)
            #if m is not None:
            #    print m
            #    print "Job NOT submitted"
            #    print out
            raise SchedulerError('error', out)
        taskId = None
        #print "Your job identifier is: ", taskId, queue
        map = {job['name']: jobId}
        return map, taskId, queue
示例#19
0
 def pbs_conn(self):
     conn = pbs.pbs_connect(pbs.pbs_default())
     if (conn < 0):
         err, err_text = pbs.error()
         self.logging.error('Error in PBS server conncet')
         self.logging.error('PBS error code ' + str(err) + ': ' + err_text)
         raise SchedulerError('PBS error', str(err) + ': ' + err_text)
     return conn
示例#20
0
    def getOutput(self, obj, outdir=''):
        """
        Get output files from jobs in 'obj' and put them in 'outdir', and  
        remove the job from the CE.
        """
        if type(obj) == Task:
            joblist = obj.jobs
            if outdir == '':
                outdir = obj['outputDirectory']
        elif type(obj) == Job:
            joblist = [obj]
        else:
            raise SchedulerError('wrong argument type', str(type(obj)))

        assert outdir != ''
        if outdir[-1] != '/': outdir += '/'

        jobsFile, arcId2job = self.createJobsFile(joblist, "Will fetch")

        # Create a tmp dir where ngget can create its subdirs of job
        # output. Use outdir as the parent dir, to keep moving of files
        # afterwards within the same files system (faster!)
        tmpdir = tempfile.mkdtemp(prefix="joboutputs.", dir=outdir)

        cmd = self.pre_arcCmd + 'arcget -i %s -dir %s' % (jobsFile.name,
                                                          tmpdir)
        self.logging.debug("Running command: %s" % cmd)
        output, stat = self.ExecuteCommand(cmd)
        self.logging.debug("Output of arcget: %s" % output)
        jobsFile.close()
        if stat != 0:
            raise SchedulerError('arcget returned %i' % stat, output, cmd)

        # Copy the dowlodaed files to their final destination
        cmd = 'mv %s/*/* %s' % (tmpdir, outdir)
        self.logging.debug("Moving files from %s/* to %s" % (tmpdir, outdir))
        output, stat = self.ExecuteCommand(cmd)
        if stat != 0:
            raise SchedulerError('mv returned %i' % stat, output, cmd)

        # Remove the tmp output dir
        cmd = 'rm -r %s' % tmpdir
        output, stat = self.ExecuteCommand(cmd)
        if stat != 0:
            raise SchedulerError('rm returned %i' % stat, output, cmd)
示例#21
0
    def submitJob(self, job, task=None, requirements=''):
        """ Need to copy the inputsandbox to WN before submitting a job"""

        arg = self.decode(job, task, requirements)

        # command = "bsub " + arg
        chDir = "pushd . > /dev/null ; "
        resetDir = " ; popd > /dev/null"
        command = " bsub " + arg + resetDir

        if self.ksuCmd:
            chDir += "cd /tmp; "
            cmd = "#!/usr/bin/pagsh.krb\n"
            cmd += "aklog\n"
            cmd += '%s %s\n' % (chDir, command)
            command, fname = self.createCommand(cmd, task)
        else:
            # execute bsub in the directory where files have be returned
            chDir += " cd %s ;" % task['outputDirectory']
            command = '%s %s' % (chDir, command)
        out, ret = self.executeCommandWrapper(command)

        if self.ksuCmd: os.unlink(fname)
        if ret != 0:
            raise SchedulerError('Error in submit', out, command)
        r = re.compile("Job <(\d+)> is submitted.*<(\w+)>")

        m = r.search(out)
        if m is not None:
            jobId = m.group(1)
            queue = m.group(2)
        else:
            rNot = re.compile("Job not submitted.*<(\w+)>")
            m = rNot.search(out)
            if m is not None:
                self.logging.error(m)
                self.logging.error("Job NOT submitted")
                self.logging.error(out)
                job.runningJob.errors.append(out)
            raise SchedulerError('Cannot submit ', out, command)
        taskId = None
        #print "Your job identifier is: ", taskId, queue
        map = {job['name']: jobId}
        return map, taskId, queue
示例#22
0
    def query(self, obj, service='', objType='node'):
        """
        query status and eventually other scheduler related information
        It may use single 'node' scheduler id or bulk id for association
        """
        if type(obj) != Task:
            raise SchedulerError('wrong argument type', str(type(obj)))

        jobids = []

        conn = self.pbs_conn()
        attrl = pbs.new_attrl(2)
        attrl[0].name = 'job_state'
        attrl[1].name = 'exec_host'

        for job in obj.jobs:
            if not self.valid(job.runningJob): continue
            id = str(job.runningJob['schedulerId']).strip()
            jobstat = pbs.pbs_statjob(conn, id, attrl, 'Null')

            if not jobstat:
                err, err_text = pbs.error()
                if err != 15001:  # unknown job (probably finished)
                    self.logging.error('Error in job query for ' + id)
                    self.logging.error('PBS error code ' + str(err) + ': ' +
                                       err_text)
                    self.pbs_disconn(conn)
                    raise SchedulerError('PBS error',
                                         str(err) + ': ' + err_text)

            host = ''
            if len(jobstat) == 0:
                pbs_stat = 'Done'
            else:
                pbs_stat = jobstat[0].attribs[0].value
                if len(jobstat[0].attribs) > 1:
                    host = jobstat[0].attribs[1].value
            job.runningJob['statusScheduler'] = pbs_stat
            job.runningJob['status'] = self.status_map[pbs_stat]
            job.runningJob['destination'] = host

        self.pbs_disconn(conn)
示例#23
0
    def purgeService(self, obj):
        """
        Purge job (even bulk) from service
        """

        # not always available...
        # it may be useful to connect to a remote service and purge job sandbox

        out = "whatever"
        if out.find('error') >= 0:
            raise SchedulerError("Unable to purge job", out)
示例#24
0
    def __init__(self, **args):
        super(SchedulerPbs, self).__init__(**args)
        self.jobScriptDir = args['jobScriptDir']
        self.jobResDir = args['jobResDir']
        self.queue = args['queue']
        self.workerNodeWorkDir = args.get('workDir', '')

        self.res_dict = {}
        for a in args['resources'].split(','):
            if len(a) > 0:
                if a.find("=") != -1:
                    res, val = a.split('=')
                    self.res_dict.update({res: val})
                else:
                    raise SchedulerError("PBS error", +\
                                         "Unkown resource format: " + a)

        env = []
        for v in ('HOME', 'LANG', 'LOGNAME', 'MAIL', 'PATH', 'SHELL'):
            env.append('PBS_O_' + v + '=' + os.environ[v])

        env.append('PBS_O_WORKDIR=' + os.getcwd())
        env.append('PBS_O_HOST=' + pbs.pbs_default())
        #if 'use_proxy' in args:
        #     if args['use_proxy'] == 1:
        #         proxy_location = ''
        #         try:
        #             proxy_location = os.environ['X509_USER_PROXY']
        #         except:
        #             proxy_location = '/tmp/x509up_u'+ repr(os.getuid())

        #         msg, ret = self.ExecuteCommand('cp ' + proxy_location + " " + self.cert)
        ##          proxy_path = self.getUserProxy()
        #         env.append('X509_USER_PROXY=' + self.cert)
        #         env.append('X509_USER_CERT=' + self.cert)
        #         env.append('X509_USER_KEY=' + self.cert)
        #     else:
        #         raise SchedulerError(str(args), self.cert)

        self.pbs_env = ','.join(env)

        self.status_map = {
            'E': 'R',
            'H': 'SS',
            'Q': 'SS',
            'R': 'R',
            'S': 'R',
            'T': 'R',
            'W': 'SS',
            'Done': 'SD',
            'C': 'SD'
        }
示例#25
0
    def matchResources(self, obj, requirements='', config='', service=''):
        """
        resources list match
        """

        # several possibilities:
        # 1) connect to a service and ask
        # 2) wrap a CLI command like glite-wms-job-listmatch
        # 3) nor available... skip
        # 4) there is a useful lcgInfo...

        out = "whatever"
        if out.find('error') >= 0:
            raise SchedulerError("Unable to find resources", out)
示例#26
0
    def getOutput(self, obj, outdir=''):
        """
        Get output files from jobs in 'obj' and put them in 'outdir', and  
        remove the job from the CE.
        """
        if type(obj) == Task:
            self.logging.debug("getOutput called for %i jobs" % len(obj.jobs))
            joblist = obj.jobs
            if outdir == '':
                outdir = obj['outputDirectory']
        elif type(obj) == Job:
            self.logging.debug("getOutput called for 1 job")
            joblist = [obj]
        else:
            raise SchedulerError('wrong argument type', str(type(obj)))

        assert outdir != ''
        if outdir[-1] != '/': outdir += '/'

        for job in joblist:
            tmpdir = tempfile.mkdtemp(prefix="joboutputs.", dir=outdir)

            cmd = self.pre_arcCmd + 'arcget --timeout=600 %s --dir %s' % (
                job.runningJob['schedulerId'], tmpdir)
            self.logging.debug("Running command: %s" % cmd)
            output, stat = self.ExecuteCommand(cmd)
            self.logging.debug("Status and output of arcget: %i, '%s'" %
                               (stat, output))
            if stat != 0:
                msg = "arcget failed with status %i: %s" % (stat, output)
                self.logging.warning(msg)
            else:
                # Copy the dowlodaed files to their final destination
                cmd = 'mv %s/*/* %s' % (tmpdir, outdir)
                self.logging.debug("Moving files from %s/* to %s" %
                                   (tmpdir, outdir))
                output, stat = self.ExecuteCommand(cmd)
                if stat != 0:
                    msg = "Moving files to final destination failed: %s" % (
                        output)
                    self.logging.warning(msg)
                else:
                    cmd = ' rm -r %s' % (tmpdir)
                    self.logging.debug("Removing tempdir %s" % (tmpdir))
                    output, stat = self.ExecuteCommand(cmd)
                    if stat != 0:
                        msg = "Removing tempdir: %s" % (output)
                        self.logging.warning(msg)
示例#27
0
    def kill(self, obj):

        for job in obj.jobs :
            if not self.valid( job.runningJob ): continue
            id=str(job.runningJob['schedulerId']).strip()

            p = subprocess.Popen( ['qdel', id], stdout=subprocess.PIPE,
                                                       stderr=subprocess.STDOUT)
            qdel_output, \
                qdel_error = p.communicate()
            qdel_return    = p.returncode

            if qdel_return != 0:
                self.logging.error('Error in job kill for '+id)
                self.logging.error('PBS Error stdout: %s' % qdel_output)
                raise SchedulerError('PBS Error in kill', qdel_output)                  
示例#28
0
    def submit(self, task, requirements='', config='', service=''):
        """
        user submission function
        
        takes as arguments:
        - a finite, dedicated jdl
        - eventually a list of services to connect
        - eventually a config file

        the passed config file or, if not provided, a default one can be
        used from eventual defaults

        the function returns an eventual parent id, the service of the
        successfully submission and a map associating the jobname to the
        node id. If the submission is not bulk, the parent id should be the
        node id of the unique entry of the map
        
        """

        taskId = None
        queue = None
        retMap = {}

        for job in task.jobs:
            command = self.decodeJob(job, task, requirements)
            out, ret = self.ExecuteCommand(command)
            if ret != 0:
                raise SchedulerError('Error in submit', out, command)

            r = re.compile("Job <(\d+)> is submitted.*<(\w+)>")

            m = r.search(out)
            if m is not None:
                jobId = m.group(1)
                queue = m.group(2)
                retMap[job['name']] = jobId
            else:
                rNot = re.compile("Job not submitted.*<(\w+)>")
                m = rNot.search(out)
                if m is not None:
                    self.logging.error("Job NOT submitted: %s" % out)
                    job.runningJob.errors.append('Cannot submit using %s: %s' %
                                                 (out, command))

        return retMap, taskId, queue
示例#29
0
    def query(self, obj, service='', objType='node'):
        """
        query status and eventually other scheduler related information
        """

        # ask for the job informations, mainly status
        # some systems allow a query job per job, others also bulk queries

        #print schedIdList, service, objType
        r = re.compile("(\d+)\s+\w+\s+(\w+).*")
        rfull = re.compile("(\d+)\s+\w+\s+(\w+)\s+(\w+)\s+\w+\s+(\w+).*")
        rnotfound = re.compile("Job <(\d+)> is not found")
        for job in obj.jobs:

            if not self.valid(job.runningJob):
                continue

            jobid = str(job.runningJob['schedulerId']).strip()
            command = 'bjobs ' + str(jobid)
            out, ret = self.ExecuteCommand(command)
            if ret != 0:
                raise SchedulerError('Error in status query', out, command)

            mnotfound = rnotfound.search(out)
            queue = None
            host = None
            sid = None
            st = None
            if (mnotfound):
                sid = mnotfound.group(1)
                st = 'DONE'
            else:
                mfull = rfull.search(out)
                if (mfull):
                    sid, st, queue, host = mfull.groups()
                else:
                    m = r.search(out)
                    if (m):
                        sid, st = m.groups()

            if (st):
                job.runningJob['statusScheduler'] = st
                job.runningJob['status'] = self.statusMap[st]
            if (host):
                job.runningJob['destination'] = host
示例#30
0
    def kill(self, obj):

        conn = self.pbs_conn()

        for job in obj.jobs:
            if not self.valid(job.runningJob): continue
            id = str(job.runningJob['schedulerId']).strip()
            res = pbs.pbs_deljob(conn, id, '')

            if res != 0:
                err, err_text = pbs.error()
                self.logging.error('Error in job kill for ' + id)
                self.logging.error('PBS error code ' + str(err) + ': ' +
                                   err_text)
                self.pbs_disconn(conn)
                raise SchedulerError('PBS error', str(err) + ': ' + err_text)

        self.pbs_disconn(conn)