示例#1
0
    def processJobErrors(self, id, appjobid, failedjob):
        '''
        Examine errors of failed job and decide whether to resubmit or not
        '''
        errors = ";".join([joberr for joberr in failedjob.Error])
        self.log.info("%s: Job failure for %s: %s" %
                      (appjobid, failedjob.JobID, errors))

        # First check if it was a data staging problem
        if failedjob.RestartState == arc.JobState.PREPARING or \
           failedjob.RestartState == arc.JobState.FINISHING:
            # Don't retry when output list is not available
            if 'Error reading user generated output file list' not in errors:
                self.log.info("%s: Will rerun job %s" %
                              (appjobid, failedjob.JobID))
                # Reset arc job state so that next time new state will be picked up
                failedjob.State = arc.JobState('Undefined')
                return "torerun"

        newstate = "failed"
        # Check if any job runtime error matches any error in the toresubmit list
        resub = [
            err for err in self.conf.getList(
                ['errors', 'toresubmit', 'arcerrors', 'item']) if err in errors
        ]
        attemptsleft = int(
            self.db.getArcJobInfo(id, ['attemptsleft'])['attemptsleft']) - 1
        if attemptsleft < 0:
            attemptsleft = 0
        self.db.updateArcJob(id, {'attemptsleft': str(attemptsleft)})
        if resub:
            if not attemptsleft:
                self.log.info("%s: Job %s out of retries" %
                              (appjobid, failedjob.JobID))
            else:
                self.log.info("%s: Will resubmit job %s, %i attempts left" %
                              (appjobid, failedjob.JobID, attemptsleft))
                failedjob.State = arc.JobState('Undefined')
                newstate = "toresubmit"

        else:
            self.log.info("%s: Job %s has fatal errors, cannot resubmit" %
                          (appjobid, failedjob.JobID))
        return newstate
示例#2
0
    def checkJobs(self):
        '''
        Query all running jobs
        '''

        # minimum time between checks
        if time.time() < self.checktime + int(
                self.conf.get(['jobs', 'checkmintime'])):
            self.log.debug("mininterval not reached")
            return
        self.checktime = time.time()

        # check jobs which were last checked more than checkinterval ago
        jobstocheck=self.db.getArcJobs("arcstate in ('submitted', 'running', 'finishing', 'cancelling', 'holding') and " \
                                       "jobid not like '' and cluster='"+self.cluster+"' and "+ \
                                       self.db.timeStampLessThan("tarcstate", self.conf.get(['jobs','checkinterval'])) + \
                                       " limit 100000")

        njobstocheck = sum(len(v) for v in jobstocheck.values())
        if not njobstocheck:
            return
        self.log.info("%d jobs to check" % njobstocheck)
        self.resetJobs(jobstocheck)

        # Loop over proxies
        for proxyid, jobs in jobstocheck.items():
            self.uc.CredentialString(str(self.db.getProxy(proxyid)))

            job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs])
            job_supervisor.Update()
            jobsupdated = job_supervisor.GetAllJobs()
            jobsnotupdated = job_supervisor.GetIDsNotProcessed()

            for (originaljobinfo, updatedjob) in zip(jobs, jobsupdated):
                (id, appjobid, originaljob, created) = originaljobinfo
                if updatedjob.JobID in jobsnotupdated:
                    self.log.error("%s: Failed to find information on %s" %
                                   (appjobid, updatedjob.JobID))
                    continue
                if updatedjob.JobID != originaljob.JobID:
                    # something went wrong with list order
                    self.log.warning(
                        "%s: Bad job id (%s), expected %s" %
                        (appjobid, updatedjob.JobID, originaljob.JobID))
                    continue
                # compare strings here to get around limitations of JobState API
                # map INLRMS:S and O to HOLD (not necessary when ARC 4.1 is used)
                if updatedjob.State.GetGeneralState() == 'Queuing' and (
                        updatedjob.State.GetSpecificState() == 'INLRMS:S'
                        or updatedjob.State.GetSpecificState() == 'INLRMS:O'):
                    updatedjob.State = arc.JobState('Hold')
                if originaljob.State.GetGeneralState() == updatedjob.State.GetGeneralState() \
                     and self.cluster not in ['gsiftp://gar-ex-etpgrid1.garching.physik.uni-muenchen.de:2811/preempt', 'gsiftp://arc1-it4i.farm.particle.cz/qfree', 'gsiftp://arc2-it4i.farm.particle.cz/qfree']:
                    # just update timestamp
                    # Update numbers every time for superMUC since walltime is missing for finished jobs
                    self.db.updateArcJob(id,
                                         {'tarcstate': self.db.getTimeStamp()})
                    continue

                self.log.info("%s: Job %s: %s -> %s (%s)" %
                              (appjobid, originaljob.JobID,
                               originaljob.State.GetGeneralState(),
                               updatedjob.State.GetGeneralState(),
                               updatedjob.State.GetSpecificState()))

                # state changed, update whole Job object
                arcstate = 'submitted'
                if updatedjob.State == arc.JobState.FINISHED:
                    if updatedjob.ExitCode == -1:
                        # Missing exit code, but assume success
                        self.log.warning(
                            "%s: Job %s FINISHED but has missing exit code, setting to zero"
                            % (appjobid, updatedjob.JobID))
                        updatedjob.ExitCode = 0
                    arcstate = 'finished'
                    self.log.debug(
                        '%s: reported walltime %d, cputime %d' %
                        (appjobid, updatedjob.UsedTotalWallTime.GetPeriod(),
                         updatedjob.UsedTotalCPUTime.GetPeriod()))
                elif updatedjob.State == arc.JobState.FAILED:
                    # EMI-ES reports cancelled jobs as failed so check substate (this is fixed in ARC 6.8)
                    if 'cancel' in updatedjob.State.GetSpecificState():
                        arcstate = 'cancelled'
                    else:
                        arcstate = self.processJobErrors(
                            id, appjobid, updatedjob)
                elif updatedjob.State == arc.JobState.KILLED:
                    arcstate = 'cancelled'
                elif updatedjob.State == arc.JobState.RUNNING:
                    arcstate = 'running'
                elif updatedjob.State == arc.JobState.FINISHING:
                    arcstate = 'finishing'
                elif updatedjob.State == arc.JobState.HOLD:
                    arcstate = 'holding'
                elif updatedjob.State == arc.JobState.DELETED or \
                     updatedjob.State == arc.JobState.OTHER:
                    # unexpected
                    arcstate = 'failed'

                # Walltime reported by ARC 6 is multiplied by cores
                if arc.ARC_VERSION_MAJOR >= 6 and updatedjob.RequestedSlots > 0:
                    updatedjob.UsedTotalWallTime = arc.Period(
                        updatedjob.UsedTotalWallTime.GetPeriod() //
                        updatedjob.RequestedSlots)
                # Fix crazy wallclock and CPU times
                if updatedjob.UsedTotalWallTime > arc.Time() - arc.Time(
                        int(created.strftime("%s"))):
                    fixedwalltime = arc.Time() - arc.Time(
                        int(created.strftime("%s")))
                    self.log.warning(
                        "%s: Fixing reported walltime %d to %d" %
                        (appjobid, updatedjob.UsedTotalWallTime.GetPeriod(),
                         fixedwalltime.GetPeriod()))
                    updatedjob.UsedTotalWallTime = fixedwalltime
                if updatedjob.UsedTotalCPUTime > arc.Period(10**7):
                    self.log.warning(
                        "%s: Discarding reported CPUtime %d" %
                        (appjobid, updatedjob.UsedTotalCPUTime.GetPeriod()))
                    updatedjob.UsedTotalCPUTime = arc.Period(-1)
                self.db.updateArcJob(
                    id, {
                        'arcstate': arcstate,
                        'tarcstate': self.db.getTimeStamp(),
                        'tstate': self.db.getTimeStamp()
                    }, updatedjob)

        self.log.info('Done')