Exemplo n.º 1
0
 def performRetry(self, job):
     '''Retry a previous job. Update stored information.'''
     # Get current info for this job
     oldJobID = job.JobID
     jobInfo = PdbUtils.readJobDB(oldJobID, False)
     # Determine whether we need to redo build
     if jobInfo.buildStatus in ['', 'finished']:
         print "Build OK."
         self.pbookCore.retry(oldJobID, newSite=self.useNewSite)
     else:
         print "Retrying build."
         self.pbookCore.retry(oldJobID,
                              retryBuild=True,
                              newSite=self.useNewSite)
     # Retrieve ID of new job
     jobInfo = PdbUtils.readJobDB(oldJobID, False)
     newJobID = jobInfo.retryID
     if newJobID == 0:
         newJobID == oldJobID
     # Remove old information in currentJobs
     if not isinstance(oldJobID, str):
         oldJobID = '%d' % oldJobID
     del self.currentJobs[oldJobID]
     # Put current information in currentJobs
     self.addJobToList(newJobID)
     print "Retrying job", oldJobID, ". New JobID:", newJobID
Exemplo n.º 2
0
 def retryFailed(self, job):
     '''Retry a failed job once db entry is frozen.'''
     print "retrying failed job"
     #Is anything in the job still running? If so, kill it.
     jobInfo = PdbUtils.readJobDB(job.JobID, False)
     if jobInfo.dbStatus != 'frozen':
         self.pbookCore.kill(job.JobID)
     self.performRetry(job)
     job.prunAttemptCount += 1
Exemplo n.º 3
0
    def checkCurrentStatus(self, job):
        '''I use combinations of panda job statuses to define
    jobs as finished, failed, stuck, or running.'''

        # Panda job status options are:
        # [defined, assigned, activated, running
        #  holding, transferring, finished, failed]
        # https://www.gridpp.ac.uk/wiki/ATLAS_Monitoring_For_Sites

        jobInfo = PdbUtils.readJobDB(job.JobID)
        statusstring = jobInfo.jobStatus
        status = statusstring.split(",")
        print "Status of job", job.JobID, "is", status
        if sorted(status) != sorted(job.status):
            job.status = status
            job.statusSince = time.time()  # seconds
        timeInThisState = time.time() - job.statusSince

        # If anything has failed, need to retry
        #    if status.count('failed') :
        #      return 'failed'

        # If everything finished, so is this job.
        #    elif status.count('finished')==len(status) \
        #           and jobInfo.dbStatus=='frozen':
        #      return 'finished'

        if jobInfo.dbStatus == 'frozen':
            if status.count('failed'):
                return 'failed'
            elif status.count('finished') == len(status):
                return 'finished'
            else:
                print "Unrecognized option!"
                return 'none'

        # Don't currently count holding at the end as stuck
        elif status.count('defined') or status.count('assigned') \
               or status.count('activated') or status.count('transferring') \
               or status.count('starting') :
            if timeInThisState < self.defineJobAsStuck:
                return 'stillRunning'
            else:
                return 'stuck'

        # If not stuck or failed but something is still running, keep waiting
        elif status.count('running') or status.count('holding') \
               or status.count('sent') :
            return 'stillRunning'

        else:
            print "Unrecognized status!"
            return 'none'
Exemplo n.º 4
0
 def getJobInfo(self,JobID):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # convert taskID to jobsetID
     JobID = self.convertTaskToJobID(JobID)
     # get job info from local repository
     job = PdbUtils.readJobDB(JobID,self.verbose)
     # not found
     if job == None:
         tmpLog.warning("JobID=%s not found in local repository. Synchronization may be needed" % JobID)
         return None
     # return
     return job
Exemplo n.º 5
0
 def getJobInfo(self, JobID):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # get job info from local repository
     job = PdbUtils.readJobDB(JobID, self.verbose)
     # not found
     if job == None:
         tmpLog.warning(
             "JobID=%s not found in local repository. Synchronization may be needed"
             % JobID)
         return None
     # return
     return job
Exemplo n.º 6
0
 def getJobJobsetInfo(self,id):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # try to get jobset
     job = PdbUtils.readJobsetDB(id,self.verbose)
     # get job info from local repository
     if job == None:
         job = PdbUtils.readJobDB(id,self.verbose)
     # not found
     if job == None:
         tmpLog.warning("JobsetID/JobID=%s not found in local repository. Synchronization may be needed" % JobID)
         return None
     # return
     return job
Exemplo n.º 7
0
 def addJobToList(self, jobID):
     '''Get job info for jobID and add GridJob to self.currentJobs'''
     self.pbookCore.sync()
     if not isinstance(jobID, str):
         jobID = '%d' % jobID
     if jobID not in self.currentJobs.keys():
         jobinfo = PdbUtils.readJobDB(jobID, False)
         if jobinfo == None:
             print "No job found with ID", jobID, "!"
             self.failedJobs.append(jobID)
             return
         newjob = GridJob(jobinfo,self.outputdir,self.dq2SetupScript,\
                  self.defineDownloadAsStuck,self.dq2RetryLimit)
         self.currentJobs[jobID] = newjob
Exemplo n.º 8
0
 def sync(self):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     tmpLog.info("Synchronizing local repository ...")
     # check proxy
     self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy(
             self.gridPassPhrase,
             False,
             self.verbose,
             useCache=True)
     # get nickname
     nickName = PsubUtils.getNickname()
     # set Rucio accounting
     PsubUtils.setRucioAccount(nickName,'pbook',True)
     # get JobIDs in local repository
     localJobIDs = PdbUtils.getListOfJobIDs()
     # get recent JobIDs from panda server
     syncTimeRaw = datetime.datetime.utcnow()
     syncTime = syncTimeRaw.strftime('%Y-%m-%d %H:%M:%S')
     # set sync time for the first attempt
     bookConf = BookConfig.getConfig()
     if self.restoreDB:
         # reset last_synctime to restore database 
         bookConf.last_synctime = ''
     # disable
     self.restoreDB = False
     tmpLog.info("It may take several minutes to restore local repository ...")
     if bookConf.last_synctime == '':
         bookConf.last_synctime = datetime.datetime.utcnow()-datetime.timedelta(days=180)
         bookConf.last_synctime = bookConf.last_synctime.strftime('%Y-%m-%d %H:%M:%S')
     maxTaskID = None
     while True:
         status, jediTaskDicts = Client.getJobIDsJediTasksInTimeRange(bookConf.last_synctime,
                                                                      minTaskID=maxTaskID,
                                                                      verbose=self.verbose)
         if status != 0:
             tmpLog.error("Failed to get tasks from panda server")
             return
         if len(jediTaskDicts) == 0:
             break
         tmpLog.info("Got %s tasks to be updated" % len(jediTaskDicts))
         # insert if missing
         for remoteJobID in jediTaskDicts.keys():
             taskID = jediTaskDicts[remoteJobID]['jediTaskID']
             # get max
             if maxTaskID is None or taskID > maxTaskID:
                 maxTaskID = taskID
             # check local status
             job = None
             if remoteJobID in localJobIDs:
                 # get job info from local repository
                 job = PdbUtils.readJobDB(remoteJobID, self.verbose)
                 # skip if frozen
                 if job.dbStatus == 'frozen':
                     continue
             tmpLog.info("Updating taskID=%s ..." % taskID)
             # convert JEDI task
             localJob = PdbUtils.convertJTtoD(jediTaskDicts[remoteJobID],job)
             # update database
             if not remoteJobID in localJobIDs:
                 # insert to DB
                 try:
                     PdbUtils.insertJobDB(localJob,self.verbose)
                 except:
                     tmpLog.error("Failed to insert taskID=%s to local repository" % taskID)
                     return
             else:
                 # update
                 try:
                     PdbUtils.updateJobDB(localJob,self.verbose,syncTimeRaw)
                 except:
                     tmpLog.error("Failed to update local repository for taskID=%s" % taskID)
                     return
     # update sync time
     bookConf = BookConfig.getConfig()
     bookConf.last_synctime = syncTime
     BookConfig.updateConfig(bookConf)
     self.updateTaskJobsetMap()
     tmpLog.info("Synchronization Completed")
Exemplo n.º 9
0
    def sync(self):
        # get logger
        tmpLog = PLogger.getPandaLogger()
        tmpLog.info("Synchronizing local repository ...")
        # check proxy
        self.gridPassPhrase, self.vomsFQAN = PsubUtils.checkGridProxy(
            self.gridPassPhrase, False, self.verbose)
        # get JobIDs in local repository
        localJobIDs = PdbUtils.getListOfJobIDs()
        # get recent JobIDs from panda server
        syncTime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
        # set sync time for the first attempt
        bookConf = BookConfig.getConfig()
        if self.restoreDB:
            # reset last_synctime to restore database
            bookConf.last_synctime = ''
            # disable
            self.restoreDB = False
            tmpLog.info(
                "It may take several minutes to restore local repository ...")
        if bookConf.last_synctime == '':
            bookConf.last_synctime = datetime.datetime.utcnow(
            ) - datetime.timedelta(days=180)
            bookConf.last_synctime = bookConf.last_synctime.strftime(
                '%Y-%m-%d %H:%M:%S')
        status, remoteJobIDs = Client.getJobIDsInTimeRange(
            bookConf.last_synctime, verbose=self.verbose)
        if status != 0:
            tmpLog.error("Failed to get JobIDs from panda server")
            return
        tmpLog.info("Got %s jobs to be updated" % len(remoteJobIDs))
        # insert if missing
        for remoteJobID in remoteJobIDs:
            # check local status
            job = None
            if remoteJobID in localJobIDs:
                # get job info from local repository
                job = PdbUtils.readJobDB(remoteJobID, self.verbose)
                # skip if frozen
                if job.dbStatus == 'frozen':
                    continue
            tmpLog.info("Updating JobID=%s ..." % remoteJobID)
            # get PandaIDs
            status, pandaIDstatus = Client.getPandIDsWithJobID(
                remoteJobID, verbose=self.verbose)
            if status != 0:
                tmpLog.error("Failed to get PandaIDs for %s" % remoteJobID)
                return
            pandaIDs = pandaIDstatus.keys()
            pandaIDs.sort()
            # get full JobSpec
            pandaJobs = []
            pandaFileInfo = {}
            pandaJobForSiteID = None
            if job == None:
                tmpIDs = [pandaIDs[0], pandaIDs[-1]]
                status, pandaJobs = Client.getFullJobStatus(
                    tmpIDs, verbose=self.verbose)
                if status != 0:
                    tmpLog.error("Failed to get PandaJobs for %s" %
                                 remoteJobID)
                    return
# get slimmed file info
                status, pandaFileInfo = Client.getSlimmedFileInfoPandaIDs(
                    pandaIDs, verbose=self.verbose)
                if status != 0:
                    tmpLog.error("Failed to get file info  for %s" %
                                 remoteJobID)
                    return
            else:
                # get one job to set computingSite which may have changed due to rebrokerage
                status, tmpPandaJobs = Client.getFullJobStatus(
                    [pandaIDs[0]], verbose=self.verbose)
                if status != 0:
                    tmpLog.error("Failed to get PandaJobs for %s" %
                                 remoteJobID)
                    return
                pandaJobForSiteID = tmpPandaJobs[0]
            # convert to local job spec
            localJob = PdbUtils.convertPtoD(pandaJobs, pandaIDstatus, job,
                                            pandaFileInfo, pandaJobForSiteID)
            # update database
            if not remoteJobID in localJobIDs:
                # insert to DB
                try:
                    PdbUtils.insertJobDB(localJob, self.verbose)
                except:
                    tmpLog.error(
                        "Failed to insert JobID=%s to local repository" %
                        remoteJobID)
                    return
                # set retryID
                if not localJob.provenanceID in [0, '0']:
                    try:
                        PdbUtils.setRetryID(localJob, self.verbose)
                    except:
                        tmpLog.error(
                            "Failed to set retryID for JobID=%s in local repository"
                            % remoteJobID)
                        return
            else:
                # update
                try:
                    PdbUtils.updateJobDB(localJob, self.verbose)
                except:
                    tmpLog.error(
                        "Failed to update local repository for JobID=%s" %
                        remoteJobID)
                    return
        # update sync time
        bookConf = BookConfig.getConfig()
        bookConf.last_synctime = syncTime
        BookConfig.updateConfig(bookConf)
        tmpLog.info("Synchronization Completed")