Пример #1
0
    def execute(self):

        # Hold output jobs
        self.failedJobs = []
        self.successfulJobs = []

        # Run all commands to submit new jobsets to the grid.
        # Retrieve the JobIDs and stats of all newly created jobs.
        print "About to submit requested jobs"
        self.currentJobs = {}
        for item in self.commandList:
            newjobset = GridJobset(item)
            newjobset.submit()
            createdJobs = newjobset.JobIDs
            # Register newly existing JobIDS
            self.pbookCore.sync()
            PdbUtils.getListOfJobIDs(True, False)
            for jobID in createdJobs:
                self.addJobToList(jobID)
        print "Submitted jobs", self.currentJobs.keys()

        # If specified jobs to pick up, get those:
        if len(self.additionalJobs) != 0:
            print "Adding specified jobs to list."
            for item in self.additionalJobs:
                self.addJobToList(item)

        # If we are supposed to include all running jobs, add those
        if self.syncToRunningJobs == True:
            print "Adding currently running jobs to list."
            # Param 'True' means only jobs not 'frozen' are kept
            runningJobs = PdbUtils.getListOfJobIDs(True, False)
            for item in runningJobs:
                self.addJobToList(item)

        print "Total list of jobs to monitor is now:", self.currentJobs.keys(
        ), "\n"

        # Run this until all jobs are complete.
        while len(self.currentJobs.keys()) > 0:

            # Synchronise pbook.
            self.pbookCore.sync()

            # Check each job and act accordingly.
            self.currentJobIDs = sorted(self.currentJobs.keys())
            for jobID in self.currentJobIDs:
                job = self.currentJobs[jobID]
                currentStatus = self.checkCurrentStatus(job)

                if currentStatus == 'stillRunning':
                    continue

                elif currentStatus == 'stuck':
                    self.unstick(job)

                elif currentStatus == 'failed':
                    if job.prunAttemptCount < self.pandaRetryLimit:
                        self.retryFailed(job)
                    else:
                        self.failedJobs.append[jobID]
                        del self.currentJobs[jobID]

                elif currentStatus == 'finished':
                    ## If running a test code which does not produce an
                    ## output dataset, the outDS will be blank.
                    if job.outDS == "":
                        del self.currentJobs[jobID]
                        self.successfulJobs.append(jobID)
                    ## dq2-get output.
                    else:
                        self.getOutput(job)

                else:
                    print "Error!"
                    self.currentJobs = {}
                    break

            # Wait required gap time, then go to next iteration.
            print "\n"
            time.sleep(self.downtime)

        print "All jobs finished."
        print "Successful jobs:", self.successfulJobs
        print "Failed jobs:", self.failedJobs

        sys.exit(0)
Пример #2
0
 def sync(self):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     tmpLog.info("Synchronizing local repository ...")
     # check proxy
     self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy(
             self.gridPassPhrase,
             False,
             self.verbose,
             useCache=True)
     # get nickname
     nickName = PsubUtils.getNickname()
     # set Rucio accounting
     PsubUtils.setRucioAccount(nickName,'pbook',True)
     # get JobIDs in local repository
     localJobIDs = PdbUtils.getListOfJobIDs()
     # get recent JobIDs from panda server
     syncTimeRaw = datetime.datetime.utcnow()
     syncTime = syncTimeRaw.strftime('%Y-%m-%d %H:%M:%S')
     # set sync time for the first attempt
     bookConf = BookConfig.getConfig()
     if self.restoreDB:
         # reset last_synctime to restore database 
         bookConf.last_synctime = ''
     # disable
     self.restoreDB = False
     tmpLog.info("It may take several minutes to restore local repository ...")
     if bookConf.last_synctime == '':
         bookConf.last_synctime = datetime.datetime.utcnow()-datetime.timedelta(days=180)
         bookConf.last_synctime = bookConf.last_synctime.strftime('%Y-%m-%d %H:%M:%S')
     maxTaskID = None
     while True:
         status, jediTaskDicts = Client.getJobIDsJediTasksInTimeRange(bookConf.last_synctime,
                                                                      minTaskID=maxTaskID,
                                                                      verbose=self.verbose)
         if status != 0:
             tmpLog.error("Failed to get tasks from panda server")
             return
         if len(jediTaskDicts) == 0:
             break
         tmpLog.info("Got %s tasks to be updated" % len(jediTaskDicts))
         # insert if missing
         for remoteJobID in jediTaskDicts.keys():
             taskID = jediTaskDicts[remoteJobID]['jediTaskID']
             # get max
             if maxTaskID is None or taskID > maxTaskID:
                 maxTaskID = taskID
             # check local status
             job = None
             if remoteJobID in localJobIDs:
                 # get job info from local repository
                 job = PdbUtils.readJobDB(remoteJobID, self.verbose)
                 # skip if frozen
                 if job.dbStatus == 'frozen':
                     continue
             tmpLog.info("Updating taskID=%s ..." % taskID)
             # convert JEDI task
             localJob = PdbUtils.convertJTtoD(jediTaskDicts[remoteJobID],job)
             # update database
             if not remoteJobID in localJobIDs:
                 # insert to DB
                 try:
                     PdbUtils.insertJobDB(localJob,self.verbose)
                 except:
                     tmpLog.error("Failed to insert taskID=%s to local repository" % taskID)
                     return
             else:
                 # update
                 try:
                     PdbUtils.updateJobDB(localJob,self.verbose,syncTimeRaw)
                 except:
                     tmpLog.error("Failed to update local repository for taskID=%s" % taskID)
                     return
     # update sync time
     bookConf = BookConfig.getConfig()
     bookConf.last_synctime = syncTime
     BookConfig.updateConfig(bookConf)
     self.updateTaskJobsetMap()
     tmpLog.info("Synchronization Completed")
Пример #3
0
    def sync(self):
        # get logger
        tmpLog = PLogger.getPandaLogger()
        tmpLog.info("Synchronizing local repository ...")
        # check proxy
        self.gridPassPhrase, self.vomsFQAN = PsubUtils.checkGridProxy(
            self.gridPassPhrase, False, self.verbose)
        # get JobIDs in local repository
        localJobIDs = PdbUtils.getListOfJobIDs()
        # get recent JobIDs from panda server
        syncTime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
        # set sync time for the first attempt
        bookConf = BookConfig.getConfig()
        if self.restoreDB:
            # reset last_synctime to restore database
            bookConf.last_synctime = ''
            # disable
            self.restoreDB = False
            tmpLog.info(
                "It may take several minutes to restore local repository ...")
        if bookConf.last_synctime == '':
            bookConf.last_synctime = datetime.datetime.utcnow(
            ) - datetime.timedelta(days=180)
            bookConf.last_synctime = bookConf.last_synctime.strftime(
                '%Y-%m-%d %H:%M:%S')
        status, remoteJobIDs = Client.getJobIDsInTimeRange(
            bookConf.last_synctime, verbose=self.verbose)
        if status != 0:
            tmpLog.error("Failed to get JobIDs from panda server")
            return
        tmpLog.info("Got %s jobs to be updated" % len(remoteJobIDs))
        # insert if missing
        for remoteJobID in remoteJobIDs:
            # check local status
            job = None
            if remoteJobID in localJobIDs:
                # get job info from local repository
                job = PdbUtils.readJobDB(remoteJobID, self.verbose)
                # skip if frozen
                if job.dbStatus == 'frozen':
                    continue
            tmpLog.info("Updating JobID=%s ..." % remoteJobID)
            # get PandaIDs
            status, pandaIDstatus = Client.getPandIDsWithJobID(
                remoteJobID, verbose=self.verbose)
            if status != 0:
                tmpLog.error("Failed to get PandaIDs for %s" % remoteJobID)
                return
            pandaIDs = pandaIDstatus.keys()
            pandaIDs.sort()
            # get full JobSpec
            pandaJobs = []
            pandaFileInfo = {}
            pandaJobForSiteID = None
            if job == None:
                tmpIDs = [pandaIDs[0], pandaIDs[-1]]
                status, pandaJobs = Client.getFullJobStatus(
                    tmpIDs, verbose=self.verbose)
                if status != 0:
                    tmpLog.error("Failed to get PandaJobs for %s" %
                                 remoteJobID)
                    return
# get slimmed file info
                status, pandaFileInfo = Client.getSlimmedFileInfoPandaIDs(
                    pandaIDs, verbose=self.verbose)
                if status != 0:
                    tmpLog.error("Failed to get file info  for %s" %
                                 remoteJobID)
                    return
            else:
                # get one job to set computingSite which may have changed due to rebrokerage
                status, tmpPandaJobs = Client.getFullJobStatus(
                    [pandaIDs[0]], verbose=self.verbose)
                if status != 0:
                    tmpLog.error("Failed to get PandaJobs for %s" %
                                 remoteJobID)
                    return
                pandaJobForSiteID = tmpPandaJobs[0]
            # convert to local job spec
            localJob = PdbUtils.convertPtoD(pandaJobs, pandaIDstatus, job,
                                            pandaFileInfo, pandaJobForSiteID)
            # update database
            if not remoteJobID in localJobIDs:
                # insert to DB
                try:
                    PdbUtils.insertJobDB(localJob, self.verbose)
                except:
                    tmpLog.error(
                        "Failed to insert JobID=%s to local repository" %
                        remoteJobID)
                    return
                # set retryID
                if not localJob.provenanceID in [0, '0']:
                    try:
                        PdbUtils.setRetryID(localJob, self.verbose)
                    except:
                        tmpLog.error(
                            "Failed to set retryID for JobID=%s in local repository"
                            % remoteJobID)
                        return
            else:
                # update
                try:
                    PdbUtils.updateJobDB(localJob, self.verbose)
                except:
                    tmpLog.error(
                        "Failed to update local repository for JobID=%s" %
                        remoteJobID)
                    return
        # update sync time
        bookConf = BookConfig.getConfig()
        bookConf.last_synctime = syncTime
        BookConfig.updateConfig(bookConf)
        tmpLog.info("Synchronization Completed")