class JobSubmitterPoller(BaseWorkerThread): """ _JobSubmitterPoller_ The jobSubmitterPoller takes the jobs and organizes them into packages before sending them to the individual plugin submitters. """ def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.config = config #DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=self.config) self.hostName = self.config.Agent.hostName self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) self.maxJobsPerPoll = int( getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000)) self.maxJobsThisCycle = self.maxJobsPerPoll # changes as per schedd limit self.cacheRefreshSize = int( getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000)) self.skipRefreshCount = int( getattr(self.config.JobSubmitter, 'skipRefreshCount', 20)) self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7) self.condorFraction = 0.75 # update during every algorithm cycle self.condorOverflowFraction = 0.2 self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting') # Additions for caching-based JobSubmitter self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.locationDict = {} self.taskTypePrioMap = {} self.drainSites = set() self.abortSites = set() self.refreshPollingCount = 0 try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except OSError as ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) logging.debug("PackageDir: %s", self.packageDir) logging.debug("Config: %s", config) raise JobSubmitterPollerException(msg) # Now the DAOs self.listJobsAction = self.daoFactory( classname="Jobs.ListForSubmitter") self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation") self.locationAction = self.daoFactory( classname="Locations.GetSiteInfo") self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath") self.listWorkflows = self.daoFactory( classname="Workflow.ListForSubmitter") # Keep a record of the thresholds in memory self.currentRcThresholds = {} self.useReqMgrForCompletionCheck = getattr( self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) if self.useReqMgrForCompletionCheck: # only set up this when reqmgr is used (not Tier0) self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL) self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache( ) self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) else: # Tier0 Case - just for the clarity (This private variable shouldn't be used self.abortedAndForceCompleteWorkflowCache = None return def getPackageCollection(self, sandboxDir): """ _getPackageCollection_ Given a jobID figure out which packageCollection it should belong in. """ rawList = os.listdir(sandboxDir) collections = [] numberList = [] for entry in rawList: if 'PackageCollection' in entry: collections.append(entry) # If we have no collections, return 0 (PackageCollection_0) if len(collections) < 1: return 0 # Loop over the list of PackageCollections for collection in collections: collectionPath = os.path.join(sandboxDir, collection) packageList = os.listdir(collectionPath) collectionNum = int(collection.split('_')[1]) if len(packageList) < self.collSize: return collectionNum else: numberList.append(collectionNum) # If we got here, then all collections are full. We'll need # a new one. Find the highest number, increment by one numberList.sort() return numberList[-1] + 1 def addJobsToPackage(self, loadedJob): """ _addJobsToPackage_ Add a job to a job package and then return the batch ID for the job. Packages are only written out to disk when they contain 100 jobs. The flushJobsPackages() method must be called after all jobs have been added to the cache and before they are actually submitted to make sure all the job packages have been written to disk. """ if loadedJob["workflow"] not in self.jobsToPackage: # First, let's pull all the information from the loadedJob batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"]) sandboxDir = os.path.dirname(loadedJob["sandbox"]) # Second, assemble the jobPackage location collectionIndex = self.getPackageCollection(sandboxDir) collectionDir = os.path.join( sandboxDir, 'PackageCollection_%i' % collectionIndex, 'batch_%s' % batchid) # Now create the package object self.jobsToPackage[loadedJob["workflow"]] = { "batchid": batchid, 'id': loadedJob['id'], "package": JobPackage(directory=collectionDir) } jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"] jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob() batchDir = jobPackage['directory'] if len(jobPackage.keys()) == self.packageSize: if not os.path.exists(batchDir): os.makedirs(batchDir) batchPath = os.path.join(batchDir, "JobPackage.pkl") jobPackage.save(batchPath) del self.jobsToPackage[loadedJob["workflow"]] return batchDir def flushJobPackages(self): """ _flushJobPackages_ Write any jobs packages to disk that haven't been written out already. """ workflowNames = self.jobsToPackage.keys() for workflowName in workflowNames: jobPackage = self.jobsToPackage[workflowName]["package"] batchDir = jobPackage['directory'] if not os.path.exists(batchDir): os.makedirs(batchDir) batchPath = os.path.join(batchDir, "JobPackage.pkl") jobPackage.save(batchPath) del self.jobsToPackage[workflowName] return def refreshCache(self): """ _refreshCache_ Query WMBS for all jobs in the 'created' state. For all jobs returned from the query, check if they already exist in the cache. If they don't, unpickle them and combine their site white and black list with the list of locations they can run at. Add them to the cache. Each entry in the cache is a tuple with five items: - WMBS Job ID - Retry count - Batch ID - Path to sanbox - Path to cache directory """ badJobs = dict([(x, []) for x in range(71101, 71105)]) dbJobs = set() logging.info("Refreshing priority cache with currently %i jobs", len(self.cachedJobIDs)) if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \ self.refreshPollingCount >= self.skipRefreshCount: newJobs = self.listJobsAction.execute() self.refreshPollingCount = 0 if self.useReqMgrForCompletionCheck: # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData( ) else: #T0Agent abortedAndForceCompleteRequests = [] logging.info("Found %s new jobs to be submitted.", len(newJobs)) else: self.refreshPollingCount += 1 newJobs = [] dbJobs = self.cachedJobIDs abortedAndForceCompleteRequests = [] logging.info( "Skipping cache update to be submitted. (%s job in cache)", len(dbJobs)) logging.info("Determining possible sites for new jobs...") jobCount = 0 for newJob in newJobs: # whether newJob belongs to aborted or force-complete workflow, and skip it if it is. if (newJob['request_name'] in abortedAndForceCompleteRequests) and \ (newJob['type'] not in ['LogCollect', "Cleanup"]): continue jobID = newJob['id'] dbJobs.add(jobID) if jobID in self.cachedJobIDs: continue jobCount += 1 if jobCount % 5000 == 0: logging.info("Processed %d/%d new jobs.", jobCount, len(newJobs)) pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl") if not os.path.isfile(pickledJobPath): # Then we have a problem - there's no file logging.error("Could not find pickled jobObject %s", pickledJobPath) badJobs[71103].append(newJob) continue try: jobHandle = open(pickledJobPath, "r") loadedJob = pickle.load(jobHandle) jobHandle.close() except Exception as ex: msg = "Error while loading pickled job object %s\n" % pickledJobPath msg += str(ex) logging.error(msg) raise JobSubmitterPollerException(msg) loadedJob['retry_count'] = newJob['retry_count'] # figure out possible locations for job possibleLocations = loadedJob["possiblePSN"] # Create another set of locations that may change when a site goes white/black listed # Does not care about the non_draining or aborted sites, they may change and that is the point potentialLocations = set() potentialLocations.update(possibleLocations) # now check for sites in drain and adjust the possible locations # also check if there is at least one site left to run the job if len(possibleLocations) == 0: newJob['name'] = loadedJob['name'] newJob['fileLocations'] = loadedJob.get('fileLocations', []) newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', []) newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', []) badJobs[71101].append(newJob) continue else: nonAbortSites = [ x for x in possibleLocations if x not in self.abortSites ] if nonAbortSites: # if there is at least a non aborted/down site then run there, otherwise fail the job possibleLocations = nonAbortSites else: newJob['name'] = loadedJob['name'] newJob['possibleLocations'] = possibleLocations badJobs[71102].append(newJob) continue # try to remove draining sites if possible, this is needed to stop # jobs that could run anywhere blocking draining sites # if the job type is Merge, LogCollect or Cleanup this is skipped if newJob['type'] not in self.ioboundTypes: nonDrainingSites = [ x for x in possibleLocations if x not in self.drainSites ] if nonDrainingSites: # if >1 viable non-draining site remove draining ones possibleLocations = nonDrainingSites else: newJob['name'] = loadedJob['name'] newJob['possibleLocations'] = possibleLocations badJobs[71104].append(newJob) continue # locations clear of abort and draining sites newJob['possibleLocations'] = possibleLocations batchDir = self.addJobsToPackage(loadedJob) self.cachedJobIDs.add(jobID) # calculate the final job priority such that we can order cached jobs by prio jobPrio = self.taskTypePrioMap.get(newJob['type'], 0) + newJob['wf_priority'] if jobPrio not in self.cachedJobs: self.cachedJobs[jobPrio] = {} # now add basic information keyed by the jobid self.cachedJobs[jobPrio][jobID] = newJob # allow job baggage to override numberOfCores # => used for repacking to get more slots/disk numberOfCores = loadedJob.get('numberOfCores', 1) if numberOfCores == 1: baggage = loadedJob.getBaggage() numberOfCores = getattr(baggage, "numberOfCores", 1) loadedJob['numberOfCores'] = numberOfCores # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob) jobInfo = { 'id': jobID, 'requestName': newJob['request_name'], 'taskName': newJob['task_name'], 'taskType': newJob['type'], 'cache_dir': newJob["cache_dir"], 'priority': newJob['wf_priority'], 'taskID': newJob['task_id'], 'retry_count': newJob["retry_count"], 'taskPriority': None, # update from the thresholds 'custom': { 'location': None }, # update later 'packageDir': batchDir, 'sandbox': loadedJob["sandbox"], # remove before submit 'userdn': loadedJob.get("ownerDN", None), 'usergroup': loadedJob.get("ownerGroup", ''), 'userrole': loadedJob.get("ownerRole", ''), 'possibleSites': frozenset( possibleLocations), # abort and drain sites filtered out 'potentialSites': frozenset(potentialLocations), # original list of sites 'scramArch': loadedJob.get("scramArch", None), 'swVersion': loadedJob.get("swVersion", None), 'name': loadedJob["name"], 'proxyPath': loadedJob.get("proxyPath", None), 'estimatedJobTime': loadedJob.get("estimatedJobTime", None), 'estimatedDiskUsage': loadedJob.get("estimatedDiskUsage", None), 'estimatedMemoryUsage': loadedJob.get("estimatedMemoryUsage", None), 'numberOfCores': loadedJob.get("numberOfCores", 1), # may update it later 'inputDataset': loadedJob.get('inputDataset', None), 'inputDatasetLocations': loadedJob.get('inputDatasetLocations', None), 'allowOpportunistic': loadedJob.get('allowOpportunistic', False) } self.jobDataCache[jobID] = jobInfo # Register failures in submission for errorCode in badJobs: if badJobs[errorCode]: logging.debug( "The following jobs could not be submitted: %s, error code : %d", badJobs, errorCode) self._handleSubmitFailedJobs(badJobs[errorCode], errorCode) # If there are any leftover jobs, we want to get rid of them. self.flushJobPackages() # We need to remove any jobs from the cache that were not returned in # the last call to the database. jobIDsToPurge = self.cachedJobIDs - dbJobs self._purgeJobsFromCache(jobIDsToPurge) logging.info("Done pruning killed jobs, moving on to submit.") return def removeAbortedForceCompletedWorkflowFromCache(self): abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData( ) jobIDsToPurge = set() for jobID, jobInfo in self.jobDataCache.iteritems(): if (jobInfo['requestName'] in abortedAndForceCompleteRequests) and \ (jobInfo['taskType'] not in ['LogCollect', "Cleanup"]): jobIDsToPurge.add(jobID) self._purgeJobsFromCache(jobIDsToPurge) return def _purgeJobsFromCache(self, jobIDsToPurge): if len(jobIDsToPurge) == 0: return self.cachedJobIDs -= jobIDsToPurge for jobid in jobIDsToPurge: self.jobDataCache.pop(jobid, None) for jobPrio in self.cachedJobs: if self.cachedJobs[jobPrio].pop(jobid, None): # then the jobid was found, go to the next one break return def _handleSubmitFailedJobs(self, badJobs, exitCode): """ __handleSubmitFailedJobs_ For a default job report for the exitCode and register in the job. Preserve it on disk as well. Propagate the failure to the JobStateMachine. """ fwjrBinds = [] for job in badJobs: job['couch_record'] = None job['fwjr'] = Report() if exitCode in [71102, 71104]: job['fwjr'].addError( "JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ', '.join(job['possibleLocations'])) elif exitCode in [71101]: # there is no possible site if job.get("fileLocations"): job['fwjr'].addError( "JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ": file locations: " + ', '.join(job['fileLocations']) + ": site white list: " + ', '.join(job['siteWhitelist']) + ": site black list: " + ', '.join(job['siteBlacklist'])) else: job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]) fwjrPath = os.path.join(job['cache_dir'], 'Report.%d.pkl' % int(job['retry_count'])) job['fwjr'].setJobID(job['id']) try: job['fwjr'].save(fwjrPath) fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath}) except IOError as ioer: logging.error( "Failed to write FWJR for submit failed job %d, message: %s", job['id'], str(ioer)) self.changeState.propagate(badJobs, "submitfailed", "created") self.setFWJRPathAction.execute(binds=fwjrBinds) return def getThresholds(self): """ _getThresholds_ Retrieve submit thresholds, which considers what is pending and running for those sites. Also update the list of draining and abort/down sites. Finally, creates a map between task type and its priority. """ self.taskTypePrioMap = {} newDrainSites = set() newAbortSites = set() rcThresholds = self.resourceControl.listThresholdsForSubmit() for siteName in rcThresholds.keys(): # Add threshold if we don't have it already state = rcThresholds[siteName]["state"] if state == "Draining": newDrainSites.add(siteName) if state in ["Down", "Aborted"]: newAbortSites.add(siteName) # then update the task type x task priority mapping if not self.taskTypePrioMap: for task, value in rcThresholds[siteName]['thresholds'].items( ): self.taskTypePrioMap[task] = value.get( 'priority', 0) * self.maxTaskPriority # When the list of drain/abort sites change between iteration then a location # refresh is needed, for now it forces a full cache refresh if newDrainSites != self.drainSites or newAbortSites != self.abortSites: logging.info( "Draining or Aborted sites have changed, the cache will be rebuilt." ) self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.currentRcThresholds = rcThresholds self.abortSites = newAbortSites self.drainSites = newDrainSites return def _getJobSubmitCondition(self, jobPrio, siteName, jobType): """ returns the string describing whether a job is ready to be submitted or the reason can't be submitted Only jobs with "JobSubmitReady" return value will be added to submit job. Other return values will indicate the reason jobs cannot be submitted. i.e. "NoPendingSlot" - pending slot is full with pending job """ try: totalPendingSlots = self.currentRcThresholds[siteName][ "total_pending_slots"] totalPendingJobs = self.currentRcThresholds[siteName][ "total_pending_jobs"] totalRunningSlots = self.currentRcThresholds[siteName][ "total_running_slots"] totalRunningJobs = self.currentRcThresholds[siteName][ "total_running_jobs"] taskPendingSlots = self.currentRcThresholds[siteName][ 'thresholds'][jobType]["pending_slots"] taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][ jobType]["task_pending_jobs"] taskRunningSlots = self.currentRcThresholds[siteName][ 'thresholds'][jobType]["max_slots"] taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][ jobType]["task_running_jobs"] highestPriorityInJobs = self.currentRcThresholds[siteName][ 'thresholds'][jobType]['wf_highest_priority'] # set the initial totalPendingJobs since it increases in every cycle when a job is submitted self.currentRcThresholds[siteName].setdefault( "init_total_pending_jobs", totalPendingJobs) # set the initial taskPendingJobs since it increases in every cycle when a job is submitted self.currentRcThresholds[siteName]['thresholds'][ jobType].setdefault("init_task_pending_jobs", taskPendingJobs) initTotalPending = self.currentRcThresholds[siteName][ "init_total_pending_jobs"] initTaskPending = self.currentRcThresholds[siteName]['thresholds'][ jobType]["init_task_pending_jobs"] except KeyError as ex: msg = "Invalid key for site %s and job type %s\n" % (siteName, jobType) logging.exception(msg) return "NoJobType_%s_%s" % (siteName, jobType) if (highestPriorityInJobs is None) or ( jobPrio <= highestPriorityInJobs) or (jobType in self.ioboundTypes): # there is no pending or running jobs in the system (None case) or # priority of the job is lower or equal don't allow overflow # Also if jobType is in ioboundTypes don't allow overflow totalPendingThreshold = totalPendingSlots taskPendingThreshold = taskPendingSlots totalJobThreshold = totalPendingSlots + totalRunningSlots totalTaskTheshold = taskPendingSlots + taskRunningSlots else: # In case the priority of the job is higher than any of currently pending or running jobs. # Then increase the threshold by condorOverflowFraction * original pending slot. totalPendingThreshold = max( totalPendingSlots, initTotalPending) + ( totalPendingSlots * self.condorOverflowFraction) taskPendingThreshold = max(taskPendingSlots, initTaskPending) + ( taskPendingSlots * self.condorOverflowFraction) totalJobThreshold = totalPendingThreshold + totalRunningSlots totalTaskTheshold = taskPendingThreshold + taskRunningSlots jobStats = [{ "Condition": "NoPendingSlot", "Current": totalPendingJobs, "Threshold": totalPendingThreshold }, { "Condition": "NoTaskPendingSlot", "Current": taskPendingJobs, "Threshold": taskPendingThreshold }, { "Condition": "NoRunningSlot", "Current": totalPendingJobs + totalRunningJobs, "Threshold": totalJobThreshold }, { "Condition": "NoTaskRunningSlot", "Current": taskPendingJobs + taskRunningJobs, "Threshold": totalTaskTheshold }] return jobSubmitCondition(jobStats) def assignJobLocations(self): """ _assignJobLocations_ Loop through the submit thresholds and pull sites out of the job cache as we discover open slots. This will return a list of tuple where each tuple will have six elements: - WMBS Job ID - Retry count - Batch ID - Path to sanbox - Path to cache directory - SE name of the site to run at """ jobsToSubmit = {} jobsToUncache = [] jobsCount = 0 exitLoop = False jobSubmitLogBySites = defaultdict(Counter) jobSubmitLogByPriority = defaultdict(Counter) # iterate over jobs from the highest to the lowest prio for jobPrio in sorted(self.cachedJobs, reverse=True): # then we're completely done and have our basket full of jobs to submit if exitLoop: break # start eating through the elder jobs first for job in sorted(self.cachedJobs[jobPrio].values(), key=itemgetter('timestamp')): jobid = job['id'] jobType = job['type'] possibleSites = job['possibleLocations'] jobSubmitLogByPriority[jobPrio]['Total'] += 1 # now look for sites with free pending slots for siteName in possibleSites: if siteName not in self.currentRcThresholds: logging.warn( "Have a job for %s which is not in the resource control", siteName) continue condition = self._getJobSubmitCondition( jobPrio, siteName, jobType) if condition != "JobSubmitReady": jobSubmitLogBySites[siteName][condition] += 1 logging.debug("Found a job for %s : %s", siteName, condition) continue # otherwise, update the site/task thresholds and the component job counter self.currentRcThresholds[siteName][ "total_pending_jobs"] += 1 self.currentRcThresholds[siteName]['thresholds'][jobType][ "task_pending_jobs"] += 1 jobsCount += 1 # load (and remove) the job dictionary object from jobDataCache cachedJob = self.jobDataCache.pop(jobid) jobsToUncache.append((jobPrio, jobid)) # Sort jobs by jobPackage package = cachedJob['packageDir'] if package not in jobsToSubmit.keys(): jobsToSubmit[package] = [] # Add the sandbox to a global list self.sandboxPackage[package] = cachedJob.pop('sandbox') # Now update the job dictionary object cachedJob['custom'] = {'location': siteName} cachedJob['taskPriority'] = self.currentRcThresholds[ siteName]['thresholds'][jobType]["priority"] # Get this job in place to be submitted by the plugin jobsToSubmit[package].append(cachedJob) jobSubmitLogBySites[siteName]["submitted"] += 1 jobSubmitLogByPriority[jobPrio]['submitted'] += 1 # found a site to submit this job, so go to the next job break # set the flag and get out of the job iteration if jobsCount >= self.maxJobsThisCycle: logging.info( "Submitter reached limit of submit slots for this cycle: %i", self.maxJobsThisCycle) exitLoop = True break # jobs that are going to be submitted must be removed from all caches for prio, jobid in jobsToUncache: self.cachedJobs[prio].pop(jobid) self.cachedJobIDs.remove(jobid) logging.info("Site submission report: %s", dict(jobSubmitLogBySites)) logging.info("Priority submission report: %s", dict(jobSubmitLogByPriority)) logging.info("Have %s packages to submit.", len(jobsToSubmit)) logging.info("Have %s jobs to submit.", jobsCount) logging.info("Done assigning site locations.") return jobsToSubmit def submitJobs(self, jobsToSubmit): """ _submitJobs_ Actually do the submission of the jobs """ jobList = [] idList = [] if len(jobsToSubmit) == 0: logging.debug("There are no packages to submit.") return for package in jobsToSubmit.keys(): sandbox = self.sandboxPackage[package] jobs = jobsToSubmit.get(package, []) for job in jobs: job['location'], job['plugin'], job[ 'site_cms_name'] = self.getSiteInfo( job['custom']['location']) job['sandbox'] = sandbox idList.append({ 'jobid': job['id'], 'location': job['custom']['location'] }) #Clean out the package reference del self.sandboxPackage[package] jobList.extend(jobs) myThread = threading.currentThread() myThread.transaction.begin() # Run the actual underlying submit code using bossAir successList, failList = self.bossAir.submit(jobs=jobList) logging.info("Jobs that succeeded/failed submission: %d/%d.", len(successList), len(failList)) # Propagate states in the WMBS database logging.debug("Propagating success state to WMBS.") self.changeState.propagate(successList, 'executing', 'created') logging.debug("Propagating fail state to WMBS.") self.changeState.propagate(failList, 'submitfailed', 'created') # At the end we mark the locations of the jobs # This applies even to failed jobs, since the location # could be part of the failure reason. logging.debug("Updating job location...") self.setLocationAction.execute(bulkList=idList, conn=myThread.transaction.conn, transaction=True) myThread.transaction.commit() logging.info("Transaction cycle successfully completed.") return def getSiteInfo(self, jobSite): """ _getSiteInfo_ This is how you get the name of a CE and the plugin for a job """ if not jobSite in self.locationDict.keys(): siteInfo = self.locationAction.execute(siteName=jobSite) self.locationDict[jobSite] = siteInfo[0] return (self.locationDict[jobSite].get('ce_name'), self.locationDict[jobSite].get('plugin'), self.locationDict[jobSite].get('cms_name')) @timeFunction def algorithm(self, parameters=None): """ _algorithm_ Try to, in order: 1) Refresh the cache 2) Find jobs for all the necessary sites 3) Submit the jobs to the plugin """ myThread = threading.currentThread() if self.useReqMgrForCompletionCheck: # only runs when reqmgr is used (not Tier0) self.removeAbortedForceCompletedWorkflowFromCache() agentConfig = self.reqAuxDB.getWMAgentConfig( self.config.Agent.hostName) self.condorFraction = agentConfig.get('CondorJobsFraction', 0.75) self.condorOverflowFraction = agentConfig.get( "CondorOverflowFraction", 0.2) else: # For Tier0 agent self.condorFraction = 1 self.condorOverflowFraction = 0 if not self.passSubmitConditions(): msg = "JobSubmitter didn't pass the submit conditions. Skipping this cycle." logging.warning(msg) myThread.logdbClient.post("JobSubmitter_submitWork", msg, "warning") return try: myThread.logdbClient.delete("JobSubmitter_submitWork", "warning", this_thread=True) self.getThresholds() self.refreshCache() jobsToSubmit = self.assignJobLocations() self.submitJobs(jobsToSubmit=jobsToSubmit) except WMException: if getattr(myThread, 'transaction', None) != None: myThread.transaction.rollback() raise except Exception as ex: msg = 'Fatal error in JobSubmitter:\n' msg += str(ex) #msg += str(traceback.format_exc()) msg += '\n\n' logging.error(msg) if getattr(myThread, 'transaction', None) != None: myThread.transaction.rollback() raise JobSubmitterPollerException(msg) return def passSubmitConditions(self): """ _passSubmitConditions_ Check whether the component is allowed to submit jobs to condor. Initially it has only one condition, which is the total number of jobs we can have in condor (pending + running) per schedd, set by MAX_JOBS_PER_OWNER. """ myThread = threading.currentThread() freeSubmitSlots = availableScheddSlots( dbi=myThread.dbi, logger=logging, condorFraction=self.condorFraction) self.maxJobsThisCycle = min(freeSubmitSlots, self.maxJobsPerPoll) return (self.maxJobsThisCycle > 0) def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)
class ErrorHandlerPoller(BaseWorkerThread): """ Polls for Error Conditions, handles them """ def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries if not isinstance(self.maxRetries, dict): self.maxRetries = {'default': self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException( 'Max retries for the default job type must be specified') self.exitCodesNoRetry = [] self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) if hasattr(self.config, "Tier0Feeder"): self.reqAuxDB = None else: self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) return def setup(self, parameters=None): """ Load DB objects required for queries """ # For now, does nothing return def terminate(self, params): """ _terminate_ Do one pass, then commit suicide """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params) def exhaustJobs(self, jobList): """ _exhaustJobs_ Actually do the jobs exhaustion """ self.changeState.propagate(jobList, 'exhausted', 'retrydone') # Remove all the files in the exhausted jobs. logging.debug("About to fail input files for exhausted jobs") for job in jobList: job.failInputFiles() # Do not build ACDC for utilitarian job types jobList = [ job for job in jobList if job['type'] not in ['LogCollect', 'Cleanup'] ] self.handleACDC(jobList) return def processRetries(self, jobList, state): """ _processRetries_ Actually do the retries """ logging.info("Processing retries for %d failed jobs of type %sfailed", len(jobList), state) retrydoneJobs = [] cooloffJobs = [] passJobs = [] # Retries < max retry count for job in jobList: allowedRetries = self.maxRetries.get(job['type'], self.maxRetries['default']) # Retries < allowed max retry count if job['retry_count'] < allowedRetries and state != 'create': cooloffJobs.append(job) # Check if Retries >= allowed max retry count elif job['retry_count'] >= allowedRetries or state == 'create': retrydoneJobs.append(job) msg = "Stopping retries for job %d" % job['id'] logging.debug(msg) logging.debug("JobInfo: %s", job) if self.readFWJR: # Then we have to check each FWJR for exit status cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors( cooloffJobs) retrydoneJobs.extend(retrydoneFWJRJobs) # Now to actually do something. logging.debug("About to propagate jobs") if len(retrydoneJobs) > 0: self.changeState.propagate(retrydoneJobs, 'retrydone', '%sfailed' % state, updatesummary=True) if len(cooloffJobs) > 0: self.changeState.propagate(cooloffJobs, '%scooloff' % state, '%sfailed' % state, updatesummary=True) if len(passJobs) > 0: # Overwrite the transition states and move directly to created self.changeState.propagate(passJobs, 'created', 'new') return def handleACDC(self, jobList): """ _handleACDC_ Do the ACDC creation and hope it works """ idList = [x['id'] for x in jobList] logging.info("Starting to build ACDC with %i jobs", len(idList)) logging.info("This operation will take some time...") loadList = self.loadJobsFromListFull(idList) for job in loadList: job.getMask() self.dataCollection.failedJobs(loadList) return def readFWJRForErrors(self, jobList): """ _readFWJRForErrors_ Check the FWJRs of the failed jobs and determine those that can be retried and which must be retried without going through cooloff. Returns a triplet with cooloff, passed and exhausted jobs. """ cooloffJobs = [] passJobs = [] exhaustJobs = [] if self.reqAuxDB: self.exitCodesNoRetry = self.reqAuxDB.getWMAgentConfig( self.config.Agent.hostName).get("NoRetryExitCodes", []) for job in jobList: report = Report() reportPath = job['fwjr_path'] if reportPath is None: logging.error( "No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff.", job['id']) cooloffJobs.append(job) continue if not os.path.isfile(reportPath): logging.error( "Failed to find FWJR for job %i in location %s.\n Passing it to cooloff.", job['id'], reportPath) cooloffJobs.append(job) continue try: report.load(reportPath) # First let's check the time conditions times = report.getFirstStartLastStop() startTime = None stopTime = None if times is not None: startTime = times['startTime'] stopTime = times['stopTime'] # correct the location if the original location is different from recorded in wmbs # WARNING: we are not updating job location in wmbs only updating in couchdb by doing this. # If location in wmbs needs to be updated, it should happen in JobAccountant. locationFromFWJR = report.getSiteName() if locationFromFWJR: job["location"] = locationFromFWJR job["site_cms_name"] = locationFromFWJR if startTime is None or stopTime is None: # We have no information to make a decision, keep going. logging.debug("No start, stop times for steps for job %i", job['id']) elif stopTime - startTime > self.maxFailTime: msg = "Job %i exhausted after running on node for %i seconds" % ( job['id'], stopTime - startTime) logging.debug(msg) exhaustJobs.append(job) continue if len([ x for x in report.getExitCodes() if x in self.exitCodesNoRetry ]): msg = "Job %i exhausted due to a bad exit code (%s)" % ( job['id'], str(report.getExitCodes())) logging.error(msg) exhaustJobs.append(job) continue if len( [x for x in report.getExitCodes() if x in self.passCodes]): msg = "Job %i restarted immediately due to an exit code (%s)" % ( job['id'], str(report.getExitCodes())) logging.debug(msg) passJobs.append(job) continue cooloffJobs.append(job) except Exception as ex: logging.warning( "Exception while trying to check jobs for failures!") logging.warning(str(ex)) logging.warning("Ignoring and sending job to cooloff") cooloffJobs.append(job) return cooloffJobs, passJobs, exhaustJobs def handleRetryDoneJobs(self, jobList): """ _handleRetryDoneJobs_ """ myThread = threading.currentThread() logging.info("About to process %d retry done jobs", len(jobList)) myThread.transaction.begin() self.exhaustJobs(jobList) myThread.transaction.commit() return def handleFailedJobs(self, jobList, state): """ _handleFailedJobs_ """ myThread = threading.currentThread() logging.info("About to process %d failures", len(jobList)) myThread.transaction.begin() self.processRetries(jobList, state) myThread.transaction.commit() return def handleErrors(self): """ Queries DB for all watched filesets, if matching filesets become available, create the subscriptions """ # Run over created, submitted and executed job failures failure_states = ['create', 'submit', 'job'] for state in failure_states: idList = self.getJobs.execute(state="%sfailed" % state) logging.info("Found %d failed jobs in state %sfailed", len(idList), state) while len(idList) > 0: tmpList = idList[:self.maxProcessSize] idList = idList[self.maxProcessSize:] jobList = self.loadJobsFromList(tmpList) self.handleFailedJobs(jobList, state) # Run over jobs done with retries idList = self.getJobs.execute(state='retrydone') logging.info("Found %d jobs done with all retries", len(idList)) while len(idList) > 0: tmpList = idList[:self.maxProcessSize] idList = idList[self.maxProcessSize:] jobList = self.loadJobsFromList(tmpList) self.handleRetryDoneJobs(jobList) return def loadJobsFromList(self, idList): """ _loadJobsFromList_ Load jobs in bulk """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.idLoad.execute(jobID=binds) # You have to have a list if isinstance(results, dict): results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id=entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs def loadJobsFromListFull(self, idList): """ _loadJobsFromList_ Load jobs in bulk. Include the full metadata. """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.loadAction.execute(jobID=binds) # You have to have a list if isinstance(results, dict): results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id=entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs @timeFunction def algorithm(self, parameters=None): """ Performs the handleErrors method, looking for each type of failure And deal with it as desired. """ logging.debug("Running error handling algorithm") try: myThread = threading.currentThread() self.handleErrors() except (CouchConnectionError, HTTPException) as ex: if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() msg = "Caught CouchConnectionError/HTTPException exception in ErrorHandler. " msg += "Transactions postponed until the next polling cycle\n" msg += str(ex) logging.error(msg) except Exception as ex: if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() msg = "Caught unexpected exception in ErrorHandler:\n" msg += str(ex) logging.exception(msg) raise ErrorHandlerException(msg)
class JobSubmitterPoller(BaseWorkerThread): """ _JobSubmitterPoller_ The jobSubmitterPoller takes the jobs and organizes them into packages before sending them to the individual plugin submitters. """ def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.config = config #DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=self.config) self.hostName = self.config.Agent.hostName self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000)) self.maxJobsThisCycle = self.maxJobsPerPoll # changes as per schedd limit self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000)) self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20)) self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7) self.condorFraction = 0.75 # update during every algorithm cycle self.condorOverflowFraction = 0.2 self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting') # Additions for caching-based JobSubmitter self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.locationDict = {} self.taskTypePrioMap = {} self.drainSites = set() self.abortSites = set() self.refreshPollingCount = 0 try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except OSError as ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) logging.debug("PackageDir: %s", self.packageDir) logging.debug("Config: %s", config) raise JobSubmitterPollerException(msg) # Now the DAOs self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter") self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation") self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo") self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath") self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter") # Keep a record of the thresholds in memory self.currentRcThresholds = {} self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) if self.useReqMgrForCompletionCheck: # only set up this when reqmgr is used (not Tier0) self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL) self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache() self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) else: # Tier0 Case - just for the clarity (This private variable shouldn't be used self.abortedAndForceCompleteWorkflowCache = None return def getPackageCollection(self, sandboxDir): """ _getPackageCollection_ Given a jobID figure out which packageCollection it should belong in. """ rawList = os.listdir(sandboxDir) collections = [] numberList = [] for entry in rawList: if 'PackageCollection' in entry: collections.append(entry) # If we have no collections, return 0 (PackageCollection_0) if len(collections) < 1: return 0 # Loop over the list of PackageCollections for collection in collections: collectionPath = os.path.join(sandboxDir, collection) packageList = os.listdir(collectionPath) collectionNum = int(collection.split('_')[1]) if len(packageList) < self.collSize: return collectionNum else: numberList.append(collectionNum) # If we got here, then all collections are full. We'll need # a new one. Find the highest number, increment by one numberList.sort() return numberList[-1] + 1 def addJobsToPackage(self, loadedJob): """ _addJobsToPackage_ Add a job to a job package and then return the batch ID for the job. Packages are only written out to disk when they contain 100 jobs. The flushJobsPackages() method must be called after all jobs have been added to the cache and before they are actually submitted to make sure all the job packages have been written to disk. """ if loadedJob["workflow"] not in self.jobsToPackage: # First, let's pull all the information from the loadedJob batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"]) sandboxDir = os.path.dirname(loadedJob["sandbox"]) # Second, assemble the jobPackage location collectionIndex = self.getPackageCollection(sandboxDir) collectionDir = os.path.join(sandboxDir, 'PackageCollection_%i' % collectionIndex, 'batch_%s' % batchid) # Now create the package object self.jobsToPackage[loadedJob["workflow"]] = {"batchid": batchid, 'id': loadedJob['id'], "package": JobPackage(directory=collectionDir)} jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"] jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob() batchDir = jobPackage['directory'] if len(jobPackage.keys()) == self.packageSize: if not os.path.exists(batchDir): os.makedirs(batchDir) batchPath = os.path.join(batchDir, "JobPackage.pkl") jobPackage.save(batchPath) del self.jobsToPackage[loadedJob["workflow"]] return batchDir def flushJobPackages(self): """ _flushJobPackages_ Write any jobs packages to disk that haven't been written out already. """ workflowNames = self.jobsToPackage.keys() for workflowName in workflowNames: jobPackage = self.jobsToPackage[workflowName]["package"] batchDir = jobPackage['directory'] if not os.path.exists(batchDir): os.makedirs(batchDir) batchPath = os.path.join(batchDir, "JobPackage.pkl") jobPackage.save(batchPath) del self.jobsToPackage[workflowName] return def refreshCache(self): """ _refreshCache_ Query WMBS for all jobs in the 'created' state. For all jobs returned from the query, check if they already exist in the cache. If they don't, unpickle them and combine their site white and black list with the list of locations they can run at. Add them to the cache. Each entry in the cache is a tuple with five items: - WMBS Job ID - Retry count - Batch ID - Path to sanbox - Path to cache directory """ badJobs = dict([(x, []) for x in range(71101, 71105)]) dbJobs = set() logging.info("Refreshing priority cache with currently %i jobs", len(self.cachedJobIDs)) if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \ self.refreshPollingCount >= self.skipRefreshCount: newJobs = self.listJobsAction.execute() self.refreshPollingCount = 0 if self.useReqMgrForCompletionCheck: # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData() else: #T0Agent abortedAndForceCompleteRequests = [] logging.info("Found %s new jobs to be submitted.", len(newJobs)) else: self.refreshPollingCount += 1 newJobs = [] dbJobs = self.cachedJobIDs abortedAndForceCompleteRequests = [] logging.info("Skipping cache update to be submitted. (%s job in cache)", len(dbJobs)) logging.info("Determining possible sites for new jobs...") jobCount = 0 for newJob in newJobs: # whether newJob belongs to aborted or force-complete workflow, and skip it if it is. if (newJob['request_name'] in abortedAndForceCompleteRequests) and \ (newJob['type'] not in ['LogCollect', "Cleanup"]): continue jobID = newJob['id'] dbJobs.add(jobID) if jobID in self.cachedJobIDs: continue jobCount += 1 if jobCount % 5000 == 0: logging.info("Processed %d/%d new jobs.", jobCount, len(newJobs)) pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl") if not os.path.isfile(pickledJobPath): # Then we have a problem - there's no file logging.error("Could not find pickled jobObject %s", pickledJobPath) badJobs[71103].append(newJob) continue try: jobHandle = open(pickledJobPath, "r") loadedJob = pickle.load(jobHandle) jobHandle.close() except Exception as ex: msg = "Error while loading pickled job object %s\n" % pickledJobPath msg += str(ex) logging.error(msg) raise JobSubmitterPollerException(msg) loadedJob['retry_count'] = newJob['retry_count'] # figure out possible locations for job possibleLocations = loadedJob["possiblePSN"] # Create another set of locations that may change when a site goes white/black listed # Does not care about the non_draining or aborted sites, they may change and that is the point potentialLocations = set() potentialLocations.update(possibleLocations) # now check for sites in drain and adjust the possible locations # also check if there is at least one site left to run the job if len(possibleLocations) == 0: newJob['name'] = loadedJob['name'] newJob['fileLocations'] = loadedJob.get('fileLocations', []) newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', []) newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', []) badJobs[71101].append(newJob) continue else: nonAbortSites = [x for x in possibleLocations if x not in self.abortSites] if nonAbortSites: # if there is at least a non aborted/down site then run there, otherwise fail the job possibleLocations = nonAbortSites else: newJob['name'] = loadedJob['name'] newJob['possibleLocations'] = possibleLocations badJobs[71102].append(newJob) continue # try to remove draining sites if possible, this is needed to stop # jobs that could run anywhere blocking draining sites # if the job type is Merge, LogCollect or Cleanup this is skipped if newJob['type'] not in self.ioboundTypes: nonDrainingSites = [x for x in possibleLocations if x not in self.drainSites] if nonDrainingSites: # if >1 viable non-draining site remove draining ones possibleLocations = nonDrainingSites else: newJob['name'] = loadedJob['name'] newJob['possibleLocations'] = possibleLocations badJobs[71104].append(newJob) continue # locations clear of abort and draining sites newJob['possibleLocations'] = possibleLocations batchDir = self.addJobsToPackage(loadedJob) self.cachedJobIDs.add(jobID) # calculate the final job priority such that we can order cached jobs by prio jobPrio = self.taskTypePrioMap.get(newJob['type'], 0) + newJob['wf_priority'] if jobPrio not in self.cachedJobs: self.cachedJobs[jobPrio] = {} # now add basic information keyed by the jobid self.cachedJobs[jobPrio][jobID] = newJob # allow job baggage to override numberOfCores # => used for repacking to get more slots/disk numberOfCores = loadedJob.get('numberOfCores', 1) if numberOfCores == 1: baggage = loadedJob.getBaggage() numberOfCores = getattr(baggage, "numberOfCores", 1) loadedJob['numberOfCores'] = numberOfCores # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob) jobInfo = {'id': jobID, 'requestName': newJob['request_name'], 'taskName': newJob['task_name'], 'taskType': newJob['type'], 'cache_dir': newJob["cache_dir"], 'priority': newJob['wf_priority'], 'taskID': newJob['task_id'], 'retry_count': newJob["retry_count"], 'taskPriority': None, # update from the thresholds 'custom': {'location': None}, # update later 'packageDir': batchDir, 'sandbox': loadedJob["sandbox"], # remove before submit 'userdn': loadedJob.get("ownerDN", None), 'usergroup': loadedJob.get("ownerGroup", ''), 'userrole': loadedJob.get("ownerRole", ''), 'possibleSites': frozenset(possibleLocations), # abort and drain sites filtered out 'potentialSites': frozenset(potentialLocations), # original list of sites 'scramArch': loadedJob.get("scramArch", None), 'swVersion': loadedJob.get("swVersion", None), 'name': loadedJob["name"], 'proxyPath': loadedJob.get("proxyPath", None), 'estimatedJobTime': loadedJob.get("estimatedJobTime", None), 'estimatedDiskUsage': loadedJob.get("estimatedDiskUsage", None), 'estimatedMemoryUsage': loadedJob.get("estimatedMemoryUsage", None), 'numberOfCores': loadedJob.get("numberOfCores", 1), # may update it later 'inputDataset': loadedJob.get('inputDataset', None), 'inputDatasetLocations': loadedJob.get('inputDatasetLocations', None), 'allowOpportunistic': loadedJob.get('allowOpportunistic', False)} self.jobDataCache[jobID] = jobInfo # Register failures in submission for errorCode in badJobs: if badJobs[errorCode]: logging.debug("The following jobs could not be submitted: %s, error code : %d", badJobs, errorCode) self._handleSubmitFailedJobs(badJobs[errorCode], errorCode) # If there are any leftover jobs, we want to get rid of them. self.flushJobPackages() # We need to remove any jobs from the cache that were not returned in # the last call to the database. jobIDsToPurge = self.cachedJobIDs - dbJobs self._purgeJobsFromCache(jobIDsToPurge) logging.info("Done pruning killed jobs, moving on to submit.") return def removeAbortedForceCompletedWorkflowFromCache(self): abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData() jobIDsToPurge = set() for jobID, jobInfo in self.jobDataCache.iteritems(): if (jobInfo['requestName'] in abortedAndForceCompleteRequests) and \ (jobInfo['taskType'] not in ['LogCollect', "Cleanup"]): jobIDsToPurge.add(jobID) self._purgeJobsFromCache(jobIDsToPurge) return def _purgeJobsFromCache(self, jobIDsToPurge): if len(jobIDsToPurge) == 0: return self.cachedJobIDs -= jobIDsToPurge for jobid in jobIDsToPurge: self.jobDataCache.pop(jobid, None) for jobPrio in self.cachedJobs: if self.cachedJobs[jobPrio].pop(jobid, None): # then the jobid was found, go to the next one break return def _handleSubmitFailedJobs(self, badJobs, exitCode): """ __handleSubmitFailedJobs_ For a default job report for the exitCode and register in the job. Preserve it on disk as well. Propagate the failure to the JobStateMachine. """ fwjrBinds = [] for job in badJobs: job['couch_record'] = None job['fwjr'] = Report() if exitCode in [71102, 71104]: job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ', '.join(job['possibleLocations'])) elif exitCode in [71101]: # there is no possible site if job.get("fileLocations"): job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ": file locations: " + ', '.join(job['fileLocations']) + ": site white list: " + ', '.join(job['siteWhitelist']) + ": site black list: " + ', '.join(job['siteBlacklist'])) else: job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]) fwjrPath = os.path.join(job['cache_dir'], 'Report.%d.pkl' % int(job['retry_count'])) job['fwjr'].setJobID(job['id']) try: job['fwjr'].save(fwjrPath) fwjrBinds.append({"jobid" : job["id"], "fwjrpath" : fwjrPath}) except IOError as ioer: logging.error("Failed to write FWJR for submit failed job %d, message: %s", job['id'], str(ioer)) self.changeState.propagate(badJobs, "submitfailed", "created") self.setFWJRPathAction.execute(binds=fwjrBinds) return def getThresholds(self): """ _getThresholds_ Retrieve submit thresholds, which considers what is pending and running for those sites. Also update the list of draining and abort/down sites. Finally, creates a map between task type and its priority. """ self.taskTypePrioMap = {} newDrainSites = set() newAbortSites = set() rcThresholds = self.resourceControl.listThresholdsForSubmit() for siteName in rcThresholds.keys(): # Add threshold if we don't have it already state = rcThresholds[siteName]["state"] if state == "Draining": newDrainSites.add(siteName) if state in ["Down", "Aborted"]: newAbortSites.add(siteName) # then update the task type x task priority mapping if not self.taskTypePrioMap: for task, value in rcThresholds[siteName]['thresholds'].items(): self.taskTypePrioMap[task] = value.get('priority', 0) * self.maxTaskPriority # When the list of drain/abort sites change between iteration then a location # refresh is needed, for now it forces a full cache refresh if newDrainSites != self.drainSites or newAbortSites != self.abortSites: logging.info("Draining or Aborted sites have changed, the cache will be rebuilt.") self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.currentRcThresholds = rcThresholds self.abortSites = newAbortSites self.drainSites = newDrainSites return def _getJobSubmitCondition(self, jobPrio, siteName, jobType): """ returns the string describing whether a job is ready to be submitted or the reason can't be submitted Only jobs with "JobSubmitReady" return value will be added to submit job. Other return values will indicate the reason jobs cannot be submitted. i.e. "NoPendingSlot" - pending slot is full with pending job """ try: totalPendingSlots = self.currentRcThresholds[siteName]["total_pending_slots"] totalPendingJobs = self.currentRcThresholds[siteName]["total_pending_jobs"] totalRunningSlots = self.currentRcThresholds[siteName]["total_running_slots"] totalRunningJobs = self.currentRcThresholds[siteName]["total_running_jobs"] taskPendingSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["pending_slots"] taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"] taskRunningSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["max_slots"] taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_running_jobs"] highestPriorityInJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]['wf_highest_priority'] # set the initial totalPendingJobs since it increases in every cycle when a job is submitted self.currentRcThresholds[siteName].setdefault("init_total_pending_jobs", totalPendingJobs) # set the initial taskPendingJobs since it increases in every cycle when a job is submitted self.currentRcThresholds[siteName]['thresholds'][jobType].setdefault("init_task_pending_jobs", taskPendingJobs) initTotalPending = self.currentRcThresholds[siteName]["init_total_pending_jobs"] initTaskPending = self.currentRcThresholds[siteName]['thresholds'][jobType]["init_task_pending_jobs"] except KeyError as ex: msg = "Invalid key for site %s and job type %s\n" % (siteName, jobType) logging.exception(msg) return "NoJobType_%s_%s" % (siteName, jobType) if (highestPriorityInJobs is None) or (jobPrio <= highestPriorityInJobs) or (jobType in self.ioboundTypes): # there is no pending or running jobs in the system (None case) or # priority of the job is lower or equal don't allow overflow # Also if jobType is in ioboundTypes don't allow overflow totalPendingThreshold = totalPendingSlots taskPendingThreshold = taskPendingSlots totalJobThreshold = totalPendingSlots + totalRunningSlots totalTaskTheshold = taskPendingSlots + taskRunningSlots else: # In case the priority of the job is higher than any of currently pending or running jobs. # Then increase the threshold by condorOverflowFraction * original pending slot. totalPendingThreshold = max(totalPendingSlots, initTotalPending) + ( totalPendingSlots * self.condorOverflowFraction) taskPendingThreshold = max(taskPendingSlots, initTaskPending) + ( taskPendingSlots * self.condorOverflowFraction) totalJobThreshold = totalPendingThreshold + totalRunningSlots totalTaskTheshold = taskPendingThreshold + taskRunningSlots jobStats = [{"Condition": "NoPendingSlot", "Current": totalPendingJobs, "Threshold": totalPendingThreshold}, {"Condition": "NoTaskPendingSlot", "Current": taskPendingJobs, "Threshold": taskPendingThreshold}, {"Condition": "NoRunningSlot", "Current": totalPendingJobs + totalRunningJobs, "Threshold": totalJobThreshold}, {"Condition": "NoTaskRunningSlot", "Current": taskPendingJobs + taskRunningJobs, "Threshold": totalTaskTheshold}] return jobSubmitCondition(jobStats) def assignJobLocations(self): """ _assignJobLocations_ Loop through the submit thresholds and pull sites out of the job cache as we discover open slots. This will return a list of tuple where each tuple will have six elements: - WMBS Job ID - Retry count - Batch ID - Path to sanbox - Path to cache directory - SE name of the site to run at """ jobsToSubmit = {} jobsToUncache = [] jobsCount = 0 exitLoop = False jobSubmitLogBySites = defaultdict(Counter) jobSubmitLogByPriority = defaultdict(Counter) # iterate over jobs from the highest to the lowest prio for jobPrio in sorted(self.cachedJobs, reverse=True): # then we're completely done and have our basket full of jobs to submit if exitLoop: break # start eating through the elder jobs first for job in sorted(self.cachedJobs[jobPrio].values(), key=itemgetter('timestamp')): jobid = job['id'] jobType = job['type'] possibleSites = job['possibleLocations'] jobSubmitLogByPriority[jobPrio]['Total'] += 1 # now look for sites with free pending slots for siteName in possibleSites: if siteName not in self.currentRcThresholds: logging.warn("Have a job for %s which is not in the resource control", siteName) continue condition = self._getJobSubmitCondition(jobPrio, siteName, jobType) if condition != "JobSubmitReady": jobSubmitLogBySites[siteName][condition] += 1 logging.debug("Found a job for %s : %s", siteName, condition) continue # otherwise, update the site/task thresholds and the component job counter self.currentRcThresholds[siteName]["total_pending_jobs"] += 1 self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"] += 1 jobsCount += 1 # load (and remove) the job dictionary object from jobDataCache cachedJob = self.jobDataCache.pop(jobid) jobsToUncache.append((jobPrio, jobid)) # Sort jobs by jobPackage package = cachedJob['packageDir'] if package not in jobsToSubmit.keys(): jobsToSubmit[package] = [] # Add the sandbox to a global list self.sandboxPackage[package] = cachedJob.pop('sandbox') # Now update the job dictionary object cachedJob['custom'] = {'location': siteName} cachedJob['taskPriority'] = self.currentRcThresholds[siteName]['thresholds'][jobType]["priority"] # Get this job in place to be submitted by the plugin jobsToSubmit[package].append(cachedJob) jobSubmitLogBySites[siteName]["submitted"] += 1 jobSubmitLogByPriority[jobPrio]['submitted'] += 1 # found a site to submit this job, so go to the next job break # set the flag and get out of the job iteration if jobsCount >= self.maxJobsThisCycle: logging.info("Submitter reached limit of submit slots for this cycle: %i", self.maxJobsThisCycle) exitLoop = True break # jobs that are going to be submitted must be removed from all caches for prio, jobid in jobsToUncache: self.cachedJobs[prio].pop(jobid) self.cachedJobIDs.remove(jobid) logging.info("Site submission report: %s", dict(jobSubmitLogBySites)) logging.info("Priority submission report: %s", dict(jobSubmitLogByPriority)) logging.info("Have %s packages to submit.", len(jobsToSubmit)) logging.info("Have %s jobs to submit.", jobsCount) logging.info("Done assigning site locations.") return jobsToSubmit def submitJobs(self, jobsToSubmit): """ _submitJobs_ Actually do the submission of the jobs """ jobList = [] idList = [] if len(jobsToSubmit) == 0: logging.debug("There are no packages to submit.") return for package in jobsToSubmit.keys(): sandbox = self.sandboxPackage[package] jobs = jobsToSubmit.get(package, []) for job in jobs: job['location'], job['plugin'], job['site_cms_name'] = self.getSiteInfo(job['custom']['location']) job['sandbox'] = sandbox idList.append({'jobid': job['id'], 'location': job['custom']['location']}) #Clean out the package reference del self.sandboxPackage[package] jobList.extend(jobs) myThread = threading.currentThread() myThread.transaction.begin() # Run the actual underlying submit code using bossAir successList, failList = self.bossAir.submit(jobs=jobList) logging.info("Jobs that succeeded/failed submission: %d/%d.", len(successList), len(failList)) # Propagate states in the WMBS database logging.debug("Propagating success state to WMBS.") self.changeState.propagate(successList, 'executing', 'created') logging.debug("Propagating fail state to WMBS.") self.changeState.propagate(failList, 'submitfailed', 'created') # At the end we mark the locations of the jobs # This applies even to failed jobs, since the location # could be part of the failure reason. logging.debug("Updating job location...") self.setLocationAction.execute(bulkList=idList, conn=myThread.transaction.conn, transaction=True) myThread.transaction.commit() logging.info("Transaction cycle successfully completed.") return def getSiteInfo(self, jobSite): """ _getSiteInfo_ This is how you get the name of a CE and the plugin for a job """ if not jobSite in self.locationDict.keys(): siteInfo = self.locationAction.execute(siteName=jobSite) self.locationDict[jobSite] = siteInfo[0] return (self.locationDict[jobSite].get('ce_name'), self.locationDict[jobSite].get('plugin'), self.locationDict[jobSite].get('cms_name')) @timeFunction def algorithm(self, parameters=None): """ _algorithm_ Try to, in order: 1) Refresh the cache 2) Find jobs for all the necessary sites 3) Submit the jobs to the plugin """ myThread = threading.currentThread() if self.useReqMgrForCompletionCheck: # only runs when reqmgr is used (not Tier0) self.removeAbortedForceCompletedWorkflowFromCache() agentConfig = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName) self.condorFraction = agentConfig.get('CondorJobsFraction', 0.75) self.condorOverflowFraction = agentConfig.get("CondorOverflowFraction", 0.2) else: # For Tier0 agent self.condorFraction = 1 self.condorOverflowFraction = 0 if not self.passSubmitConditions(): msg = "JobSubmitter didn't pass the submit conditions. Skipping this cycle." logging.warning(msg) myThread.logdbClient.post("JobSubmitter_submitWork", msg, "warning") return try: myThread.logdbClient.delete("JobSubmitter_submitWork", "warning", this_thread=True) self.getThresholds() self.refreshCache() jobsToSubmit = self.assignJobLocations() self.submitJobs(jobsToSubmit=jobsToSubmit) except WMException: if getattr(myThread, 'transaction', None) != None: myThread.transaction.rollback() raise except Exception as ex: msg = 'Fatal error in JobSubmitter:\n' msg += str(ex) #msg += str(traceback.format_exc()) msg += '\n\n' logging.error(msg) if getattr(myThread, 'transaction', None) != None: myThread.transaction.rollback() raise JobSubmitterPollerException(msg) return def passSubmitConditions(self): """ _passSubmitConditions_ Check whether the component is allowed to submit jobs to condor. Initially it has only one condition, which is the total number of jobs we can have in condor (pending + running) per schedd, set by MAX_JOBS_PER_OWNER. """ myThread = threading.currentThread() freeSubmitSlots = availableScheddSlots(dbi=myThread.dbi, logger=logging, condorFraction=self.condorFraction) self.maxJobsThisCycle = min(freeSubmitSlots, self.maxJobsPerPoll) return (freeSubmitSlots > 0) def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)
class DrainStatusPoller(BaseWorkerThread): """ Collects information related to the agent drain status """ # class variable that contains drain statistics drainStats = {} def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI() self.condorAPI = PyCondorAPI() self.agentConfig = {} self.validSpeedDrainConfigKeys = [ 'CondorPriority', 'NoJobRetries', 'EnableAllSites' ] self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) @timeFunction def algorithm(self, parameters): """ Update drainStats if agent is in drain mode """ logging.info("Running agent drain algorithm...") self.agentConfig = self.reqAuxDB.getWMAgentConfig( self.config.Agent.hostName) if isDrainMode(self.config): # check to see if the agent hit any speed drain thresholds thresholdsHit = self.checkSpeedDrainThresholds() if thresholdsHit: logging.info("Updating agent configuration for speed drain...") self.updateAgentSpeedDrainConfig(thresholdsHit) try: DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo() logging.info("Finished collecting agent drain status.") logging.info("Drain stats: " + str(DrainStatusPoller.drainStats)) except Exception as ex: msg = "Error occurred, will retry later:\n" msg += str(ex) logging.exception(msg) else: logging.info( "Agent not in drain mode. Resetting flags and skipping drain check..." ) self.resetAgentSpeedDrainConfig() @classmethod def getDrainInfo(cls): """ Return drainStats class variable """ return cls.drainStats def updateAgentSpeedDrainConfig(self, thresholdsHit): """ Takes a list of speed drain configuration keys and updates the agent configuration """ updateConfig = False condorPriorityFlag = False speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") if 'CondorPriority' in thresholdsHit: logging.info( "Bumping condor job priority to 999999 for Production/Processing pending jobs." ) self.condorAPI.editCondorJobs( "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")", "JobPrio", "999999") condorPriorityFlag = True if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']: # CondorPriority setting is irreversible so the flag only indicates weather # priority is increased or not. It is not checked by other components logging.info("Enabling CondorPriority flag.") speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag updateConfig = True if 'NoJobRetries' in thresholdsHit: logging.info( "Enabling NoJobRetries flag: Error Handler won't retry the jobs" ) # ErrorHandler will pick this up and set max retries to 0 speedDrainConfig['NoJobRetries']['Enabled'] = True updateConfig = True if 'EnableAllSites' in thresholdsHit: logging.info( "Enabling EnableAllSites flag: Updating agent to submit to all sites." ) # setting this value to True makes JobSubmitterPoller ignore site status speedDrainConfig['EnableAllSites']['Enabled'] = True updateConfig = True # update the aux db speed drain config with any changes if updateConfig: self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", True) self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig) return def resetAgentSpeedDrainConfig(self): """ resetting SpeedDrainMode to False and SpeedDrainiConfig Enabled to False """ if self.agentConfig.get("SpeedDrainMode"): self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", False) speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") for key, v in speedDrainConfig.items(): if key in self.validSpeedDrainConfigKeys and v['Enabled']: speedDrainConfig[key]['Enabled'] = False self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig) return def checkSpeedDrainThresholds(self): """ Check the current number of jobs in Condor and create a list of agent configuration parameters that need updated for speed draining """ enableKeys = [] # get the current speed drain status speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") # get condor jobs jobs = self.condorAPI.getCondorJobs("", []) if jobs is None: logging.warning( "There was an error querying the schedd. Not checking speed drain thresholds." ) return [] # loop through the speed drain configuration and make a list of what thresholds have been hit for k, v in speedDrainConfig.items(): # make sure keys in the speed drain config are valid if k in self.validSpeedDrainConfigKeys and isinstance( v['Threshold'], int) and isinstance(v['Enabled'], bool): # we always want to apply the condor priority change if the threshold is hit if not v['Enabled'] or k == 'CondorPriority': logging.info("Checking speed drain threshold for %s. ", k) if len(jobs) < v['Threshold']: logging.info( "Agent will update speed drain configuration for %s. ", k) enableKeys.append(k) else: logging.warning( "Speed drain configuration error for %s. Please check aux db contents.", k) return enableKeys
class ErrorHandlerPoller(BaseWorkerThread): """ Polls for Error Conditions, handles them """ def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.changeState = ChangeState(self.config) if hasattr(self.config, "Tier0Feeder"): self.reqAuxDB = None self.maxRetries = self.config.ErrorHandler.maxRetries else: self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) self.maxRetries = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName).get("MaxRetries") if not isinstance(self.maxRetries, dict): self.maxRetries = {'default': self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException('Max retries for the default job type must be specified') self.exitCodesNoRetry = [] self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService(url=config.ACDC.couchurl, database=config.ACDC.database) return def setup(self, parameters=None): """ Load DB objects required for queries """ # For now, does nothing return def terminate(self, params): """ _terminate_ Do one pass, then commit suicide """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params) def exhaustJobs(self, jobList): """ _exhaustJobs_ Actually do the jobs exhaustion """ # Remove all the files in the exhausted jobs. logging.debug("About to fail input files for exhausted jobs") for job in jobList: job.failInputFiles() # Do not build ACDC for utilitarian job types acdcJobList = [job for job in jobList if job['type'] not in ['LogCollect', 'Cleanup']] self.handleACDC(acdcJobList) self.changeState.propagate(jobList, 'exhausted', 'retrydone') return def processRetries(self, jobList, state): """ _processRetries_ Actually do the retries """ logging.info("Processing retries for %d failed jobs of type %sfailed", len(jobList), state) retrydoneJobs = [] cooloffJobs = [] passJobs = [] # Query auxiliary db for current state of maxRetries if self.reqAuxDB: self.maxRetries = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName).get("MaxRetries", self.maxRetries) if not isinstance(self.maxRetries, dict): self.maxRetries = {'default': self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException('Max retries for the default job type must be specified') # Retries < max retry count for job in jobList: allowedRetries = self.maxRetries.get(job['type'], self.maxRetries['default']) # Retries < allowed max retry count if job['retry_count'] < allowedRetries and state != 'create': cooloffJobs.append(job) # Check if Retries >= allowed max retry count elif job['retry_count'] >= allowedRetries or state == 'create': retrydoneJobs.append(job) msg = "Stopping retries for job %d" % job['id'] logging.debug(msg) logging.debug("JobInfo: %s", job) if self.readFWJR: # Then we have to check each FWJR for exit status cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(cooloffJobs) retrydoneJobs.extend(retrydoneFWJRJobs) # Now to actually do something. logging.debug("About to propagate jobs") if len(retrydoneJobs) > 0: self.changeState.propagate(retrydoneJobs, 'retrydone', '%sfailed' % state, updatesummary=True) if len(cooloffJobs) > 0: self.changeState.propagate(cooloffJobs, '%scooloff' % state, '%sfailed' % state, updatesummary=True) if len(passJobs) > 0: # Overwrite the transition states and move directly to created self.changeState.propagate(passJobs, 'created', 'new') return def handleACDC(self, jobList): """ _handleACDC_ Do the ACDC creation and hope it works """ idList = [x['id'] for x in jobList] logging.info("Starting to build ACDC with %i jobs", len(idList)) logging.info("This operation will take some time...") loadList = self.loadJobsFromListFull(idList) for job in loadList: job.getMask() self.dataCollection.failedJobs(loadList) return def readFWJRForErrors(self, jobList): """ _readFWJRForErrors_ Check the FWJRs of the failed jobs and determine those that can be retried and which must be retried without going through cooloff. Returns a triplet with cooloff, passed and exhausted jobs. """ cooloffJobs = [] passJobs = [] exhaustJobs = [] if self.reqAuxDB: self.exitCodesNoRetry = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName).get("NoRetryExitCodes", []) for job in jobList: report = Report() reportPath = job['fwjr_path'] if reportPath is None: logging.error("No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff.", job['id']) cooloffJobs.append(job) continue if not os.path.isfile(reportPath): logging.error( "Failed to find FWJR for job %i in location %s.\n Passing it to cooloff.", job['id'], reportPath) cooloffJobs.append(job) continue try: report.load(reportPath) # First let's check the time conditions times = report.getFirstStartLastStop() startTime = None stopTime = None if times is not None: startTime = times['startTime'] stopTime = times['stopTime'] # correct the location if the original location is different from recorded in wmbs # WARNING: we are not updating job location in wmbs only updating in couchdb by doing this. # If location in wmbs needs to be updated, it should happen in JobAccountant. locationFromFWJR = report.getSiteName() if locationFromFWJR: job["location"] = locationFromFWJR job["site_cms_name"] = locationFromFWJR if startTime is None or stopTime is None: # We have no information to make a decision, keep going. logging.debug("No start, stop times for steps for job %i", job['id']) elif stopTime - startTime > self.maxFailTime: msg = "Job %i exhausted after running on node for %i seconds" % (job['id'], stopTime - startTime) logging.debug(msg) exhaustJobs.append(job) continue if len([x for x in report.getExitCodes() if x in self.exitCodesNoRetry]): msg = "Job %i exhausted due to a bad exit code (%s)" % (job['id'], str(report.getExitCodes())) logging.debug(msg) exhaustJobs.append(job) continue if len([x for x in report.getExitCodes() if x in self.passCodes]): msg = "Job %i restarted immediately due to an exit code (%s)" % (job['id'], str(report.getExitCodes())) logging.debug(msg) passJobs.append(job) continue cooloffJobs.append(job) except Exception as ex: logging.warning("Exception while trying to check jobs for failures!") logging.warning(str(ex)) logging.warning("Ignoring and sending job to cooloff") cooloffJobs.append(job) return cooloffJobs, passJobs, exhaustJobs def handleRetryDoneJobs(self, jobList): """ _handleRetryDoneJobs_ """ myThread = threading.currentThread() logging.info("About to process %d retry done jobs", len(jobList)) myThread.transaction.begin() self.exhaustJobs(jobList) myThread.transaction.commit() return def handleFailedJobs(self, jobList, state): """ _handleFailedJobs_ """ myThread = threading.currentThread() logging.info("About to process %d failures", len(jobList)) myThread.transaction.begin() self.processRetries(jobList, state) myThread.transaction.commit() return def handleErrors(self): """ Queries DB for all watched filesets, if matching filesets become available, create the subscriptions """ # Run over created, submitted and executed job failures failure_states = ['create', 'submit', 'job'] for state in failure_states: idList = self.getJobs.execute(state="%sfailed" % state) logging.info("Found %d failed jobs in state %sfailed", len(idList), state) for jobSlice in grouper(idList, self.maxProcessSize): jobList = self.loadJobsFromList(jobSlice) self.handleFailedJobs(jobList, state) # Run over jobs done with retries idList = self.getJobs.execute(state='retrydone') logging.info("Found %d jobs done with all retries", len(idList)) for jobSlice in grouper(idList, self.maxProcessSize): jobList = self.loadJobsFromList(jobSlice) self.handleRetryDoneJobs(jobList) return def loadJobsFromList(self, idList): """ _loadJobsFromList_ Load jobs in bulk """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.idLoad.execute(jobID=binds) # You have to have a list if isinstance(results, dict): results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id=entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs def loadJobsFromListFull(self, idList): """ _loadJobsFromList_ Load jobs in bulk. Include the full metadata. """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.loadAction.execute(jobID=binds) # You have to have a list if isinstance(results, dict): results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id=entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs @timeFunction def algorithm(self, parameters=None): """ Performs the handleErrors method, looking for each type of failure And deal with it as desired. """ logging.debug("Running error handling algorithm") try: myThread = threading.currentThread() self.handleErrors() except (CouchConnectionError, HTTPException) as ex: if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() msg = "Caught CouchConnectionError/HTTPException exception in ErrorHandler. " msg += "Transactions postponed until the next polling cycle\n" msg += str(ex) logging.error(msg) except Exception as ex: if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() msg = "Caught unexpected exception in ErrorHandler:\n" msg += str(ex) logging.exception(msg) raise ErrorHandlerException(msg)
class DrainStatusPoller(BaseWorkerThread): """ Collects information related to the agent drain status """ # class variable that contains drain statistics drainStats = {} def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI(config) self.condorAPI = PyCondorAPI() self.agentConfig = {} self.previousConfig = {} self.validSpeedDrainConfigKeys = [ 'CondorPriority', 'NoJobRetries', 'EnableAllSites' ] self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) self.emailAlert = EmailAlert(config.EmailAlert.dictionary_()) self.condorStates = ("Running", "Idle") @timeFunction def algorithm(self, parameters): """ Update drainStats if agent is in drain mode """ logging.info("Running agent drain algorithm...") if self.agentConfig: # make a copy of the previous agent aux db configuration to compare against later self.previousConfig = copy.deepcopy(self.agentConfig) # grab a new copy of the agent aux db configuration self.agentConfig = self.reqAuxDB.getWMAgentConfig( self.config.Agent.hostName) if not self.agentConfig: logging.error( "Failed to fetch agent configuration from the auxiliary DB") return try: # see if the agent is in drain mode if self.agentConfig["UserDrainMode"] or self.agentConfig[ "AgentDrainMode"]: # check to see if the agent hit any speed drain thresholds thresholdsHit = self.checkSpeedDrainThresholds() if thresholdsHit: logging.info( "Updating agent configuration for speed drain...") self.updateAgentSpeedDrainConfig(thresholdsHit) # now collect drain statistics DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo() logging.info("Finished collecting agent drain status.") logging.info("Drain stats: %s", str(DrainStatusPoller.drainStats)) else: logging.info( "Agent not in drain mode. Resetting flags and skipping drain check..." ) self.resetAgentSpeedDrainConfig() # finally, check for any changes in drain status self.checkDrainStatusChanges() except Exception as ex: msg = "Error occurred, will retry later:\n" msg += str(ex) logging.exception(msg) @classmethod def getDrainInfo(cls): """ Return drainStats class variable """ return cls.drainStats def checkDrainStatusChanges(self): """ Check to see if any drain statuses have changed in the auxiliary db If yes, send email notification and update local drain thread variables """ message = "" drainStatusKeys = ['UserDrainMode', 'AgentDrainMode', 'SpeedDrainMode'] if not self.previousConfig: return for key in drainStatusKeys: if self.previousConfig[key] != self.agentConfig[key]: message += "Agent had a drain status transition to %s = %s\n" % ( str(key), str(self.agentConfig[key])) if message: self.emailAlert.send( "DrainMode status change on " + getattr(self.config.Agent, "hostName"), message) logging.info("Drain mode status change: %s", message) return def updateAgentSpeedDrainConfig(self, thresholdsHit): """ Takes a list of speed drain configuration keys and updates the agent configuration """ updateConfig = False condorPriorityFlag = False speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") if 'CondorPriority' in thresholdsHit: logging.info( "Bumping condor job priority to 999999 for Production/Processing pending jobs." ) self.condorAPI.editCondorJobs( "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")", "JobPrio", "999999") condorPriorityFlag = True if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']: # CondorPriority setting is irreversible so the flag only indicates weather # priority is increased or not. It is not checked by other components logging.info("Enabling CondorPriority flag.") speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag updateConfig = True if 'NoJobRetries' in thresholdsHit: logging.info( "Enabling NoJobRetries flag: Error Handler won't retry the jobs" ) # ErrorHandler will pick this up and set max retries to 0 speedDrainConfig['NoJobRetries']['Enabled'] = True updateConfig = True if 'EnableAllSites' in thresholdsHit: logging.info( "Enabling EnableAllSites flag: Updating agent to submit to all sites." ) # setting this value to True makes JobSubmitterPoller ignore site status speedDrainConfig['EnableAllSites']['Enabled'] = True updateConfig = True # update the aux db speed drain config with any changes if updateConfig: self.agentConfig['SpeedDrainMode'] = True self.reqAuxDB.updateWMAgentConfig(self.config.Agent.hostName, self.agentConfig) return def resetAgentSpeedDrainConfig(self): """ resetting SpeedDrainMode to False and SpeedDrainConfig Enabled to False """ if self.agentConfig.get("SpeedDrainMode"): self.agentConfig['SpeedDrainMode'] = False speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") for key, v in viewitems(speedDrainConfig): if key in self.validSpeedDrainConfigKeys and v['Enabled']: speedDrainConfig[key]['Enabled'] = False self.reqAuxDB.updateWMAgentConfig(self.config.Agent.hostName, self.agentConfig) return def checkSpeedDrainThresholds(self): """ Check the current number of jobs in Condor and create a list of agent configuration parameters that need updated for speed draining """ enableKeys = [] # first, update our summary of condor jobs totalJobs = self.getTotalCondorJobs() if totalJobs is None: msg = "Cannot check speed drain because there was an error fetching job summary from HTCondor." msg += " Will retry again in the next cycle." logging.warning(msg) return [] # get the current speed drain status speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") # loop through the speed drain configuration and make a list of what thresholds have been hit for k, v in viewitems(speedDrainConfig): # make sure keys in the speed drain config are valid if k in self.validSpeedDrainConfigKeys and isinstance( v['Threshold'], int) and isinstance(v['Enabled'], bool): # we always want to apply the condor priority change if the threshold is hit if not v['Enabled'] or k == 'CondorPriority': logging.info("Checking speed drain threshold for %s. ", k) if totalJobs < v['Threshold']: logging.info( "Agent will update speed drain configuration for %s. ", k) enableKeys.append(k) else: logging.warning( "Speed drain configuration error for %s. Please check aux db contents.", k) return enableKeys def getTotalCondorJobs(self): """ Retrieve a summary of the jobs in condor and return an absolute number of the jobs in Idle and Running states. :return: returns an integer with the total number of jobs, or None if it failed. """ jobs = self.condorAPI.getCondorJobsSummary() if not jobs: return None results = 0 if jobs: for state in self.condorStates: results += int(jobs[0].get(state)) return results
class DrainStatusPoller(BaseWorkerThread): """ Collects information related to the agent drain status """ # class variable that contains drain statistics drainStats = {} def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI(config) self.condorAPI = PyCondorAPI() self.agentConfig = {} self.validSpeedDrainConfigKeys = ['CondorPriority', 'NoJobRetries', 'EnableAllSites'] self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) @timeFunction def algorithm(self, parameters): """ Update drainStats if agent is in drain mode """ logging.info("Running agent drain algorithm...") self.agentConfig = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName) if not self.agentConfig: logging.error("Failed to fetch agent configuration from the auxiliary DB") return if isDrainMode(self.config): # check to see if the agent hit any speed drain thresholds thresholdsHit = self.checkSpeedDrainThresholds() if thresholdsHit: logging.info("Updating agent configuration for speed drain...") self.updateAgentSpeedDrainConfig(thresholdsHit) # now collect drain statistics try: DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo() logging.info("Finished collecting agent drain status.") logging.info("Drain stats: " + str(DrainStatusPoller.drainStats)) except Exception as ex: msg = "Error occurred, will retry later:\n" msg += str(ex) logging.exception(msg) else: logging.info("Agent not in drain mode. Resetting flags and skipping drain check...") self.resetAgentSpeedDrainConfig() @classmethod def getDrainInfo(cls): """ Return drainStats class variable """ return cls.drainStats def updateAgentSpeedDrainConfig(self, thresholdsHit): """ Takes a list of speed drain configuration keys and updates the agent configuration """ updateConfig = False condorPriorityFlag = False speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") if 'CondorPriority' in thresholdsHit: logging.info("Bumping condor job priority to 999999 for Production/Processing pending jobs.") self.condorAPI.editCondorJobs( "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")", "JobPrio", "999999") condorPriorityFlag = True if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']: # CondorPriority setting is irreversible so the flag only indicates weather # priority is increased or not. It is not checked by other components logging.info("Enabling CondorPriority flag.") speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag updateConfig = True if 'NoJobRetries' in thresholdsHit: logging.info("Enabling NoJobRetries flag: Error Handler won't retry the jobs") # ErrorHandler will pick this up and set max retries to 0 speedDrainConfig['NoJobRetries']['Enabled'] = True updateConfig = True if 'EnableAllSites' in thresholdsHit: logging.info("Enabling EnableAllSites flag: Updating agent to submit to all sites.") # setting this value to True makes JobSubmitterPoller ignore site status speedDrainConfig['EnableAllSites']['Enabled'] = True updateConfig = True # update the aux db speed drain config with any changes if updateConfig: self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", True) self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig) return def resetAgentSpeedDrainConfig(self): """ resetting SpeedDrainMode to False and SpeedDrainiConfig Enabled to False """ if self.agentConfig.get("SpeedDrainMode"): self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", False) speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") for key, v in speedDrainConfig.items(): if key in self.validSpeedDrainConfigKeys and v['Enabled']: speedDrainConfig[key]['Enabled'] = False self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig) return def checkSpeedDrainThresholds(self): """ Check the current number of jobs in Condor and create a list of agent configuration parameters that need updated for speed draining """ enableKeys = [] # get the current speed drain status speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") # get condor jobs jobs = self.condorAPI.getCondorJobs("", []) if jobs is None: logging.warning("There was an error querying the schedd. Not checking speed drain thresholds.") return [] # loop through the speed drain configuration and make a list of what thresholds have been hit for k, v in speedDrainConfig.items(): # make sure keys in the speed drain config are valid if k in self.validSpeedDrainConfigKeys and isinstance(v['Threshold'], int) and isinstance(v['Enabled'], bool): # we always want to apply the condor priority change if the threshold is hit if not v['Enabled'] or k == 'CondorPriority': logging.info("Checking speed drain threshold for %s. ", k) if len(jobs) < v['Threshold']: logging.info("Agent will update speed drain configuration for %s. ", k) enableKeys.append(k) else: logging.warning("Speed drain configuration error for %s. Please check aux db contents.", k) return enableKeys