def __init__(self, config): # queue url used in WorkQueueManager self.thisAgentUrl = "http://" + config.Agent.hostName + ":5984" self.globalBackend = WorkQueueBackend(config.WorkloadSummary.couchurl) self.localBackend = WorkQueueBackend(config.WorkQueueManager.couchurl) self.dbsUtil = DBSBufferUtil() self.condorAPI = PyCondorAPI()
def __init__(self, queue, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.queue = queue self.config = config self.condorAPI = PyCondorAPI()
def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI() self.condorAPI = PyCondorAPI() self.agentConfig = {} self.validSpeedDrainConfigKeys = [ 'CondorPriority', 'NoJobRetries', 'EnableAllSites' ] self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
def __init__(self, queue, config): """ Initialise class members """ BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.queue = queue self.config = config self.condorAPI = PyCondorAPI() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) self.listSubsWithoutJobs = self.daoFactory( classname="Subscriptions.GetSubsWithoutJobGroup")
def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI(config) self.condorAPI = PyCondorAPI() self.agentConfig = {} self.previousConfig = {} self.validSpeedDrainConfigKeys = [ 'CondorPriority', 'NoJobRetries', 'EnableAllSites' ] self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) self.emailAlert = EmailAlert(config.EmailAlert.dictionary_()) self.condorStates = ("Running", "Idle")
class DrainStatusAPI(object): """ Provides methods for querying dbs and condor for drain statistics """ def __init__(self): self.dbsUtil = DBSBufferUtil() self.condorAPI = PyCondorAPI() def collectDrainInfo(self): """ Call methods to check the drain status """ results = {} results['workflows_completed'] = self.checkWorkflows() # if workflows are completed, collect additional drain statistics if results['workflows_completed']: results['upload_status'] = self.checkFileUploadStatus() results['condor_status'] = self.checkCondorStates() return results def checkWorkflows(self): """ Check to see if all workflows have a 'completed' status """ results = self.dbsUtil.isAllWorkflowCompleted() return results def checkCondorStates(self): """ Check idle and running jobs in Condor """ results = {} queries = [["1", "idle"], ["2", "running"]] for query in queries: jobs = self.condorAPI.getCondorJobs("JobStatus=="+query[0], []) # if there is an error, report it instead of the length of an empty list if jobs is None: results[query[1]] = "unknown (schedd query error)" else: results[query[1]] = len(jobs) return results def checkFileUploadStatus(self): """ Check file upload status: Blocks open in DBS Files not uploaded in DBS Files not uploaded to Phedex """ results = {} results['dbs_open_blocks'] = self.dbsUtil.countOpenBlocks() results['dbs_notuploaded'] = self.dbsUtil.countFilesByStatus(status="NOTUPLOADED") results['phedex_notuploaded'] = self.dbsUtil.countPhedexNotUploaded() return results
def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI(config) self.condorAPI = PyCondorAPI() self.agentConfig = {} self.validSpeedDrainConfigKeys = ['CondorPriority', 'NoJobRetries', 'EnableAllSites'] self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
def __init__(self, queue, config): """ Initialise class members """ BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.queue = queue self.config = config self.condorAPI = PyCondorAPI() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) self.listSubsWithoutJobs = self.daoFactory(classname="Subscriptions.GetSubsWithoutJobGroup")
def __init__(self): self.dbsUtil = DBSBufferUtil() self.condorAPI = PyCondorAPI()
class WorkQueueManagerWorkPoller(BaseWorkerThread): """ Polls for Work """ def __init__(self, queue, config): """ Initialise class members """ BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.queue = queue self.config = config self.condorAPI = PyCondorAPI() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) self.listSubsWithoutJobs = self.daoFactory( classname="Subscriptions.GetSubsWithoutJobGroup") def setup(self, parameters): """ Called at startup - introduce random delay to avoid workers all starting at once """ t = random.randrange(self.idleTime) self.logger.info('Sleeping for %d seconds before 1st loop', t) time.sleep(t) @timeFunction def algorithm(self, parameters): """ Pull in work """ self.logger.info("Starting WorkQueueManagerWorkPoller thread ...") try: self.pullWork() except Exception as ex: self.queue.logger.error("Error in work pull loop: %s", str(ex)) try: # process if we get work or not - we may have to split old work # i.e. if transient errors were seen during splitting self.processWork() except Exception as ex: self.queue.logger.error("Error in new work split loop: %s", str(ex)) return def passRetrieveCondition(self): """ _passRetrieveCondition_ Return true if the component can proceed with fetching work. False if the component should skip pulling work this cycle. For now, it only checks whether the agent is in drain mode or MAX_JOBS_PER_OWNER is reached or if the condor schedd is overloaded. """ passCond = "OK" myThread = threading.currentThread() if isDrainMode(self.config): passCond = "agent is in drain mode" elif availableScheddSlots(myThread.dbi) <= 0: passCond = "schedd slot is maxed: MAX_JOBS_PER_OWNER" elif self.condorAPI.isScheddOverloaded(): passCond = "schedd is overloaded" else: subscriptions = self.listSubsWithoutJobs.execute() if subscriptions: passCond = "JobCreator hasn't created jobs for subscriptions %s" % subscriptions return passCond def pullWork(self): """Get work from parent""" self.queue.logger.info("Pulling work from %s", self.queue.parent_queue.queueUrl) myThread = threading.currentThread() try: cond = self.passRetrieveCondition() if cond == "OK": work = self.queue.pullWork() self.queue.logger.info("Obtained %s unit(s) of work", work) myThread.logdbClient.delete("LocalWorkQueue_pullWork", "warning", this_thread=True) else: self.queue.logger.warning("No work will be pulled, reason: %s", cond) myThread.logdbClient.post("LocalWorkQueue_pullWork", cond, "warning") except IOError as ex: self.queue.logger.exception( "Error opening connection to work queue: %s", str(ex)) except Exception as ex: self.queue.logger.exception( "Unable to pull work from parent Error: %s", str(ex)) def processWork(self): """Process new work""" self.queue.logger.info("Splitting new work") try: self.queue.processInboundWork() except Exception as ex: self.queue.logger.exception('Error during split: %s', str(ex)) self.logger.info('Splitting finished') return
class DrainStatusAPI(object): """ Provides methods for querying dbs and condor for drain statistics """ def __init__(self, config): # queue url used in WorkQueueManager self.thisAgentUrl = "http://" + config.Agent.hostName + ":5984" self.globalBackend = WorkQueueBackend(config.WorkloadSummary.couchurl) self.localBackend = WorkQueueBackend(config.WorkQueueManager.couchurl) self.dbsUtil = DBSBufferUtil() self.condorAPI = PyCondorAPI() def collectDrainInfo(self): """ Call methods to check the drain status """ results = {} results['workflows_completed'] = self.checkWorkflows() # if workflows are completed, collect additional drain statistics if results['workflows_completed']: results['upload_status'] = self.checkFileUploadStatus() results['condor_status'] = self.checkCondorStates() results['local_wq_status'] = self.checkLocalWQStatus(dbname="workqueue") results['local_wqinbox_status'] = self.checkLocalWQStatus(dbname="workqueue_inbox") results['global_wq_status'] = self.checkGlobalWQStatus() return results def checkWorkflows(self): """ Check to see if all workflows have a 'completed' status """ results = self.dbsUtil.isAllWorkflowCompleted() return results def checkCondorStates(self): """ Check idle and running jobs in Condor """ results = {} queries = [["1", "idle"], ["2", "running"]] for query in queries: jobs = self.condorAPI.getCondorJobs("JobStatus=="+query[0], []) # if there is an error, report it instead of the length of an empty list if jobs is None: results[query[1]] = "unknown (schedd query error)" else: results[query[1]] = len(jobs) return results def checkFileUploadStatus(self): """ Check file upload status: Blocks open in DBS Files not uploaded in DBS Files not uploaded to Phedex """ results = {} results['dbs_open_blocks'] = self.dbsUtil.countOpenBlocks() results['dbs_notuploaded'] = self.dbsUtil.countFilesByStatus(status="NOTUPLOADED") results['phedex_notuploaded'] = self.dbsUtil.countPhedexNotUploaded() return results def checkLocalWQStatus(self, dbname): """ Query local WorkQueue workqueue/workqueue_inbox database to see whether there are any active elements in this agent. """ results = {} for st in ('Available', 'Negotiating', 'Acquired', 'Running'): if dbname == "workqueue": elements = self.localBackend.getElements(status=st, returnIdOnly=True) else: elements = self.localBackend.getInboxElements(status=st, returnIdOnly=True) results[st] = len(elements) return results def checkGlobalWQStatus(self): """ Query Global WorkQueue workqueue database to see whether there are any active elements set to this agent. """ results = {} for st in ("Acquired", "Running"): elements = self.globalBackend.getElements(status=st, returnIdOnly=True, ChildQueueUrl=self.thisAgentUrl) results[st] = len(elements) return results
class WorkQueueManagerWorkPoller(BaseWorkerThread): """ Polls for Work """ def __init__(self, queue, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.queue = queue self.config = config self.condorAPI = PyCondorAPI() def setup(self, parameters): """ Called at startup - introduce random delay to avoid workers all starting at once """ t = random.randrange(self.idleTime) self.logger.info('Sleeping for %d seconds before 1st loop' % t) time.sleep(t) def algorithm(self, parameters): """ Pull in work """ try: self.pullWork() except Exception as ex: self.queue.logger.error("Error in work pull loop: %s" % str(ex)) try: # process if we get work or not - we may have to split old work # i.e. if transient errors were seen during splitting self.processWork() except Exception as ex: self.queue.logger.error("Error in new work split loop: %s" % str(ex)) return def passRetrieveCondition(self): """ _passRetrieveCondition_ Return true if the component can proceed with fetching work. False if the component should skip pulling work this cycle. For now, it only checks whether the agent is in drain mode or if the condor schedd is overloaded. """ passCond = True if isDrainMode(self.config): passCond = False elif self.condorAPI.isScheddOverloaded(): passCond = False return passCond def pullWork(self): """Get work from parent""" self.queue.logger.info("Pulling work from %s" % self.queue.parent_queue.queueUrl) work = 0 myThread = threading.currentThread() try: if self.passRetrieveCondition(): work = self.queue.pullWork() myThread.logdbClient.delete("LocalWorkQueue_pullWork", "warning", this_thread=True) else: msg = "Workqueue didn't pass the retrieve condition: NOT pulling work" self.queue.logger.warning(msg) myThread.logdbClient.post("LocalWorkQueue_pullWork", msg, "warning") except IOError as ex: self.queue.logger.error( "Error opening connection to work queue: %s \n%s" % (str(ex), traceback.format_exc())) except Exception as ex: self.queue.logger.error( "Unable to pull work from parent Error: %s\n%s" % (str(ex), traceback.format_exc())) self.queue.logger.info("Obtained %s unit(s) of work" % work) return work def processWork(self): """Process new work""" self.queue.logger.info("Splitting new work") try: self.queue.processInboundWork() except Exception as ex: self.queue.logger.exception('Error during split: %s' % str(ex)) self.logger.info('Splitting finished') return
class WorkQueueManagerWorkPoller(BaseWorkerThread): """ Polls for Work """ def __init__(self, queue, config): """ Initialise class members """ BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.queue = queue self.config = config self.condorAPI = PyCondorAPI() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) self.listSubsWithoutJobs = self.daoFactory(classname="Subscriptions.GetSubsWithoutJobGroup") def setup(self, parameters): """ Called at startup - introduce random delay to avoid workers all starting at once """ t = random.randrange(self.idleTime) self.logger.info('Sleeping for %d seconds before 1st loop' % t) time.sleep(t) @timeFunction def algorithm(self, parameters): """ Pull in work """ try: self.pullWork() except Exception as ex: self.queue.logger.error("Error in work pull loop: %s" % str(ex)) try: # process if we get work or not - we may have to split old work # i.e. if transient errors were seen during splitting self.processWork() except Exception as ex: self.queue.logger.error("Error in new work split loop: %s" % str(ex)) return def passRetrieveCondition(self): """ _passRetrieveCondition_ Return true if the component can proceed with fetching work. False if the component should skip pulling work this cycle. For now, it only checks whether the agent is in drain mode or MAX_JOBS_PER_OWNER is reached or if the condor schedd is overloaded. """ passCond = "OK" myThread = threading.currentThread() if isDrainMode(self.config): passCond = "No work will be pulled: Agent is in drain" elif availableScheddSlots(myThread.dbi) <= 0: passCond = "No work will be pulled: schedd slot is maxed: MAX_JOBS_PER_OWNER" elif self.condorAPI.isScheddOverloaded(): passCond = "No work will be pulled: schedd is overloaded" else: subscriptions = self.listSubsWithoutJobs.execute() if subscriptions: passCond = "No work will be pulled: " passCond += "JobCreator hasn't created jobs for subscriptions %s" % subscriptions return passCond def pullWork(self): """Get work from parent""" self.queue.logger.info("Pulling work from %s" % self.queue.parent_queue.queueUrl) work = 0 myThread = threading.currentThread() try: cond = self.passRetrieveCondition() if cond == "OK": work = self.queue.pullWork() myThread.logdbClient.delete("LocalWorkQueue_pullWork", "warning", this_thread=True) else: self.queue.logger.warning(cond) myThread.logdbClient.post("LocalWorkQueue_pullWork", cond, "warning") except IOError as ex: self.queue.logger.error("Error opening connection to work queue: %s \n%s" % (str(ex), traceback.format_exc())) except Exception as ex: self.queue.logger.error("Unable to pull work from parent Error: %s\n%s" % (str(ex), traceback.format_exc())) self.queue.logger.info("Obtained %s unit(s) of work" % work) return work def processWork(self): """Process new work""" self.queue.logger.info("Splitting new work") try: self.queue.processInboundWork() except Exception as ex: self.queue.logger.exception('Error during split: %s' % str(ex)) self.logger.info('Splitting finished') return
class DrainStatusPoller(BaseWorkerThread): """ Collects information related to the agent drain status """ # class variable that contains drain statistics drainStats = {} def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI() self.condorAPI = PyCondorAPI() self.agentConfig = {} self.validSpeedDrainConfigKeys = [ 'CondorPriority', 'NoJobRetries', 'EnableAllSites' ] self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) @timeFunction def algorithm(self, parameters): """ Update drainStats if agent is in drain mode """ logging.info("Running agent drain algorithm...") self.agentConfig = self.reqAuxDB.getWMAgentConfig( self.config.Agent.hostName) if isDrainMode(self.config): # check to see if the agent hit any speed drain thresholds thresholdsHit = self.checkSpeedDrainThresholds() if thresholdsHit: logging.info("Updating agent configuration for speed drain...") self.updateAgentSpeedDrainConfig(thresholdsHit) try: DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo() logging.info("Finished collecting agent drain status.") logging.info("Drain stats: " + str(DrainStatusPoller.drainStats)) except Exception as ex: msg = "Error occurred, will retry later:\n" msg += str(ex) logging.exception(msg) else: logging.info( "Agent not in drain mode. Resetting flags and skipping drain check..." ) self.resetAgentSpeedDrainConfig() @classmethod def getDrainInfo(cls): """ Return drainStats class variable """ return cls.drainStats def updateAgentSpeedDrainConfig(self, thresholdsHit): """ Takes a list of speed drain configuration keys and updates the agent configuration """ updateConfig = False condorPriorityFlag = False speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") if 'CondorPriority' in thresholdsHit: logging.info( "Bumping condor job priority to 999999 for Production/Processing pending jobs." ) self.condorAPI.editCondorJobs( "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")", "JobPrio", "999999") condorPriorityFlag = True if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']: # CondorPriority setting is irreversible so the flag only indicates weather # priority is increased or not. It is not checked by other components logging.info("Enabling CondorPriority flag.") speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag updateConfig = True if 'NoJobRetries' in thresholdsHit: logging.info( "Enabling NoJobRetries flag: Error Handler won't retry the jobs" ) # ErrorHandler will pick this up and set max retries to 0 speedDrainConfig['NoJobRetries']['Enabled'] = True updateConfig = True if 'EnableAllSites' in thresholdsHit: logging.info( "Enabling EnableAllSites flag: Updating agent to submit to all sites." ) # setting this value to True makes JobSubmitterPoller ignore site status speedDrainConfig['EnableAllSites']['Enabled'] = True updateConfig = True # update the aux db speed drain config with any changes if updateConfig: self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", True) self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig) return def resetAgentSpeedDrainConfig(self): """ resetting SpeedDrainMode to False and SpeedDrainiConfig Enabled to False """ if self.agentConfig.get("SpeedDrainMode"): self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", False) speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") for key, v in speedDrainConfig.items(): if key in self.validSpeedDrainConfigKeys and v['Enabled']: speedDrainConfig[key]['Enabled'] = False self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig) return def checkSpeedDrainThresholds(self): """ Check the current number of jobs in Condor and create a list of agent configuration parameters that need updated for speed draining """ enableKeys = [] # get the current speed drain status speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") # get condor jobs jobs = self.condorAPI.getCondorJobs("", []) if jobs is None: logging.warning( "There was an error querying the schedd. Not checking speed drain thresholds." ) return [] # loop through the speed drain configuration and make a list of what thresholds have been hit for k, v in speedDrainConfig.items(): # make sure keys in the speed drain config are valid if k in self.validSpeedDrainConfigKeys and isinstance( v['Threshold'], int) and isinstance(v['Enabled'], bool): # we always want to apply the condor priority change if the threshold is hit if not v['Enabled'] or k == 'CondorPriority': logging.info("Checking speed drain threshold for %s. ", k) if len(jobs) < v['Threshold']: logging.info( "Agent will update speed drain configuration for %s. ", k) enableKeys.append(k) else: logging.warning( "Speed drain configuration error for %s. Please check aux db contents.", k) return enableKeys
class DrainStatusAPI(object): """ Provides methods for querying dbs and condor for drain statistics """ def __init__(self, config): # queue url used in WorkQueueManager self.thisAgentUrl = "http://" + config.Agent.hostName + ":5984" self.globalBackend = WorkQueueBackend(config.WorkloadSummary.couchurl) self.localBackend = WorkQueueBackend(config.WorkQueueManager.couchurl) self.dbsUtil = DBSBufferUtil() self.condorAPI = PyCondorAPI() self.condorStates = ("Running", "Idle") def collectDrainInfo(self): """ Call methods to check the drain status """ results = {} results['workflows_completed'] = self.checkWorkflows() # if workflows are completed, collect additional drain statistics if results['workflows_completed']: results['upload_status'] = self.checkFileUploadStatus() results['condor_status'] = self.checkCondorStates() results['local_wq_status'] = self.checkLocalWQStatus( dbname="workqueue") results['local_wqinbox_status'] = self.checkLocalWQStatus( dbname="workqueue_inbox") results['global_wq_status'] = self.checkGlobalWQStatus() return results def checkWorkflows(self): """ Check to see if all workflows have a 'completed' status """ results = self.dbsUtil.isAllWorkflowCompleted() return results def checkCondorStates(self): """ Check idle and running jobs in Condor """ results = {} jobs = self.condorAPI.getCondorJobsSummary() for state in self.condorStates: # if there is an error, report it instead of the length of an empty list if not jobs: results[state.lower()] = None else: results[state.lower()] = int(jobs[0].get(state)) return results def checkFileUploadStatus(self): """ Check file upload status: Blocks open in DBS Files not uploaded in DBS Files not uploaded to Phedex """ results = {} results['dbs_open_blocks'] = self.dbsUtil.countOpenBlocks() results['dbs_notuploaded'] = self.dbsUtil.countFilesByStatus( status="NOTUPLOADED") results['phedex_notuploaded'] = self.dbsUtil.countPhedexNotUploaded() return results def checkLocalWQStatus(self, dbname): """ Query local WorkQueue workqueue/workqueue_inbox database to see whether there are any active elements in this agent. """ results = {} for st in ('Available', 'Negotiating', 'Acquired', 'Running'): if dbname == "workqueue": elements = self.localBackend.getElements(status=st, returnIdOnly=True) else: elements = self.localBackend.getInboxElements( status=st, returnIdOnly=True) results[st] = len(elements) return results def checkGlobalWQStatus(self): """ Query Global WorkQueue workqueue database to see whether there are any active elements set to this agent. """ results = {} for st in ("Acquired", "Running"): elements = self.globalBackend.getElements( status=st, returnIdOnly=True, ChildQueueUrl=self.thisAgentUrl) results[st] = len(elements) return results
class DrainStatusPoller(BaseWorkerThread): """ Collects information related to the agent drain status """ # class variable that contains drain statistics drainStats = {} def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI(config) self.condorAPI = PyCondorAPI() self.agentConfig = {} self.validSpeedDrainConfigKeys = ['CondorPriority', 'NoJobRetries', 'EnableAllSites'] self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) @timeFunction def algorithm(self, parameters): """ Update drainStats if agent is in drain mode """ logging.info("Running agent drain algorithm...") self.agentConfig = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName) if not self.agentConfig: logging.error("Failed to fetch agent configuration from the auxiliary DB") return if isDrainMode(self.config): # check to see if the agent hit any speed drain thresholds thresholdsHit = self.checkSpeedDrainThresholds() if thresholdsHit: logging.info("Updating agent configuration for speed drain...") self.updateAgentSpeedDrainConfig(thresholdsHit) # now collect drain statistics try: DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo() logging.info("Finished collecting agent drain status.") logging.info("Drain stats: " + str(DrainStatusPoller.drainStats)) except Exception as ex: msg = "Error occurred, will retry later:\n" msg += str(ex) logging.exception(msg) else: logging.info("Agent not in drain mode. Resetting flags and skipping drain check...") self.resetAgentSpeedDrainConfig() @classmethod def getDrainInfo(cls): """ Return drainStats class variable """ return cls.drainStats def updateAgentSpeedDrainConfig(self, thresholdsHit): """ Takes a list of speed drain configuration keys and updates the agent configuration """ updateConfig = False condorPriorityFlag = False speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") if 'CondorPriority' in thresholdsHit: logging.info("Bumping condor job priority to 999999 for Production/Processing pending jobs.") self.condorAPI.editCondorJobs( "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")", "JobPrio", "999999") condorPriorityFlag = True if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']: # CondorPriority setting is irreversible so the flag only indicates weather # priority is increased or not. It is not checked by other components logging.info("Enabling CondorPriority flag.") speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag updateConfig = True if 'NoJobRetries' in thresholdsHit: logging.info("Enabling NoJobRetries flag: Error Handler won't retry the jobs") # ErrorHandler will pick this up and set max retries to 0 speedDrainConfig['NoJobRetries']['Enabled'] = True updateConfig = True if 'EnableAllSites' in thresholdsHit: logging.info("Enabling EnableAllSites flag: Updating agent to submit to all sites.") # setting this value to True makes JobSubmitterPoller ignore site status speedDrainConfig['EnableAllSites']['Enabled'] = True updateConfig = True # update the aux db speed drain config with any changes if updateConfig: self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", True) self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig) return def resetAgentSpeedDrainConfig(self): """ resetting SpeedDrainMode to False and SpeedDrainiConfig Enabled to False """ if self.agentConfig.get("SpeedDrainMode"): self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", False) speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") for key, v in speedDrainConfig.items(): if key in self.validSpeedDrainConfigKeys and v['Enabled']: speedDrainConfig[key]['Enabled'] = False self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig) return def checkSpeedDrainThresholds(self): """ Check the current number of jobs in Condor and create a list of agent configuration parameters that need updated for speed draining """ enableKeys = [] # get the current speed drain status speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") # get condor jobs jobs = self.condorAPI.getCondorJobs("", []) if jobs is None: logging.warning("There was an error querying the schedd. Not checking speed drain thresholds.") return [] # loop through the speed drain configuration and make a list of what thresholds have been hit for k, v in speedDrainConfig.items(): # make sure keys in the speed drain config are valid if k in self.validSpeedDrainConfigKeys and isinstance(v['Threshold'], int) and isinstance(v['Enabled'], bool): # we always want to apply the condor priority change if the threshold is hit if not v['Enabled'] or k == 'CondorPriority': logging.info("Checking speed drain threshold for %s. ", k) if len(jobs) < v['Threshold']: logging.info("Agent will update speed drain configuration for %s. ", k) enableKeys.append(k) else: logging.warning("Speed drain configuration error for %s. Please check aux db contents.", k) return enableKeys
class DrainStatusPoller(BaseWorkerThread): """ Collects information related to the agent drain status """ # class variable that contains drain statistics drainStats = {} def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI(config) self.condorAPI = PyCondorAPI() self.agentConfig = {} self.previousConfig = {} self.validSpeedDrainConfigKeys = [ 'CondorPriority', 'NoJobRetries', 'EnableAllSites' ] self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) self.emailAlert = EmailAlert(config.EmailAlert.dictionary_()) self.condorStates = ("Running", "Idle") @timeFunction def algorithm(self, parameters): """ Update drainStats if agent is in drain mode """ logging.info("Running agent drain algorithm...") if self.agentConfig: # make a copy of the previous agent aux db configuration to compare against later self.previousConfig = copy.deepcopy(self.agentConfig) # grab a new copy of the agent aux db configuration self.agentConfig = self.reqAuxDB.getWMAgentConfig( self.config.Agent.hostName) if not self.agentConfig: logging.error( "Failed to fetch agent configuration from the auxiliary DB") return try: # see if the agent is in drain mode if self.agentConfig["UserDrainMode"] or self.agentConfig[ "AgentDrainMode"]: # check to see if the agent hit any speed drain thresholds thresholdsHit = self.checkSpeedDrainThresholds() if thresholdsHit: logging.info( "Updating agent configuration for speed drain...") self.updateAgentSpeedDrainConfig(thresholdsHit) # now collect drain statistics DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo() logging.info("Finished collecting agent drain status.") logging.info("Drain stats: %s", str(DrainStatusPoller.drainStats)) else: logging.info( "Agent not in drain mode. Resetting flags and skipping drain check..." ) self.resetAgentSpeedDrainConfig() # finally, check for any changes in drain status self.checkDrainStatusChanges() except Exception as ex: msg = "Error occurred, will retry later:\n" msg += str(ex) logging.exception(msg) @classmethod def getDrainInfo(cls): """ Return drainStats class variable """ return cls.drainStats def checkDrainStatusChanges(self): """ Check to see if any drain statuses have changed in the auxiliary db If yes, send email notification and update local drain thread variables """ message = "" drainStatusKeys = ['UserDrainMode', 'AgentDrainMode', 'SpeedDrainMode'] if not self.previousConfig: return for key in drainStatusKeys: if self.previousConfig[key] != self.agentConfig[key]: message += "Agent had a drain status transition to %s = %s\n" % ( str(key), str(self.agentConfig[key])) if message: self.emailAlert.send( "DrainMode status change on " + getattr(self.config.Agent, "hostName"), message) logging.info("Drain mode status change: %s", message) return def updateAgentSpeedDrainConfig(self, thresholdsHit): """ Takes a list of speed drain configuration keys and updates the agent configuration """ updateConfig = False condorPriorityFlag = False speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") if 'CondorPriority' in thresholdsHit: logging.info( "Bumping condor job priority to 999999 for Production/Processing pending jobs." ) self.condorAPI.editCondorJobs( "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")", "JobPrio", "999999") condorPriorityFlag = True if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']: # CondorPriority setting is irreversible so the flag only indicates weather # priority is increased or not. It is not checked by other components logging.info("Enabling CondorPriority flag.") speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag updateConfig = True if 'NoJobRetries' in thresholdsHit: logging.info( "Enabling NoJobRetries flag: Error Handler won't retry the jobs" ) # ErrorHandler will pick this up and set max retries to 0 speedDrainConfig['NoJobRetries']['Enabled'] = True updateConfig = True if 'EnableAllSites' in thresholdsHit: logging.info( "Enabling EnableAllSites flag: Updating agent to submit to all sites." ) # setting this value to True makes JobSubmitterPoller ignore site status speedDrainConfig['EnableAllSites']['Enabled'] = True updateConfig = True # update the aux db speed drain config with any changes if updateConfig: self.agentConfig['SpeedDrainMode'] = True self.reqAuxDB.updateWMAgentConfig(self.config.Agent.hostName, self.agentConfig) return def resetAgentSpeedDrainConfig(self): """ resetting SpeedDrainMode to False and SpeedDrainConfig Enabled to False """ if self.agentConfig.get("SpeedDrainMode"): self.agentConfig['SpeedDrainMode'] = False speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") for key, v in viewitems(speedDrainConfig): if key in self.validSpeedDrainConfigKeys and v['Enabled']: speedDrainConfig[key]['Enabled'] = False self.reqAuxDB.updateWMAgentConfig(self.config.Agent.hostName, self.agentConfig) return def checkSpeedDrainThresholds(self): """ Check the current number of jobs in Condor and create a list of agent configuration parameters that need updated for speed draining """ enableKeys = [] # first, update our summary of condor jobs totalJobs = self.getTotalCondorJobs() if totalJobs is None: msg = "Cannot check speed drain because there was an error fetching job summary from HTCondor." msg += " Will retry again in the next cycle." logging.warning(msg) return [] # get the current speed drain status speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") # loop through the speed drain configuration and make a list of what thresholds have been hit for k, v in viewitems(speedDrainConfig): # make sure keys in the speed drain config are valid if k in self.validSpeedDrainConfigKeys and isinstance( v['Threshold'], int) and isinstance(v['Enabled'], bool): # we always want to apply the condor priority change if the threshold is hit if not v['Enabled'] or k == 'CondorPriority': logging.info("Checking speed drain threshold for %s. ", k) if totalJobs < v['Threshold']: logging.info( "Agent will update speed drain configuration for %s. ", k) enableKeys.append(k) else: logging.warning( "Speed drain configuration error for %s. Please check aux db contents.", k) return enableKeys def getTotalCondorJobs(self): """ Retrieve a summary of the jobs in condor and return an absolute number of the jobs in Idle and Running states. :return: returns an integer with the total number of jobs, or None if it failed. """ jobs = self.condorAPI.getCondorJobsSummary() if not jobs: return None results = 0 if jobs: for state in self.condorStates: results += int(jobs[0].get(state)) return results