class DrainStatusPoller(BaseWorkerThread): """ Collects information related to the agent drain status """ # class variable that contains drain statistics drainStats = {} def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI() @timeFunction def algorithm(self, parameters): """ Update drainStats if agent is in drain mode """ if isDrainMode(self.config): logging.info("Checking agent drain status...") try: DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo() logging.info("Finished collecting agent drain status.") logging.info("Drain stats: " + str(DrainStatusPoller.drainStats)) except Exception as ex: msg = "Error occurred, will retry later:\n" msg += str(ex) logging.exception(msg) else: logging.info("Agent not in drain mode. Skipping drain check...") @classmethod def getDrainInfo(cls): """ Return drainStats class variable """ return cls.drainStats
class DrainStatusPoller(BaseWorkerThread): """ Collects information related to the agent drain status """ # class variable that contains drain statistics drainStats = {} def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI() def algorithm(self, parameters): """ Update drainStats if agent is in drain mode """ if isDrainMode(self.config): logging.info("Checking agent drain status...") try: DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo() logging.info("Finished collecting agent drain status.") logging.info("Drain stats: " + str(DrainStatusPoller.drainStats)) except Exception as ex: msg = "Error occurred, will retry later:\n" msg += str(ex) logging.exception(msg) else: logging.info("Agent not in drain mode. Skipping drain check...") @classmethod def getDrainInfo(cls): """ Return drainStats class variable """ return cls.drainStats
class DrainStatusPoller(BaseWorkerThread): """ Collects information related to the agent drain status """ # class variable that contains drain statistics drainStats = {} def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI() self.condorAPI = PyCondorAPI() self.agentConfig = {} self.validSpeedDrainConfigKeys = [ 'CondorPriority', 'NoJobRetries', 'EnableAllSites' ] self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) @timeFunction def algorithm(self, parameters): """ Update drainStats if agent is in drain mode """ logging.info("Running agent drain algorithm...") self.agentConfig = self.reqAuxDB.getWMAgentConfig( self.config.Agent.hostName) if isDrainMode(self.config): # check to see if the agent hit any speed drain thresholds thresholdsHit = self.checkSpeedDrainThresholds() if thresholdsHit: logging.info("Updating agent configuration for speed drain...") self.updateAgentSpeedDrainConfig(thresholdsHit) try: DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo() logging.info("Finished collecting agent drain status.") logging.info("Drain stats: " + str(DrainStatusPoller.drainStats)) except Exception as ex: msg = "Error occurred, will retry later:\n" msg += str(ex) logging.exception(msg) else: logging.info( "Agent not in drain mode. Resetting flags and skipping drain check..." ) self.resetAgentSpeedDrainConfig() @classmethod def getDrainInfo(cls): """ Return drainStats class variable """ return cls.drainStats def updateAgentSpeedDrainConfig(self, thresholdsHit): """ Takes a list of speed drain configuration keys and updates the agent configuration """ updateConfig = False condorPriorityFlag = False speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") if 'CondorPriority' in thresholdsHit: logging.info( "Bumping condor job priority to 999999 for Production/Processing pending jobs." ) self.condorAPI.editCondorJobs( "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")", "JobPrio", "999999") condorPriorityFlag = True if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']: # CondorPriority setting is irreversible so the flag only indicates weather # priority is increased or not. It is not checked by other components logging.info("Enabling CondorPriority flag.") speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag updateConfig = True if 'NoJobRetries' in thresholdsHit: logging.info( "Enabling NoJobRetries flag: Error Handler won't retry the jobs" ) # ErrorHandler will pick this up and set max retries to 0 speedDrainConfig['NoJobRetries']['Enabled'] = True updateConfig = True if 'EnableAllSites' in thresholdsHit: logging.info( "Enabling EnableAllSites flag: Updating agent to submit to all sites." ) # setting this value to True makes JobSubmitterPoller ignore site status speedDrainConfig['EnableAllSites']['Enabled'] = True updateConfig = True # update the aux db speed drain config with any changes if updateConfig: self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", True) self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig) return def resetAgentSpeedDrainConfig(self): """ resetting SpeedDrainMode to False and SpeedDrainiConfig Enabled to False """ if self.agentConfig.get("SpeedDrainMode"): self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", False) speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") for key, v in speedDrainConfig.items(): if key in self.validSpeedDrainConfigKeys and v['Enabled']: speedDrainConfig[key]['Enabled'] = False self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig) return def checkSpeedDrainThresholds(self): """ Check the current number of jobs in Condor and create a list of agent configuration parameters that need updated for speed draining """ enableKeys = [] # get the current speed drain status speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") # get condor jobs jobs = self.condorAPI.getCondorJobs("", []) if jobs is None: logging.warning( "There was an error querying the schedd. Not checking speed drain thresholds." ) return [] # loop through the speed drain configuration and make a list of what thresholds have been hit for k, v in speedDrainConfig.items(): # make sure keys in the speed drain config are valid if k in self.validSpeedDrainConfigKeys and isinstance( v['Threshold'], int) and isinstance(v['Enabled'], bool): # we always want to apply the condor priority change if the threshold is hit if not v['Enabled'] or k == 'CondorPriority': logging.info("Checking speed drain threshold for %s. ", k) if len(jobs) < v['Threshold']: logging.info( "Agent will update speed drain configuration for %s. ", k) enableKeys.append(k) else: logging.warning( "Speed drain configuration error for %s. Please check aux db contents.", k) return enableKeys
class DrainStatusPoller(BaseWorkerThread): """ Collects information related to the agent drain status """ # class variable that contains drain statistics drainStats = {} def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI(config) self.condorAPI = PyCondorAPI() self.agentConfig = {} self.previousConfig = {} self.validSpeedDrainConfigKeys = [ 'CondorPriority', 'NoJobRetries', 'EnableAllSites' ] self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) self.emailAlert = EmailAlert(config.EmailAlert.dictionary_()) self.condorStates = ("Running", "Idle") @timeFunction def algorithm(self, parameters): """ Update drainStats if agent is in drain mode """ logging.info("Running agent drain algorithm...") if self.agentConfig: # make a copy of the previous agent aux db configuration to compare against later self.previousConfig = copy.deepcopy(self.agentConfig) # grab a new copy of the agent aux db configuration self.agentConfig = self.reqAuxDB.getWMAgentConfig( self.config.Agent.hostName) if not self.agentConfig: logging.error( "Failed to fetch agent configuration from the auxiliary DB") return try: # see if the agent is in drain mode if self.agentConfig["UserDrainMode"] or self.agentConfig[ "AgentDrainMode"]: # check to see if the agent hit any speed drain thresholds thresholdsHit = self.checkSpeedDrainThresholds() if thresholdsHit: logging.info( "Updating agent configuration for speed drain...") self.updateAgentSpeedDrainConfig(thresholdsHit) # now collect drain statistics DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo() logging.info("Finished collecting agent drain status.") logging.info("Drain stats: %s", str(DrainStatusPoller.drainStats)) else: logging.info( "Agent not in drain mode. Resetting flags and skipping drain check..." ) self.resetAgentSpeedDrainConfig() # finally, check for any changes in drain status self.checkDrainStatusChanges() except Exception as ex: msg = "Error occurred, will retry later:\n" msg += str(ex) logging.exception(msg) @classmethod def getDrainInfo(cls): """ Return drainStats class variable """ return cls.drainStats def checkDrainStatusChanges(self): """ Check to see if any drain statuses have changed in the auxiliary db If yes, send email notification and update local drain thread variables """ message = "" drainStatusKeys = ['UserDrainMode', 'AgentDrainMode', 'SpeedDrainMode'] if not self.previousConfig: return for key in drainStatusKeys: if self.previousConfig[key] != self.agentConfig[key]: message += "Agent had a drain status transition to %s = %s\n" % ( str(key), str(self.agentConfig[key])) if message: self.emailAlert.send( "DrainMode status change on " + getattr(self.config.Agent, "hostName"), message) logging.info("Drain mode status change: %s", message) return def updateAgentSpeedDrainConfig(self, thresholdsHit): """ Takes a list of speed drain configuration keys and updates the agent configuration """ updateConfig = False condorPriorityFlag = False speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") if 'CondorPriority' in thresholdsHit: logging.info( "Bumping condor job priority to 999999 for Production/Processing pending jobs." ) self.condorAPI.editCondorJobs( "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")", "JobPrio", "999999") condorPriorityFlag = True if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']: # CondorPriority setting is irreversible so the flag only indicates weather # priority is increased or not. It is not checked by other components logging.info("Enabling CondorPriority flag.") speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag updateConfig = True if 'NoJobRetries' in thresholdsHit: logging.info( "Enabling NoJobRetries flag: Error Handler won't retry the jobs" ) # ErrorHandler will pick this up and set max retries to 0 speedDrainConfig['NoJobRetries']['Enabled'] = True updateConfig = True if 'EnableAllSites' in thresholdsHit: logging.info( "Enabling EnableAllSites flag: Updating agent to submit to all sites." ) # setting this value to True makes JobSubmitterPoller ignore site status speedDrainConfig['EnableAllSites']['Enabled'] = True updateConfig = True # update the aux db speed drain config with any changes if updateConfig: self.agentConfig['SpeedDrainMode'] = True self.reqAuxDB.updateWMAgentConfig(self.config.Agent.hostName, self.agentConfig) return def resetAgentSpeedDrainConfig(self): """ resetting SpeedDrainMode to False and SpeedDrainConfig Enabled to False """ if self.agentConfig.get("SpeedDrainMode"): self.agentConfig['SpeedDrainMode'] = False speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") for key, v in viewitems(speedDrainConfig): if key in self.validSpeedDrainConfigKeys and v['Enabled']: speedDrainConfig[key]['Enabled'] = False self.reqAuxDB.updateWMAgentConfig(self.config.Agent.hostName, self.agentConfig) return def checkSpeedDrainThresholds(self): """ Check the current number of jobs in Condor and create a list of agent configuration parameters that need updated for speed draining """ enableKeys = [] # first, update our summary of condor jobs totalJobs = self.getTotalCondorJobs() if totalJobs is None: msg = "Cannot check speed drain because there was an error fetching job summary from HTCondor." msg += " Will retry again in the next cycle." logging.warning(msg) return [] # get the current speed drain status speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") # loop through the speed drain configuration and make a list of what thresholds have been hit for k, v in viewitems(speedDrainConfig): # make sure keys in the speed drain config are valid if k in self.validSpeedDrainConfigKeys and isinstance( v['Threshold'], int) and isinstance(v['Enabled'], bool): # we always want to apply the condor priority change if the threshold is hit if not v['Enabled'] or k == 'CondorPriority': logging.info("Checking speed drain threshold for %s. ", k) if totalJobs < v['Threshold']: logging.info( "Agent will update speed drain configuration for %s. ", k) enableKeys.append(k) else: logging.warning( "Speed drain configuration error for %s. Please check aux db contents.", k) return enableKeys def getTotalCondorJobs(self): """ Retrieve a summary of the jobs in condor and return an absolute number of the jobs in Idle and Running states. :return: returns an integer with the total number of jobs, or None if it failed. """ jobs = self.condorAPI.getCondorJobsSummary() if not jobs: return None results = 0 if jobs: for state in self.condorStates: results += int(jobs[0].get(state)) return results
class DrainStatusPoller(BaseWorkerThread): """ Collects information related to the agent drain status """ # class variable that contains drain statistics drainStats = {} def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) self.config = config self.drainAPI = DrainStatusAPI(config) self.condorAPI = PyCondorAPI() self.agentConfig = {} self.validSpeedDrainConfigKeys = ['CondorPriority', 'NoJobRetries', 'EnableAllSites'] self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) @timeFunction def algorithm(self, parameters): """ Update drainStats if agent is in drain mode """ logging.info("Running agent drain algorithm...") self.agentConfig = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName) if not self.agentConfig: logging.error("Failed to fetch agent configuration from the auxiliary DB") return if isDrainMode(self.config): # check to see if the agent hit any speed drain thresholds thresholdsHit = self.checkSpeedDrainThresholds() if thresholdsHit: logging.info("Updating agent configuration for speed drain...") self.updateAgentSpeedDrainConfig(thresholdsHit) # now collect drain statistics try: DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo() logging.info("Finished collecting agent drain status.") logging.info("Drain stats: " + str(DrainStatusPoller.drainStats)) except Exception as ex: msg = "Error occurred, will retry later:\n" msg += str(ex) logging.exception(msg) else: logging.info("Agent not in drain mode. Resetting flags and skipping drain check...") self.resetAgentSpeedDrainConfig() @classmethod def getDrainInfo(cls): """ Return drainStats class variable """ return cls.drainStats def updateAgentSpeedDrainConfig(self, thresholdsHit): """ Takes a list of speed drain configuration keys and updates the agent configuration """ updateConfig = False condorPriorityFlag = False speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") if 'CondorPriority' in thresholdsHit: logging.info("Bumping condor job priority to 999999 for Production/Processing pending jobs.") self.condorAPI.editCondorJobs( "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")", "JobPrio", "999999") condorPriorityFlag = True if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']: # CondorPriority setting is irreversible so the flag only indicates weather # priority is increased or not. It is not checked by other components logging.info("Enabling CondorPriority flag.") speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag updateConfig = True if 'NoJobRetries' in thresholdsHit: logging.info("Enabling NoJobRetries flag: Error Handler won't retry the jobs") # ErrorHandler will pick this up and set max retries to 0 speedDrainConfig['NoJobRetries']['Enabled'] = True updateConfig = True if 'EnableAllSites' in thresholdsHit: logging.info("Enabling EnableAllSites flag: Updating agent to submit to all sites.") # setting this value to True makes JobSubmitterPoller ignore site status speedDrainConfig['EnableAllSites']['Enabled'] = True updateConfig = True # update the aux db speed drain config with any changes if updateConfig: self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", True) self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig) return def resetAgentSpeedDrainConfig(self): """ resetting SpeedDrainMode to False and SpeedDrainiConfig Enabled to False """ if self.agentConfig.get("SpeedDrainMode"): self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", False) speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") for key, v in speedDrainConfig.items(): if key in self.validSpeedDrainConfigKeys and v['Enabled']: speedDrainConfig[key]['Enabled'] = False self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig) return def checkSpeedDrainThresholds(self): """ Check the current number of jobs in Condor and create a list of agent configuration parameters that need updated for speed draining """ enableKeys = [] # get the current speed drain status speedDrainConfig = self.agentConfig.get("SpeedDrainConfig") # get condor jobs jobs = self.condorAPI.getCondorJobs("", []) if jobs is None: logging.warning("There was an error querying the schedd. Not checking speed drain thresholds.") return [] # loop through the speed drain configuration and make a list of what thresholds have been hit for k, v in speedDrainConfig.items(): # make sure keys in the speed drain config are valid if k in self.validSpeedDrainConfigKeys and isinstance(v['Threshold'], int) and isinstance(v['Enabled'], bool): # we always want to apply the condor priority change if the threshold is hit if not v['Enabled'] or k == 'CondorPriority': logging.info("Checking speed drain threshold for %s. ", k) if len(jobs) < v['Threshold']: logging.info("Agent will update speed drain configuration for %s. ", k) enableKeys.append(k) else: logging.warning("Speed drain configuration error for %s. Please check aux db contents.", k) return enableKeys