Пример #1
0
    def passRetrieveCondition(self):
        """
        _passRetrieveCondition_
        Return true if the component can proceed with fetching work.
        False if the component should skip pulling work this cycle.

        For now, it only checks whether the agent is in drain mode or
        MAX_JOBS_PER_OWNER is reached or if the condor schedd is overloaded.
        """

        passCond = "OK"
        myThread = threading.currentThread()
        if isDrainMode(self.config):
            passCond = "No work will be pulled: Agent is in drain"
        elif availableScheddSlots(myThread.dbi) <= 0:
            passCond = "No work will be pulled: schedd slot is maxed: MAX_JOBS_PER_OWNER"
        elif self.condorAPI.isScheddOverloaded():
            passCond = "No work will be pulled: schedd is overloaded"
        else:
            subscriptions = self.listSubsWithoutJobs.execute()
            if subscriptions:
                passCond = "No work will be pulled: "
                passCond += "JobCreator hasn't created jobs for subscriptions %s" % subscriptions

        return passCond
Пример #2
0
    def markInjected(self):
        """
        _markInjected_

        Mark any workflows that have been fully injected as injected
        """

        if self.tier0Mode:
            logging.debug("Component will not check workflows for injection status")
            return

        myThread = threading.currentThread()
        getAction = self.daoFactory(classname="Workflow.GetInjectedWorkflows")
        markAction = self.daoFactory(classname="Workflow.MarkInjectedWorkflows")
        result = getAction.execute()

        # Check each result to see if it is injected:
        injected = []
        for name in result:
            try:
                if self.workQueue.getWMBSInjectionStatus(name, isDrainMode(self.config)):
                    injected.append(name)
            except WorkQueueNoMatchingElements:
                # workflow not known - free to cleanup
                injected.append(name)
            except Exception as ex:
                logging.exception("Injection status checking failed, investigate: %s", str(ex))

        logging.info("Found %d workflows to mark as injected", len(injected))
        # Now, mark as injected those that returned True
        if len(injected) > 0:
            myThread.transaction.begin()
            markAction.execute(names=injected, injected=True)
            myThread.transaction.commit()
        return
Пример #3
0
    def algorithm(self, parameters):
        """
        Update drainStats if agent is in drain mode
        """
        logging.info("Running agent drain algorithm...")
        self.agentConfig = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName)
        if not self.agentConfig:
            logging.error("Failed to fetch agent configuration from the auxiliary DB")
            return

        if isDrainMode(self.config):
            # check to see if the agent hit any speed drain thresholds
            thresholdsHit = self.checkSpeedDrainThresholds()
            if thresholdsHit:
                logging.info("Updating agent configuration for speed drain...")
                self.updateAgentSpeedDrainConfig(thresholdsHit)
            # now collect drain statistics
            try:
                DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo()
                logging.info("Finished collecting agent drain status.")
                logging.info("Drain stats: " + str(DrainStatusPoller.drainStats))

            except Exception as ex:
                msg = "Error occurred, will retry later:\n"
                msg += str(ex)
                logging.exception(msg)
        else:
            logging.info("Agent not in drain mode. Resetting flags and skipping drain check...")
            self.resetAgentSpeedDrainConfig()
Пример #4
0
    def collectAgentInfo(self):
        """
        Monitors the general health of the agent, as:
          1. status of the agent processes
          2. status of the agent threads based on the database info
          3. couchdb active tasks and its replications
          4. check the disk usage
          5. check the number of couch processes

        :return: a dict with all the info collected
        """
        logging.info("Getting agent info ...")
        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)

        agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True)

        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
            agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo()
        else:
            agentInfo['drain_mode'] = False

        couchInfo = self.collectCouchDBInfo()
        if couchInfo['status'] != 'ok':
            agentInfo['down_components'].append(couchInfo['name'])
            agentInfo['status'] = couchInfo['status']
            agentInfo['down_component_detail'].append(couchInfo)

        # Couch process warning
        couchProc = numberCouchProcess()
        logging.info("CouchDB is running with %d processes", couchProc)
        couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0

        # This adds the last time and message when data was updated to agentInfo
        lastDataUpload = DataUploadTime.getInfo()
        if lastDataUpload['data_last_update']:
            agentInfo['data_last_update'] = lastDataUpload['data_last_update']
        if lastDataUpload['data_error']:
            agentInfo['data_error'] = lastDataUpload['data_error']

        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']):
            agentInfo['status'] = "warning"

        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get('couch_process_warning', 0):
                agentInfo['status'] = "error"

        logging.info("List of agent components down: %s", agentInfo['down_components'])

        return agentInfo
Пример #5
0
    def algorithm(self, parameters):
        """
        Update drainStats if agent is in drain mode
        """
        if isDrainMode(self.config):
            logging.info("Checking agent drain status...")

            try:
                DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo()
                logging.info("Finished collecting agent drain status.")
                logging.info("Drain stats: " + str(DrainStatusPoller.drainStats))

            except Exception as ex:
                msg = "Error occurred, will retry later:\n"
                msg += str(ex)
                logging.exception(msg)
        else:
            logging.info("Agent not in drain mode. Skipping drain check...")
Пример #6
0
    def passRetrieveCondition(self):
        """
        _passRetrieveCondition_
        Return true if the component can proceed with fetching work.
        False if the component should skip pulling work this cycle.

        For now, it only checks whether the agent is in drain mode or
        MAX_JOBS_PER_OWNER is reached or if the condor schedd is overloaded.
        """

        passCond = "OK"
        myThread = threading.currentThread()
        if isDrainMode(self.config):
            passCond = "No work will be pulled: Agent is in drain"
        elif availableScheddSlots(myThread.dbi) <= 0:
            passCond = False
            passCond = "No work will be pulled: schedd slot is maxed: MAX_JOBS_PER_OWNER"
        elif self.condorAPI.isScheddOverloaded():
            passCond = "No work will be pulled: shedd is overloaded"

        return passCond
Пример #7
0
    def markInjected(self):
        """
        _markInjected_

        Mark any workflows that have been fully injected as injected
        """

        if not self.handleWorkflowInjection:
            logging.debug(
                "Component will not check workflows for injection status")
            return

        myThread = threading.currentThread()
        getAction = self.daoFactory(classname="Workflow.GetInjectedWorkflows")
        markAction = self.daoFactory(
            classname="Workflow.MarkInjectedWorkflows")
        result = getAction.execute()

        # Check each result to see if it is injected:
        injected = []
        for name in result:
            try:
                if self.workQueue.getWMBSInjectionStatus(
                        name, isDrainMode(self.config)):
                    injected.append(name)
            except WorkQueueNoMatchingElements:
                # workflow not known - free to cleanup
                injected.append(name)
            except Exception as ex:
                logging.exception(
                    "Injection status checking failed, investigate: %s",
                    str(ex))

        logging.info("Found %d workflows to mark as injected", len(injected))
        # Now, mark as injected those that returned True
        if len(injected) > 0:
            myThread.transaction.begin()
            markAction.execute(names=injected, injected=True)
            myThread.transaction.commit()
        return
Пример #8
0
    def passRetrieveCondition(self):
        """
        _passRetrieveCondition_
        Return true if the component can proceed with fetching work.
        False if the component should skip pulling work this cycle.

        For now, it only checks whether the agent is in drain mode or
        MAX_JOBS_PER_OWNER is reached or if the condor schedd is overloaded.
        """
        passCond = "OK"
        myThread = threading.currentThread()
        if isDrainMode(self.config):
            passCond = "agent is in drain mode"
        elif availableScheddSlots(myThread.dbi) <= 0:
            passCond = "schedd slot is maxed: MAX_JOBS_PER_OWNER"
        elif self.condorAPI.isScheddOverloaded():
            passCond = "schedd is overloaded"
        else:
            subscriptions = self.listSubsWithoutJobs.execute()
            if subscriptions:
                passCond = "JobCreator hasn't created jobs for subscriptions %s" % subscriptions

        return passCond
    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_

        Get the WMStats view about agents and teams
        """
        if isDrainMode(self.config):
            # maximize pending thresholds to get this agent drained ASAP
            self.agentsNumByTeam = 1
            return

        agentsByTeam = {}
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam(filterDrain=True)
        except Exception:
            logging.error("WMStats is not available or is unresponsive.")

        if not agentsByTeam:
            logging.warning("agentInfo couch view is not available, use default value %s", self.agentsNumByTeam)
        else:
            self.agentsNumByTeam = agentsByTeam.get(self.teamName, self.agentsNumByTeam)
            logging.debug("Agents connected to the same team (not in DrainMode): %d", self.agentsNumByTeam)
        return
Пример #10
0
    def collectAgentInfo(self):
        """
        Monitors the general health of the agent, as:
          1. status of the agent processes
          2. status of the agent threads based on the database info
          3. couchdb active tasks and its replications
          4. check the disk usage
          5. check the number of couch processes

        :return: a dict with all the info collected
        """
        logging.info("Getting agent info ...")
        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)

        agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config,
                                                               updateDB=True)

        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
            agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo()
        else:
            agentInfo['drain_mode'] = False

        couchInfo = self.collectCouchDBInfo()
        if couchInfo['status'] != 'ok':
            agentInfo['down_components'].append(couchInfo['name'])
            agentInfo['status'] = couchInfo['status']
            agentInfo['down_component_detail'].append(couchInfo)

        # Couch process warning
        couchProc = numberCouchProcess()
        logging.info("CouchDB is running with %d processes", couchProc)
        couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0

        # This adds the last time and message when data was updated to agentInfo
        lastDataUpload = DataUploadTime.getInfo()
        if lastDataUpload['data_last_update']:
            agentInfo['data_last_update'] = lastDataUpload['data_last_update']
        if lastDataUpload['data_error']:
            agentInfo['data_error'] = lastDataUpload['data_error']

        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok' and (agentInfo['drain_mode']
                                            or agentInfo['disk_warning']):
            agentInfo['status'] = "warning"

        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get(
                    'couch_process_warning', 0):
                agentInfo['status'] = "error"

        logging.info("List of agent components down: %s",
                     agentInfo['down_components'])

        return agentInfo