Пример #1
0
    def stepEnd(self, step, stepReport):
        """
        _stepEnd_

        Fill with step-ending information
        """
        helper = WMStepHelper(step)

        stepSuccess = stepReport.getStepExitCode(stepName = helper.name())
        if self.jobSuccess == 0:
            self.jobSuccess = int(stepSuccess)


        data = {}
        data['MessageType']              = 'jobRuntime'
        data['MessageTS']                = time.time()
        data['taskId']                   = self.taskName
        data['jobId']                    = self.jobName
        data['retryCount']               = self.job['retry_count']
        data['ExeEnd']                   = helper.name()
        data['ExeExitCode']              = stepReport.getStepExitCode(stepName = helper.name())
        if helper.name() == 'StageOut':
            data['StageOutExitStatus']       = stepReport.stepSuccessful(stepName = helper.name())

        
        self.publish(data = data)


        return
Пример #2
0
    def stepEnd(self, step, stepReport):
        """
        _stepEnd_

        Fill with step-ending information
        """
        helper = WMStepHelper(step)

        stepSuccess = stepReport.getStepExitCode(stepName=helper.name())
        stepReport.setStepCounter(stepName=helper.name(),
                                  counter=self.stepCount)
        if self.jobSuccess == 0:
            self.jobSuccess = int(stepSuccess)
        if int(stepSuccess) != 0:
            self.failedStep = helper.name()

        data = {}
        data['MessageType'] = 'jobRuntime-stepEnd'
        data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime())
        data['taskId'] = self.taskName
        data['jobId'] = self.jobName
        data['%d_ExeEnd' % self.stepCount] = helper.name()
        data['%d_ExeExitCode' % self.stepCount] = stepReport.getStepExitCode(
            stepName=helper.name())
        data['%d_NCores' % self.stepCount] = helper.getNumberOfCores()

        if helper.name() == 'StageOut':
            data['%d_StageOutExitStatus' % self.stepCount] = int(
                stepReport.stepSuccessful(stepName=helper.name()))

        times = stepReport.getTimes(stepName=helper.name())
        if times['stopTime'] is not None and times['startTime'] is not None:
            data['%d_ExeWCTime' %
                 self.stepCount] = times['stopTime'] - times['startTime']
        else:
            logging.error('Failed to retrieve start/end step time: %s', times)
            data['%d_ExeWCTime' % self.stepCount] = 0
        self.WrapperWCTime += data['%d_ExeWCTime' % self.stepCount]

        step = stepReport.retrieveStep(step=helper.name())
        try:
            data['%d_ExeCPUTime' % self.stepCount] = getattr(
                step.performance.cpu, 'TotalJobCPU', 0)
        except AttributeError:
            msg = "Failed to retrieve cpu performance for step %s. Defaulting to 0" % helper.name(
            )
            logging.warn(msg)
            data['%d_ExeCPUTime' % self.stepCount] = 0
        self.WrapperCPUTime += float(data['%d_ExeCPUTime' % self.stepCount])

        self.lastStep = helper.name()
        self.maxCores = max(self.maxCores, helper.getNumberOfCores())

        self.publish(data=data)

        return data
Пример #3
0
    def stepEnd(self, step, stepReport):
        """
        _stepEnd_

        Fill with step-ending information
        """
        helper = WMStepHelper(step)

        stepSuccess = stepReport.getStepExitCode(stepName=helper.name())
        stepReport.setStepCounter(stepName=helper.name(), counter=self.stepCount)
        if self.jobSuccess == 0:
            self.jobSuccess = int(stepSuccess)
        if int(stepSuccess) != 0:
            self.failedStep = helper.name()

        data = {}
        data['MessageType'] = 'jobRuntime-stepEnd'
        data['MessageTS'] = time.strftime(self.tsFormat,
                                          time.gmtime())
        data['taskId'] = self.taskName
        data['jobId'] = self.jobName
        data['%d_ExeEnd' % self.stepCount] = helper.name()
        data['%d_ExeExitCode' % self.stepCount] = stepReport.getStepExitCode(
            stepName=helper.name())
        data['%d_NCores' % self.stepCount] = helper.getNumberOfCores()

        if helper.name() == 'StageOut':
            data['%d_StageOutExitStatus' % self.stepCount] = int(
                stepReport.stepSuccessful(stepName=helper.name()))

        times = stepReport.getTimes(stepName=helper.name())
        if times['stopTime'] is not None and times['startTime'] is not None:
            data['%d_ExeWCTime' % self.stepCount] = times['stopTime'] - times['startTime']
        else:
            logging.error('Failed to retrieve start/end step time: %s', times)
            data['%d_ExeWCTime' % self.stepCount] = 0
        self.WrapperWCTime += data['%d_ExeWCTime' % self.stepCount]

        step = stepReport.retrieveStep(step=helper.name())
        try:
            data['%d_ExeCPUTime' % self.stepCount] = getattr(step.performance.cpu,
                                                             'TotalJobCPU', 0)
        except AttributeError:
            msg = "Failed to retrieve cpu performance for step %s. Defaulting to 0" % helper.name()
            logging.warn(msg)
            data['%d_ExeCPUTime' % self.stepCount] = 0
        self.WrapperCPUTime += float(data['%d_ExeCPUTime' % self.stepCount])

        self.lastStep = helper.name()
        self.maxCores = max(self.maxCores, helper.getNumberOfCores())

        self.publish(data=data)

        return data
Пример #4
0
    def stepEnd(self, step, stepReport):
        """
        _stepEnd_

        Fill with step-ending information
        """
        helper = WMStepHelper(step)

        stepSuccess = stepReport.getStepExitCode(stepName=helper.name())
        stepReport.setStepCounter(stepName=helper.name(),
                                  counter=self.stepCount)
        if self.jobSuccess == 0:
            self.jobSuccess = int(stepSuccess)
        if int(stepSuccess) != 0:
            self.failedStep = helper.name()

        data = {}
        data['MessageType'] = 'jobRuntime-stepEnd'
        data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime())
        data['taskId'] = self.taskName
        data['jobId'] = self.jobName
        data['%d_ExeEnd' % self.stepCount] = helper.name()
        data['%d_ExeExitCode' % self.stepCount] = stepReport.getStepExitCode(
            stepName=helper.name())

        if helper.name() == 'StageOut':
            data['%d_StageOutExitStatus' % self.stepCount] = int(
                stepReport.stepSuccessful(stepName=helper.name()))

        times = stepReport.getTimes(stepName=helper.name())
        if times['stopTime'] != None and times['startTime'] != None:
            data['%d_ExeWCTime' % self.stepCount] = \
                                       times['stopTime'] - times['startTime']
            data['%d_NCores' % self.stepCount] = helper.getNumberOfCores()
            self.maxCores = max(self.maxCores, helper.getNumberOfCores())

        step = stepReport.retrieveStep(step=helper.name())

        if hasattr(step, 'performance'):
            if hasattr(step.performance, 'cpu'):
                data['%d_ExeCPUTime' % self.stepCount] = \
                                                getattr(step.performance.cpu,
                                                        'TotalJobCPU', 0)
                self.WrapperCPUTime += float(data['%d_ExeCPUTime' %
                                                  self.stepCount])

        self.WrapperWCTime += data['%d_ExeWCTime' % self.stepCount]
        self.lastStep = helper.name()

        self.publish(data=data)

        return data
Пример #5
0
    def stepEnd(self, step, stepReport):
        """
        _stepEnd_

        Fill with step-ending information
        """
        helper = WMStepHelper(step)

        stepSuccess = stepReport.getStepExitCode(stepName = helper.name())
        stepReport.setStepCounter(stepName = helper.name(), counter = self.stepCount)
        if self.jobSuccess == 0:
            self.jobSuccess = int(stepSuccess)
        if int(stepSuccess) != 0:
            self.failedStep = helper.name()

        data = {}
        data['MessageType']                     = 'jobRuntime-stepEnd'
        data['MessageTS']                       = time.strftime(self.tsFormat,
                                                         time.gmtime())
        data['taskId']                          = self.taskName
        data['jobId']                           = self.jobName
        data['%d_ExeEnd' % self.stepCount]      = helper.name()
        data['%d_ExeExitCode' % self.stepCount] = stepReport.getStepExitCode(
                                                    stepName = helper.name())

        if helper.name() == 'StageOut':
            data['%d_StageOutExitStatus' % self.stepCount] = int(
                        stepReport.stepSuccessful(stepName = helper.name()))

        times = stepReport.getTimes(stepName = helper.name())
        if times['stopTime'] != None and times['startTime'] != None:
            data['%d_ExeWCTime' % self.stepCount] = \
                                       times['stopTime'] - times['startTime']
            data['%d_NCores' % self.stepCount] = helper.getNumberOfCores()
            self.maxCores = max(self.maxCores, helper.getNumberOfCores())

        step = stepReport.retrieveStep(step = helper.name())

        if hasattr(step, 'performance'):
            if hasattr(step.performance, 'cpu'):
                data['%d_ExeCPUTime' % self.stepCount] = \
                                                getattr(step.performance.cpu,
                                                        'TotalJobCPU', 0)
                self.WrapperCPUTime += float(data['%d_ExeCPUTime'
                                                        % self.stepCount])

        self.WrapperWCTime += data['%d_ExeWCTime' % self.stepCount]
        self.lastStep = helper.name()

        self.publish(data = data)

        return data
Пример #6
0
    def stepStart(self, step):
        """
        _stepStart_

        Fill with the step-based information. If it is the first step, report
        that the job started its execution.
        """

        helper = WMStepHelper(step)
        self.stepCount += 1

        data = None
        if not self.jobStarted:
            #It's the first step so let's send the exe that started and where
            #That's what they request
            data = {}
            data['MessageType']       = 'jobRuntime-jobStart'
            data['MessageTS']         = time.strftime(self.tsFormat,
                                                      time.gmtime())
            data['taskId']            = self.taskName
            data['jobId']             = self.jobName
            data['GridName']          = getUserProxyDN()
            data['ExeStart']          = helper.name()
            data['SyncCE']            = getSyncCE()
            data['WNHostName']        = socket.gethostname()

            self.publish(data = data)
            self.jobStarted = True

        #Now let's send the step information
        tmp = {'jobStart': data}

        data = {}
        data['MessageType'] = 'jobRuntime-stepStart'
        data['MessageTS']   = time.strftime(self.tsFormat, time.gmtime())
        data['taskId']      = self.taskName
        data['jobId']       = self.jobName
        data['%d_stepName' % self.stepCount]    = helper.name()
        data['%d_ExeStart' % self.stepCount]    = helper.name()

        self.publish(data = data)

        data.update(tmp)

        return data
Пример #7
0
    def stepStart(self, step):
        """
        _stepStart_

        Fill with the step-based information. If it is the first step, report
        that the job started its execution.
        """

        helper = WMStepHelper(step)
        self.stepCount += 1

        data = None
        if not self.jobStarted:
            #It's the first step so let's send the exe that started and where
            #That's what they request
            data = {}
            data['MessageType'] = 'jobRuntime-jobStart'
            data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime())
            data['taskId'] = self.taskName
            data['jobId'] = self.jobName
            data['GridName'] = getUserProxyDN()
            data['ExeStart'] = helper.name()
            data['SyncCE'] = getSyncCE()
            data['WNHostName'] = socket.gethostname()

            self.publish(data=data)
            self.jobStarted = True

        #Now let's send the step information
        tmp = {'jobStart': data}

        data = {}
        data['MessageType'] = 'jobRuntime-stepStart'
        data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime())
        data['taskId'] = self.taskName
        data['jobId'] = self.jobName
        data['%d_stepName' % self.stepCount] = helper.name()
        data['%d_ExeStart' % self.stepCount] = helper.name()

        self.publish(data=data)

        data.update(tmp)

        return data
Пример #8
0
    def stepKilled(self, step):
        """
        _stepKilled_

        Fill with step-ending information assuming utter failure
        """

        helper = WMStepHelper(step)

        data = {}
        data['MessageType'] = 'jobRuntime-stepKilled'
        data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime())
        data['taskId'] = self.taskName
        data['jobId'] = self.jobName
        data['%d_ExeEnd' % self.stepCount] = helper.name()

        self.lastStep = helper.name()

        self.publish(data=data)

        return data
Пример #9
0
    def stepKilled(self, step):
        """
        _stepKilled_

        Fill with step-ending information assuming utter failure
        """

        helper = WMStepHelper(step)

        data = {}
        data['MessageType']   = 'jobRuntime-stepKilled'
        data['MessageTS']     = time.strftime(self.tsFormat, time.gmtime())
        data['taskId']        = self.taskName
        data['jobId']         = self.jobName
        data['ExeEnd']        = helper.name()

        self.lastStep = helper.name()

        self.publish(data = data)

        return data
Пример #10
0
    def emulateAll(self):
        """
        _emulateAll_

        Traverse all Steps and load up the default Emulator based on
        type.

        """
        for step in self.task.steps().nodeIterator():
            helper = WMStepHelper(step)
            stepType = helper.stepType()
            stepName = helper.name()
            self.addEmulator(stepName, stepType)
Пример #11
0
    def emulateAll(self):
        """
        _emulateAll_

        Traverse all Steps and load up the default Emulator based on
        type.

        """
        for step in self.task.steps().nodeIterator():
            helper = WMStepHelper(step)
            stepType = helper.stepType()
            stepName = helper.name()
            self.addEmulator(stepName, stepType)
Пример #12
0
 def setStep(self, wmStep):
     """set topStep to be the step instance provided"""
     stepData = wmStep
     if isinstance(wmStep, WMStepHelper):
         stepData = wmStep.data
         stepHelper = wmStep
     else:
         stepHelper = WMStepHelper(wmStep)
     
     stepName = stepHelper.name()
     stepHelper.setTopOfTree()
     setattr(self.data.steps, stepName, stepData)
     setattr(self.data.steps, "topStepName", stepName)
     return
Пример #13
0
    def setStep(self, wmStep):
        """set topStep to be the step instance provided"""
        stepData = wmStep
        if isinstance(wmStep, WMStepHelper):
            stepData = wmStep.data
            stepHelper = wmStep
        else:
            stepHelper = WMStepHelper(wmStep)

        stepName = stepHelper.name()
        stepHelper.setTopOfTree()
        setattr(self.data.steps, stepName, stepData)
        setattr(self.data.steps, "topStepName", stepName)
        return
Пример #14
0
    def stepStart(self, step):
        """
        _stepStart_

        Fill with the step-based information
        """

        helper = WMStepHelper(step)

        data = {}
        data['MessageType']   = 'jobRuntime'
        data['MessageTS']     = time.time()
        data['taskId']        = self.taskName
        data['jobId']         = self.jobName
        data['retryCount']    = self.job['retry_count']
        data['ExeStart']      = helper.name()
        
        self.publish(data = data)

        return data
Пример #15
0
    def stepStart(self, step):
        """
        _stepStart_

        Assure that the monitor is pointing at the right step
        """

        stepHelper = WMStepHelper(step)
        self.currentStepName  = getStepName(step)
        self.currentStepSpace = getStepSpace(stepHelper.name())

        if not stepHelper.stepType() in self.watchStepTypes:
            self.disableStep = True
            logging.debug("PerformanceMonitor ignoring step of type %s" % stepHelper.stepType())
            return
        else:
            logging.debug("Beginning PeformanceMonitor step Initialization")
            self.disableStep = False


        return
Пример #16
0
    def stepKilled(self, step):
        """
        _stepKilled_

        Fill with step-ending information assuming utter failure
        """

        helper = WMStepHelper(step)

        data = {}
        data['MessageType']   = 'jobRuntime'
        data['MessageTS']     = time.time()
        data['taskId']        = self.taskName
        data['jobId']         = self.jobName
        data['retryCount']    = self.job['retry_count']
        data['ExeEnd']        = helper.name()
        
        
        self.publish(data = data)


        return
Пример #17
0
class PerformanceMonitor(WMRuntimeMonitor):
    """
    _PerformanceMonitor_

    Monitors the performance by pinging ps and
    recording data regarding the current step
    """
    def __init__(self):
        """
        Actual variable initialization
        in initMonitor
        """

        self.pid = None
        self.uid = os.getuid()
        self.monitorBase = "ps -p %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd -ww | grep %i"
        self.monitorCommand = None
        self.currentStepSpace = None
        self.currentStepName = None

        self.rss = []
        self.vsize = []
        self.pcpu = []
        self.pmem = []

        self.maxRSS = None
        self.maxVSize = None
        self.softTimeout = None
        self.hardTimeout = None
        self.logPath = None
        self.startTime = None

        self.watchStepTypes = []

        self.disableStep = False

        WMRuntimeMonitor.__init__(self)

        return

    def initMonitor(self, task, job, logPath, args={}):
        """
        _initMonitor_

        Puts together the information needed for the monitoring
        to actually find everything.
        """

        # Set the steps we want to watch
        self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest'])

        self.maxRSS = args.get('maxRSS', None)
        self.maxVSize = args.get('maxVSize', None)
        self.softTimeout = args.get('softTimeout', None)
        self.hardTimeout = args.get('hardTimeout', None)

        self.logPath = os.path.join(logPath)

        return

    def jobStart(self, task):
        """
        _jobStart_

        Acknowledge that the job has started and initialize the time
        """
        self.startTime = time.time()

        return

    def stepStart(self, step):
        """
        _stepStart_

        Assure that the monitor is pointing at the right step
        """

        self.stepHelper = WMStepHelper(step)
        self.currentStepName = getStepName(step)
        self.currentStepSpace = None

        if not self.stepHelper.stepType() in self.watchStepTypes:
            self.disableStep = True
            logging.debug("PerformanceMonitor ignoring step of type %s" %
                          self.stepHelper.stepType())
            return
        else:
            logging.debug("Beginning PeformanceMonitor step Initialization")
            self.disableStep = False

        return

    def stepEnd(self, step, stepReport):
        """
        _stepEnd_

        Package the information and send it off
        """

        if not self.disableStep:
            # No information to correlate
            return

        self.currentStepName = None
        self.currentStepSpace = None

        return

    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False
        killHard = False

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName == None:
            # We're between steps
            return

        if self.currentStepSpace == None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID == None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the monitor command and collate the data
        cmd = self.monitorBase % (stepPID, stepPID)
        stdout, stderr, retcode = subprocessAlgos.runCommand(cmd)

        output = stdout.split()
        if not len(output) > 7:
            # Then something went wrong in getting the ps data
            msg = "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % output
            msg += "command = %s\n" % self.monitorCommand
            logging.error(msg)
            return
        rss = float(output[2])
        vsize = float(output[3])
        logging.info("Retrieved following performance figures:")
        logging.info("RSS: %s;  VSize: %s; PCPU: %s; PMEM: %s" %
                     (output[2], output[3], output[4], output[5]))

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        if self.maxRSS != None and rss >= self.maxRSS:
            msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS
            msg += "Job has RSS: %s\n" % rss
            killProc = True
        if self.maxVSize != None and vsize >= self.maxVSize:
            msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize
            msg += "Job has VSize: %s\n" % vsize
            killProc = True

        #Let's check the running time
        currentTime = time.time()

        if self.hardTimeout != None and self.softTimeout != None:
            if (currentTime - self.startTime) > self.softTimeout:
                killProc = True
                msg += "Job has been running for more than: %s\n" % str(
                    self.softTimeout)
                msg += "Job has been running for: %s\n" % str(currentTime -
                                                              self.startTime)
            if (currentTime - self.startTime) > self.hardTimeout:
                killHard = True
                msg += "Job exceeded soft timeout"

        if killProc:
            logging.error(msg)
            report = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location, '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug(
                        "Found pre-existant error report in PerformanceMonitor termination."
                    )
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step="PerformanceError"):
                    report.addStep(reportname="PerformanceError")
                report.addError(stepName="PerformanceError",
                                exitCode=99900,
                                errorType="PerformanceKill",
                                errorDetails=msg)
                report.save(logPath)
            except Exception, ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 = "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += str(ex)
                msg2 += str(traceback.format_exc()) + '\n'
                logging.error(msg2)

            try:
                if not killHard:
                    logging.error("Attempting to kill step using SIGUSR2")
                    os.kill(stepPID, signal.SIGUSR2)
                else:
                    logging.error("Attempting to kill step using SIGTERM")
                    os.kill(stepPID, signal.SIGTERM)
            except Exception:
                logging.error("Attempting to kill step using SIGTERM")
                os.kill(stepPID, signal.SIGTERM)

        return
Пример #18
0
    def __call__(self, task, wmbsJob):
        """
        _operator(task)_

        Load and run executors for all steps in Task, if an emulator is
        available for that step, use it instead.

        """

        myThread = threading.currentThread

        try:
            myThread.watchdogMonitor.setupMonitors(task, wmbsJob)
            myThread.watchdogMonitor.notifyJobStart(task)
        except WMException:
            self.toTaskDirectory()
            raise
        except Exception as ex:
            msg =  "Encountered unhandled exception while starting monitors:\n"
            msg += str(ex) + '\n'
            msg += str(traceback.format_exc()) + '\n'
            logging.error(msg)
            self.toTaskDirectory()
            raise WMExecutionFailure(msg)

        skipToStep = None
        for step in task.steps().nodeIterator():
            try:
                helper = WMStepHelper(step)
                stepType = helper.stepType()
                stepName = helper.name()
                if skipToStep and skipToStep != stepName:
                    # Then we continue until we get to the required step
                    continue
                skipToStep = None # Reset this when we get to the right step
                executor = StepFactory.getStepExecutor(stepType)
                result = self.doExecution(executor, step, wmbsJob)
                if not result == None:
                    skipToStep = result
            except WMException as ex:
                msg = "Encountered error while running ExecuteMaster:\n"
                msg += str(ex) + "\n"
                logging.error(msg)
                self.toTaskDirectory()
                break
            except Exception as ex:
                msg = "Encountered error while running ExecuteMaster:\n"
                msg += str(ex) + "\n"
                msg += str(traceback.format_exc()) + "\n"
                self.toTaskDirectory()
                logging.error(msg)
                break


        try:
            myThread.watchdogMonitor.notifyJobEnd(task)
        except WMException:
            self.toTaskDirectory()
        except Exception as ex:
            msg =  "Encountered unhandled exception while ending the job:\n"
            msg += str(ex) + '\n'
            msg += str(traceback.format_exc()) + '\n'
            logging.error(msg)
            self.toTaskDirectory()

        return
Пример #19
0
    def __call__(self, task, wmbsJob):
        """
        _operator(task)_

        Load and run executors for all steps in Task, if an emulator is
        available for that step, use it instead.

        """

        myThread = threading.currentThread

        try:
            myThread.watchdogMonitor.setupMonitors(task, wmbsJob)
            myThread.watchdogMonitor.notifyJobStart(task)
        except WMException:
            self.toTaskDirectory()
            raise
        except Exception as ex:
            msg = "Encountered unhandled exception while starting monitors:\n"
            msg += str(ex) + '\n'
            msg += str(traceback.format_exc()) + '\n'
            logging.error(msg)
            self.toTaskDirectory()
            raise WMExecutionFailure(msg)

        failureUpstream = False
        for step in task.steps().nodeIterator():
            try:
                helper = WMStepHelper(step)
                stepType = helper.stepType()
                stepName = helper.name()
                if failureUpstream:
                    # for chained steps, don't execute further steps if a
                    # failure has already happened
                    helper.addOverride("previousCmsRunFailure", True)

                executor = StepFactory.getStepExecutor(stepType)
                result = self.doExecution(executor, step, wmbsJob)
                logging.info("StepName: %s, StepType: %s, with result: %r",
                             stepName, stepType, result)
                if result:  # can be either None, or the step exit code
                    failureUpstream = True
            except WMException as ex:
                msg = "Encountered error while running ExecuteMaster:\n"
                msg += str(ex) + "\n"
                logging.error(msg)
                self.toTaskDirectory()
                break
            except Exception as ex:
                msg = "Encountered error while running ExecuteMaster:\n"
                msg += str(ex) + "\n"
                msg += str(traceback.format_exc()) + "\n"
                self.toTaskDirectory()
                logging.error(msg)
                break

        try:
            myThread.watchdogMonitor.notifyJobEnd(task)
        except WMException:
            self.toTaskDirectory()
        except Exception as ex:
            msg = "Encountered unhandled exception while ending the job:\n"
            msg += str(ex) + '\n'
            msg += str(traceback.format_exc()) + '\n'
            logging.error(msg)
            self.toTaskDirectory()

        return
Пример #20
0
    def __call__(self, task, wmbsJob):
        """
        _operator(task)_

        Load and run executors for all steps in Task, if an emulator is
        available for that step, use it instead.

        """

        myThread = threading.currentThread

        try:
            myThread.watchdogMonitor.setupMonitors(task, wmbsJob)
            myThread.watchdogMonitor.notifyJobStart(task)
        except WMException:
            self.toTaskDirectory()
            raise
        except Exception as ex:
            msg = "Encountered unhandled exception while starting monitors:\n"
            msg += str(ex) + '\n'
            msg += str(traceback.format_exc()) + '\n'
            logging.error(msg)
            self.toTaskDirectory()
            raise WMExecutionFailure(msg)

        skipToStep = None
        for step in task.steps().nodeIterator():
            try:
                helper = WMStepHelper(step)
                stepType = helper.stepType()
                stepName = helper.name()
                if skipToStep and skipToStep != stepName:
                    # Then we continue until we get to the required step
                    continue
                skipToStep = None  # Reset this when we get to the right step
                executor = StepFactory.getStepExecutor(stepType)
                result = self.doExecution(executor, step, wmbsJob)
                if result is not None:
                    skipToStep = result
            except WMException as ex:
                msg = "Encountered error while running ExecuteMaster:\n"
                msg += str(ex) + "\n"
                logging.error(msg)
                self.toTaskDirectory()
                break
            except Exception as ex:
                msg = "Encountered error while running ExecuteMaster:\n"
                msg += str(ex) + "\n"
                msg += str(traceback.format_exc()) + "\n"
                self.toTaskDirectory()
                logging.error(msg)
                break

        try:
            myThread.watchdogMonitor.notifyJobEnd(task)
        except WMException:
            self.toTaskDirectory()
        except Exception as ex:
            msg = "Encountered unhandled exception while ending the job:\n"
            msg += str(ex) + '\n'
            msg += str(traceback.format_exc()) + '\n'
            logging.error(msg)
            self.toTaskDirectory()

        return
Пример #21
0
class PerformanceMonitor(WMRuntimeMonitor):
    """
    _PerformanceMonitor_

    Monitors the performance by pinging ps and
    recording data regarding the current step
    """

    def __init__(self):
        """
        Actual variable initialization
        in initMonitor
        """

        self.pid = None
        self.uid = os.getuid()
        self.monitorBase = "ps -p %i -o pid,ppid,rss,pcpu,pmem,cmd -ww | grep %i"
        self.pssMemoryCommand = "awk '/^Pss/ {pss += $2} END {print pss}' /proc/%i/smaps"
        self.monitorCommand = None
        self.currentStepSpace = None
        self.currentStepName = None

        self.rss = []
        self.pcpu = []
        self.pmem = []

        self.maxPSS = None
        self.softTimeout = None
        self.hardTimeout = None
        self.logPath = None
        self.startTime = None
        self.killRetry = False  # will trigger a hard (SIGTERM) instead of soft kill

        self.watchStepTypes = []

        self.disableStep = False

        WMRuntimeMonitor.__init__(self)

        return

    def initMonitor(self, task, job, logPath, args=None):
        """
        _initMonitor_

        Puts together the information needed for the monitoring
        to actually find everything.
        """
        args = args or {}

        # Set the steps we want to watch
        self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest'])

        self.maxPSS = args.get('maxPSS', args.get('maxRSS'))
        self.softTimeout = args.get('softTimeout', None)
        self.hardTimeout = args.get('hardTimeout', None)
        self.numOfCores = args.get('cores', None)

        self.logPath = os.path.join(logPath)

        return

    def jobStart(self, task):
        """
        _jobStart_

        Acknowledge that the job has started and initialize the time
        """
        self.startTime = time.time()

        return

    def stepStart(self, step):
        """
        _stepStart_

        Assure that the monitor is pointing at the right step
        """

        self.stepHelper = WMStepHelper(step)
        self.currentStepName = getStepName(step)
        self.currentStepSpace = None

        if not self.stepHelper.stepType() in self.watchStepTypes:
            self.disableStep = True
            logging.debug("PerformanceMonitor ignoring step of type %s", self.stepHelper.stepType())
            return
        else:
            logging.debug("Beginning PeformanceMonitor step Initialization")
            self.disableStep = False

        return

    def stepEnd(self, step, stepReport):
        """
        _stepEnd_

        Package the information and send it off
        """

        if not self.disableStep:
            # No information to correlate
            return

        self.currentStepName = None
        self.currentStepSpace = None

        return

    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False
        killHard = False
        reason = ''
        errorCodeLookup = {'PSS': 50660,
                           'Wallclock time': 50664,
                           '': 99999}

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName is None:
            # We're between steps
            return

        if self.currentStepSpace is None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID is None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the ps monitor command and collate the data
        # Gathers RSS, %CPU and %MEM statistics from ps
        ps_cmd = self.monitorBase % (stepPID, stepPID)
        stdout, _stderr, _retcode = subprocessAlgos.runCommand(ps_cmd)

        ps_output = stdout.split()
        if not len(ps_output) > 6:
            # Then something went wrong in getting the ps data
            msg = "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % ps_output
            msg += "command = %s\n" % ps_cmd
            logging.error(msg)
            return

        # run the command to gather PSS memory statistics from /proc/<pid>/smaps
        smaps_cmd = self.pssMemoryCommand % (stepPID)
        stdout, _stderr, _retcode = subprocessAlgos.runCommand(smaps_cmd)

        smaps_output = stdout.split()
        if not len(smaps_output) == 1:
            # Then something went wrong in getting the smaps data
            msg = "Error when grabbing output from smaps\n"
            msg += "output = %s\n" % smaps_output
            msg += "command = %s\n" % smaps_cmd
            logging.error(msg)
            return

        # smaps also returns data in kiloBytes, let's make it megaBytes
        # I'm also confused with these megabytes and mebibytes...
        pss = int(smaps_output[0]) // 1000

        logging.info("PSS: %s; RSS: %s; PCPU: %s; PMEM: %s", smaps_output[0], ps_output[2], ps_output[3], ps_output[4])

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        msg += 'Number of Cores: %s\n' % self.numOfCores

        if self.maxPSS is not None and pss >= self.maxPSS:
            msg += "Job has exceeded maxPSS: %s MB\n" % self.maxPSS
            msg += "Job has PSS: %s MB\n" % pss
            killProc = True
            reason = 'PSS'
        elif self.hardTimeout is not None and self.softTimeout is not None:
            currentTime = time.time()
            if (currentTime - self.startTime) > self.softTimeout:
                killProc = True
                reason = 'Wallclock time'
                msg += "Job has been running for more than: %s\n" % str(self.softTimeout)
                msg += "Job has been running for: %s\n" % str(currentTime - self.startTime)
            if (currentTime - self.startTime) > self.hardTimeout:
                killHard = True
                msg += "Job exceeded soft timeout"

        if not killProc:
            # then job is behaving well, there is nothing to do
            return

        # make sure we persist the performance error only once
        if not self.killRetry:
            logging.error(msg)
            report = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location,
                                   '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug("Found pre-existant error report in PerformanceMonitor termination.")
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step="PerformanceError"):
                    report.addStep(reportname="PerformanceError")
                report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason],
                                errorType="PerformanceKill", errorDetails=msg)
                report.save(logPath)
            except Exception as ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 = "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += "Error: %s" % str(ex)
                logging.exception(msg2)

        try:
            if not killHard and not self.killRetry:
                logging.error("Attempting to kill step using SIGUSR2")
                os.kill(stepPID, signal.SIGUSR2)
            else:
                logging.error("Attempting to kill step using SIGTERM")
                os.kill(stepPID, signal.SIGTERM)
        except Exception:
            logging.error("Attempting to kill step using SIGTERM")
            os.kill(stepPID, signal.SIGTERM)
        finally:
            self.killRetry = True

        return
Пример #22
0
class PerformanceMonitor(WMRuntimeMonitor):
    """
    _PerformanceMonitor_

    Monitors the performance by pinging ps and
    recording data regarding the current step
    """



    def __init__(self):
        """
        Actual variable initialization
        in initMonitor
        """

        self.pid              = None
        self.uid              = os.getuid()
        #self.grabCommand      = "ps -u %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd |grep %s |grep -v grep"
        self.monitorBase      = "ps -p %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd -ww |grep %i"
        self.monitorCommand   = None
        self.currentStepSpace = None
        self.currentStepName  = None

        self.rss   = []
        self.vsize = []
        self.pcpu  = []
        self.pmem  = []

        self.maxRSS   = None
        self.maxVSize = None
        self.logPath  = None

        self.watchStepTypes = []

        self.disableStep = False

        WMRuntimeMonitor.__init__(self)


        return


    def initMonitor(self, task, job, logPath, args = {}):
        """
        _initMonitor_

        Puts together the information needed for the monitoring
        to actually find everything.
        """

        # Set the steps we want to watch
        self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest'])

        self.maxRSS   = args.get('maxRSS', None)
        self.maxVSize = args.get('maxVSize', None)

        self.logPath = os.path.join(logPath)

        return

    def stepStart(self, step):
        """
        _stepStart_

        Assure that the monitor is pointing at the right step
        """

        self.stepHelper = WMStepHelper(step)
        self.currentStepName  = getStepName(step)
        self.currentStepSpace = None

        if not self.stepHelper.stepType() in self.watchStepTypes:
            self.disableStep = True
            logging.debug("PerformanceMonitor ignoring step of type %s" % self.stepHelper.stepType())
            return
        else:
            logging.debug("Beginning PeformanceMonitor step Initialization")
            self.disableStep = False


        return


    def stepEnd(self, step, stepReport):
        """
        _stepEnd_

        Package the information and send it off
        """

        if not self.disableStep:
            # No information to correlate
            return

        self.currentStepName  = None
        self.currentStepSpace = None

        return


    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName == None:
            # We're between steps
            return

        if self.currentStepSpace == None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID == None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the monitor command and collate the data
        cmd = self.monitorBase % (stepPID, stepPID)
        stdout, stderr, retcode = subprocessAlgos.runCommand(cmd)

        output = stdout.split()
        if not len(output) > 7:
            # Then something went wrong in getting the ps data
            msg =  "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % output
            msg += "command = %s\n" % self.monitorCommand
            logging.error(msg)
            return
        rss   = float(output[2])
        vsize = float(output[3])
        logging.info("Retrieved following performance figures:")
        logging.info("RSS: %s;  VSize: %s; PCPU: %s; PMEM: %s" % (output[2], output[3],
                                                                  output[4], output[5]))

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        if self.maxRSS != None and rss >= self.maxRSS:
            msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS
            msg += "Job has RSS: %s\n" % rss
            killProc = True
        if self.maxVSize != None and vsize >= self.maxVSize:
            msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize
            msg += "Job has VSize: %s\n" % vsize
            killProc = True


        if killProc:
            logging.error(msg)
            report  = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location,
                                   '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug("Found pre-existant error report in DashboardMonitor termination.")
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step = "PerformanceError"):
                    report.addStep(reportname = "PerformanceError")
                report.addError(stepName = "PerformanceError", exitCode = 99900,
                                errorType = "PerformanceKill", errorDetails = msg)
                report.save(logPath)
            except Exception, ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 =  "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += str(ex)
                msg2 += str(traceback.format_exc()) + '\n'
                logging.error(msg2)

            try:
                logging.error("Attempting to kill job using SIGUSR2")
                os.kill(stepPID, signal.SIGUSR2)
            except Exception:
                os.kill(stepPID, signal.SIGTERM)

        return
Пример #23
0
class PerformanceMonitor(WMRuntimeMonitor):
    """
    _PerformanceMonitor_

    Monitors the performance by pinging ps and
    recording data regarding the current step
    """

    def __init__(self):
        """
        Actual variable initialization
        in initMonitor
        """

        self.pid = None
        self.uid = os.getuid()
        self.monitorBase = "ps -p %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd -ww | grep %i"
        self.monitorCommand = None
        self.currentStepSpace = None
        self.currentStepName = None

        self.rss = []
        self.vsize = []
        self.pcpu = []
        self.pmem = []

        self.maxRSS = None
        self.maxVSize = None
        self.softTimeout = None
        self.hardTimeout = None
        self.logPath = None
        self.startTime = None

        self.watchStepTypes = []

        self.disableStep = False

        WMRuntimeMonitor.__init__(self)

        return

    def initMonitor(self, task, job, logPath, args=None):
        """
        _initMonitor_

        Puts together the information needed for the monitoring
        to actually find everything.
        """
        args = args or {}

        # Set the steps we want to watch
        self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest'])

        self.maxRSS = args.get('maxRSS', None)
        self.maxVSize = args.get('maxVSize', None)
        self.softTimeout = args.get('softTimeout', None)
        self.hardTimeout = args.get('hardTimeout', None)
        self.numOfCores = args.get('cores', None)

        self.logPath = os.path.join(logPath)

        return

    def jobStart(self, task):
        """
        _jobStart_

        Acknowledge that the job has started and initialize the time
        """
        self.startTime = time.time()

        return

    def stepStart(self, step):
        """
        _stepStart_

        Assure that the monitor is pointing at the right step
        """

        self.stepHelper = WMStepHelper(step)
        self.currentStepName = getStepName(step)
        self.currentStepSpace = None

        if not self.stepHelper.stepType() in self.watchStepTypes:
            self.disableStep = True
            logging.debug("PerformanceMonitor ignoring step of type %s", self.stepHelper.stepType())
            return
        else:
            logging.debug("Beginning PeformanceMonitor step Initialization")
            self.disableStep = False

        return

    def stepEnd(self, step, stepReport):
        """
        _stepEnd_

        Package the information and send it off
        """

        if not self.disableStep:
            # No information to correlate
            return

        self.currentStepName = None
        self.currentStepSpace = None

        return

    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False
        killHard = False
        reason = ''
        errorCodeLookup = {'RSS': 50660,
                           'VSZ': 50661,
                           'Wallclock time': 50664,
                           '': 99999}

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName is None:
            # We're between steps
            return

        if self.currentStepSpace is None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID is None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the monitor command and collate the data
        cmd = self.monitorBase % (stepPID, stepPID)
        stdout, stderr, retcode = subprocessAlgos.runCommand(cmd)

        output = stdout.split()
        if not len(output) > 7:
            # Then something went wrong in getting the ps data
            msg = "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % output
            msg += "command = %s\n" % self.monitorCommand
            logging.error(msg)
            return
        # FIXME: making it backwards compatible. Keep only the "else" block in HG1801
        if self.maxRSS is not None and self.maxRSS >= (1024 * 1024):
            # then workload value is still in KiB (old way)
            rss = int(output[2])
            vsize = int(output[3])
        else:
            rss = int(output[2]) // 1024  # convert it to MiB
            vsize = int(output[3]) // 1024  # convert it to MiB
        logging.info("Retrieved following performance figures:")
        logging.info("RSS: %s;  VSize: %s; PCPU: %s; PMEM: %s", output[2], output[3],
                     output[4], output[5])

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        msg += 'Number of Cores: %s\n' % self.numOfCores

        if self.maxRSS is not None and rss >= self.maxRSS:
            msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS
            msg += "Job has RSS: %s\n" % rss
            killProc = True
            reason = 'RSS'
        elif self.maxVSize is not None and vsize >= self.maxVSize:
            msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize
            msg += "Job has VSize: %s\n" % vsize
            killProc = True
            reason = 'VSZ'
        elif self.hardTimeout is not None and self.softTimeout is not None:
            currentTime = time.time()
            if (currentTime - self.startTime) > self.softTimeout:
                killProc = True
                reason = 'Wallclock time'
                msg += "Job has been running for more than: %s\n" % str(self.softTimeout)
                msg += "Job has been running for: %s\n" % str(currentTime - self.startTime)
            if (currentTime - self.startTime) > self.hardTimeout:
                killHard = True
                msg += "Job exceeded soft timeout"

        if killProc:
            logging.error(msg)
            report = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location,
                                   '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug("Found pre-existant error report in PerformanceMonitor termination.")
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step="PerformanceError"):
                    report.addStep(reportname="PerformanceError")
                report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason],
                                errorType="PerformanceKill", errorDetails=msg)
                report.save(logPath)
            except Exception as ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 = "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += str(ex)
                msg2 += str(traceback.format_exc()) + '\n'
                logging.error(msg2)

            try:
                if not killHard:
                    logging.error("Attempting to kill step using SIGUSR2")
                    os.kill(stepPID, signal.SIGUSR2)
                else:
                    logging.error("Attempting to kill step using SIGTERM")
                    os.kill(stepPID, signal.SIGTERM)
            except Exception:
                logging.error("Attempting to kill step using SIGTERM")
                os.kill(stepPID, signal.SIGTERM)

        return