def sendEventRangeToAthenaMP(self, eventRanges): block_sig(signal.SIGTERM) if "No more events" in eventRanges: self.__log.debug("Rank %s: sendEventRangeToAthenaMP: %s" % (self.__rank, eventRanges)) self.__messageInQueue.put(eventRanges) else: if type(eventRanges) is not list: eventRanges = [eventRanges] eventRangeFormat = json.dumps(eventRanges) self.__log.debug("Rank %s: sendEventRangeToAthenaMP: %s" % (self.__rank, eventRangeFormat)) self.__messageInQueue.put(eventRangeFormat) self.__totalQueuedEvents += 1 for eventRange in eventRanges: eventRangeID = eventRange['eventRangeID'] self.__eventRangesStatus[eventRangeID]['status'] = 'processing' #eventRanges= eval(eventRange) #for eventRange in eventRanges: # eventRangeID = eventRange['eventRangeID'] # self.__eventRangesStatus[eventRangeID]['status'] = 'processing' self.__athenaMP_isReady = False unblock_sig(signal.SIGTERM)
def handleMessage(self): block_sig(signal.SIGTERM) try: #message = self.__messageQueue.get(True, self.__pollTimeout) message = self.__messageQueue.get(False) #self.__messageQueue.task_done() except Queue.Empty: unblock_sig(signal.SIGTERM) return False else: if self.__readyForEventTime is None: self.__readyForEventTime = time.time() self.__log.debug("Rank %s: Received message: %s" % (self.__rank, message)) if "Ready for events" in message: self.__athenaMP_isReady = True self.__athenaMP_needEvents += 1 elif message.startswith("/"): self.__totalProcessedEvents += 1 self.__numOutputs += 1 # self.__outputMessage.append(message) try: # eventRangeID = message.split(',')[0].split('.')[-1] eventRangeID = message.split(',')[-3].replace( "ID:", "").replace("ID: ", "") self.__eventRangesStatus[eventRangeID][ 'status'] = 'finished' self.__eventRangesStatus[eventRangeID]['output'] = message self.__outputMessage.append( (eventRangeID, 'finished', message)) except Exception, e: self.__log.warning( "Rank %s: output message format is not recognized: %s " % (self.__rank, message)) self.__log.warning("Rank %s: %s" % (self.__rank, str(e))) elif message.startswith('ERR'): self.__log.error("Rank %s: Received an error message: %s" % (self.__rank, message)) error_acronym, eventRangeID, error_diagnostics = self.extractErrorMessage( message) if eventRangeID != "": try: self.__log.error( "Rank %s: !!WARNING!!2144!! Extracted error acronym %s and error diagnostics \'%s\' for event range %s" % (self.__rank, error_acronym, error_diagnostics, eventRangeID)) self.__eventRangesStatus[eventRangeID][ 'status'] = 'failed' self.__eventRangesStatus[eventRangeID][ 'output'] = message self.__outputMessage.append( (eventRangeID, error_acronym, message)) except Exception, e: self.__log.warning( "Rank %s: output message format is not recognized: %s " % (self.__rank, message)) self.__log.warning("Rank %s: %s" % (self.__rank, str(e)))
def flushMessages(self): block_sig(signal.SIGTERM) self.__log.info("Rank %s: ESJobManager flush messages" % self.__rank) while self.isReady(): self.__log.info("Rank %s: AthenaMP is ready, send 'No more events' to it." % self.__rank) self.sendEventRangeToAthenaMP("No more events") while self.handleMessage(): pass unblock_sig(signal.SIGTERM)
def stop(self, signum=None, frame=None): self.tmpLog.info('stop signal received') block_sig(signal.SIGTERM) #self.flushMessages() #self.updateFailedEventRanges() # final dump self.tmpLog.info('final dumping') self.updateEventRangesToDB(force=True, final=True) #self.db.dumpUpdates(True) self.tmpLog.info("post Exec job") self.postExecJob() self.tmpLog.info('stop') unblock_sig(signal.SIGTERM)
def stop(self, signum=None, frame=None): self.__tmpLog.info('Rank %s: stop signal received' % self.__rank) block_sig(signal.SIGTERM) self.__esJobManager.terminate() self.__esJobManager.flushMessages() self.updateOutputs(signal=True, final=True) self.__tmpLog.info("Rank %s: post exec job" % self.__rank) self.postExecJob() #self.__tmpLog.info("Rank %s: finish job" % self.__rank) #self.finishJob() self.__tmpLog.info('Rank %s: stop' % self.__rank) unblock_sig(signal.SIGTERM)
def handleMessage(self): block_sig(signal.SIGTERM) try: #message = self.__messageQueue.get(True, self.__pollTimeout) message = self.__messageQueue.get(False) #self.__messageQueue.task_done() except Queue.Empty: unblock_sig(signal.SIGTERM) return False else: if self.__readyForEventTime is None: self.__readyForEventTime = time.time() self.__log.debug("Rank %s: Received message: %s" % (self.__rank, message)) if "Ready for events" in message: self.__athenaMP_isReady = True self.__athenaMP_needEvents += 1 elif message.startswith("/"): self.__totalProcessedEvents += 1 self.__numOutputs += 1 # self.__outputMessage.append(message) try: # eventRangeID = message.split(',')[0].split('.')[-1] eventRangeID = message.split(',')[-3].replace("ID:", "").replace("ID: ", "") self.__eventRangesStatus[eventRangeID]['status'] = 'finished' self.__eventRangesStatus[eventRangeID]['output'] = message self.__outputMessage.append((eventRangeID, 'finished', message)) except Exception, e: self.__log.warning("Rank %s: output message format is not recognized: %s " % (self.__rank, message)) self.__log.warning("Rank %s: %s" % (self.__rank, str(e))) elif message.startswith('ERR'): self.__log.error("Rank %s: Received an error message: %s" % (self.__rank, message)) error_acronym, eventRangeID, error_diagnostics = self.extractErrorMessage(message) if eventRangeID != "": try: self.__log.error("Rank %s: !!WARNING!!2144!! Extracted error acronym %s and error diagnostics \'%s\' for event range %s" % (self.__rank, error_acronym, error_diagnostics, eventRangeID)) self.__eventRangesStatus[eventRangeID]['status'] = 'failed' self.__eventRangesStatus[eventRangeID]['output'] = message self.__outputMessage.append((eventRangeID, error_acronym, message)) except Exception, e: self.__log.warning("Rank %s: output message format is not recognized: %s " % (self.__rank, message)) self.__log.warning("Rank %s: %s" % (self.__rank, str(e)))
def stop(self, signum=None, frame=None): self.tmpLog.info('stop signal received') block_sig(signal.SIGTERM) self.dumpJobMetrics() for jobId in self.jobsTimestamp: if self.jobsTimestamp[jobId]['endTime'] is None: self.jobsTimestamp[jobId]['endTime'] = time.time() if len(self.jobsRuningRanks[jobId]) > 0: self.jobsTimestamp[jobId]['endTime'] = time.time() self.dumpJobsStartTime() #self.flushMessages() #self.updateFailedEventRanges() # final dump self.tmpLog.info('final dumping') self.updateEventRangesToDB(force=True, final=True) #self.db.dumpUpdates(True) self.tmpLog.info("post Exec job") self.postExecJob() self.tmpLog.info('stop') unblock_sig(signal.SIGTERM) sys.exit(0)
def stop(self, signum=None, frame=None): self.tmpLog.info('stop signal %s received' % signum) block_sig(signum) signal.siginterrupt(signum, False) self.dumpJobMetrics() for jobId in self.jobsTimestamp: if self.jobsTimestamp[jobId]['endTime'] is None: self.jobsTimestamp[jobId]['endTime'] = time.time() if len(self.jobsRuningRanks[jobId]) > 0: self.jobsTimestamp[jobId]['endTime'] = time.time() self.dumpJobsStartTime() #self.flushMessages() #self.updateFailedEventRanges() # final dump self.tmpLog.info('final dumping') self.updateEventRangesToDB(force=True, final=True) #self.db.dumpUpdates(True) self.tmpLog.info("post Exec job") self.postExecJob() self.tmpLog.info('stop') #signal.siginterrupt(signum, True) unblock_sig(signum)
def stop(self, signum=None, frame=None): self.__tmpLog.info('Rank %s: stop signal %s received' % (self.__rank, signum)) self.__stop = True block_sig(signum) signal.siginterrupt(signum, False) if self.__esJobManager: self.__esJobManager.terminate() self.getAccountingMetrics() self.dumpJobMetrics() self.heartbeat() #self.__esJobManager.terminate() self.__esJobManager.flushMessages() self.updateOutputs(signal=True, final=True) self.__tmpLog.info("Rank %s: post exec job" % self.__rank) self.postExecJob() #self.__tmpLog.info("Rank %s: finish job" % self.__rank) #self.finishJob() self.__tmpLog.info('Rank %s: stop' % self.__rank) #signal.siginterrupt(signum, True) unblock_sig(signum)
error_acronym, eventRangeID, error_diagnostics = self.extractErrorMessage(message) if eventRangeID != "": try: self.__log.error("Rank %s: !!WARNING!!2144!! Extracted error acronym %s and error diagnostics \'%s\' for event range %s" % (self.__rank, error_acronym, error_diagnostics, eventRangeID)) self.__eventRangesStatus[eventRangeID]['status'] = 'failed' self.__eventRangesStatus[eventRangeID]['output'] = message self.__outputMessage.append((eventRangeID, error_acronym, message)) except Exception, e: self.__log.warning("Rank %s: output message format is not recognized: %s " % (self.__rank, message)) self.__log.warning("Rank %s: %s" % (self.__rank, str(e))) if "FATAL" in error_acronym: self.__log.error("Rank %s: !!WARNING!!2146!! A FATAL error was encountered, prepare to finish" % (self.__rank)) self.terminate() else: self.__log.error("Rank %s: Received an unknown message: %s" % (self.__rank, message)) unblock_sig(signal.SIGTERM) return True def findChildProcesses(self,pid): command = "/bin/ps -e --no-headers -o pid -o ppid -o fname" status,output = commands.getstatusoutput(command) #print "ps output: %s" % output pieces = [] result = [] for line in output.split("\n"): pieces= line.split() try: value=int(pieces[1]) except Exception,e: #print "trouble interpreting ps output %s: \n %s" % (e,pieces)