def __checkComp(self, comp, starved, stagnant, threshold): isProblem = False try: badList = comp.checkList(comp.inputFields) if badList is not None: starved += badList isProblem = True except Exception: self.__log.error(str(comp) + ' inputs: ' + exc_string()) if not isProblem: try: badList = comp.checkList(comp.outputFields) if badList is not None: stagnant += badList isProblem = True except Exception: self.__log.error(str(comp) + ' outputs: ' + exc_string()) if not isProblem: try: badList = comp.checkList(comp.thresholdFields) if badList is not None: threshold += badList isProblem = True except Exception: self.__log.error( str(comp) + ' thresholds: ' + exc_string())
def monitorLoop(self): "Monitor components to ensure they're still alive" new = True lastCount = 0 self.__monitoring = True while self.__monitoring: try: count = self.monitorClients(self.__log) except: self.__log.error("Monitoring clients: " + exc_string()) count = lastCount new = (lastCount != count) if new and not self.__quiet: print >>sys.stderr, "%d bins, %d comps" % \ (self.numUnused(), count) lastCount = count problems = self.getRunsetsInErrorState() for rs in problems: self.__log.error("Returning runset#%d (state=%s)" % (rs.id(), rs.state())) try: if self.__forceRestart: self.restartRunset(rs, self.__log) else: self.returnRunset(rs, self.__log) except: self.__log.error("Failed to return %s: %s" % (rs, exc_string())) time.sleep(1)
def run(self): try: self.healthy = self.__watchdog.realWatch() self.done = True except Exception: self.__log.error("Exception in run watchdog: %s" % exc_string()) self.error = True
def __init__(self, daqLog, moniPath, IDs, shortNameOf, daqIDof, rpcAddrOf, mbeanPortOf, moniType, quiet=False): self.__log = daqLog self.__quiet = quiet self.__moniList = {} self.__threadList = {} for c in IDs: if mbeanPortOf[c] > 0: if moniType == DAQMoni.TYPE_LIVE: md = self.createLiveData(shortNameOf[c], daqIDof[c], rpcAddrOf[c], mbeanPortOf[c]) else: fname = DAQMoni.fileName(moniPath, shortNameOf[c], daqIDof[c]) self.__log.info(("Creating moni output file %s (remote" + " is %s:%d)") % (fname, rpcAddrOf[c], mbeanPortOf[c])) try: if moniType == DAQMoni.TYPE_FILE: md = self.createFileData(shortNameOf[c], daqIDof[c], rpcAddrOf[c], mbeanPortOf[c], fname) else: md = self.createBothData(shortNameOf[c], daqIDof[c], rpcAddrOf[c], mbeanPortOf[c], fname) except Exception: self.__log.error(("Couldn't create monitoring output" + ' (%s) for component %d!: %s') % (fname, c, exc_string())) continue self.__moniList[c] = md self.__threadList[c] = MoniThread(md, self.__log, self.__quiet)
def unhealthyRecord(self, value): if isinstance(value, Exception) and not isinstance(value, TaskException): msg = "%s: %s" % (str(self), exc_string()) else: msg = "%s (value=%s)" % (str(self), str(value)) return UnhealthyRecord(msg, self.__comp.order())
def startRun(self, runSet, runNum, runOptions, logDir=None): if logDir is None: logDir = self.__defaultLogDir try: openCount = self.__countFileDescriptors() except: self.__log.error("Cannot count open files: %s" % exc_string()) openCount = 0 runSet.startRun(runNum, self.getClusterConfig().configName(), runOptions, get_version_info(SVN_ID), self.__spadeDir, copyDir=self.__copyDir, logDir=logDir, quiet=self.__quiet) if self.__openFileCount is None: self.__openFileCount = openCount elif openCount > self.__openFileCount: runSet.logToDash("WARNING: Possible file leak; open file count" + " increased from %d to %d" % (self.__openFileCount, openCount)) self.__openFileCount = openCount
def startRun(self, runNum): "Start component processing DAQ data" try: return self.__client.xmlrpc.startRun(runNum) except: self.__log.error(exc_string()) return None
def stopRun(self): "Stop component processing DAQ data" try: return self.__client.xmlrpc.stopRun() except: self.__log.error(exc_string()) return None
def breakRunset(self, runSet): hadError = False if not runSet.isReady(): try: hadError = runSet.stopRun() except: self.__log.error("While breaking %s: %s" % (runSet, exc_string())) try: if self.__forceRestart or (hadError and self.__restartOnError): self.restartRunset(runSet, self.__log) else: self.returnRunset(runSet, self.__log) except: self.__log.error("Failed to break %s: %s" % (runSet, exc_string()))
def startSubrun(self, data): "Send subrun data to stringHubs" try: return self.__client.xmlrpc.startSubrun(data) except: self.__log.error(exc_string()) return None
def prepareSubrun(self, subrunNum): "Start marking events as bogus in preparation for subrun" try: return self.__client.xmlrpc.prepareSubrun(subrunNum) except: self.__log.error(exc_string()) return None
def queueForSpade(logger, spadeDir, copyDir, runDir, runNum, runTime, runDuration): if runDir is None or not os.path.exists(runDir): logger.info("Run directory \"%s\" does not exist" % runDir) return if spadeDir is None or not os.path.exists(spadeDir): logger.info("SPADE directory \"%s\" does not exist" % spadeDir) return try: spadeBaseName = "SPS-pDAQ-run-%03d_%04d%02d%02d_%02d%02d%02d_%06d" % \ (runNum, runTime.year, runTime.month, runTime.day, runTime.hour, runTime.minute, runTime.second, runDuration) tarFile = __writeSpadeTarFile(spadeDir, spadeBaseName, runDir) if copyDir is not None and os.path.exists(copyDir): __copySpadeTarFile(logger, copyDir, spadeBaseName, tarFile) __writeSpadeSemaphore(spadeDir, spadeBaseName) __indicate_daq_logs_queued(spadeDir) logger.info( ("Queued data for SPADE (spadeDir=%s" + ", runDir=%s, runNum=%s)...") % (spadeDir, runDir, runNum)) except: logger.error("FAILED to queue data for SPADE: " + exc_string())
def forcedStop(self): "Force component to stop running" try: return self.__client.xmlrpc.forcedStop() except: self.__log.error(exc_string()) return None
def getNonstoppedConnectorsString(self): """ Return string describing states of all connectors which have not yet stopped """ try: connStates = self.__client.xmlrpc.listConnectorStates() except: self.__log.error(exc_string()) connStates = [] csStr = None for cs in connStates: if cs["state"] == 'idle': continue if csStr is None: csStr = '[' else: csStr += ', ' csStr += '%s:%s' % (cs["type"], cs["state"]) if csStr is None: csStr = '' else: csStr += ']' return csStr
def monitorClients(self, logger=None): "check that all components in the pool are still alive" count = 0 for k in self.__pool.keys(): try: bin = self.__pool[k] except KeyError: # bin may have been removed by daemon continue for c in bin: state = c.monitor() if state == DAQClient.STATE_DEAD: self.remove(c) try: c.close() except: if logger is not None: logger.error("Could not close %s: %s" % (c.fullName(), exc_string())) elif state != DAQClient.STATE_MISSING: count += 1 return count
def commitSubrun(self, subrunNum, latestTime): "Start marking events with the subrun number" try: return self.__client.xmlrpc.commitSubrun(subrunNum, latestTime) except: self.__log.error(exc_string()) return None
def unhealthyRecord(self, value): if isinstance(value, Exception) and not isinstance(value, TaskException): msg = "%s: %s" % (str(self), exc_string()) else: msg = "%s not changing from %s" % (str(self), str( self.__prevValue)) return UnhealthyRecord(msg, self.__order)
def _write(self, fd, time, msg): if type(msg) == unicode: msg = str(msg) if not msg.startswith('Start of log at '): try: fd.send(self.__fmt.format('log', time, msg, self.__prio)) except socket.error, err: print >> sys.stderr, "%s (Cannot send: %s)" % (msg, exc_string())
def run(self): self.done = False try: self.__moniData.monitor(self.now) except Exception: self.__log.error("Ignoring %s: %s" % (str(self.__moniData), exc_string())) self.done = True
def configure(self, configName=None): "Configure this component" try: if not configName: return self.__client.xmlrpc.configure() else: return self.__client.xmlrpc.configure(configName) except: self.__log.error(exc_string()) return None
def events(self, subrunNumber): "Get the number of events in the specified subrun" try: evts = self.__client.xmlrpc.getEvents(subrunNumber) if type(evts) == str: evts = long(evts[:-1]) return evts except: self.__log.error(exc_string()) return None
def run(self): "Main method for thread" try: self.__runOperation() except socket.error: self.__error = True except: self.__log.error("%s(%s): %s" % (str(self.__operation), str(self.__comp), exc_string())) self.__error = True
def check(self, starved, stagnant, threshold): isOK = True try: badList = self.__checkBeans(self.__inputFields) if badList is not None: starved += badList isOK = False except: self.__dashlog.error(self.__comp.fullName() + " inputs: " + exc_string()) isOK = False if isOK: # don't report output problems if there are input problems # try: badList = self.__checkBeans(self.__outputFields) if badList is not None: stagnant += badList isOK = False except: self.__dashlog.error(self.__comp.fullName() + " outputs: " + exc_string()) isOK = False # report threshold problems even if there are other problems # try: badList = self.__checkBeans(self.__thresholdFields) if badList is not None: threshold += badList isOK = False except: self.__dashlog.error(self.__comp.fullName() + " thresholds: " + exc_string()) isOK = False return isOK
def _run(self): activeTotal = 0 total = 0 hubActiveDoms = 0 hubTotalDoms = 0 hubDOMs = {} hubInactiveDOMs = {} for c in self.__comps: if c.isSource(): # collect the number of active and total channels try: nList = c.getSingleBeanField( "stringhub", "NumberOfActiveAndTotalChannels") except Exception, e: self.__dashlog.error( "Cannot get # active and total DOMS from" + " %s: %s" % (c.fullName(), exc_string())) print "Exception: " print e continue try: hubActiveDoms, hubTotalDoms = [int(a) for a in nList] except: self.__dashlog.error("Cannot get # active DOMS from" + " %s string: %s" % (c.fullName(), exc_string())) continue activeTotal += hubActiveDoms total += hubTotalDoms if self.__sendDetails: hubDOMs[str(c.num())] = (hubActiveDoms, hubTotalDoms)
def getClusterConfig(self): cdesc = self.__clusterDesc cfgDir = self.__runConfigDir try: return DAQConfigParser.getClusterConfiguration( None, useActiveConfig=True, clusterDesc=cdesc, configDir=cfgDir) except XMLFileNotFound: if cdesc is None: cdescStr = "" else: cdescStr = " for cluster \"%s\"" % cdesc raise CnCServerException("Cannot find cluster configuration" + " %s: %s" % (cdescStr, exc_string()))
def run(self): "Start a server" self.__log.info( ("%(filename)s %(revision)s %(date)s %(time)s" + " %(author)s %(release)s %(repo_rev)s") % self.__versionInfo) t = threading.Thread(name="CnCServer", target=self.monitorLoop) t.setDaemon(True) t.start() try: self.__live = self.startLiveThread() except: self.__log.error("Cannot start I3Live thread: " + exc_string()) self.__server.serve_forever()
def makeRunsetFromRunConfig(self, runConfig, timeout=REGISTRATION_TIMEOUT, strict=True): try: runSet = self.makeRunset(self.__runConfigDir, runConfig, timeout, self.__log, forceRestart=self.__forceRestart, strict=strict) except: self.__log.error("While making runset from \"%s\": %s" % (runConfig, exc_string())) runSet = None return runSet
def state(self): "Get current state" try: state = self.__client.xmlrpc.getState() except socket.error: state = None except: self.__log.error(exc_string()) state = None if not state: self.__deadCount += 1 if self.__deadCount < 3: state = DAQClient.STATE_MISSING else: state = DAQClient.STATE_DEAD return state
def restartRunset(self, rs, logger, verbose=False, killWith9=False, eventCheck=False): try: self.__removeRunset(rs) except ValueError: logger.error("Cannot remove %s (#%d available - %s)" % (rs, len(self.__sets), self.__sets)) try: self.restartRunsetComponents(rs, verbose=verbose, killWith9=killWith9, eventCheck=eventCheck) except: logger.error("Cannot restart %s (#%d available - %s): %s" % (rs, len(self.__sets), self.__sets, exc_string())) rs.destroy(ignoreComponents=True)
def __run(self): self.__running = True while self.__running: waitSecs = CnCTask.MAX_TASK_SECS for t in self.__tasks: try: taskSecs = t.check() except: if self.__dashlog is not None: self.__dashlog.error("%s exception: %s" % (str(t), exc_string())) taskSecs = CnCTask.MAX_TASK_SECS if waitSecs > taskSecs: waitSecs = taskSecs self.__flag.acquire() try: self.__flag.wait(waitSecs) finally: self.__flag.release() for t in self.__tasks: t.close()
def wu_get_thread_name(): return current_thread().name r = tp.enqueue(rq, wu_get_thread_name, (), {}).wait() assert r == "TP/1:2" class FooException(Exception): pass def wu_failed(): raise FooException("fails") try: tp.enqueue(rq, wu_failed, (), {}).wait() except FooException as e: assert exc_string().startswith("FooException(\"fails\") in wu_failed() (thread_pool.py:256)") else: assert False sleep(0.5) with expected(WorkUnitTimedOut("request deadline waiting for a work unit")): tp.enqueue(rq, wu_skip, (), {}).wait() finally: RegisteredResourcePool.stop_pools() print("ok") ###################################