Пример #1
0
    def __checkComp(self, comp, starved, stagnant, threshold):
        isProblem = False
        try:
            badList = comp.checkList(comp.inputFields)
            if badList is not None:
                starved += badList
                isProblem = True
        except Exception:
            self.__log.error(str(comp) + ' inputs: ' + exc_string())

        if not isProblem:
            try:
                badList = comp.checkList(comp.outputFields)
                if badList is not None:
                    stagnant += badList
                    isProblem = True
            except Exception:
                self.__log.error(str(comp) + ' outputs: ' + exc_string())

            if not isProblem:
                try:
                    badList = comp.checkList(comp.thresholdFields)
                    if badList is not None:
                        threshold += badList
                        isProblem = True
                except Exception:
                    self.__log.error(
                        str(comp) + ' thresholds: ' + exc_string())
Пример #2
0
    def monitorLoop(self):
        "Monitor components to ensure they're still alive"
        new = True
        lastCount = 0
        self.__monitoring = True
        while self.__monitoring:
            try:
                count = self.monitorClients(self.__log)
            except:
                self.__log.error("Monitoring clients: " + exc_string())
                count = lastCount

            new = (lastCount != count)
            if new and not self.__quiet:
                print >>sys.stderr, "%d bins, %d comps" % \
                    (self.numUnused(), count)

            lastCount = count

            problems = self.getRunsetsInErrorState()
            for rs in problems:
                self.__log.error("Returning runset#%d (state=%s)" %
                                 (rs.id(), rs.state()))
                try:
                    if self.__forceRestart:
                        self.restartRunset(rs, self.__log)
                    else:
                        self.returnRunset(rs, self.__log)
                except:
                    self.__log.error("Failed to return %s: %s" %
                                     (rs, exc_string()))

            time.sleep(1)
Пример #3
0
 def run(self):
     try:
         self.healthy = self.__watchdog.realWatch()
         self.done = True
     except Exception:
         self.__log.error("Exception in run watchdog: %s" % exc_string())
         self.error = True
Пример #4
0
    def __init__(self, daqLog, moniPath, IDs, shortNameOf, daqIDof, rpcAddrOf,
                 mbeanPortOf, moniType, quiet=False):
        self.__log         = daqLog
        self.__quiet       = quiet
        self.__moniList    = {}
        self.__threadList  = {}
        for c in IDs:
            if mbeanPortOf[c] > 0:
                if moniType == DAQMoni.TYPE_LIVE:
                    md = self.createLiveData(shortNameOf[c], daqIDof[c],
                                             rpcAddrOf[c], mbeanPortOf[c])
                else:
                    fname = DAQMoni.fileName(moniPath, shortNameOf[c],
                                             daqIDof[c])
                    self.__log.info(("Creating moni output file %s (remote" +
                                     " is %s:%d)") %
                                    (fname, rpcAddrOf[c], mbeanPortOf[c]))
                    try:
                        if moniType == DAQMoni.TYPE_FILE:
                            md = self.createFileData(shortNameOf[c], daqIDof[c],
                                                     rpcAddrOf[c],
                                                     mbeanPortOf[c], fname)
                        else:
                            md = self.createBothData(shortNameOf[c], daqIDof[c],
                                                     rpcAddrOf[c],
                                                     mbeanPortOf[c], fname)
                    except Exception:
                        self.__log.error(("Couldn't create monitoring output" +
                                          ' (%s) for component %d!: %s') %
                                         (fname, c, exc_string()))
                        continue

                self.__moniList[c] = md
                self.__threadList[c] = MoniThread(md, self.__log, self.__quiet)
Пример #5
0
 def unhealthyRecord(self, value):
     if isinstance(value,
                   Exception) and not isinstance(value, TaskException):
         msg = "%s: %s" % (str(self), exc_string())
     else:
         msg = "%s (value=%s)" % (str(self), str(value))
     return UnhealthyRecord(msg, self.__comp.order())
Пример #6
0
    def startRun(self, runSet, runNum, runOptions, logDir=None):
        if logDir is None:
            logDir = self.__defaultLogDir

        try:
            openCount = self.__countFileDescriptors()
        except:
            self.__log.error("Cannot count open files: %s" % exc_string())
            openCount = 0

        runSet.startRun(runNum,
                        self.getClusterConfig().configName(),
                        runOptions,
                        get_version_info(SVN_ID),
                        self.__spadeDir,
                        copyDir=self.__copyDir,
                        logDir=logDir,
                        quiet=self.__quiet)

        if self.__openFileCount is None:
            self.__openFileCount = openCount
        elif openCount > self.__openFileCount:
            runSet.logToDash("WARNING: Possible file leak; open file count" +
                             " increased from %d to %d" %
                             (self.__openFileCount, openCount))
            self.__openFileCount = openCount
Пример #7
0
 def startRun(self, runNum):
     "Start component processing DAQ data"
     try:
         return self.__client.xmlrpc.startRun(runNum)
     except:
         self.__log.error(exc_string())
         return None
Пример #8
0
 def stopRun(self):
     "Stop component processing DAQ data"
     try:
         return self.__client.xmlrpc.stopRun()
     except:
         self.__log.error(exc_string())
         return None
Пример #9
0
    def breakRunset(self, runSet):
        hadError = False
        if not runSet.isReady():
            try:
                hadError = runSet.stopRun()
            except:
                self.__log.error("While breaking %s: %s" %
                                 (runSet, exc_string()))

        try:
            if self.__forceRestart or (hadError and self.__restartOnError):
                self.restartRunset(runSet, self.__log)
            else:
                self.returnRunset(runSet, self.__log)
        except:
            self.__log.error("Failed to break %s: %s" % (runSet, exc_string()))
Пример #10
0
 def startSubrun(self, data):
     "Send subrun data to stringHubs"
     try:
         return self.__client.xmlrpc.startSubrun(data)
     except:
         self.__log.error(exc_string())
         return None
Пример #11
0
 def prepareSubrun(self, subrunNum):
     "Start marking events as bogus in preparation for subrun"
     try:
         return self.__client.xmlrpc.prepareSubrun(subrunNum)
     except:
         self.__log.error(exc_string())
         return None
Пример #12
0
def queueForSpade(logger, spadeDir, copyDir, runDir, runNum, runTime,
                  runDuration):
    if runDir is None or not os.path.exists(runDir):
        logger.info("Run directory \"%s\" does not exist" % runDir)
        return

    if spadeDir is None or not os.path.exists(spadeDir):
        logger.info("SPADE directory \"%s\" does not exist" % spadeDir)
        return

    try:
        spadeBaseName = "SPS-pDAQ-run-%03d_%04d%02d%02d_%02d%02d%02d_%06d" % \
            (runNum, runTime.year, runTime.month, runTime.day,
             runTime.hour, runTime.minute, runTime.second, runDuration)

        tarFile = __writeSpadeTarFile(spadeDir, spadeBaseName, runDir)

        if copyDir is not None and os.path.exists(copyDir):
            __copySpadeTarFile(logger, copyDir, spadeBaseName, tarFile)

        __writeSpadeSemaphore(spadeDir, spadeBaseName)

        __indicate_daq_logs_queued(spadeDir)

        logger.info(
            ("Queued data for SPADE (spadeDir=%s" +
             ", runDir=%s, runNum=%s)...") % (spadeDir, runDir, runNum))
    except:
        logger.error("FAILED to queue data for SPADE: " + exc_string())
Пример #13
0
 def forcedStop(self):
     "Force component to stop running"
     try:
         return self.__client.xmlrpc.forcedStop()
     except:
         self.__log.error(exc_string())
         return None
Пример #14
0
    def getNonstoppedConnectorsString(self):
        """
        Return string describing states of all connectors
        which have not yet stopped
        """
        try:
            connStates = self.__client.xmlrpc.listConnectorStates()
        except:
            self.__log.error(exc_string())
            connStates = []

        csStr = None
        for cs in connStates:
            if cs["state"] == 'idle':
                continue
            if csStr is None:
                csStr = '['
            else:
                csStr += ', '
            csStr += '%s:%s' % (cs["type"], cs["state"])

        if csStr is None:
            csStr = ''
        else:
            csStr += ']'

        return csStr
Пример #15
0
    def monitorClients(self, logger=None):
        "check that all components in the pool are still alive"
        count = 0

        for k in self.__pool.keys():
            try:
                bin = self.__pool[k]
            except KeyError:
                # bin may have been removed by daemon
                continue

            for c in bin:
                state = c.monitor()
                if state == DAQClient.STATE_DEAD:
                    self.remove(c)
                    try:
                        c.close()
                    except:
                        if logger is not None:
                            logger.error("Could not close %s: %s" %
                                         (c.fullName(), exc_string()))
                elif state != DAQClient.STATE_MISSING:
                    count += 1

        return count
Пример #16
0
 def commitSubrun(self, subrunNum, latestTime):
     "Start marking events with the subrun number"
     try:
         return self.__client.xmlrpc.commitSubrun(subrunNum, latestTime)
     except:
         self.__log.error(exc_string())
         return None
Пример #17
0
 def unhealthyRecord(self, value):
     if isinstance(value,
                   Exception) and not isinstance(value, TaskException):
         msg = "%s: %s" % (str(self), exc_string())
     else:
         msg = "%s not changing from %s" % (str(self), str(
             self.__prevValue))
     return UnhealthyRecord(msg, self.__order)
Пример #18
0
 def _write(self, fd, time, msg):
     if type(msg) == unicode:
         msg = str(msg)
     if not msg.startswith('Start of log at '):
         try:
             fd.send(self.__fmt.format('log', time, msg, self.__prio))
         except socket.error, err:
             print >> sys.stderr, "%s (Cannot send: %s)" % (msg,
                                                            exc_string())
Пример #19
0
    def run(self):
        self.done = False
        try:
            self.__moniData.monitor(self.now)
        except Exception:
            self.__log.error("Ignoring %s: %s" %
                             (str(self.__moniData), exc_string()))

        self.done = True
Пример #20
0
 def configure(self, configName=None):
     "Configure this component"
     try:
         if not configName:
             return self.__client.xmlrpc.configure()
         else:
             return self.__client.xmlrpc.configure(configName)
     except:
         self.__log.error(exc_string())
         return None
Пример #21
0
 def events(self, subrunNumber):
     "Get the number of events in the specified subrun"
     try:
         evts = self.__client.xmlrpc.getEvents(subrunNumber)
         if type(evts) == str:
             evts = long(evts[:-1])
         return evts
     except:
         self.__log.error(exc_string())
         return None
Пример #22
0
 def run(self):
     "Main method for thread"
     try:
         self.__runOperation()
     except socket.error:
         self.__error = True
     except:
         self.__log.error("%s(%s): %s" % (str(self.__operation),
                                          str(self.__comp),
                                          exc_string()))
         self.__error = True
Пример #23
0
    def check(self, starved, stagnant, threshold):
        isOK = True
        try:
            badList = self.__checkBeans(self.__inputFields)
            if badList is not None:
                starved += badList
                isOK = False
        except:
            self.__dashlog.error(self.__comp.fullName() + " inputs: " +
                                 exc_string())
            isOK = False

        if isOK:
            # don't report output problems if there are input problems
            #
            try:
                badList = self.__checkBeans(self.__outputFields)
                if badList is not None:
                    stagnant += badList
                    isOK = False
            except:
                self.__dashlog.error(self.__comp.fullName() + " outputs: " +
                                     exc_string())
                isOK = False

        # report threshold problems even if there are other problems
        #
        try:
            badList = self.__checkBeans(self.__thresholdFields)
            if badList is not None:
                threshold += badList
                isOK = False
        except:
            self.__dashlog.error(self.__comp.fullName() + " thresholds: " +
                                 exc_string())
            isOK = False

        return isOK
Пример #24
0
    def _run(self):
        activeTotal = 0
        total = 0
        hubActiveDoms = 0
        hubTotalDoms = 0
        hubDOMs = {}
        hubInactiveDOMs = {}

        for c in self.__comps:
            if c.isSource():

                # collect the number of active and total channels
                try:
                    nList = c.getSingleBeanField(
                        "stringhub", "NumberOfActiveAndTotalChannels")
                except Exception, e:
                    self.__dashlog.error(
                        "Cannot get # active and total DOMS from" + " %s: %s" %
                        (c.fullName(), exc_string()))
                    print "Exception: "
                    print e

                    continue

                try:
                    hubActiveDoms, hubTotalDoms = [int(a) for a in nList]
                except:
                    self.__dashlog.error("Cannot get # active DOMS from" +
                                         " %s string: %s" %
                                         (c.fullName(), exc_string()))
                    continue

                activeTotal += hubActiveDoms
                total += hubTotalDoms

                if self.__sendDetails:
                    hubDOMs[str(c.num())] = (hubActiveDoms, hubTotalDoms)
Пример #25
0
 def getClusterConfig(self):
     cdesc = self.__clusterDesc
     cfgDir = self.__runConfigDir
     try:
         return DAQConfigParser.getClusterConfiguration(
             None,
             useActiveConfig=True,
             clusterDesc=cdesc,
             configDir=cfgDir)
     except XMLFileNotFound:
         if cdesc is None:
             cdescStr = ""
         else:
             cdescStr = " for cluster \"%s\"" % cdesc
         raise CnCServerException("Cannot find cluster configuration" +
                                  " %s: %s" % (cdescStr, exc_string()))
Пример #26
0
    def run(self):
        "Start a server"
        self.__log.info(
            ("%(filename)s %(revision)s %(date)s %(time)s" +
             " %(author)s %(release)s %(repo_rev)s") % self.__versionInfo)

        t = threading.Thread(name="CnCServer", target=self.monitorLoop)
        t.setDaemon(True)
        t.start()

        try:
            self.__live = self.startLiveThread()
        except:
            self.__log.error("Cannot start I3Live thread: " + exc_string())

        self.__server.serve_forever()
Пример #27
0
    def makeRunsetFromRunConfig(self,
                                runConfig,
                                timeout=REGISTRATION_TIMEOUT,
                                strict=True):
        try:
            runSet = self.makeRunset(self.__runConfigDir,
                                     runConfig,
                                     timeout,
                                     self.__log,
                                     forceRestart=self.__forceRestart,
                                     strict=strict)
        except:
            self.__log.error("While making runset from \"%s\": %s" %
                             (runConfig, exc_string()))
            runSet = None

        return runSet
Пример #28
0
    def state(self):
        "Get current state"
        try:
            state = self.__client.xmlrpc.getState()
        except socket.error:
            state = None
        except:
            self.__log.error(exc_string())
            state = None

        if not state:
            self.__deadCount += 1
            if self.__deadCount < 3:
                state = DAQClient.STATE_MISSING
            else:
                state = DAQClient.STATE_DEAD

        return state
Пример #29
0
    def restartRunset(self,
                      rs,
                      logger,
                      verbose=False,
                      killWith9=False,
                      eventCheck=False):
        try:
            self.__removeRunset(rs)
        except ValueError:
            logger.error("Cannot remove %s (#%d available - %s)" %
                         (rs, len(self.__sets), self.__sets))

        try:
            self.restartRunsetComponents(rs,
                                         verbose=verbose,
                                         killWith9=killWith9,
                                         eventCheck=eventCheck)
        except:
            logger.error("Cannot restart %s (#%d available - %s): %s" %
                         (rs, len(self.__sets), self.__sets, exc_string()))

        rs.destroy(ignoreComponents=True)
Пример #30
0
    def __run(self):
        self.__running = True
        while self.__running:
            waitSecs = CnCTask.MAX_TASK_SECS
            for t in self.__tasks:
                try:
                    taskSecs = t.check()
                except:
                    if self.__dashlog is not None:
                        self.__dashlog.error("%s exception: %s" %
                                             (str(t), exc_string()))
                    taskSecs = CnCTask.MAX_TASK_SECS
                if waitSecs > taskSecs:
                    waitSecs = taskSecs

            self.__flag.acquire()
            try:
                self.__flag.wait(waitSecs)
            finally:
                self.__flag.release()

        for t in self.__tasks:
            t.close()
Пример #31
0
        def wu_get_thread_name():
            return current_thread().name

        r = tp.enqueue(rq, wu_get_thread_name, (), {}).wait()
        assert r == "TP/1:2"

        class FooException(Exception): pass

        def wu_failed():
            raise FooException("fails")

        try:
            tp.enqueue(rq, wu_failed, (), {}).wait()
        except FooException as e:
            assert exc_string().startswith("FooException(\"fails\") in wu_failed() (thread_pool.py:256)")
        else:
            assert False

        sleep(0.5)

        with expected(WorkUnitTimedOut("request deadline waiting for a work unit")):
            tp.enqueue(rq, wu_skip, (), {}).wait()

    finally:
        RegisteredResourcePool.stop_pools()

    print("ok")

    ###################################