Exemplo n.º 1
0
    def __createCompressedTar(self, directory, tarOutputFilePath):
        try:
            tarResultPath = tarOutputFilePath
            tar = tarfile.open(tarResultPath, TAR_GZIP_FLAG)
            #tar.dereference = True - This is too dangerous - Can link to huge files/directories - removed on purpuse - don't add this!
        except:
            self.myLogger.error(
                "Exception occured while creating the Tar file - tar file can't be created"
                + stats_queue_objects.formatExceptionInfo())
            return False

        try:
            tar.add(directory)
        except:
            self.myLogger.error(
                "Exception occured while creating the Tar file" +
                stats_queue_objects.formatExceptionInfo())
            return False
        try:
            tar.close()
        except:
            self.myLogger.error(
                "Exception occured while creating the Tar file - tar file can't be closed"
                + stats_queue_objects.formatExceptionInfo())
            return False
        self.myLogger.debug3("Tar file created successfully")
        return True
Exemplo n.º 2
0
    def __addCommandToFile(self, createNewFile, closeFileAfter,
                           commandToBeAdded):
        try:
            # Start new file if requested
            if createNewFile:
                self.myCommandsFileIndex += 1
                fileName = os.path.join(self.myRRDComandFilesDirPath,
                                        str(self.myCommandsFileIndex))
                self.myCommandsFileHandle = open(fileName, "w")
                self.myLogger.debug5("Opened file index %s" %
                                     (self.myCommandsFileIndex))

            #Write
            self.myCommandsFileHandle.write("%s\n" % commandToBeAdded)
            self.myLogger.debug5("Wrote %s to file index %s" %
                                 (commandToBeAdded, self.myCommandsFileIndex))

            # Close file if requested
            if closeFileAfter:
                self.__closeCommandFile()

        except:
            self.myLogger.error("Failed. Exception info: %s" %
                                formatExceptionInfo())
            return 11236
Exemplo n.º 3
0
 def __delDirsIfNeeded(self, dirPath):
     try:
         if os.path.exists(dirPath):
             if os.path.isdir(dirPath):
                 shutil.rmtree(dirPath)
     except:
         self.myLogger.error(
             "UNExpected exception on Stats Aggregator.__delDirsIfNeeded. Exception info: %s"
             % stats_queue_objects.formatExceptionInfo())
Exemplo n.º 4
0
    def loadCountersConfigurations(self):
        countersConfig = ConfigParser()
        try:
            countersConfig.read(self.myCountersListFilePath)
        except:
            self.myLogger.error(
                "Couldn't open counters configuration file. Path: %s. Exception details: %s"
                % (self.myCountersListFilePath, formatExceptionInfo()))
            return (None, None)

        #deserialize
        return self.__createDBInfoObject(countersConfig)
Exemplo n.º 5
0
 def __moveSubDirsToDir(self, srcDir, dstDir):
     try:
         self.__makeDirsIfNeeded(dstDir)
         listOfFiles = os.listdir(srcDir)
         for d in listOfFiles:
             srcSubDir = os.path.join(srcDir, d)
             if srcSubDir == dstDir:
                 continue
             if os.path.isdir(srcSubDir):
                 shutil.move(srcSubDir, dstDir)
     except:
         self.myLogger.error(
             "UNExpected exception on Stats Aggregator.__moveSubDirsToDir. Exception info: %s"
             % stats_queue_objects.formatExceptionInfo())
Exemplo n.º 6
0
    def getRRDFilesMapping(self):
        queryGet1 = 'Select counterID, filePath FROM rrdFilesMapping'
        try:
            self.myCur.execute(queryGet1)
            tuplesList = self.myCur.fetchall()
            resultDictionary = {}
            for counterMapping in tuplesList:
                resultDictionary[counterMapping[0]] = counterMapping[1]

            return resultDictionary
        except:
            self.myLogger.error(
                "UNExpected exception was raised during sqlite query execution. Query: %s. Exception info: %s"
                % (queryGet1, formatExceptionInfo()))
            return None
Exemplo n.º 7
0
    def __executeSQLiteQueryArray(self, queries):

        try:
            for queryObj in queries:
                queryExecuted = queryObj
                #If params list length = 0 it still executes OK
                self.myCur.execute(queryObj.myQueryStr, queryObj.myParamsList)
            self.myCon.commit()
            return True
        except:
            self.myLogger.error(
                "UNExpected exception was raised during sqlite query execution. Exception info: %s"
                % (formatExceptionInfo()))
            self.myLogger.error(
                "Query: %s Num of params: %d" %
                (queryExecuted.myQueryStr, len(queryExecuted.myParamsList)))
            #self.myCon.rollback() - dangerous and uneccessary since the file will be thrown. It might help debugging
            return False
Exemplo n.º 8
0
    def connectToDb(self):
        try:
            if hasattr(self, 'myCon'):
                if self.myCon:
                    self.myLogger.error(
                        "SQLite connectToDB. An attempt to connect to the DB was made twice"
                    )
                    return False

            self.myLogger.debug1("Connecting to " + self.myFileName)
            self.myCon = sqlite.connect(self.myFileName)
            self.myCur = self.myCon.cursor()
            return True
        except:
            self.myLogger.error(
                "UNExpected exception during SQLite connectToDB. Exception info: %s"
                % formatExceptionInfo())
            return False
Exemplo n.º 9
0
    def __addCommandToFile (self, createNewFile, closeFileAfter, commandToBeAdded):
        try:
            # Start new file if requested
            if createNewFile:
                self.myCommandsFileIndex += 1
                fileName = os.path.join(self.myRRDComandFilesDirPath, str(self.myCommandsFileIndex))
                self.myCommandsFileHandle = open(fileName,"w")
                self.myLogger.debug5("Opened file index %s" % (self.myCommandsFileIndex))

            #Write
            self.myCommandsFileHandle.write("%s\n" % commandToBeAdded)
            self.myLogger.debug5("Wrote %s to file index %s" % (commandToBeAdded, self.myCommandsFileIndex))

            # Close file if requested
            if closeFileAfter:
                self.__closeCommandFile()

        except:
            self.myLogger.error("Failed. Exception info: %s" % formatExceptionInfo())
            return 11236
Exemplo n.º 10
0
    def run(self):

        self._log.notice(
            "Stats RRD writer running. Entering grace period of %s seconds",
            self._startupGracePeriod)

        time.sleep(self._startupGracePeriod)
        self._log.notice(
            "Stats RRD writer finished grace period. Start main loop")

        try:
            self._runFlag = True
            while (self._runFlag):
                startTime = time.time()
                self._doUpdates()
                self._statsMgr.notifyNoMoreGrace()

                # Finished updating the whole history dict.
                # Warn about a long update, or count it for the next warning
                maxAllowedUpdateTime = self._updatePeriod * 0.75
                updateTime = time.time() - startTime
                if updateTime > maxAllowedUpdateTime:
                    (shouldReport,
                     numEvents) = self._updateReportLimiter.shouldReport()
                    if shouldReport:
                        self._log.error(
                            "StatsRrdWriter full update took too long: %s seconds, max allowed is %s. (Occured %s times since last reported)"
                            % (updateTime, maxAllowedUpdateTime, numEvents))
                else:
                    # sleep to make mainLoop() take exactly _updatePeriod
                    remainingTime = self._updatePeriod - updateTime
                    if remainingTime > 0:
                        time.sleep(remainingTime)

            self._log.notice("Stats RRD Writer thread exited")
        except:
            # Loop ended - thread is shutting down
            self._runFlag = False
            self._log.error(
                "UNExpected exception on Stats RRD Writer. Exception info: %s"
                % stats_queue_objects.formatExceptionInfo())
Exemplo n.º 11
0
 def __createCompressedTar(self, directory, tarOutputFilePath):
     try:
         tarResultPath = tarOutputFilePath
         tar = tarfile.open(tarResultPath, TAR_GZIP_FLAG)
         #tar.dereference = True - This is too dangerous - Can link to huge files/directories - removed on purpuse - don't add this!
     except:
         self.myLogger.error("Exception occured while creating the Tar file - tar file can't be created" + stats_queue_objects.formatExceptionInfo())
         return False
 
     try:
         tar.add(directory)
     except:
         self.myLogger.error("Exception occured while creating the Tar file" + stats_queue_objects.formatExceptionInfo())
         return False    
     try:
         tar.close()
     except:
         self.myLogger.error("Exception occured while creating the Tar file - tar file can't be closed" + stats_queue_objects.formatExceptionInfo())
         return False
     self.myLogger.debug3("Tar file created successfully")
     return True
Exemplo n.º 12
0
    def run (self):

        self._log.notice("Stats RRD writer running. Entering grace period of %s seconds", self._startupGracePeriod)

        time.sleep(self._startupGracePeriod)
        self._log.notice("Stats RRD writer finished grace period. Start main loop")

        try:
            self._runFlag = True
            while (self._runFlag):
                startTime = time.time()
                self._doUpdates()
                self._statsMgr.notifyNoMoreGrace()

                # Finished updating the whole history dict.
                # Warn about a long update, or count it for the next warning
                maxAllowedUpdateTime = self._updatePeriod * 0.75
                updateTime = time.time() - startTime
                if updateTime > maxAllowedUpdateTime:
                    (shouldReport, numEvents) = self._updateReportLimiter.shouldReport()
                    if shouldReport:
                        self._log.error("StatsRrdWriter full update took too long: %s seconds, max allowed is %s. (Occured %s times since last reported)" % 
                                        (updateTime, maxAllowedUpdateTime, numEvents))
                else:
                    # sleep to make mainLoop() take exactly _updatePeriod
                    remainingTime = self._updatePeriod - updateTime
                    if remainingTime > 0:
                        time.sleep(remainingTime)

            self._log.notice("Stats RRD Writer thread exited")
        except:
            # Loop ended - thread is shutting down
            self._runFlag = False
            self._log.error("UNExpected exception on Stats RRD Writer. Exception info: %s" % stats_queue_objects.formatExceptionInfo())
Exemplo n.º 13
0
    def run (self):

        self.myLogger.notice("Stats Sampler running")

        if self.myNextSampleTime is None:
            # One time initialization
            self.myNextSampleTime = time.time()
        else:
            # If we quit this thread due to discovering new counters, we need to sleep here before the next sample
            now = time.time()
            sleepTime = self.myNextSampleTime - now
            while sleepTime < 0:
                self.myNextSampleTime += 60
                sleepTime = self.myNextSampleTime - now

            self.myLogger.debug1("%d Compensation sleep %d sec (before loop)" % (int(now), int(sleepTime)))
            time.sleep(sleepTime)

        try:
            while (self.myRunFlag):

                # Sample counters
                startTime = time.time()
                self.myLogger.debug1("%d Start sampling counters" % int(startTime))
                self.__sampleValues()

                # Discover new regex counters
                if self.myNextDiscoveryTime - time.time() < 0:
                    self.myLogger.debug1("%d Start discover counters" % int(time.time()))
                    (self.myNewCounters, self.myDeletedCounters) = self.__discoverRegexCounters()
                    self.myNextDiscoveryTime = time.time() + REGEX_COUNTERS_DISCOVERY_INTERVAL
                    if self.myNewCounters or self.myDeletedCounters:
                        # If we discovered new counters we will exit the loop and re-initialize
                        self.myRunFlag = False

                endTime = time.time()
                self.myLogger.debug1("%d Finish sample/discover cycle. Elasped %d sec" % (int(endTime), int(endTime-startTime)))

                self.myNextSampleTime += 60
                sleepTime = self.myNextSampleTime - endTime
                while sleepTime < 0:
                    self.myNextSampleTime += 60
                    sleepTime = self.myNextSampleTime - endTime

                if self.myRunFlag:
                    # If we need to quit we will sleep when we come back
                    self.myLogger.debug1("%d Sleep %d sec" % (int(endTime), int(sleepTime)))
                    time.sleep(sleepTime)
        except:
            self.myLogger.error("Unexpected exception on Stats Sampler. Exception info: %s" % formatExceptionInfo())

        if self.myRunFlag:
            self.myLogger.notice("Stats Sampler thread ended enexpectedly")
        else:
            self.myLogger.notice("Stats Sampler thread ended")

        self.myRunFlag = False
Exemplo n.º 14
0
 def __moveSubDirsToDir(self, srcDir, dstDir):
     try:
         self.__makeDirsIfNeeded(dstDir)
         listOfFiles = os.listdir(srcDir)
         for d in listOfFiles:
             srcSubDir = os.path.join(srcDir, d)
             if srcSubDir == dstDir:
                 continue
             if os.path.isdir(srcSubDir):
                 shutil.move(srcSubDir, dstDir)
     except:
         self.myLogger.error("UNExpected exception on Stats Aggregator.__moveSubDirsToDir. Exception info: %s" % stats_queue_objects.formatExceptionInfo())
Exemplo n.º 15
0
 def __delDirsIfNeeded (self, dirPath):
     try:
         if os.path.exists(dirPath):
             if os.path.isdir(dirPath):
                 shutil.rmtree(dirPath)
     except:
         self.myLogger.error("UNExpected exception on Stats Aggregator.__delDirsIfNeeded. Exception info: %s" % stats_queue_objects.formatExceptionInfo())
Exemplo n.º 16
0
    def run (self):
        
        self.myLogger.notice("Stats Aggregator running")
        reportLimiter = EventReportLimiter(3600)

        try:
            self.myRunFlag = True
            self.myDescriptionsDB.connectToDb()
    
            while(self.myRunFlag):

                self.myLogger.debug2("StatsAggregator blocking in order to dequeue. Time = %d" % time.time())

                job = self.__dequeue()
                if None == job:
                    self.myLogger.debug1("StatsAggregator dequeued a job from jobs queue. Message is None -> unblock message")
                    continue

                self.myLogger.debug2("StatsAggregator message dequeued successfully. Time = %d" % time.time())

                # If this is a sample result job (The only legal message for v1.0)
                startTime = time.time()

                if (job.quack() == stats_queue_objects.AggregationQueueJobType.VALUES):
                    self.myLogger.debug2("StatsAggregator received values job")

                    # Iterate on all received counters and update global aggr dictionary.
                    # Respect counter burst params

                    allUpdatesStr = "" # This will be saved to a history file

                    counterIdx = 0
                    numCounters = len(job.myCountersArray)
                    while counterIdx < numCounters:
                        burstStartTime = time.time()
                        self.myAggrHistory.lock()
                        i = 0
                        self.myLogger.debug3("Start burts")
                        while i < self.myUpdateBurstCount:
                            counterVal = job.myCountersArray[counterIdx]
                            self.myLogger.debug5("Updating couner id %s values (%s, %s)",
                                                 counterVal.myCounterId, counterVal.myValue, counterVal.myTimeStamp)
                            valueType = type(counterVal.myValue).__name__
                            self.myAggrHistory.update(counterVal.myCounterId, counterVal.myValue, valueType, counterVal.myTimeStamp)

                            allUpdatesStr += self.__formatHistoryLine(counterVal.myCounterId, counterVal.myValue, 
                                                                      counterVal.myTimeStamp, valueType)

                            i += 1
                            counterIdx += 1
                            if counterIdx == numCounters:
                                break
                        self.myAggrHistory.unlock()
                        burstTime = time.time() - burstStartTime
                        self.myLogger.debug3("End burts")

                        # Warn about a long burst, or count it for the next warning
                        maxAllowedBurstTime = self.myUpdateBurstInterval * 0.75
                        if burstTime > maxAllowedBurstTime:
                            (shouldReport, numEvents) = reportLimiter.shouldReport()
                            if shouldReport:
                                self.myLogger.warning("Aggregator update burst took too long: %s seconds, max allowed is %s. (Occured %s times since last reported)",
                                                      burstTime, maxAllowedBurstTime, numEvents)

                        # If we have more counters, sleep
                        if counterIdx < numCounters:
                            remainingTime = self.myUpdateBurstInterval - burstTime
                            if remainingTime > 0:
                                time.sleep(remainingTime)

                    self.saveHistoryFile(allUpdatesStr)

                insertTimeSpan = time.time()-startTime
                if insertTimeSpan > MAX_INSERT_TO_DB_TIME_SPAN:
                    self.myLogger.warning("Execute job exceeds the time limit. Limit is %d sec. Elapsed time is %s sec, numCounters=%s. Queue size is: %d",
                                          MAX_INSERT_TO_DB_TIME_SPAN, insertTimeSpan, numCounters, self.myJobsQueue._qsize())
                else:
                    self.myLogger.notice("Execute job. Elapsed time is %s sec, numCounters=%s. Queue size is: %d",
                                         insertTimeSpan, numCounters, self.myJobsQueue._qsize())
    

            if self.myRunFlag:
                self.myLogger.notice("Stats Aggregator thread ended enexpectedly")
            else:
                self.myLogger.notice("Stats Aggregator thread ended")

            self.myRunFlag = False

        except:
            #Loop ended - thread is shutting down
            self.myRunFlag = False
            self.myLogger.error("UNExpected exception on Stats Aggregator. Exception info: %s" % stats_queue_objects.formatExceptionInfo())

        #When the while loop ended = shutdown - close the DBs
        self.myValuesDB.closeDBConnection()
        self.myDescriptionsDB.closeDBConnection()
        self.myLogger.debug2("StatsAggregator thread ended")
Exemplo n.º 17
0
    def start(self):
        self.postStartInit()
        self.myLogger.info("Stats mgr - thread started")

        if not self.statsInitOK:
            self.myLogger.error(
                "StatsMgr starting before proper initialization - crashing")
            return 2112

        try:
            self.myLogger.debug1("StatsMgr starting all threads")
            #Set logger must be called since Oscar launches Stats as a process with 'fork'. If the counter is not
            #created again in the child process then a collision will occur. The logger is thread-safe BUT NOT process-safe
            self.myAggregator.start()
            self.myPacer.start()
            self.myLogger.debug1("StatsMgr threads started")

            # This loop makes sure that if a thread dies all the module dies with it

            reloadCounters = False

            while (True):

                abort = False

                if not self.myAggregator.isRunning():
                    if not reloadCounters:
                        #Aggregator really died
                        self.myLogger.error(
                            "Stats aggregator thread died - script aborts")
                        abort = True
                        #Kill other thread
                        self.myPacer.end()
                    else:
                        # rebuild the database and restart the threads
                        (newCounters, deletedCounters
                         ) = self.myPacer.getNewAndDeletedCounters()
                        if newCounters or deletedCounters:
                            self.myLogger.notice(
                                "Stats reloading new counters (%d new. %d deleted)"
                                % (len(newCounters), len(deletedCounters)))
                            self.myCountersDBInfo.update(newCounters)
                            for key in deletedCounters:
                                del (self.myCountersDBInfo[key])
                            self.__initWorkers()
                            self.myAggregator.start()
                            self.myPacer.start()
                            self.myLogger.notice(
                                "Stats reloaded with %d new counters",
                                len(newCounters))
                        else:
                            self.myLogger.error(
                                "Asked to load new counters but list is empty")

                        reloadCounters = False

                if not self.myPacer.isRunning() and not abort:

                    if not reloadCounters and self.myPacer.needUpdate():
                        #Pacer discovered new regex counters and need to re-initialize
                        self.myLogger.notice(
                            "Stats discovered new counters - Ask to reload")
                        self.myAggregator.end()
                        reloadCounters = True

                    if not reloadCounters:
                        #Pacer really died
                        abort = True
                        self.myLogger.error(
                            "Stats sampler-pacer thread died - Script aborts")
                        #Kill other thread
                        self.myAggregator.end()

                if not reloadCounters:
                    #We want the script to die if one of its threads dies but it is not urgent
                    time.sleep(3)

                #Shut down
                if abort:
                    self.myLogger.notice(
                        "Stats main thread finished and is shutting down")
                    return 10101010
        except:
            self.myLogger.notice(
                "Exception in main thread. Exception details: %s" %
                (formatExceptionInfo()))
            self.myLogger.debug2("Killing sampler-pacer thread")
            self.myPacer.end()
            self.myLogger.debug2("Killing aggregation thread")
            self.myAggregator.end()
            self.myLogger.debug2("Killing rrd writer thread")
            self.myRrdWriter.end()
            return 8873

        self.myLogger.notice("Stats main thread finished and is shutting down")
        return 0
Exemplo n.º 18
0
    def postStartInit (self):
        #Enables us to replace the counters if needed
        #TODO(galm) - this is not relevant to you output, only to you inputs - please separate the outpt
        alternativeOutputPath = getSysParamOrDefault(self.myStatsConfigurations, CFG_SECTION, CFG_DEBUG_OUTPUT_PATH, configurationsDefaults)
        if alternativeOutputPath:
            self.myDescriptionDbFilesPath = alternativeOutputPath
        
            #Create the output dir if doesn't exist
            if not self.__makeDirsIfNeeded(self.myDescriptionDbFilesPath):
                self.myLogger.error("Couldn't create db's output directory - aborts. Dir path: %s" % self.myDescriptionDbFilesPath)
                return 8349


        #TEMP till we have a captain
        self.captain = a.infra.process.captain.Captain()
        self.captain._initKickNumber(self.myKickNumber)
        a.infra.process.setGlobalCaptain(self.captain)
        #TEMP END
        self._mainLogger = MainLogger(processName = "Stats")
        self._baseLogger = self._mainLogger.getLoggerManager().createLogger("Stats", "Stats")
        self._mainLogger.initLoggerToUse(self._baseLogger)
        self._mainLogger.init(initialLogLevel=self._logLevel, logDir = self.myLogPath, 
                              logFileSize = self.myLogFileSize, logTotalSize = self.myLogTotalSize,
                              pearlConfigurationFilesFullName = self.myLogConfigurationFile, 
                              pearlConfigurationLoadPeriodInSeconds = self.myLogConfigurationLoadPeriod)
        self.myLogger = self._baseLogger("stats-msg")
        self.myLogger.info("Stats Script - Starting...")
        
        #The next code block checks that the q-shell of the sampled processes is responding. This solves the problem of sampling the q-shell too soon
        allProcesses = self.myProcessesDictionary.keys()
        processesWithQShell = getSysParamOrDefault(self.myStatsConfigurations, CFG_SECTION, CFG_PROCESSES_WITH_Q_SHELL, configurationsDefaults)
        successes = 0
        hasQshellProcess = False
        try:
            for process_key in allProcesses:
                if not process_key in processesWithQShell:
                    continue

                hasQshellProcess = True

                # Given that line might take a lot of time to start, try a few times before giving up.
                maxRetries=20
                warnThreshold=3
                retryIndex=0
                processStarted=False
                while (not processStarted) and (retryIndex < maxRetries):
                    retryIndex += 1
                    self.myLogger.debug5("Waiting for process %s to start, attempt %s out of %s" % (process_key, retryIndex, maxRetries))
                    if self.myProcessesDictionary[process_key].waitForQshellRunningIndicationOrTimeout(process_key, datetime.datetime.now()+DATE_TIME_QSHELL_CHECK_INTERVAL):
                        processStarted = True # Terminate loop
                    else:
                        if retryIndex >= warnThreshold:
                            self.myLogger.warning("Process %s q-shell is not responding after %s attempts" % (process_key, retryIndex))

                if processStarted:
                    successes += 1
                    self.myLogger.notice("Process %s started, attempt %s out of %s" % (process_key, retryIndex, maxRetries))
                else:
                    self.myLogger.notice("Process %s q-shell is not responding" % process_key)
        except:
            self.myLogger.error("Unknown Exception in stats init - aborts. Exception info: %s" % (formatExceptionInfo()))
            return 69403
        
        #There is no logic for stats to run if there are no responding processes
        if (not allProcesses):
            self.myLogger.error("Stats initialized with zero processes - aborts")
            return 69401

        if (hasQshellProcess and successes == 0):
            self.myLogger.error("Stats initialized with zero processes running - aborts")
            return 69402
        
        # At this point, the line configuration file must exist.
        self.__readLineRunningCfg()

        #Create consumer design pattern Queue
        self.myJobsQueue = Queue()
        self.myLogger.debug1("StatsMgr queues created")
        
        #Load counters configurations 
        (self.myCountersDBInfo, self.myRegexCounters, self.myCommChanDict) = self.loadCountersConfigurations()
        if not self.myCountersDBInfo and not self.myRegexCounters:
            #No counters were found? Nothing to do - die
            self.myLogger.error("StatsMgr zero counters were found in counter configurations file. File path: %s" % self.myCountersListFilePath)
            return 548731

        #Check that the properties names are unique.
        #We need to combine the 2 counter dictionaries to one.
        #Since the dict keys are derived from a config file section names, 
        #it is safe to assume that there will be no overlapping keys
        tempCountersDict = dict(self.myCountersDBInfo)
        tempCountersDict.update(self.myRegexCounters)
        errFlag = self.__validateProperties(tempCountersDict)
        if errFlag:
            return 787544

        self.myLogger.debug1("StatsMgr counters configuration loaded")
        self.myMaxJobsQueueSize = int(getSysParamOrDefault(self.myStatsConfigurations, CFG_SECTION, CFG_JOBS_QUEUE_MAX_SIZE, configurationsDefaults))

        self.myNumHistoryFiles = self.__calcNumHistoryFiles(self.myWritePeriod + self.myStartupGracePeriod)
        self.myLogger.notice("StatsMgr: Grace period started, keeping %s history files", self.myNumHistoryFiles)
        self.myAggrHistory = AggrHistory(self.myLogger)
        self.myAggrHistory.setMaxHistoryEntries(self.myNumHistoryFiles * 2)

        self.__initWorkers()

        # Now, the aggregator is already created. We want to tell him to load its prev-run files
        # Into the newly created aggrHistory object
        self.myAggregator.loadHistoryFiles()

        self.myRrdWriter = StatsRrdWriter()
        self.myRrdWriter.init(self.myLogger, self, self.myAggrHistory, self.myWritePeriod, self.myRrdUpdateBurstCount, 
                              self.myRrdUpdateBurstInterval, self.myStartupGracePeriod)
        self.myRrdWriter.start()

        self.myLogger.info("Stats Script - Launched")
        self.myLogger.closeLog()
        self.statsInitOK = True
        return 0
Exemplo n.º 19
0
    def postStartInit(self):
        #Enables us to replace the counters if needed
        #TODO(galm) - this is not relevant to you output, only to you inputs - please separate the outpt
        alternativeOutputPath = getSysParamOrDefault(
            self.myStatsConfigurations, CFG_SECTION, CFG_DEBUG_OUTPUT_PATH,
            configurationsDefaults)
        if alternativeOutputPath:
            self.myDescriptionDbFilesPath = alternativeOutputPath

            #Create the output dir if doesn't exist
            if not self.__makeDirsIfNeeded(self.myDescriptionDbFilesPath):
                self.myLogger.error(
                    "Couldn't create db's output directory - aborts. Dir path: %s"
                    % self.myDescriptionDbFilesPath)
                return 8349

        #TEMP till we have a captain
        self.captain = a.infra.process.captain.Captain()
        self.captain._initKickNumber(self.myKickNumber)
        a.infra.process.setGlobalCaptain(self.captain)
        #TEMP END
        self._mainLogger = MainLogger(processName="Stats")
        self._baseLogger = self._mainLogger.getLoggerManager().createLogger(
            "Stats", "Stats")
        self._mainLogger.initLoggerToUse(self._baseLogger)
        self._mainLogger.init(
            initialLogLevel=self._logLevel,
            logDir=self.myLogPath,
            logFileSize=self.myLogFileSize,
            logTotalSize=self.myLogTotalSize,
            pearlConfigurationFilesFullName=self.myLogConfigurationFile,
            pearlConfigurationLoadPeriodInSeconds=self.
            myLogConfigurationLoadPeriod)
        self.myLogger = self._baseLogger("stats-msg")
        self.myLogger.info("Stats Script - Starting...")

        #The next code block checks that the q-shell of the sampled processes is responding. This solves the problem of sampling the q-shell too soon
        allProcesses = self.myProcessesDictionary.keys()
        processesWithQShell = getSysParamOrDefault(self.myStatsConfigurations,
                                                   CFG_SECTION,
                                                   CFG_PROCESSES_WITH_Q_SHELL,
                                                   configurationsDefaults)
        successes = 0
        hasQshellProcess = False
        try:
            for process_key in allProcesses:
                if not process_key in processesWithQShell:
                    continue

                hasQshellProcess = True

                # Given that line might take a lot of time to start, try a few times before giving up.
                maxRetries = 20
                warnThreshold = 3
                retryIndex = 0
                processStarted = False
                while (not processStarted) and (retryIndex < maxRetries):
                    retryIndex += 1
                    self.myLogger.debug5(
                        "Waiting for process %s to start, attempt %s out of %s"
                        % (process_key, retryIndex, maxRetries))
                    if self.myProcessesDictionary[
                            process_key].waitForQshellRunningIndicationOrTimeout(
                                process_key,
                                datetime.datetime.now() +
                                DATE_TIME_QSHELL_CHECK_INTERVAL):
                        processStarted = True  # Terminate loop
                    else:
                        if retryIndex >= warnThreshold:
                            self.myLogger.warning(
                                "Process %s q-shell is not responding after %s attempts"
                                % (process_key, retryIndex))

                if processStarted:
                    successes += 1
                    self.myLogger.notice(
                        "Process %s started, attempt %s out of %s" %
                        (process_key, retryIndex, maxRetries))
                else:
                    self.myLogger.notice(
                        "Process %s q-shell is not responding" % process_key)
        except:
            self.myLogger.error(
                "Unknown Exception in stats init - aborts. Exception info: %s"
                % (formatExceptionInfo()))
            return 69403

        #There is no logic for stats to run if there are no responding processes
        if (not allProcesses):
            self.myLogger.error(
                "Stats initialized with zero processes - aborts")
            return 69401

        if (hasQshellProcess and successes == 0):
            self.myLogger.error(
                "Stats initialized with zero processes running - aborts")
            return 69402

        # At this point, the line configuration file must exist.
        self.__readLineRunningCfg()

        #Create consumer design pattern Queue
        self.myJobsQueue = Queue()
        self.myLogger.debug1("StatsMgr queues created")

        #Load counters configurations
        (self.myCountersDBInfo, self.myRegexCounters,
         self.myCommChanDict) = self.loadCountersConfigurations()
        if not self.myCountersDBInfo and not self.myRegexCounters:
            #No counters were found? Nothing to do - die
            self.myLogger.error(
                "StatsMgr zero counters were found in counter configurations file. File path: %s"
                % self.myCountersListFilePath)
            return 548731

        #Check that the properties names are unique.
        #We need to combine the 2 counter dictionaries to one.
        #Since the dict keys are derived from a config file section names,
        #it is safe to assume that there will be no overlapping keys
        tempCountersDict = dict(self.myCountersDBInfo)
        tempCountersDict.update(self.myRegexCounters)
        errFlag = self.__validateProperties(tempCountersDict)
        if errFlag:
            return 787544

        self.myLogger.debug1("StatsMgr counters configuration loaded")
        self.myMaxJobsQueueSize = int(
            getSysParamOrDefault(self.myStatsConfigurations, CFG_SECTION,
                                 CFG_JOBS_QUEUE_MAX_SIZE,
                                 configurationsDefaults))

        self.myNumHistoryFiles = self.__calcNumHistoryFiles(
            self.myWritePeriod + self.myStartupGracePeriod)
        self.myLogger.notice(
            "StatsMgr: Grace period started, keeping %s history files",
            self.myNumHistoryFiles)
        self.myAggrHistory = AggrHistory(self.myLogger)
        self.myAggrHistory.setMaxHistoryEntries(self.myNumHistoryFiles * 2)

        self.__initWorkers()

        # Now, the aggregator is already created. We want to tell him to load its prev-run files
        # Into the newly created aggrHistory object
        self.myAggregator.loadHistoryFiles()

        self.myRrdWriter = StatsRrdWriter()
        self.myRrdWriter.init(self.myLogger, self, self.myAggrHistory,
                              self.myWritePeriod, self.myRrdUpdateBurstCount,
                              self.myRrdUpdateBurstInterval,
                              self.myStartupGracePeriod)
        self.myRrdWriter.start()

        self.myLogger.info("Stats Script - Launched")
        self.myLogger.closeLog()
        self.statsInitOK = True
        return 0
Exemplo n.º 20
0
    def getConfigurationsFromDB(self):
        """
            Only loads relevant data i.e. don't care about counterUISettings
        """
        list_of_files = self.getPreviousDbPath()
        if len(list_of_files) > 0:
            if len(list_of_files) > 1:
                self.myLogger.error(
                    "Too many sqlite files in the output directory. There should be one or none"
                )
                return None
            else:
                #This is the file to work with
                self.myFileName = os.path.join(self.myDbFilesOutputPath,
                                               list_of_files[0])
        else:
            self.myLogger.debug2(
                "No sqlite files found - no previous configurations")
            cleanConfigurations = {}
            return cleanConfigurations

        if not self.connectToDb():
            self.myLogger.error("SQLite Connect to DB failed")
            return None
        else:
            self.myLogger.debug2("SQLiteDescriptorsDB connected")

        #Template DB queris for each counter
        queryGet0 = 'SELECT * FROM sqlite_master'
        queryGet1 = 'SELECT * FROM counterID'
        queryGet2 = 'SELECT * FROM counterDescriptions'
        queryGet3 = 'SELECT * FROM counterSamplingRate'
        #queryGet4 = 'SELECT * FROM properties'
        queryGet5 = 'SELECT * FROM counterPropertiesInt'
        queryGet6 = 'SELECT * FROM counterPropertiesString'
        queryGet8 = 'SELECT * FROM counterArchaives'
        #queryGet9 = 'SELECT * FROM counterUISettings'

        #Create the counter descriptor objects
        resultDictionary = {}
        query = ""
        try:
            #Log
            query = queryGet0
            self.myCur.execute(query)
            countersTuplesList = self.myCur.fetchall()
            self.myLogger.debug2("sqlite_master query = %s" %
                                 str(countersTuplesList))

            #Get all counters
            query = queryGet1
            self.myCur.execute(query)
            countersTuplesList = self.myCur.fetchall()
            #Create objects
            for counter in countersTuplesList:
                counterObj = CounterDescriptor()
                isRateValue = counter[6] > 0  #booleans represented as integers
                counterObj.init(-1, counter[1], counter[3], counter[2],
                                counter[5], counter[4], isRateValue, "", [],
                                [], counter[7], counter[8], False, "", None)
                resultDictionary[counter[0]] = counterObj

            #Get Descriptions
            query = queryGet2
            self.myCur.execute(query)
            descriptionsTuplesList = self.myCur.fetchall()
            for desc in descriptionsTuplesList:
                if desc[1] is None:
                    description = ""
                else:
                    description = desc[1]
                resultDictionary[
                    desc[0]].myCounterShortDescriptionString = description
                resultDictionary[
                    desc[0]].myCounterDescriptionIsOverride = desc[2]

            #Get Sampling Rate
            query = queryGet3
            self.myCur.execute(query)
            counterSampleClassTuplesList = self.myCur.fetchall()

            for counterSample in counterSampleClassTuplesList:
                resultDictionary[
                    counterSample[0]].mySamplingRate = counterSample[1]

            #Get Properties
            """query = queryGet4
            self.myCur.execute(query)
            counterPropertiesTuplesList = self.myCur.fetchall()
            for prop in counterPropertiesTuplesList:
                propObj = CounterProperty()
                propObj.init(prop[1], prop[2], 0)
                resultDictionary[prop[0]].myProperties.append(propObj)"""

            #Get Properties' values
            #integers
            query = queryGet5
            self.myCur.execute(query)
            counterPropertiesIntTuplesList = self.myCur.fetchall()
            for val in counterPropertiesIntTuplesList:
                counterProperties = resultDictionary[val[0]].myProperties
                propObj = CounterProperty()
                propObj.init(val[1], VariableTypes.INTEGER, val[2])
                counterProperties.append(propObj)

            #strings
            query = queryGet6
            self.myCur.execute(query)
            counterPropertiesStringTuplesList = self.myCur.fetchall()
            for val in counterPropertiesStringTuplesList:
                counterProperties = resultDictionary[val[0]].myProperties
                propObj = CounterProperty()
                propObj.init(val[1], VariableTypes.STRING, val[2])
                counterProperties.append(propObj)

            #Archaives
            query = queryGet8
            self.myCur.execute(query)
            counterArchaivesTuplesList = self.myCur.fetchall()
            for archaive in counterArchaivesTuplesList:
                archObj = DataSetArchaive()
                archObj.init(archaive[1], archaive[2], archaive[3],
                             archaive[4])
                resultDictionary[archaive[0]].myArchaives.append(archObj)

            self.closeDBConnection()

            return resultDictionary
        except:
            self.closeDBConnection()
            self.myLogger.error(
                "UNExpected exception was raised during sqlite query execution. Query: %s. Exception info: %s"
                % (query, formatExceptionInfo()))
            return None
Exemplo n.º 21
0
    def run(self):

        self.myLogger.notice("Stats Sampler running")

        if self.myNextSampleTime is None:
            # One time initialization
            self.myNextSampleTime = time.time()
        else:
            # If we quit this thread due to discovering new counters, we need to sleep here before the next sample
            now = time.time()
            sleepTime = self.myNextSampleTime - now
            while sleepTime < 0:
                self.myNextSampleTime += 60
                sleepTime = self.myNextSampleTime - now

            self.myLogger.debug1("%d Compensation sleep %d sec (before loop)" %
                                 (int(now), int(sleepTime)))
            time.sleep(sleepTime)

        try:
            while (self.myRunFlag):

                # Sample counters
                startTime = time.time()
                self.myLogger.debug1("%d Start sampling counters" %
                                     int(startTime))
                self.__sampleValues()

                # Discover new regex counters
                if self.myNextDiscoveryTime - time.time() < 0:
                    self.myLogger.debug1("%d Start discover counters" %
                                         int(time.time()))
                    (self.myNewCounters,
                     self.myDeletedCounters) = self.__discoverRegexCounters()
                    self.myNextDiscoveryTime = time.time(
                    ) + REGEX_COUNTERS_DISCOVERY_INTERVAL
                    if self.myNewCounters or self.myDeletedCounters:
                        # If we discovered new counters we will exit the loop and re-initialize
                        self.myRunFlag = False

                endTime = time.time()
                self.myLogger.debug1(
                    "%d Finish sample/discover cycle. Elasped %d sec" %
                    (int(endTime), int(endTime - startTime)))

                self.myNextSampleTime += 60
                sleepTime = self.myNextSampleTime - endTime
                while sleepTime < 0:
                    self.myNextSampleTime += 60
                    sleepTime = self.myNextSampleTime - endTime

                if self.myRunFlag:
                    # If we need to quit we will sleep when we come back
                    self.myLogger.debug1("%d Sleep %d sec" %
                                         (int(endTime), int(sleepTime)))
                    time.sleep(sleepTime)
        except:
            self.myLogger.error(
                "Unexpected exception on Stats Sampler. Exception info: %s" %
                formatExceptionInfo())

        if self.myRunFlag:
            self.myLogger.notice("Stats Sampler thread ended enexpectedly")
        else:
            self.myLogger.notice("Stats Sampler thread ended")

        self.myRunFlag = False
Exemplo n.º 22
0
    def run(self):

        self.myLogger.notice("Stats Aggregator running")
        reportLimiter = EventReportLimiter(3600)

        try:
            self.myRunFlag = True
            self.myDescriptionsDB.connectToDb()

            while (self.myRunFlag):

                self.myLogger.debug2(
                    "StatsAggregator blocking in order to dequeue. Time = %d" %
                    time.time())

                job = self.__dequeue()
                if None == job:
                    self.myLogger.debug1(
                        "StatsAggregator dequeued a job from jobs queue. Message is None -> unblock message"
                    )
                    continue

                self.myLogger.debug2(
                    "StatsAggregator message dequeued successfully. Time = %d"
                    % time.time())

                # If this is a sample result job (The only legal message for v1.0)
                startTime = time.time()

                if (job.quack() ==
                        stats_queue_objects.AggregationQueueJobType.VALUES):
                    self.myLogger.debug2("StatsAggregator received values job")

                    # Iterate on all received counters and update global aggr dictionary.
                    # Respect counter burst params

                    allUpdatesStr = ""  # This will be saved to a history file

                    counterIdx = 0
                    numCounters = len(job.myCountersArray)
                    while counterIdx < numCounters:
                        burstStartTime = time.time()
                        self.myAggrHistory.lock()
                        i = 0
                        self.myLogger.debug3("Start burts")
                        while i < self.myUpdateBurstCount:
                            counterVal = job.myCountersArray[counterIdx]
                            self.myLogger.debug5(
                                "Updating couner id %s values (%s, %s)",
                                counterVal.myCounterId, counterVal.myValue,
                                counterVal.myTimeStamp)
                            valueType = type(counterVal.myValue).__name__
                            self.myAggrHistory.update(counterVal.myCounterId,
                                                      counterVal.myValue,
                                                      valueType,
                                                      counterVal.myTimeStamp)

                            allUpdatesStr += self.__formatHistoryLine(
                                counterVal.myCounterId, counterVal.myValue,
                                counterVal.myTimeStamp, valueType)

                            i += 1
                            counterIdx += 1
                            if counterIdx == numCounters:
                                break
                        self.myAggrHistory.unlock()
                        burstTime = time.time() - burstStartTime
                        self.myLogger.debug3("End burts")

                        # Warn about a long burst, or count it for the next warning
                        maxAllowedBurstTime = self.myUpdateBurstInterval * 0.75
                        if burstTime > maxAllowedBurstTime:
                            (shouldReport,
                             numEvents) = reportLimiter.shouldReport()
                            if shouldReport:
                                self.myLogger.warning(
                                    "Aggregator update burst took too long: %s seconds, max allowed is %s. (Occured %s times since last reported)",
                                    burstTime, maxAllowedBurstTime, numEvents)

                        # If we have more counters, sleep
                        if counterIdx < numCounters:
                            remainingTime = self.myUpdateBurstInterval - burstTime
                            if remainingTime > 0:
                                time.sleep(remainingTime)

                    self.saveHistoryFile(allUpdatesStr)

                insertTimeSpan = time.time() - startTime
                if insertTimeSpan > MAX_INSERT_TO_DB_TIME_SPAN:
                    self.myLogger.warning(
                        "Execute job exceeds the time limit. Limit is %d sec. Elapsed time is %s sec, numCounters=%s. Queue size is: %d",
                        MAX_INSERT_TO_DB_TIME_SPAN, insertTimeSpan,
                        numCounters, self.myJobsQueue._qsize())
                else:
                    self.myLogger.notice(
                        "Execute job. Elapsed time is %s sec, numCounters=%s. Queue size is: %d",
                        insertTimeSpan, numCounters, self.myJobsQueue._qsize())

            if self.myRunFlag:
                self.myLogger.notice(
                    "Stats Aggregator thread ended enexpectedly")
            else:
                self.myLogger.notice("Stats Aggregator thread ended")

            self.myRunFlag = False

        except:
            #Loop ended - thread is shutting down
            self.myRunFlag = False
            self.myLogger.error(
                "UNExpected exception on Stats Aggregator. Exception info: %s"
                % stats_queue_objects.formatExceptionInfo())

        #When the while loop ended = shutdown - close the DBs
        self.myValuesDB.closeDBConnection()
        self.myDescriptionsDB.closeDBConnection()
        self.myLogger.debug2("StatsAggregator thread ended")
Exemplo n.º 23
0
    def loadCountersConfigurations (self):
        countersConfig = ConfigParser()
        try:
            countersConfig.read(self.myCountersListFilePath)
        except:
            self.myLogger.error("Couldn't open counters configuration file. Path: %s. Exception details: %s" % (self.myCountersListFilePath, formatExceptionInfo()))
            return (None, None)

        #deserialize
        return self.__createDBInfoObject(countersConfig)
Exemplo n.º 24
0
    def start (self):
        self.postStartInit()
        self.myLogger.info("Stats mgr - thread started")

        if not self.statsInitOK:
            self.myLogger.error("StatsMgr starting before proper initialization - crashing")
            return 2112

        try:
            self.myLogger.debug1("StatsMgr starting all threads")
            #Set logger must be called since Oscar launches Stats as a process with 'fork'. If the counter is not
            #created again in the child process then a collision will occur. The logger is thread-safe BUT NOT process-safe
            self.myAggregator.start()
            self.myPacer.start()
            self.myLogger.debug1("StatsMgr threads started")
    
            # This loop makes sure that if a thread dies all the module dies with it

            reloadCounters = False

            while(True):

                abort = False

                if not self.myAggregator.isRunning():
                    if not reloadCounters:
                        #Aggregator really died
                        self.myLogger.error("Stats aggregator thread died - script aborts")
                        abort = True
                        #Kill other thread
                        self.myPacer.end()
                    else:
                        # rebuild the database and restart the threads
                        (newCounters, deletedCounters) = self.myPacer.getNewAndDeletedCounters()
                        if newCounters or deletedCounters:
                            self.myLogger.notice("Stats reloading new counters (%d new. %d deleted)" % (len(newCounters), len(deletedCounters)))
                            self.myCountersDBInfo.update(newCounters)
                            for key in deletedCounters:
                                del(self.myCountersDBInfo[key])
                            self.__initWorkers()
                            self.myAggregator.start()
                            self.myPacer.start()
                            self.myLogger.notice("Stats reloaded with %d new counters", len(newCounters))
                        else:
                            self.myLogger.error("Asked to load new counters but list is empty")

                        reloadCounters = False

                if not self.myPacer.isRunning() and not abort:

                    if not reloadCounters and self.myPacer.needUpdate():
                        #Pacer discovered new regex counters and need to re-initialize
                        self.myLogger.notice("Stats discovered new counters - Ask to reload")
                        self.myAggregator.end()
                        reloadCounters = True

                    if not reloadCounters:
                        #Pacer really died
                        abort = True
                        self.myLogger.error("Stats sampler-pacer thread died - Script aborts")
                        #Kill other thread
                        self.myAggregator.end()
    
                if not reloadCounters:
                    #We want the script to die if one of its threads dies but it is not urgent
                    time.sleep(3)
                
                #Shut down
                if abort:
                    self.myLogger.notice("Stats main thread finished and is shutting down")
                    return 10101010
        except:
            self.myLogger.notice("Exception in main thread. Exception details: %s" % (formatExceptionInfo()))
            self.myLogger.debug2("Killing sampler-pacer thread")
            self.myPacer.end()
            self.myLogger.debug2("Killing aggregation thread")
            self.myAggregator.end()
            self.myLogger.debug2("Killing rrd writer thread")
            self.myRrdWriter.end()
            return 8873

        self.myLogger.notice("Stats main thread finished and is shutting down")
        return 0