def execute(sqlQuery): global sqlTableContent if ( sqlTableContent == None ): sqlTableContent = generateTableContent() global reExp sqlQuery = sqlQuery.replace('\n',' ') dateMatch = reExp.match(sqlQuery) if dateMatch == None: raise Exception("Misspecified query argument:"+sqlQuery) startDateS = dateMatch.group(1) endDateS = dateMatch.group(2) limitS = dateMatch.group(3) startDate = DateStrToSecs(startDateS) endDate = DateStrToSecs(endDateS) limit = int(limitS) results = [] for r in sqlTableContent: recordTime = r['tm'] if ( recordTime >= startDate and recordTime < endDate ): results.append(r.copy()) if ( len(results) == limit ): break if ( len(results) != 0 ): TestContainer.sendInterrupt(300) return results
def _processDBRow(self, row): """ Completely process a single DB row. Take the row, convert it to a UsageRecord, and send it up to Gratia. Process any recoverable errors which occurred during the process. Note we skip a row if it is an Intra-site transfer and we are instructed not to send them. Otherwise, we process the row in Gratia or exit the probe. @return: The number of jobs in this row, regardless of whether we sent them successfully or not. """ # Skip intra-site transfers if required if self._skipIntraSiteXfer(row): return row['njobs'] if ( TestContainer.isTest() ): if ( self._summarize ): TestContainer.sendInterrupt(15) return TestContainer.processRow(row,self._log) usageRecord = self._convertBillingInfoToGratiaUsageRecord(\ row) # Send to gratia, and see what it says. response = Gratia.Send(usageRecord) baseMsg = "Record: %s, %s, njobs %i" % (str(row['datestamp']), row['transaction'], row['njobs']) if response == "Fatal Error: too many pending files": # The server is currently not accepting record and # Gratia.py was not able to store the record, we will # need to resend it. # For now take a long nap and then by 'break' we # force a retry for this record. self._log.error("Error sending : too many pending files") longsleep = 15*60 self._log.warn("sleeping for = %i seconds." % longsleep) sleep_check(longsleep, self._stopFileName) elif response.startswith('Fatal Error') or \ response.startswith('Internal Error'): self._log.critical('error sending ' + baseMsg + \ '\ngot response ' + response) sys.exit(2) self._log.debug('sent ' + baseMsg) # If we got a non-fatal error, slow down since the server # might be overloaded. if response[:2] != 'OK': self._log.error('error sending ' + baseMsg + \ '\ngot response ' + response) return row['njobs']
def _processDBRow(self, row): """ Completely process a single DB row. Take the row, convert it to a UsageRecord, and send it up to Gratia. Process any recoverable errors which occurred during the process. Note we skip a row if it is an Intra-site transfer and we are instructed not to send them. Otherwise, we process the row in Gratia or exit the probe. @return: The number of jobs in this row, regardless of whether we sent them successfully or not. """ # Skip intra-site transfers if required if self._skipIntraSiteXfer(row): return row['njobs'] if (TestContainer.isTest()): if (self._summarize): TestContainer.sendInterrupt(15) return TestContainer.processRow(row, self._log) usageRecord = self._convertBillingInfoToGratiaUsageRecord(\ row) # Send to gratia, and see what it says. response = Gratia.Send(usageRecord) baseMsg = "Record: %s, %s, njobs %i" % (str( row['datestamp']), row['transaction'], row['njobs']) if response == "Fatal Error: too many pending files": # The server is currently not accepting record and # Gratia.py was not able to store the record, we will # need to resend it. # For now take a long nap and then by 'break' we # force a retry for this record. self._log.error("Error sending : too many pending files") longsleep = 15 * 60 self._log.warn("sleeping for = %i seconds." % longsleep) sleep_check(longsleep, self._stopFileName) elif response.startswith('Fatal Error') or \ response.startswith('Internal Error'): self._log.critical('error sending ' + baseMsg + \ '\ngot response ' + response) sys.exit(2) self._log.debug('sent ' + baseMsg) # If we got a non-fatal error, slow down since the server # might be overloaded. if response[:2] != 'OK': self._log.error('error sending ' + baseMsg + \ '\ngot response ' + response) return row['njobs']
def sendBillingInfoRecordsToGratia(self): """ This is the public method for starting the dCache-transfer reporting. This will query records no more than _maxAge old, and always starts queries on hour time boundaries (i.e., 1:00:00 not 1:02:00). This will continue to query until we hit records starting less than 75 minutes ago, then return. By default, we start with querying 60-second intervals, but will shrink this window if we encounter lots of data. If not summarizing: this method uses _execute to get all the data for a given interval, then uses _processResults to send them to Gratia. Once the query for a time interval is done, then we immediately checkpoint. If summarizing: this method continues to query until it hits the end of an hour interval. At that point, it summarizes once again, and sends the summaries up to Gratia. We then only checkpoint on the hour. """ self._log.debug("sendBillingInfoRecordsToGratia") # Query no more than a set number of days in the past minTime = datetime.datetime.now() - datetime.timedelta(self._maxAge, 0) minTime = datetime.datetime(minTime.year, minTime.month, minTime.day, minTime.hour, 0, 0) # The latest allowed record is 75 minutes in the past, in order to make # sure we only query complete intervals latestAllowed = datetime.datetime.now() - datetime.timedelta(0, 75*60) if ( TestContainer.isTest() ): latestAllowed = TestContainer.getEndDateTime() # Start with either the last checkpoint or minTime days ago, whichever # is more recent. starttime = max(self._BIcheckpoint.lastDateStamp(), minTime) self._log.info("Starting queries at time %s." % starttime) dictRecordAgg = TimeBinRange.DictRecordAggregator(DCACHE_AGG_FIELDS, DCACHE_SUM_FIELDS) nextSummary = self._determineNextEndtime(starttime, summary=True) if self._summarize: self._log.debug("Next summary send time: %s." % nextSummary) results = [] endtime = self._determineNextEndtime(starttime) totalRecords = 0 # Loop until we have caught up to latestAllowed. while starttime < latestAllowed: assert starttime < endtime self._log.debug('sendBillingInfoRecordsToGratia: Processing ' \ 'starting at %s.' % starttime) # We are guaranteed that starttime will move forward to the value of # endtime every time we call execute. next_starttime, rows = self._execute(starttime, endtime, self._maxSelect) results += rows totalRecords += len(rows) if self._summarize: # Summarize the partial results results = Collapse.collapse(results, dictRecordAgg) assert next_starttime > starttime next_endtime = self._determineNextEndtime(next_starttime) # If we're not summarizing, we send up records each loop. if (not self._summarize) and results: totalRecords = 0 # We now have all the rows we want; process them self._BIcheckpoint.createPending(endtime, '') self._processResults(results) self._BIcheckpoint.commit() if (self._range < STARTING_RANGE and len(results)*4 < \ self._maxSelect): self._range = STARTING_RANGE results = [] # If we are summarizing, send records only per hour of data elif (next_endtime > nextSummary) and results: num_agg = totalRecords - len(results) if num_agg: factor = float(totalRecords)/float(len(results)) self._log.info("Aggregated %i of %i records for time " \ "interval ending in %s. %.1fx reduction." % \ (num_agg, totalRecords, nextSummary, factor)) else: self._log.debug("Unable to aggregate any of %i records" \ % totalRecords) totalRecords = 0 self._BIcheckpoint.createPending(nextSummary, '') self._processResults(results) self._BIcheckpoint.commit() results = [] self._range = STARTING_RANGE nextSummary = self._determineNextEndtime(next_starttime, summary=True) endtime = next_endtime starttime = next_starttime # Check to see if the stop file has been created. If so, break if os.path.exists(self._stopFileName): #Neha - 03/17/2011 #Don't need to commit anything since we are only doing select and no inserts or updates self._cur.close() self._connection.close() break
def _execute(self, starttime, endtime, maxSelect): """ Execute the select command against the Billing DB return the results (possibly summarized) It is guaranteed this function will return an endtime greater than the starttime, but not guaranteed by how much. Note on the time returned as the first part of the tuple: We guarantee two things: a) returned time is strictly greater than starttime b) We return *all* records in the interval [starttime, return time). We do not guarantee that return time == parameter endtime. Thus it is suitable to use as the start time of the next select query. To do this, we reduce the range until it reaches 1 second or the query returns less than maxSelect results. If the interval is one second and it still returns maxSelect results then we extend the limit of the query until all records fit. @param starttime: Datetime object for the start of the query interval. @param endtime: Datetime object for the end of the query interval. @param maxSelect: The maximum number of rows to select @return: Tuple containing the a time that is greater than all the records and the results """ assert starttime < endtime if (maxSelect > MAX_SELECT) and ((endtime-starttime).seconds <= \ MIN_RANGE): raise Exception("Fatal error - more than %i transfers in %i" \ " second(s)." % (MAX_SELECT,(endtime-starttime).seconds)) datestr = str(starttime) datestr_end = str(endtime) # Query the database. If it takes more than MAX_QUERY_TIME_SECS, then # have the probe self-destruct. query=BILLINGDB_SELECT_CMD% ((datestr, datestr_end, datestr, datestr_end, maxSelect)) self._log.debug('_sendToGratia: will execute ' + query) select_time = -time.time() if not TestContainer.isTest(): self._cur.execute(query) result = self._cur.fetchall() else: result = BillingRecSimulator.execute(query) select_time += time.time() if select_time > MAX_QUERY_TIME_SECS: raise Exception("Postgres query took %i seconds, more than " \ "the maximum allowable of %i; this is a sign the DB is " \ "not properly optimized!" % (int(select_time), MAX_QUERY_TIME_SECS)) self._log.debug("BillingDB query finished in %.02f seconds and " \ "returned %i records." % (select_time, len(result))) if not result: self._log.debug("No results from %s to %s." % (starttime, endtime)) return endtime, result # dCache sometimes returns a negative transfer size; when this happens, # it also tosses up a complete garbage duration filtered_result = [] for row in result: row = dict(row) #print row if row['transfersize'] < 0: row['transfersize'] = 0 row['connectiontime'] = 0 filtered_result.append(row) result = filtered_result # If we hit our limit, there's no telling how many identical records # there are on the final millisecond; we must re-query with a smaller # interval or a higher limit on the select. if len(result) == maxSelect: diff = endtime - starttime interval = diff.days*86400 + diff.seconds # Ensure that self._range is such that we always end up on a minute boundary (eventually). # Whenever we decrease the interval size it is guaranteed to be a multiple of what's left # of the interval to the next minute. I.e the transitions are: # 60s -> 30s # 30s -> 15s (which can only happen at :30s) # 15s -> 5s (which can only happen at :15s :30s or :45s) # 5s -> 1s if (interval > 60): new_interval = 60 elif (interval > 30): new_interval = 30 elif (interval > 15): new_interval = 15 elif (interval > 5): new_interval = 5 else: new_interval = 1 new_endtime = starttime + datetime.timedelta(0, new_interval) # Guard against the DST jump by making sure new_endtime > starttime. if (interval == new_interval) or (new_interval == 0) or \ (new_endtime <= starttime): self._log.warning("Limit hit; increasing from %i to %i." % \ (maxSelect, maxSelect*2)) endtime, result = self._execute(starttime, endtime, maxSelect*2) assert endtime > starttime return endtime, result else: self._log.warning("Limit hit; decreasing time interval from %i" \ " to %i." % (interval, new_interval)) self._range = new_interval endtime, result = self._execute(starttime, new_endtime, maxSelect) assert endtime > starttime return endtime, result return endtime, result
def __init__( self, configuration, chkptdir=None ): # Pick up the logger self._log = logging.getLogger( 'DCacheAggregator' ) #Fermilab dCache billing node doesn't support user to uid mapping in the /etc/passwd #instead of that there is GROUP_ID_LIST_FILE_NAME that contains gid to group mapping #group should be present in user-vo-map file to be mapped correctly self.__gid_file_mod_time = int(time.time()) self.__group_map = {} self._unix_gid_list_file_name = configuration.get_UnixGidListFileName() if os.path.exists(self._unix_gid_list_file_name) : self.__gid_file_mod_time = os.stat(self._unix_gid_list_file_name).st_mtime self.__refresh_group_map() # Neha - 03/17/2011 # Using psycopg2 instead of sqlalchemy DBurl = 'dbname=%s user=%s ' % (configuration.get_DBName(), configuration.get_DBLoginName()) DBurl += 'password=%s ' % (configuration.get_DBPassword()) DBurl += 'host=%s' % (configuration.get_DBHostName()) # Neha - 03/17/2011 # Commenting out as not using sqlalchemy anymore #DBurl = 'postgres://%s:%s@%s:5432/%s' % \ (configuration.get_DBLoginName(), configuration.get_DBPassword(), configuration.get_DBHostName(), configuration.get_DBName()) self._skipIntraSite = configuration.get_OnlySendInterSiteTransfers() self._stopFileName = configuration.get_StopFileName() self._dCacheSvrHost = configuration.get_DCacheServerHost() # Create the billinginfo database checkpoint. self._maxAge = configuration.get_MaxBillingHistoryDays() if ( TestContainer.isTest() ): self._maxAge = TestContainer.getMaxAge() billinginfoChkpt = 'chkpt_dcache_xfer_DoNotDelete' if chkptdir != None: billinginfoChkpt = os.path.join(chkptdir, billinginfoChkpt) self._BIcheckpoint = Checkpoint(billinginfoChkpt, self._maxAge) self._sendAlarm = Alarm( configuration.get_EmailServerHost(), configuration.get_EmailFromAddress(), configuration.get_EmailToList(), 'dCacheTransfer probe aggregator alarm', 'The dCache Transfer Probe was not able to send to Gratia.', 2, # If more than two errors have occurred 1800, # Max of once per half hour complaining True ) self._summarize = configuration.get_Summarize() # Connect to the dCache postgres database. # TODO: Using sqlalchemy gives us nothing but a new dependency. Remove - Done # Neha: 03/17/2011 - Removing sqlalchemy. Using psycopg2 instead try: if TestContainer.isTest(): self._db = None else: #self._db = sqlalchemy.create_engine(DBurl) #self._connection = self._db.connect() self._connection = psycopg2.connect(DBurl) self._cur = self._connection.cursor(cursor_factory=psycopg2.extras.DictCursor) except: tblist = traceback.format_exception(sys.exc_type, sys.exc_value, sys.exc_traceback) errmsg = 'Failed to connect to %s\n\n%s' % (DBurl, "\n".join(tblist)) self._log.error(errmsg) raise self._grid = configuration.get_Grid()
def _CalcMaxSelect(): """ Returns the maximum number of sql results so that we do not use more than half of the install RAM on the current machine. """ try: mem = _Meminfo()["MemTotal"] if ( mem < 2048000 ): mem = 2048000 return int(mem / 4) except: return 512000 if TestContainer.isTest(): STARTING_MAX_SELECT = 50 MAX_SELECT = 100 STARTING_RANGE = 60 MIN_RANGE = 1 else: STARTING_MAX_SELECT = 32000 MAX_SELECT = _CalcMaxSelect() STARTING_RANGE = 60 MIN_RANGE = 1 BILLINGDB_SELECT_CMD = """ SELECT b.datestamp AS datestamp, b.transaction AS transaction, b.cellname AS cellname,
# # # You should have received a copy of the GNU Lesser General Public # # License along with AVANGO. If not, see <http://www.gnu.org/licenses/>. # # # ########################################################################## import unittest import TestFieldContainer import TestField import TestUpdate import TestScript import TestNodefactory import TestContainer import TestUtility import sys if __name__ == '__main__': suites = [ TestFieldContainer.Suite(), TestField.Suite(), TestUpdate.Suite(), TestScript.Suite(), TestNodefactory.Suite(), TestContainer.Suite(), TestUtility.Suite(), ] alltests = unittest.TestSuite(suites) result = unittest.TextTestRunner(verbosity=2).run(alltests) if not result.wasSuccessful(): sys.exit(1)
def main(): # We need the logger variable in the exception handler. # So we create it here. logger = logging.getLogger( 'DCacheAggregator' ) # Ignore hangup signals. We shouldn't die just because our parent # shell logs out. signal.signal( signal.SIGHUP, signal.SIG_IGN ) # Try to catch common signals and send email before we die signal.signal( signal.SIGINT, warn_of_signal ); signal.signal( signal.SIGQUIT, warn_of_signal ); signal.signal( signal.SIGTERM, warn_of_signal ); try: # Tell Gratia what versions we are using. # CHRIS: is there a way to automate the version extraction # using the pkg_resource package? Gratia.RegisterReporterLibrary( "psycopg2", "2.0.6" ) #Gratia.RegisterReporterLibrary( "SQLAlchemy", "0.4.1" ) rev = Gratia.ExtractCvsRevision("$Revision: 1.13 $") tag = Gratia.ExtractCvsRevision("$Name: $") Gratia.RegisterReporter( "dCacheBillingAggregator.py", str(rev) + " (tag " + str(tag) + ")") # BRIAN: attempt to pull the dCache version from RPM. version = "UNKNOWN" try: version = os.popen("rpm -q --qf '%{VERSION}-%{RELEASE}' " \ "dcache-server").read() except: pass Gratia.RegisterService( "dCache", version ) # Initialize gratia before attempting to read its config file. Gratia.Initialize() # Extract the configuration information into local variables. myconf = dCacheProbeConfig() # Get the name of the directory where we are to store the log files. logDir = myconf.get_LogFolder() # Make sure that the logging directory is present if not os.path.isdir( logDir ): os.mkdir( logDir, 0755 ) logFileName = os.path.join( logDir, "dcacheTransfer.log" ) # Set up an alarm to send an email if the program terminates. termSubject = "dCache-transfer probe is going down" termMessage = "The dCache transfer probe for Gratia has " + \ "terminated.\nPlease check the logfile\n\n " + \ logFileName + \ "\n\nfor the cause.\n" terminationAlarm = Alarm( myconf.get_EmailServerHost(), myconf.get_EmailFromAddress(), myconf.get_EmailToList(), termSubject, termMessage, 0, 0, False ) # Set up the logger with a suitable format hdlr = RotatingFileHandler( logFileName, 'a', 512000, 10 ) formatter = logging.Formatter( '%(asctime)s %(levelname)s %(message)s' ) hdlr.setFormatter( formatter ) logger.addHandler( hdlr ) logger.setLevel( myconf.get_AggrLogLevel() ) logger.info( "starting " + ProgramName ) stopFileName = myconf.get_StopFileName() updateFreq = float(myconf.get_UpdateFrequency()) logger.warn("update freq = %.2f" % updateFreq) # Create the aggregator instance that we will use. dataDir = myconf.get_DataFolder() aggregator = DCacheAggregator(myconf, dataDir) # If profiling was requested, turn it on. profiling = sys.argv.count('-profile') > 0 if profiling: profiler = hotshot.Profile("profile.dat") logger.info( "Enabling Profiling" ) # Now aggregate new records, then sleep, until somebody creates # the stop file... while 1: # Make sure we (still) have a connection to Gratia. if ( not TestContainer.isTest() ): # no need in that during self test Gratia.Maintenance() if profiling: profiler.run("aggregator.sendBillingInfoRecordsToGratia()") else: try: aggregator.sendBillingInfoRecordsToGratia() except TestContainer.SimInterrupt: logger.info("BillingRecSimulator.SimInterrupt caught, " \ "restarting") aggregator = DCacheAggregator(myconf, dataDir) continue # Are we are shutting down? if os.path.exists(stopFileName): break if TestContainer.isTest(): break logger.warn("sleeping for = %.2f seconds" % updateFreq) sleep_check(updateFreq, stopFileName) # If we are profiling, print the results... if profiling: profiler.close() stats = hotshot.stats.load("profile.dat") stats.sort_stats('time', 'calls') stats.print_stats() logger.warn(ProgramName + " stop file detected.") except (KeyboardInterrupt, SystemExit): raise except: # format the traceback into a string tblist = traceback.format_exception( sys.exc_type, sys.exc_value, sys.exc_traceback ) msg = ProgramName + " caught an exception:\n" + "".join(tblist) print msg logger.error(msg) TestContainer.dumpStatistics(logger) # shut down the logger to make sure nothing is lost. logger.critical(ProgramName + " shutting down.") logging.shutdown() # try to send an email warning of the shutdown. if terminationAlarm != None: terminationAlarm.event() sys.exit(1)
def main(): # We need the logger variable in the exception handler. # So we create it here. logger = logging.getLogger('DCacheAggregator') # Ignore hangup signals. We shouldn't die just because our parent # shell logs out. signal.signal(signal.SIGHUP, signal.SIG_IGN) # Try to catch common signals and send email before we die signal.signal(signal.SIGINT, warn_of_signal) signal.signal(signal.SIGQUIT, warn_of_signal) signal.signal(signal.SIGTERM, warn_of_signal) try: # Tell Gratia what versions we are using. # CHRIS: is there a way to automate the version extraction # using the pkg_resource package? Gratia.RegisterReporterLibrary("psycopg2", "2.0.6") #Gratia.RegisterReporterLibrary( "SQLAlchemy", "0.4.1" ) rev = Gratia.ExtractCvsRevision("$Revision: 1.13 $") tag = Gratia.ExtractCvsRevision("$Name: $") Gratia.RegisterReporter("dCacheBillingAggregator.py", str(rev) + " (tag " + str(tag) + ")") # BRIAN: attempt to pull the dCache version from RPM. version = "UNKNOWN" try: version = os.popen("rpm -q --qf '%{VERSION}-%{RELEASE}' " \ "dcache-server").read() except: pass Gratia.RegisterService("dCache", version) # Initialize gratia before attempting to read its config file. Gratia.Initialize() # Extract the configuration information into local variables. myconf = dCacheProbeConfig() # Get the name of the directory where we are to store the log files. logDir = myconf.get_LogFolder() # Make sure that the logging directory is present if not os.path.isdir(logDir): os.mkdir(logDir, 0755) logFileName = os.path.join(logDir, "dcacheTransfer.log") # Set up an alarm to send an email if the program terminates. termSubject = "dCache-transfer probe is going down" termMessage = "The dCache transfer probe for Gratia has " + \ "terminated.\nPlease check the logfile\n\n " + \ logFileName + \ "\n\nfor the cause.\n" terminationAlarm = Alarm(myconf.get_EmailServerHost(), myconf.get_EmailFromAddress(), myconf.get_EmailToList(), termSubject, termMessage, 0, 0, False) # Set up the logger with a suitable format hdlr = RotatingFileHandler(logFileName, 'a', 512000, 10) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.setLevel(myconf.get_AggrLogLevel()) logger.info("starting " + ProgramName) stopFileName = myconf.get_StopFileName() updateFreq = float(myconf.get_UpdateFrequency()) logger.warn("update freq = %.2f" % updateFreq) # Create the aggregator instance that we will use. dataDir = myconf.get_DataFolder() aggregator = DCacheAggregator(myconf, dataDir) # If profiling was requested, turn it on. profiling = sys.argv.count('-profile') > 0 if profiling: profiler = hotshot.Profile("profile.dat") logger.info("Enabling Profiling") # Now aggregate new records, then sleep, until somebody creates # the stop file... while 1: # Make sure we (still) have a connection to Gratia. if (not TestContainer.isTest() ): # no need in that during self test Gratia.Maintenance() if profiling: profiler.run("aggregator.sendBillingInfoRecordsToGratia()") else: try: aggregator.sendBillingInfoRecordsToGratia() except TestContainer.SimInterrupt: logger.info("BillingRecSimulator.SimInterrupt caught, " \ "restarting") aggregator = DCacheAggregator(myconf, dataDir) continue # Are we are shutting down? if os.path.exists(stopFileName): break if TestContainer.isTest(): break logger.warn("sleeping for = %.2f seconds" % updateFreq) sleep_check(updateFreq, stopFileName) # If we are profiling, print the results... if profiling: profiler.close() stats = hotshot.stats.load("profile.dat") stats.sort_stats('time', 'calls') stats.print_stats() logger.warn(ProgramName + " stop file detected.") except (KeyboardInterrupt, SystemExit): raise except: # format the traceback into a string tblist = traceback.format_exception(sys.exc_type, sys.exc_value, sys.exc_traceback) msg = ProgramName + " caught an exception:\n" + "".join(tblist) print msg logger.error(msg) TestContainer.dumpStatistics(logger) # shut down the logger to make sure nothing is lost. logger.critical(ProgramName + " shutting down.") logging.shutdown() # try to send an email warning of the shutdown. if terminationAlarm != None: terminationAlarm.event() sys.exit(1)
def sendBillingInfoRecordsToGratia(self): """ This is the public method for starting the dCache-transfer reporting. This will query records no more than _maxAge old, and always starts queries on hour time boundaries (i.e., 1:00:00 not 1:02:00). This will continue to query until we hit records starting less than 75 minutes ago, then return. By default, we start with querying 60-second intervals, but will shrink this window if we encounter lots of data. If not summarizing: this method uses _execute to get all the data for a given interval, then uses _processResults to send them to Gratia. Once the query for a time interval is done, then we immediately checkpoint. If summarizing: this method continues to query until it hits the end of an hour interval. At that point, it summarizes once again, and sends the summaries up to Gratia. We then only checkpoint on the hour. """ self._log.debug("sendBillingInfoRecordsToGratia") # Query no more than a set number of days in the past minTime = datetime.datetime.now() - datetime.timedelta(self._maxAge, 0) minTime = datetime.datetime(minTime.year, minTime.month, minTime.day, minTime.hour, 0, 0) # The latest allowed record is 75 minutes in the past, in order to make # sure we only query complete intervals latestAllowed = datetime.datetime.now() - datetime.timedelta( 0, 75 * 60) if (TestContainer.isTest()): latestAllowed = TestContainer.getEndDateTime() # Start with either the last checkpoint or minTime days ago, whichever # is more recent. starttime = max(self._BIcheckpoint.lastDateStamp(), minTime) self._log.info("Starting queries at time %s." % starttime) dictRecordAgg = TimeBinRange.DictRecordAggregator( DCACHE_AGG_FIELDS, DCACHE_SUM_FIELDS) nextSummary = self._determineNextEndtime(starttime, summary=True) if self._summarize: self._log.debug("Next summary send time: %s." % nextSummary) results = [] endtime = self._determineNextEndtime(starttime) totalRecords = 0 # Loop until we have caught up to latestAllowed. while starttime < latestAllowed: assert starttime < endtime self._log.debug('sendBillingInfoRecordsToGratia: Processing ' \ 'starting at %s.' % starttime) # We are guaranteed that starttime will move forward to the value of # endtime every time we call execute. next_starttime, rows = self._execute(starttime, endtime, self._maxSelect) results += rows totalRecords += len(rows) if self._summarize: # Summarize the partial results results = Collapse.collapse(results, dictRecordAgg) assert next_starttime > starttime next_endtime = self._determineNextEndtime(next_starttime) # If we're not summarizing, we send up records each loop. if (not self._summarize) and results: totalRecords = 0 # We now have all the rows we want; process them self._BIcheckpoint.createPending(endtime, '') self._processResults(results) self._BIcheckpoint.commit() if (self._range < STARTING_RANGE and len(results)*4 < \ self._maxSelect): self._range = STARTING_RANGE results = [] # If we are summarizing, send records only per hour of data elif (next_endtime > nextSummary) and results: num_agg = totalRecords - len(results) if num_agg: factor = float(totalRecords) / float(len(results)) self._log.info("Aggregated %i of %i records for time " \ "interval ending in %s. %.1fx reduction." % \ (num_agg, totalRecords, nextSummary, factor)) else: self._log.debug("Unable to aggregate any of %i records" \ % totalRecords) totalRecords = 0 self._BIcheckpoint.createPending(nextSummary, '') self._processResults(results) self._BIcheckpoint.commit() results = [] self._range = STARTING_RANGE nextSummary = self._determineNextEndtime(next_starttime, summary=True) endtime = next_endtime starttime = next_starttime # Check to see if the stop file has been created. If so, break if os.path.exists(self._stopFileName): #Neha - 03/17/2011 #Don't need to commit anything since we are only doing select and no inserts or updates self._cur.close() self._connection.close() break
def _execute(self, starttime, endtime, maxSelect): """ Execute the select command against the Billing DB return the results (possibly summarized) It is guaranteed this function will return an endtime greater than the starttime, but not guaranteed by how much. Note on the time returned as the first part of the tuple: We guarantee two things: a) returned time is strictly greater than starttime b) We return *all* records in the interval [starttime, return time). We do not guarantee that return time == parameter endtime. Thus it is suitable to use as the start time of the next select query. To do this, we reduce the range until it reaches 1 second or the query returns less than maxSelect results. If the interval is one second and it still returns maxSelect results then we extend the limit of the query until all records fit. @param starttime: Datetime object for the start of the query interval. @param endtime: Datetime object for the end of the query interval. @param maxSelect: The maximum number of rows to select @return: Tuple containing the a time that is greater than all the records and the results """ assert starttime < endtime if (maxSelect > MAX_SELECT) and ((endtime-starttime).seconds <= \ MIN_RANGE): raise Exception("Fatal error - more than %i transfers in %i" \ " second(s)." % (MAX_SELECT,(endtime-starttime).seconds)) datestr = str(starttime) datestr_end = str(endtime) # Query the database. If it takes more than MAX_QUERY_TIME_SECS, then # have the probe self-destruct. query = BILLINGDB_SELECT_CMD % ( (datestr, datestr_end, datestr, datestr_end, maxSelect)) self._log.debug('_sendToGratia: will execute ' + query) select_time = -time.time() if not TestContainer.isTest(): self._cur.execute(query) result = self._cur.fetchall() else: result = BillingRecSimulator.execute(query) select_time += time.time() if select_time > MAX_QUERY_TIME_SECS: raise Exception("Postgres query took %i seconds, more than " \ "the maximum allowable of %i; this is a sign the DB is " \ "not properly optimized!" % (int(select_time), MAX_QUERY_TIME_SECS)) self._log.debug("BillingDB query finished in %.02f seconds and " \ "returned %i records." % (select_time, len(result))) if not result: self._log.debug("No results from %s to %s." % (starttime, endtime)) return endtime, result # dCache sometimes returns a negative transfer size; when this happens, # it also tosses up a complete garbage duration filtered_result = [] for row in result: row = dict(row) #print row if row['transfersize'] < 0: row['transfersize'] = 0 row['connectiontime'] = 0 filtered_result.append(row) result = filtered_result # If we hit our limit, there's no telling how many identical records # there are on the final millisecond; we must re-query with a smaller # interval or a higher limit on the select. if len(result) == maxSelect: diff = endtime - starttime interval = diff.days * 86400 + diff.seconds # Ensure that self._range is such that we always end up on a minute boundary (eventually). # Whenever we decrease the interval size it is guaranteed to be a multiple of what's left # of the interval to the next minute. I.e the transitions are: # 60s -> 30s # 30s -> 15s (which can only happen at :30s) # 15s -> 5s (which can only happen at :15s :30s or :45s) # 5s -> 1s if (interval > 60): new_interval = 60 elif (interval > 30): new_interval = 30 elif (interval > 15): new_interval = 15 elif (interval > 5): new_interval = 5 else: new_interval = 1 new_endtime = starttime + datetime.timedelta(0, new_interval) # Guard against the DST jump by making sure new_endtime > starttime. if (interval == new_interval) or (new_interval == 0) or \ (new_endtime <= starttime): self._log.warning("Limit hit; increasing from %i to %i." % \ (maxSelect, maxSelect*2)) endtime, result = self._execute(starttime, endtime, maxSelect * 2) assert endtime > starttime return endtime, result else: self._log.warning("Limit hit; decreasing time interval from %i" \ " to %i." % (interval, new_interval)) self._range = new_interval endtime, result = self._execute(starttime, new_endtime, maxSelect) assert endtime > starttime return endtime, result return endtime, result
def __init__(self, configuration, chkptdir=None): # Pick up the logger self._log = logging.getLogger('DCacheAggregator') #Fermilab dCache billing node doesn't support user to uid mapping in the /etc/passwd #instead of that there is GROUP_ID_LIST_FILE_NAME that contains gid to group mapping #group should be present in user-vo-map file to be mapped correctly self.__gid_file_mod_time = int(time.time()) self.__group_map = {} self._unix_gid_list_file_name = configuration.get_UnixGidListFileName() if os.path.exists(self._unix_gid_list_file_name): self.__gid_file_mod_time = os.stat( self._unix_gid_list_file_name).st_mtime self.__refresh_group_map() # Neha - 03/17/2011 # Using psycopg2 instead of sqlalchemy DBurl = 'dbname=%s user=%s ' % (configuration.get_DBName(), configuration.get_DBLoginName()) DBurl += 'password=%s ' % (configuration.get_DBPassword()) DBurl += 'host=%s' % (configuration.get_DBHostName()) # Neha - 03/17/2011 # Commenting out as not using sqlalchemy anymore #DBurl = 'postgres://%s:%s@%s:5432/%s' % \ (configuration.get_DBLoginName(), configuration.get_DBPassword(), configuration.get_DBHostName(), configuration.get_DBName()) self._skipIntraSite = configuration.get_OnlySendInterSiteTransfers() self._stopFileName = configuration.get_StopFileName() self._dCacheSvrHost = configuration.get_DCacheServerHost() # Create the billinginfo database checkpoint. self._maxAge = configuration.get_MaxBillingHistoryDays() if (TestContainer.isTest()): self._maxAge = TestContainer.getMaxAge() billinginfoChkpt = 'chkpt_dcache_xfer_DoNotDelete' if chkptdir != None: billinginfoChkpt = os.path.join(chkptdir, billinginfoChkpt) self._BIcheckpoint = Checkpoint(billinginfoChkpt, self._maxAge) self._sendAlarm = Alarm( configuration.get_EmailServerHost(), configuration.get_EmailFromAddress(), configuration.get_EmailToList(), 'dCacheTransfer probe aggregator alarm', 'The dCache Transfer Probe was not able to send to Gratia.', 2, # If more than two errors have occurred 1800, # Max of once per half hour complaining True) self._summarize = configuration.get_Summarize() # Connect to the dCache postgres database. # TODO: Using sqlalchemy gives us nothing but a new dependency. Remove - Done # Neha: 03/17/2011 - Removing sqlalchemy. Using psycopg2 instead try: if TestContainer.isTest(): self._db = None else: #self._db = sqlalchemy.create_engine(DBurl) #self._connection = self._db.connect() self._connection = psycopg2.connect(DBurl) self._cur = self._connection.cursor( cursor_factory=psycopg2.extras.DictCursor) except: tblist = traceback.format_exception(sys.exc_type, sys.exc_value, sys.exc_traceback) errmsg = 'Failed to connect to %s\n\n%s' % (DBurl, "\n".join(tblist)) self._log.error(errmsg) raise self._grid = configuration.get_Grid()
def _CalcMaxSelect(): """ Returns the maximum number of sql results so that we do not use more than half of the install RAM on the current machine. """ try: mem = _Meminfo()["MemTotal"] if (mem < 2048000): mem = 2048000 return int(mem / 4) except: return 512000 if TestContainer.isTest(): STARTING_MAX_SELECT = 50 MAX_SELECT = 100 STARTING_RANGE = 60 MIN_RANGE = 1 else: STARTING_MAX_SELECT = 32000 MAX_SELECT = _CalcMaxSelect() STARTING_RANGE = 60 MIN_RANGE = 1 BILLINGDB_SELECT_CMD = """ SELECT b.datestamp AS datestamp, b.transaction AS transaction, b.cellname AS cellname,