def execute(sqlQuery):

  global sqlTableContent

  if ( sqlTableContent == None ):
     sqlTableContent = generateTableContent()

  global reExp
 
  sqlQuery = sqlQuery.replace('\n',' ')
  dateMatch = reExp.match(sqlQuery)
  if dateMatch == None:
     raise Exception("Misspecified query argument:"+sqlQuery)

  startDateS = dateMatch.group(1)
  endDateS = dateMatch.group(2)
  limitS = dateMatch.group(3)
 
  startDate = DateStrToSecs(startDateS)
  endDate   = DateStrToSecs(endDateS)
  limit = int(limitS)

  results = []
  for r in sqlTableContent:
     recordTime = r['tm']

     if ( recordTime >= startDate and recordTime < endDate ):
         results.append(r.copy())
         if ( len(results) == limit ):
            break
  
  if ( len(results) != 0 ):
       TestContainer.sendInterrupt(300)

  return results
Exemplo n.º 2
0
    def _processDBRow(self, row):
        """
        Completely process a single DB row.  Take the row, convert it to a
        UsageRecord, and send it up to Gratia.  Process any recoverable errors
        which occurred during the process.

        Note we skip a row if it is an Intra-site transfer and we are instructed
        not to send them.

        Otherwise, we process the row in Gratia or exit the probe.

        @return: The number of jobs in this row, regardless of whether we sent
           them successfully or not.
        """
        # Skip intra-site transfers if required
        if self._skipIntraSiteXfer(row):
           return row['njobs']

        if ( TestContainer.isTest() ):
           if ( self._summarize ):
              TestContainer.sendInterrupt(15)
           return TestContainer.processRow(row,self._log)

        usageRecord = self._convertBillingInfoToGratiaUsageRecord(\
                        row)

        # Send to gratia, and see what it says.
        response = Gratia.Send(usageRecord)
        baseMsg = "Record: %s, %s, njobs %i" % (str(row['datestamp']),
            row['transaction'], row['njobs'])
        if response == "Fatal Error: too many pending files":
            # The server is currently not accepting record and
            # Gratia.py was not able to store the record, we will
            # need to resend it.
            # For now take a long nap and then by 'break' we
            # force a retry for this record.
            self._log.error("Error sending : too many pending files")
            longsleep = 15*60
            self._log.warn("sleeping for = %i seconds." % longsleep)
            sleep_check(longsleep, self._stopFileName)
        elif response.startswith('Fatal Error') or \
            response.startswith('Internal Error'):
            self._log.critical('error sending ' + baseMsg + \
                '\ngot response ' + response)
            sys.exit(2)
            self._log.debug('sent ' + baseMsg)
        # If we got a non-fatal error, slow down since the server
        # might be overloaded.
        if response[:2] != 'OK':
            self._log.error('error sending ' + baseMsg + \
                            '\ngot response ' + response)

        return row['njobs']
Exemplo n.º 3
0
    def _processDBRow(self, row):
        """
        Completely process a single DB row.  Take the row, convert it to a
        UsageRecord, and send it up to Gratia.  Process any recoverable errors
        which occurred during the process.

        Note we skip a row if it is an Intra-site transfer and we are instructed
        not to send them.

        Otherwise, we process the row in Gratia or exit the probe.

        @return: The number of jobs in this row, regardless of whether we sent
           them successfully or not.
        """
        # Skip intra-site transfers if required
        if self._skipIntraSiteXfer(row):
            return row['njobs']

        if (TestContainer.isTest()):
            if (self._summarize):
                TestContainer.sendInterrupt(15)
            return TestContainer.processRow(row, self._log)

        usageRecord = self._convertBillingInfoToGratiaUsageRecord(\
                        row)

        # Send to gratia, and see what it says.
        response = Gratia.Send(usageRecord)
        baseMsg = "Record: %s, %s, njobs %i" % (str(
            row['datestamp']), row['transaction'], row['njobs'])
        if response == "Fatal Error: too many pending files":
            # The server is currently not accepting record and
            # Gratia.py was not able to store the record, we will
            # need to resend it.
            # For now take a long nap and then by 'break' we
            # force a retry for this record.
            self._log.error("Error sending : too many pending files")
            longsleep = 15 * 60
            self._log.warn("sleeping for = %i seconds." % longsleep)
            sleep_check(longsleep, self._stopFileName)
        elif response.startswith('Fatal Error') or \
            response.startswith('Internal Error'):
            self._log.critical('error sending ' + baseMsg + \
                '\ngot response ' + response)
            sys.exit(2)
            self._log.debug('sent ' + baseMsg)
        # If we got a non-fatal error, slow down since the server
        # might be overloaded.
        if response[:2] != 'OK':
            self._log.error('error sending ' + baseMsg + \
                            '\ngot response ' + response)

        return row['njobs']
Exemplo n.º 4
0
    def sendBillingInfoRecordsToGratia(self):
        """
        This is the public method for starting the dCache-transfer reporting.

        This will query records no more than _maxAge old, and always starts
        queries on hour time boundaries (i.e., 1:00:00 not 1:02:00).

        This will continue to query until we hit records starting less than 75
        minutes ago, then return.

        By default, we start with querying 60-second intervals, but will shrink
        this window if we encounter lots of data.

        If not summarizing: this method uses _execute to get all the data for
           a given interval, then uses _processResults to send them to Gratia.
           Once the query for a time interval is done, then we immediately
           checkpoint.

        If summarizing: this method continues to query until it hits the end of
           an hour interval.  At that point, it summarizes once again, and sends
           the summaries up to Gratia.  We then only checkpoint on the hour.
        """
        self._log.debug("sendBillingInfoRecordsToGratia")

        # Query no more than a set number of days in the past
        minTime = datetime.datetime.now() - datetime.timedelta(self._maxAge, 0)
        minTime = datetime.datetime(minTime.year, minTime.month, minTime.day,
            minTime.hour, 0, 0)

        # The latest allowed record is 75 minutes in the past, in order to make
        # sure we only query complete intervals
        latestAllowed = datetime.datetime.now() - datetime.timedelta(0, 75*60)

        if ( TestContainer.isTest() ):
           latestAllowed = TestContainer.getEndDateTime()

        # Start with either the last checkpoint or minTime days ago, whichever
        # is more recent.
        starttime = max(self._BIcheckpoint.lastDateStamp(), minTime)
        self._log.info("Starting queries at time %s." % starttime)

        dictRecordAgg = TimeBinRange.DictRecordAggregator(DCACHE_AGG_FIELDS,
            DCACHE_SUM_FIELDS)

        nextSummary = self._determineNextEndtime(starttime, summary=True)
        if self._summarize:
            self._log.debug("Next summary send time: %s." % nextSummary)

        results = []
        endtime = self._determineNextEndtime(starttime)
        totalRecords = 0
        # Loop until we have caught up to latestAllowed.
        while starttime < latestAllowed:
            assert starttime < endtime
            self._log.debug('sendBillingInfoRecordsToGratia: Processing ' \
                'starting at %s.' % starttime)
            # We are guaranteed that starttime will move forward to the value of
            # endtime every time we call execute.
            next_starttime, rows = self._execute(starttime, endtime, self._maxSelect)

	    results += rows
            totalRecords += len(rows)
            if self._summarize:
                # Summarize the partial results
                results = Collapse.collapse(results, dictRecordAgg)
            assert next_starttime > starttime
            next_endtime = self._determineNextEndtime(next_starttime)

            # If we're not summarizing, we send up records each loop.
            if (not self._summarize) and results:
                totalRecords = 0
                # We now have all the rows we want; process them
                self._BIcheckpoint.createPending(endtime, '')
                self._processResults(results)
                self._BIcheckpoint.commit()
                if (self._range < STARTING_RANGE and len(results)*4 < \
                       self._maxSelect):
                    self._range = STARTING_RANGE
                results = []
            # If we are summarizing, send records only per hour of data
            elif (next_endtime > nextSummary) and results:
                num_agg = totalRecords - len(results)
                if num_agg:
                    factor = float(totalRecords)/float(len(results))
                    self._log.info("Aggregated %i of %i records for time " \
                        "interval ending in %s.  %.1fx reduction." % \
                        (num_agg, totalRecords, nextSummary, factor))
                else:
                    self._log.debug("Unable to aggregate any of %i records" \
                        % totalRecords)
                totalRecords = 0
                self._BIcheckpoint.createPending(nextSummary, '')
                self._processResults(results)
                self._BIcheckpoint.commit()
                results = []
                self._range = STARTING_RANGE

            nextSummary = self._determineNextEndtime(next_starttime,
                summary=True)

            endtime = next_endtime
            starttime = next_starttime

            # Check to see if the stop file has been created.  If so, break
            if os.path.exists(self._stopFileName):
		#Neha - 03/17/2011
	        #Don't need to commit anything since we are only doing select and no inserts or updates
            	self._cur.close()
            	self._connection.close()
	        break
Exemplo n.º 5
0
    def _execute(self, starttime, endtime, maxSelect):
        """
        Execute the select command against the Billing DB return the results
        (possibly summarized)

        It is guaranteed this function will return an endtime greater than the
        starttime, but not guaranteed by how much.

        Note on the time returned as the first part of the tuple:
        We guarantee two things:
           a) returned time is strictly greater than starttime
           b) We return *all* records in the interval [starttime, return time).
        We do not guarantee that return time == parameter endtime.
        Thus it is suitable to use as the start time of the next select query.
        To do this, we reduce the range until it reaches 1 second or the
        query returns less than maxSelect results.   If the interval is one
        second and it still returns maxSelect results then we extend the limit
        of the query until all records fit.

        @param starttime: Datetime object for the start of the query interval.
        @param endtime: Datetime object for the end of the query interval.
        @param maxSelect: The maximum number of rows to select
        @return: Tuple containing the a time that is greater than all the
           records and the results
        """
        assert starttime < endtime
        if (maxSelect > MAX_SELECT) and ((endtime-starttime).seconds <= \
                MIN_RANGE):
            raise Exception("Fatal error - more than %i transfers in %i" \
                " second(s)." % (MAX_SELECT,(endtime-starttime).seconds))
        datestr = str(starttime)
        datestr_end = str(endtime)

        # Query the database.  If it takes more than MAX_QUERY_TIME_SECS, then
        # have the probe self-destruct.
        query=BILLINGDB_SELECT_CMD% ((datestr, datestr_end, datestr, datestr_end, maxSelect))
        self._log.debug('_sendToGratia: will execute ' + query)
        select_time = -time.time()
        if not TestContainer.isTest():
            self._cur.execute(query)
	    result = self._cur.fetchall()
	else:
            result = BillingRecSimulator.execute(query)
        select_time += time.time()
        if select_time > MAX_QUERY_TIME_SECS:
            raise Exception("Postgres query took %i seconds, more than " \
                "the maximum allowable of %i; this is a sign the DB is " \
                "not properly optimized!" % (int(select_time),
                MAX_QUERY_TIME_SECS))
        self._log.debug("BillingDB query finished in %.02f seconds and " \
            "returned %i records." % (select_time, len(result)))

        if not result:
            self._log.debug("No results from %s to %s." % (starttime, endtime))
            return endtime, result
        # dCache sometimes returns a negative transfer size; when this happens,
        # it also tosses up a complete garbage duration
        filtered_result = []
        for row in result:
            row = dict(row)
      	    #print row
	    if row['transfersize'] < 0:
                row['transfersize'] = 0
                row['connectiontime'] = 0
            filtered_result.append(row)
        result = filtered_result

	# If we hit our limit, there's no telling how many identical records
        # there are on the final millisecond; we must re-query with a smaller
        # interval or a higher limit on the select.
        if len(result) == maxSelect:
            diff = endtime - starttime
            interval = diff.days*86400 + diff.seconds
            # Ensure that self._range is such that we always end up on a minute boundary (eventually).
            # Whenever we decrease the interval size it is guaranteed to be a multiple of what's left
            # of the interval to the  next minute.  I.e the transitions are:
            #   60s ->  30s
            #   30s ->  15s (which can only happen at :30s)
            #   15s ->   5s (which can only happen at :15s :30s or :45s)
            #    5s ->   1s
            if   (interval > 60):
                new_interval = 60
            elif (interval > 30):
                new_interval = 30
            elif (interval > 15):
                new_interval = 15
            elif (interval >  5):
                new_interval =  5
            else:
                new_interval =  1
            new_endtime = starttime + datetime.timedelta(0, new_interval)
            # Guard against the DST jump by making sure new_endtime > starttime.
            if (interval == new_interval) or (new_interval == 0) or \
                (new_endtime <= starttime):
                self._log.warning("Limit hit; increasing from %i to %i." % \
                    (maxSelect, maxSelect*2))
                endtime, result = self._execute(starttime, endtime, maxSelect*2)
                assert endtime > starttime
                return endtime, result
            else:
                self._log.warning("Limit hit; decreasing time interval from %i" \
                   " to %i." % (interval, new_interval))
                self._range = new_interval
                endtime, result = self._execute(starttime, new_endtime,
                    maxSelect)
                assert endtime > starttime
                return endtime, result

        return endtime, result
Exemplo n.º 6
0
    def __init__( self, configuration, chkptdir=None ):
        # Pick up the logger
        self._log = logging.getLogger( 'DCacheAggregator' )
	#Fermilab dCache billing node doesn't support user to uid mapping in the /etc/passwd
        #instead of that there is GROUP_ID_LIST_FILE_NAME that contains gid to group mapping
        #group should be present in user-vo-map file to be mapped correctly
        self.__gid_file_mod_time = int(time.time())
        self.__group_map = {}
	self._unix_gid_list_file_name = configuration.get_UnixGidListFileName()
        if os.path.exists(self._unix_gid_list_file_name) :
            self.__gid_file_mod_time = os.stat(self._unix_gid_list_file_name).st_mtime
            self.__refresh_group_map()

	# Neha - 03/17/2011
	# Using psycopg2 instead of sqlalchemy
	DBurl = 'dbname=%s user=%s ' % (configuration.get_DBName(), configuration.get_DBLoginName())
	DBurl += 'password=%s ' % (configuration.get_DBPassword())
	DBurl += 'host=%s' % (configuration.get_DBHostName())

	# Neha - 03/17/2011
	# Commenting out as not using sqlalchemy anymore
        #DBurl = 'postgres://%s:%s@%s:5432/%s' % \ (configuration.get_DBLoginName(), configuration.get_DBPassword(), configuration.get_DBHostName(), configuration.get_DBName())
        self._skipIntraSite = configuration.get_OnlySendInterSiteTransfers()
        self._stopFileName = configuration.get_StopFileName()
        self._dCacheSvrHost = configuration.get_DCacheServerHost()
        # Create the billinginfo database checkpoint.
        self._maxAge = configuration.get_MaxBillingHistoryDays()
        if ( TestContainer.isTest() ):
           self._maxAge = TestContainer.getMaxAge()

        billinginfoChkpt = 'chkpt_dcache_xfer_DoNotDelete'
        if chkptdir != None:
            billinginfoChkpt = os.path.join(chkptdir, billinginfoChkpt)
        self._BIcheckpoint = Checkpoint(billinginfoChkpt, self._maxAge)

        self._sendAlarm = Alarm(
                configuration.get_EmailServerHost(),
                configuration.get_EmailFromAddress(),
                configuration.get_EmailToList(),
                'dCacheTransfer probe aggregator alarm',
                'The dCache Transfer Probe was not able to send to Gratia.',
                2,    # If more than two errors have occurred
                1800, # Max of once per half hour complaining
                True )

        self._summarize = configuration.get_Summarize()

        # Connect to the dCache postgres database.
        # TODO: Using sqlalchemy gives us nothing but a new dependency.  Remove - Done
        # Neha: 03/17/2011 - Removing sqlalchemy. Using psycopg2 instead
	try:
            if TestContainer.isTest():
                self._db = None
            else:
                #self._db = sqlalchemy.create_engine(DBurl)
                #self._connection = self._db.connect()
		self._connection = psycopg2.connect(DBurl)
		self._cur = self._connection.cursor(cursor_factory=psycopg2.extras.DictCursor)
        except:
            tblist = traceback.format_exception(sys.exc_type,
                                                sys.exc_value,
                                                sys.exc_traceback)
            errmsg = 'Failed to connect to %s\n\n%s' % (DBurl, "\n".join(tblist))
            self._log.error(errmsg)
            raise

        self._grid = configuration.get_Grid()
Exemplo n.º 7
0
def _CalcMaxSelect():
    """
    Returns the maximum number of sql results so that
    we do not use more than half of the install RAM on
    the current machine.
    """
    try:
        mem = _Meminfo()["MemTotal"]
        if ( mem < 2048000 ):
            mem = 2048000
        return int(mem / 4)
    except:
        return 512000

if TestContainer.isTest():
    STARTING_MAX_SELECT = 50
    MAX_SELECT = 100
    STARTING_RANGE = 60
    MIN_RANGE = 1
else:
    STARTING_MAX_SELECT = 32000
    MAX_SELECT = _CalcMaxSelect()
    STARTING_RANGE = 60
    MIN_RANGE = 1

BILLINGDB_SELECT_CMD = """
 SELECT
        b.datestamp AS datestamp,
        b.transaction AS transaction,
        b.cellname AS cellname,
Exemplo n.º 8
0
#                                                                        #
# You should have received a copy of the GNU Lesser General Public       #
# License along with AVANGO. If not, see <http://www.gnu.org/licenses/>. #
#                                                                        #
##########################################################################

import unittest
import TestFieldContainer
import TestField
import TestUpdate
import TestScript
import TestNodefactory
import TestContainer
import TestUtility
import sys

if __name__ == '__main__':
    suites = [
        TestFieldContainer.Suite(),
        TestField.Suite(),
        TestUpdate.Suite(),
        TestScript.Suite(),
        TestNodefactory.Suite(),
        TestContainer.Suite(),
        TestUtility.Suite(),
    ]
    alltests = unittest.TestSuite(suites)
    result = unittest.TextTestRunner(verbosity=2).run(alltests)
    if not result.wasSuccessful():
        sys.exit(1)
Exemplo n.º 9
0
def main():
    # We need the logger variable in the exception handler.
    # So we create it here.
    logger = logging.getLogger( 'DCacheAggregator' )

    # Ignore hangup signals. We shouldn't die just because our parent
    # shell logs out.
    signal.signal( signal.SIGHUP, signal.SIG_IGN )
    # Try to catch common signals and send email before we die
    signal.signal( signal.SIGINT,  warn_of_signal );
    signal.signal( signal.SIGQUIT, warn_of_signal );
    signal.signal( signal.SIGTERM, warn_of_signal );

    try:
        # Tell Gratia what versions we are using.
        # CHRIS: is there a way to automate the version extraction
        #        using the pkg_resource package?
        Gratia.RegisterReporterLibrary( "psycopg2", "2.0.6" )
        #Gratia.RegisterReporterLibrary( "SQLAlchemy", "0.4.1" )
        rev =  Gratia.ExtractCvsRevision("$Revision: 1.13 $")
        tag =  Gratia.ExtractCvsRevision("$Name:  $")
        Gratia.RegisterReporter( "dCacheBillingAggregator.py",
                                 str(rev) + " (tag " + str(tag) + ")")

        # BRIAN: attempt to pull the dCache version from RPM.
        version = "UNKNOWN"
        try:
            version = os.popen("rpm -q --qf '%{VERSION}-%{RELEASE}' " \
                               "dcache-server").read()
        except:
            pass
        Gratia.RegisterService( "dCache", version )


        # Initialize gratia before attempting to read its config file.
        Gratia.Initialize()
        # Extract the configuration information into local variables.
        myconf = dCacheProbeConfig()

        # Get the name of the directory where we are to store the log files.
        logDir = myconf.get_LogFolder()

        # Make sure that the logging directory is present
        if not os.path.isdir( logDir ):
            os.mkdir( logDir, 0755 )

        logFileName = os.path.join( logDir, "dcacheTransfer.log" )

        # Set up an alarm to send an email if the program terminates.
        termSubject = "dCache-transfer probe is going down"
        termMessage = "The dCache transfer probe for Gratia has " + \
                      "terminated.\nPlease check the logfile\n\n   " + \
                      logFileName + \
                      "\n\nfor the cause.\n"

        terminationAlarm = Alarm( myconf.get_EmailServerHost(),
                                  myconf.get_EmailFromAddress(),
                                  myconf.get_EmailToList(),
                                  termSubject, termMessage, 0, 0, False )

        # Set up the logger with a suitable format
        hdlr = RotatingFileHandler( logFileName, 'a', 512000, 10 )
        formatter = logging.Formatter( '%(asctime)s %(levelname)s %(message)s' )
        hdlr.setFormatter( formatter )
        logger.addHandler( hdlr )
        logger.setLevel( myconf.get_AggrLogLevel() )
        logger.info( "starting " + ProgramName )

        stopFileName = myconf.get_StopFileName()
        updateFreq = float(myconf.get_UpdateFrequency())
        logger.warn("update freq = %.2f" % updateFreq)

        # Create the aggregator instance that we will use.
        dataDir = myconf.get_DataFolder()
        aggregator = DCacheAggregator(myconf, dataDir)

        # If profiling was requested, turn it on.
        profiling = sys.argv.count('-profile') > 0
        if profiling:
            profiler = hotshot.Profile("profile.dat")
            logger.info( "Enabling Profiling" )

        # Now aggregate new records, then sleep, until somebody creates
        # the stop file...
        while 1:
            # Make sure we (still) have a connection to Gratia.
            if ( not TestContainer.isTest() ): # no need in that during self test
               Gratia.Maintenance()
          
            if profiling:
                profiler.run("aggregator.sendBillingInfoRecordsToGratia()")
            else:
                try:
                    aggregator.sendBillingInfoRecordsToGratia()
                except TestContainer.SimInterrupt:
                    logger.info("BillingRecSimulator.SimInterrupt caught, " \
                        "restarting")
                    aggregator = DCacheAggregator(myconf, dataDir)
                    continue
            # Are we are shutting down?
            if os.path.exists(stopFileName):
                break

            if TestContainer.isTest():
                break

            logger.warn("sleeping for = %.2f seconds" % updateFreq)
            sleep_check(updateFreq, stopFileName)

        # If we are profiling, print the results...
        if profiling:
            profiler.close()
            stats = hotshot.stats.load("profile.dat")
            stats.sort_stats('time', 'calls')
            stats.print_stats()

        logger.warn(ProgramName + " stop file detected.")
    except (KeyboardInterrupt, SystemExit):
        raise
    except:
        # format the traceback into a string
        tblist = traceback.format_exception( sys.exc_type,
                                             sys.exc_value,
                                             sys.exc_traceback )
        msg = ProgramName + " caught an exception:\n" + "".join(tblist)
        print msg
        logger.error(msg)

    TestContainer.dumpStatistics(logger)

    # shut down the logger to make sure nothing is lost.
    logger.critical(ProgramName + " shutting down.")
    logging.shutdown()
    # try to send an email warning of the shutdown.
    if terminationAlarm != None:
        terminationAlarm.event()

    sys.exit(1)
Exemplo n.º 10
0
def main():
    # We need the logger variable in the exception handler.
    # So we create it here.
    logger = logging.getLogger('DCacheAggregator')

    # Ignore hangup signals. We shouldn't die just because our parent
    # shell logs out.
    signal.signal(signal.SIGHUP, signal.SIG_IGN)
    # Try to catch common signals and send email before we die
    signal.signal(signal.SIGINT, warn_of_signal)
    signal.signal(signal.SIGQUIT, warn_of_signal)
    signal.signal(signal.SIGTERM, warn_of_signal)

    try:
        # Tell Gratia what versions we are using.
        # CHRIS: is there a way to automate the version extraction
        #        using the pkg_resource package?
        Gratia.RegisterReporterLibrary("psycopg2", "2.0.6")
        #Gratia.RegisterReporterLibrary( "SQLAlchemy", "0.4.1" )
        rev = Gratia.ExtractCvsRevision("$Revision: 1.13 $")
        tag = Gratia.ExtractCvsRevision("$Name:  $")
        Gratia.RegisterReporter("dCacheBillingAggregator.py",
                                str(rev) + " (tag " + str(tag) + ")")

        # BRIAN: attempt to pull the dCache version from RPM.
        version = "UNKNOWN"
        try:
            version = os.popen("rpm -q --qf '%{VERSION}-%{RELEASE}' " \
                               "dcache-server").read()
        except:
            pass
        Gratia.RegisterService("dCache", version)

        # Initialize gratia before attempting to read its config file.
        Gratia.Initialize()
        # Extract the configuration information into local variables.
        myconf = dCacheProbeConfig()

        # Get the name of the directory where we are to store the log files.
        logDir = myconf.get_LogFolder()

        # Make sure that the logging directory is present
        if not os.path.isdir(logDir):
            os.mkdir(logDir, 0755)

        logFileName = os.path.join(logDir, "dcacheTransfer.log")

        # Set up an alarm to send an email if the program terminates.
        termSubject = "dCache-transfer probe is going down"
        termMessage = "The dCache transfer probe for Gratia has " + \
                      "terminated.\nPlease check the logfile\n\n   " + \
                      logFileName + \
                      "\n\nfor the cause.\n"

        terminationAlarm = Alarm(myconf.get_EmailServerHost(),
                                 myconf.get_EmailFromAddress(),
                                 myconf.get_EmailToList(), termSubject,
                                 termMessage, 0, 0, False)

        # Set up the logger with a suitable format
        hdlr = RotatingFileHandler(logFileName, 'a', 512000, 10)
        formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
        hdlr.setFormatter(formatter)
        logger.addHandler(hdlr)
        logger.setLevel(myconf.get_AggrLogLevel())
        logger.info("starting " + ProgramName)

        stopFileName = myconf.get_StopFileName()
        updateFreq = float(myconf.get_UpdateFrequency())
        logger.warn("update freq = %.2f" % updateFreq)

        # Create the aggregator instance that we will use.
        dataDir = myconf.get_DataFolder()
        aggregator = DCacheAggregator(myconf, dataDir)

        # If profiling was requested, turn it on.
        profiling = sys.argv.count('-profile') > 0
        if profiling:
            profiler = hotshot.Profile("profile.dat")
            logger.info("Enabling Profiling")

        # Now aggregate new records, then sleep, until somebody creates
        # the stop file...
        while 1:
            # Make sure we (still) have a connection to Gratia.
            if (not TestContainer.isTest()
                ):  # no need in that during self test
                Gratia.Maintenance()

            if profiling:
                profiler.run("aggregator.sendBillingInfoRecordsToGratia()")
            else:
                try:
                    aggregator.sendBillingInfoRecordsToGratia()
                except TestContainer.SimInterrupt:
                    logger.info("BillingRecSimulator.SimInterrupt caught, " \
                        "restarting")
                    aggregator = DCacheAggregator(myconf, dataDir)
                    continue
            # Are we are shutting down?
            if os.path.exists(stopFileName):
                break

            if TestContainer.isTest():
                break

            logger.warn("sleeping for = %.2f seconds" % updateFreq)
            sleep_check(updateFreq, stopFileName)

        # If we are profiling, print the results...
        if profiling:
            profiler.close()
            stats = hotshot.stats.load("profile.dat")
            stats.sort_stats('time', 'calls')
            stats.print_stats()

        logger.warn(ProgramName + " stop file detected.")
    except (KeyboardInterrupt, SystemExit):
        raise
    except:
        # format the traceback into a string
        tblist = traceback.format_exception(sys.exc_type, sys.exc_value,
                                            sys.exc_traceback)
        msg = ProgramName + " caught an exception:\n" + "".join(tblist)
        print msg
        logger.error(msg)

    TestContainer.dumpStatistics(logger)

    # shut down the logger to make sure nothing is lost.
    logger.critical(ProgramName + " shutting down.")
    logging.shutdown()
    # try to send an email warning of the shutdown.
    if terminationAlarm != None:
        terminationAlarm.event()

    sys.exit(1)
Exemplo n.º 11
0
    def sendBillingInfoRecordsToGratia(self):
        """
        This is the public method for starting the dCache-transfer reporting.

        This will query records no more than _maxAge old, and always starts
        queries on hour time boundaries (i.e., 1:00:00 not 1:02:00).

        This will continue to query until we hit records starting less than 75
        minutes ago, then return.

        By default, we start with querying 60-second intervals, but will shrink
        this window if we encounter lots of data.

        If not summarizing: this method uses _execute to get all the data for
           a given interval, then uses _processResults to send them to Gratia.
           Once the query for a time interval is done, then we immediately
           checkpoint.

        If summarizing: this method continues to query until it hits the end of
           an hour interval.  At that point, it summarizes once again, and sends
           the summaries up to Gratia.  We then only checkpoint on the hour.
        """
        self._log.debug("sendBillingInfoRecordsToGratia")

        # Query no more than a set number of days in the past
        minTime = datetime.datetime.now() - datetime.timedelta(self._maxAge, 0)
        minTime = datetime.datetime(minTime.year, minTime.month, minTime.day,
                                    minTime.hour, 0, 0)

        # The latest allowed record is 75 minutes in the past, in order to make
        # sure we only query complete intervals
        latestAllowed = datetime.datetime.now() - datetime.timedelta(
            0, 75 * 60)

        if (TestContainer.isTest()):
            latestAllowed = TestContainer.getEndDateTime()

        # Start with either the last checkpoint or minTime days ago, whichever
        # is more recent.
        starttime = max(self._BIcheckpoint.lastDateStamp(), minTime)
        self._log.info("Starting queries at time %s." % starttime)

        dictRecordAgg = TimeBinRange.DictRecordAggregator(
            DCACHE_AGG_FIELDS, DCACHE_SUM_FIELDS)

        nextSummary = self._determineNextEndtime(starttime, summary=True)
        if self._summarize:
            self._log.debug("Next summary send time: %s." % nextSummary)

        results = []
        endtime = self._determineNextEndtime(starttime)
        totalRecords = 0
        # Loop until we have caught up to latestAllowed.
        while starttime < latestAllowed:
            assert starttime < endtime
            self._log.debug('sendBillingInfoRecordsToGratia: Processing ' \
                'starting at %s.' % starttime)
            # We are guaranteed that starttime will move forward to the value of
            # endtime every time we call execute.
            next_starttime, rows = self._execute(starttime, endtime,
                                                 self._maxSelect)

            results += rows
            totalRecords += len(rows)
            if self._summarize:
                # Summarize the partial results
                results = Collapse.collapse(results, dictRecordAgg)
            assert next_starttime > starttime
            next_endtime = self._determineNextEndtime(next_starttime)

            # If we're not summarizing, we send up records each loop.
            if (not self._summarize) and results:
                totalRecords = 0
                # We now have all the rows we want; process them
                self._BIcheckpoint.createPending(endtime, '')
                self._processResults(results)
                self._BIcheckpoint.commit()
                if (self._range < STARTING_RANGE and len(results)*4 < \
                       self._maxSelect):
                    self._range = STARTING_RANGE
                results = []
            # If we are summarizing, send records only per hour of data
            elif (next_endtime > nextSummary) and results:
                num_agg = totalRecords - len(results)
                if num_agg:
                    factor = float(totalRecords) / float(len(results))
                    self._log.info("Aggregated %i of %i records for time " \
                        "interval ending in %s.  %.1fx reduction." % \
                        (num_agg, totalRecords, nextSummary, factor))
                else:
                    self._log.debug("Unable to aggregate any of %i records" \
                        % totalRecords)
                totalRecords = 0
                self._BIcheckpoint.createPending(nextSummary, '')
                self._processResults(results)
                self._BIcheckpoint.commit()
                results = []
                self._range = STARTING_RANGE

            nextSummary = self._determineNextEndtime(next_starttime,
                                                     summary=True)

            endtime = next_endtime
            starttime = next_starttime

            # Check to see if the stop file has been created.  If so, break
            if os.path.exists(self._stopFileName):
                #Neha - 03/17/2011
                #Don't need to commit anything since we are only doing select and no inserts or updates
                self._cur.close()
                self._connection.close()
                break
Exemplo n.º 12
0
    def _execute(self, starttime, endtime, maxSelect):
        """
        Execute the select command against the Billing DB return the results
        (possibly summarized)

        It is guaranteed this function will return an endtime greater than the
        starttime, but not guaranteed by how much.

        Note on the time returned as the first part of the tuple:
        We guarantee two things:
           a) returned time is strictly greater than starttime
           b) We return *all* records in the interval [starttime, return time).
        We do not guarantee that return time == parameter endtime.
        Thus it is suitable to use as the start time of the next select query.
        To do this, we reduce the range until it reaches 1 second or the
        query returns less than maxSelect results.   If the interval is one
        second and it still returns maxSelect results then we extend the limit
        of the query until all records fit.

        @param starttime: Datetime object for the start of the query interval.
        @param endtime: Datetime object for the end of the query interval.
        @param maxSelect: The maximum number of rows to select
        @return: Tuple containing the a time that is greater than all the
           records and the results
        """
        assert starttime < endtime
        if (maxSelect > MAX_SELECT) and ((endtime-starttime).seconds <= \
                MIN_RANGE):
            raise Exception("Fatal error - more than %i transfers in %i" \
                " second(s)." % (MAX_SELECT,(endtime-starttime).seconds))
        datestr = str(starttime)
        datestr_end = str(endtime)

        # Query the database.  If it takes more than MAX_QUERY_TIME_SECS, then
        # have the probe self-destruct.
        query = BILLINGDB_SELECT_CMD % (
            (datestr, datestr_end, datestr, datestr_end, maxSelect))
        self._log.debug('_sendToGratia: will execute ' + query)
        select_time = -time.time()
        if not TestContainer.isTest():
            self._cur.execute(query)
            result = self._cur.fetchall()
        else:
            result = BillingRecSimulator.execute(query)
        select_time += time.time()
        if select_time > MAX_QUERY_TIME_SECS:
            raise Exception("Postgres query took %i seconds, more than " \
                "the maximum allowable of %i; this is a sign the DB is " \
                "not properly optimized!" % (int(select_time),
                MAX_QUERY_TIME_SECS))
        self._log.debug("BillingDB query finished in %.02f seconds and " \
            "returned %i records." % (select_time, len(result)))

        if not result:
            self._log.debug("No results from %s to %s." % (starttime, endtime))
            return endtime, result
        # dCache sometimes returns a negative transfer size; when this happens,
        # it also tosses up a complete garbage duration
        filtered_result = []
        for row in result:
            row = dict(row)
            #print row
            if row['transfersize'] < 0:
                row['transfersize'] = 0
                row['connectiontime'] = 0
            filtered_result.append(row)
        result = filtered_result

        # If we hit our limit, there's no telling how many identical records
        # there are on the final millisecond; we must re-query with a smaller
        # interval or a higher limit on the select.
        if len(result) == maxSelect:
            diff = endtime - starttime
            interval = diff.days * 86400 + diff.seconds
            # Ensure that self._range is such that we always end up on a minute boundary (eventually).
            # Whenever we decrease the interval size it is guaranteed to be a multiple of what's left
            # of the interval to the  next minute.  I.e the transitions are:
            #   60s ->  30s
            #   30s ->  15s (which can only happen at :30s)
            #   15s ->   5s (which can only happen at :15s :30s or :45s)
            #    5s ->   1s
            if (interval > 60):
                new_interval = 60
            elif (interval > 30):
                new_interval = 30
            elif (interval > 15):
                new_interval = 15
            elif (interval > 5):
                new_interval = 5
            else:
                new_interval = 1
            new_endtime = starttime + datetime.timedelta(0, new_interval)
            # Guard against the DST jump by making sure new_endtime > starttime.
            if (interval == new_interval) or (new_interval == 0) or \
                (new_endtime <= starttime):
                self._log.warning("Limit hit; increasing from %i to %i." % \
                    (maxSelect, maxSelect*2))
                endtime, result = self._execute(starttime, endtime,
                                                maxSelect * 2)
                assert endtime > starttime
                return endtime, result
            else:
                self._log.warning("Limit hit; decreasing time interval from %i" \
                   " to %i." % (interval, new_interval))
                self._range = new_interval
                endtime, result = self._execute(starttime, new_endtime,
                                                maxSelect)
                assert endtime > starttime
                return endtime, result

        return endtime, result
Exemplo n.º 13
0
    def __init__(self, configuration, chkptdir=None):
        # Pick up the logger
        self._log = logging.getLogger('DCacheAggregator')
        #Fermilab dCache billing node doesn't support user to uid mapping in the /etc/passwd
        #instead of that there is GROUP_ID_LIST_FILE_NAME that contains gid to group mapping
        #group should be present in user-vo-map file to be mapped correctly
        self.__gid_file_mod_time = int(time.time())
        self.__group_map = {}
        self._unix_gid_list_file_name = configuration.get_UnixGidListFileName()
        if os.path.exists(self._unix_gid_list_file_name):
            self.__gid_file_mod_time = os.stat(
                self._unix_gid_list_file_name).st_mtime
            self.__refresh_group_map()

# Neha - 03/17/2011
# Using psycopg2 instead of sqlalchemy
        DBurl = 'dbname=%s user=%s ' % (configuration.get_DBName(),
                                        configuration.get_DBLoginName())
        DBurl += 'password=%s ' % (configuration.get_DBPassword())
        DBurl += 'host=%s' % (configuration.get_DBHostName())

        # Neha - 03/17/2011
        # Commenting out as not using sqlalchemy anymore
        #DBurl = 'postgres://%s:%s@%s:5432/%s' % \ (configuration.get_DBLoginName(), configuration.get_DBPassword(), configuration.get_DBHostName(), configuration.get_DBName())
        self._skipIntraSite = configuration.get_OnlySendInterSiteTransfers()
        self._stopFileName = configuration.get_StopFileName()
        self._dCacheSvrHost = configuration.get_DCacheServerHost()
        # Create the billinginfo database checkpoint.
        self._maxAge = configuration.get_MaxBillingHistoryDays()
        if (TestContainer.isTest()):
            self._maxAge = TestContainer.getMaxAge()

        billinginfoChkpt = 'chkpt_dcache_xfer_DoNotDelete'
        if chkptdir != None:
            billinginfoChkpt = os.path.join(chkptdir, billinginfoChkpt)
        self._BIcheckpoint = Checkpoint(billinginfoChkpt, self._maxAge)

        self._sendAlarm = Alarm(
            configuration.get_EmailServerHost(),
            configuration.get_EmailFromAddress(),
            configuration.get_EmailToList(),
            'dCacheTransfer probe aggregator alarm',
            'The dCache Transfer Probe was not able to send to Gratia.',
            2,  # If more than two errors have occurred
            1800,  # Max of once per half hour complaining
            True)

        self._summarize = configuration.get_Summarize()

        # Connect to the dCache postgres database.
        # TODO: Using sqlalchemy gives us nothing but a new dependency.  Remove - Done
        # Neha: 03/17/2011 - Removing sqlalchemy. Using psycopg2 instead
        try:
            if TestContainer.isTest():
                self._db = None
            else:
                #self._db = sqlalchemy.create_engine(DBurl)
                #self._connection = self._db.connect()
                self._connection = psycopg2.connect(DBurl)
                self._cur = self._connection.cursor(
                    cursor_factory=psycopg2.extras.DictCursor)
        except:
            tblist = traceback.format_exception(sys.exc_type, sys.exc_value,
                                                sys.exc_traceback)
            errmsg = 'Failed to connect to %s\n\n%s' % (DBurl,
                                                        "\n".join(tblist))
            self._log.error(errmsg)
            raise

        self._grid = configuration.get_Grid()
Exemplo n.º 14
0
def _CalcMaxSelect():
    """
    Returns the maximum number of sql results so that
    we do not use more than half of the install RAM on
    the current machine.
    """
    try:
        mem = _Meminfo()["MemTotal"]
        if (mem < 2048000):
            mem = 2048000
        return int(mem / 4)
    except:
        return 512000


if TestContainer.isTest():
    STARTING_MAX_SELECT = 50
    MAX_SELECT = 100
    STARTING_RANGE = 60
    MIN_RANGE = 1
else:
    STARTING_MAX_SELECT = 32000
    MAX_SELECT = _CalcMaxSelect()
    STARTING_RANGE = 60
    MIN_RANGE = 1

BILLINGDB_SELECT_CMD = """
 SELECT
        b.datestamp AS datestamp,
        b.transaction AS transaction,
        b.cellname AS cellname,