예제 #1
0
    def __init__(self, cp):
        global has_gratia
        global Gratia
        global StorageElement
        global StorageElementRecord
        if not has_gratia:
            try:
                Gratia = __import__("Gratia")
                StorageElement = __import__("StorageElement")
                StorageElementRecord = __import__("StorageElementRecord")
                has_gratia = True
            except:
                raise
        if not has_gratia:
            print "Unable to import Gratia and Storage modules!"
            sys.exit(1)

        Gratia.Initialize()
        try:
            if Gratia.Config.get_SiteName().lower().find('generic') >= 0:
                Gratia.Config.setSiteName(socket.getfqdn())
        except:
            pass
        try:
            if Gratia.Config.get_ProbeName().lower().find('generic') >= 0:
                Gratia.Config.setProbeName('dCache-storage:%s' %
                                           socket.getfqdn())
        except:
            pass
예제 #2
0
    def __init__(self):
        try:
            self.opts, self.args = self.parse_opts()
        except Exception as e:
            print(e, file=sys.stderr)
            sys.exit(1)

        # Initialize Gratia
        if not self.opts.gratia_config or not os.path.exists(
                self.opts.gratia_config):
            raise Exception("Gratia config, %s, does not exist." %
                            self.opts.gratia_config)
        Gratia.Initialize(self.opts.gratia_config)

        if self.opts.verbose:
            Gratia.Config.set_DebugLevel(5)

        # Sanity checks for the probe's runtime environment.
        GratiaWrapper.CheckPreconditions()

        if self.opts.sleep:
            rnd = random.randint(1, int(self.opts.sleep))
            DebugPrint(2, "Sleeping for %d seconds before proceeding." % rnd)
            time.sleep(rnd)

        # Make sure we have an exclusive lock for this probe.
        GratiaWrapper.ExclusiveLock()

        self.register_gratia("slurm_meter")

        # Find the checkpoint filename (if enabled)
        if self.opts.checkpoint:
            checkpoint_file = os.path.join(Gratia.Config.get_WorkingFolder(),
                                           "checkpoint")
        else:
            checkpoint_file = None

        # Open the checkpoint file
        self.checkpoint = SlurmCheckpoint(checkpoint_file)

        # Only process DataFileExpiration days of history
        # (unless we're resuming from a checkpoint file)
        if self.checkpoint.val is None:
            self.checkpoint.val = int(time.time() -
                                      (Gratia.Config.get_DataFileExpiration() *
                                       86400))

        # Connect to database
        self.conn = self.get_db_conn()

        self.cluster = Gratia.Config.getConfigAttribute('SlurmCluster')

        # SLURM made changes to the accounting database schema
        slurm_version = self.get_slurm_version()
        if LooseVersion(slurm_version) < LooseVersion("15.08.0"):
            # Original schema
            self.sacct = SlurmAcct_v1(self.conn, self.cluster, slurm_version)
        else:
            # Added TRES (Trackable resources) in 15.08.0pre5
            self.sacct = SlurmAcct_v2(self.conn, self.cluster, slurm_version)
예제 #3
0
        boincjob = False

        for var2 in lines:
            if var2.count('QDate') > 0:
                starttime = var2.split()[2]
            elif var2.count('RemoteWallClockTime') > 0:
                walltime = var2.split()[2]
            elif var2.count('CompletionDate') > 0:
                endtime = var2.split()[2]
            elif var2.count('Owner') > 0:
                if var2.split()[2] == '"boinc"':
                    boincjob = True

        if boincjob == True:
            Gratia.setProbeBatchManager('Condor')
            Gratia.Initialize()
            r = Gratia.UsageRecord('Condor')
            r.ResourceType('Backfill')

            # parsing the filenames for the hostname/localjobid.
            # the files are in the format: history.<hostname>#<localjobid>#1#<localjobid>

            host = var.partition('.')[2].partition('#')[0]
            localjobid = var.partition('.')[2].partition('#')[2].partition(
                '#')[0]

            # print 'endtime: ' + endtime
            # print 'starttime: ' + starttime
            # print 'walltime: ' + walltime

            # Gratia likes ints, not strings, for times.
예제 #4
0
    def start(self):
        """Initializes Gratia (to read the option file), does random sleep (if any), acquires the lock,
        initializes the input and registers Gratia.
        Must be invoked after options and parameters are parsed (option file name is needed)
        """

        ### Initialize Gratia
        if not self._opts or not self._opts.gratia_config or not os.path.exists(
                self._opts.gratia_config):
            # TODO: print a message instead of an exception?
            raise Exception("Gratia config file (%s) does not exist." %
                            self._opts.gratia_config)
        # Print options and initial conditions
        DebugPrint(5, "Initial options: %s" % self._opts)

        # Initialization parses the config file. No debug print will work before this
        Gratia.Initialize(self._opts.gratia_config)

        # Set to verbose in case the config changed it
        self.set_verbose()

        # Sanity checks for the probe's runtime environment.
        if self._opts.enable:
            GratiaWrapper.CheckPreconditions(check_enabled=False)
        else:
            GratiaWrapper.CheckPreconditions()

        if self._opts.sleep:
            rnd = random.randint(1, int(self._opts.sleep))
            DebugPrint(2, "Sleeping for %d seconds before proceeding." % rnd)
            time.sleep(rnd)

        # Make sure we have an exclusive lock for this probe.
        GratiaWrapper.ExclusiveLock()

        ### Initialize input (config file must be available)
        # Input must specify which parameters it requires form the config file
        # The probe provides static information form the config file
        if not self._probeinput:
            self._probeinput = ProbeInput()
        input_parameters = self._probeinput.get_init_params()
        input_ini = self.get_config_att_list(input_parameters)
        # Check for test mode: start and other methods may change
        if 'input' in self._opts.test:
            DebugPrint(3, "Running input in test mode")
            self._probeinput.do_test()
        # Finish input initialization, including DB connection (if used)
        self._probeinput.start(input_ini)

        # get_DataFileExpiration() returns the value in the config file or 31
        # TODO: Do we want to always not consider values older than 31 days or only when checkpointing is
        # enabled?
        # data_expiration = Gratia.Config.get_DataFileExpiration()

        # Find the checkpoint filename (if enabled) - after initializing the input!
        if self._opts.checkpoint:
            checkpoint_file = self.get_config_attribute('CheckpointFile')
            full_checkpoint_name = True
            if not checkpoint_file:
                full_checkpoint_name = False
                checkpoint_file = os.path.join(
                    Gratia.Config.get_WorkingFolder(), "checkpoint")
            data_expiration = Gratia.Config.get_DataFileExpiration()
            # Only process DataFileExpiration days of history
            # (unless we're resuming from a checkpoint file)
            # TODO: is datafileexpiration a maximum value or a default (if no checkpoint is specified)?
            #       Do we want both?
            # Open the checkpoint file
            self._probeinput.add_checkpoint(checkpoint_file,
                                            default_val=data_expiration,
                                            fullname=full_checkpoint_name)

        ### Complete Gratia initialization
        # This uses the input version (after Input initialization)
        self.register_gratia()
예제 #5
0
def main():
    # We need the logger variable in the exception handler.
    # So we create it here.
    logger = logging.getLogger('DCacheAggregator')

    # Ignore hangup signals. We shouldn't die just because our parent
    # shell logs out.
    signal.signal(signal.SIGHUP, signal.SIG_IGN)
    # Try to catch common signals and send email before we die
    signal.signal(signal.SIGINT, warn_of_signal)
    signal.signal(signal.SIGQUIT, warn_of_signal)
    signal.signal(signal.SIGTERM, warn_of_signal)

    try:
        # Tell Gratia what versions we are using.
        # CHRIS: is there a way to automate the version extraction
        #        using the pkg_resource package?
        Gratia.RegisterReporterLibrary("psycopg2", "2.0.6")
        #Gratia.RegisterReporterLibrary( "SQLAlchemy", "0.4.1" )
        rev = Gratia.ExtractCvsRevision("$Revision: 1.13 $")
        tag = Gratia.ExtractCvsRevision("$Name:  $")
        Gratia.RegisterReporter("dCacheBillingAggregator.py",
                                str(rev) + " (tag " + str(tag) + ")")

        # BRIAN: attempt to pull the dCache version from RPM.
        version = "UNKNOWN"
        try:
            version = os.popen("rpm -q --qf '%{VERSION}-%{RELEASE}' " \
                               "dcache-server").read()
        except:
            pass
        Gratia.RegisterService("dCache", version)

        # Initialize gratia before attempting to read its config file.
        Gratia.Initialize()
        # Extract the configuration information into local variables.
        myconf = dCacheProbeConfig()

        # Get the name of the directory where we are to store the log files.
        logDir = myconf.get_LogFolder()

        # Make sure that the logging directory is present
        if not os.path.isdir(logDir):
            os.mkdir(logDir, 0755)

        logFileName = os.path.join(logDir, "dcacheTransfer.log")

        # Set up an alarm to send an email if the program terminates.
        termSubject = "dCache-transfer probe is going down"
        termMessage = "The dCache transfer probe for Gratia has " + \
                      "terminated.\nPlease check the logfile\n\n   " + \
                      logFileName + \
                      "\n\nfor the cause.\n"

        terminationAlarm = Alarm(myconf.get_EmailServerHost(),
                                 myconf.get_EmailFromAddress(),
                                 myconf.get_EmailToList(), termSubject,
                                 termMessage, 0, 0, False)

        # Set up the logger with a suitable format
        hdlr = RotatingFileHandler(logFileName, 'a', 512000, 10)
        formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
        hdlr.setFormatter(formatter)
        logger.addHandler(hdlr)
        logger.setLevel(myconf.get_AggrLogLevel())
        logger.info("starting " + ProgramName)

        stopFileName = myconf.get_StopFileName()
        updateFreq = float(myconf.get_UpdateFrequency())
        logger.warn("update freq = %.2f" % updateFreq)

        # Create the aggregator instance that we will use.
        dataDir = myconf.get_DataFolder()
        aggregator = DCacheAggregator(myconf, dataDir)

        # If profiling was requested, turn it on.
        profiling = sys.argv.count('-profile') > 0
        if profiling:
            profiler = hotshot.Profile("profile.dat")
            logger.info("Enabling Profiling")

        # Now aggregate new records, then sleep, until somebody creates
        # the stop file...
        while 1:
            # Make sure we (still) have a connection to Gratia.
            if (not TestContainer.isTest()
                ):  # no need in that during self test
                Gratia.Maintenance()

            if profiling:
                profiler.run("aggregator.sendBillingInfoRecordsToGratia()")
            else:
                try:
                    aggregator.sendBillingInfoRecordsToGratia()
                except TestContainer.SimInterrupt:
                    logger.info("BillingRecSimulator.SimInterrupt caught, " \
                        "restarting")
                    aggregator = DCacheAggregator(myconf, dataDir)
                    continue
            # Are we are shutting down?
            if os.path.exists(stopFileName):
                break

            if TestContainer.isTest():
                break

            logger.warn("sleeping for = %.2f seconds" % updateFreq)
            sleep_check(updateFreq, stopFileName)

        # If we are profiling, print the results...
        if profiling:
            profiler.close()
            stats = hotshot.stats.load("profile.dat")
            stats.sort_stats('time', 'calls')
            stats.print_stats()

        logger.warn(ProgramName + " stop file detected.")
    except (KeyboardInterrupt, SystemExit):
        raise
    except:
        # format the traceback into a string
        tblist = traceback.format_exception(sys.exc_type, sys.exc_value,
                                            sys.exc_traceback)
        msg = ProgramName + " caught an exception:\n" + "".join(tblist)
        print msg
        logger.error(msg)

    TestContainer.dumpStatistics(logger)

    # shut down the logger to make sure nothing is lost.
    logger.critical(ProgramName + " shutting down.")
    logging.shutdown()
    # try to send an email warning of the shutdown.
    if terminationAlarm != None:
        terminationAlarm.event()

    sys.exit(1)
예제 #6
0
class SlurmProbe:

    opts = None
    args = None
    checkpoint = None
    conn = None
    cluster = None
    sacct = None

    def __init__(self):
        try:
            self.opts, self.args = self.parse_opts()
        except Exception, e:
            print >> sys.stderr, str(e)
            sys.exit(1)

        # Initialize Gratia
        if not self.opts.gratia_config or not os.path.exists(
                self.opts.gratia_config):
            raise Exception("Gratia config, %s, does not exist." %
                            self.opts.gratia_config)
        Gratia.Initialize(self.opts.gratia_config)

        if self.opts.verbose:
            Gratia.Config.set_DebugLevel(5)

        # Sanity checks for the probe's runtime environment.
        GratiaWrapper.CheckPreconditions()

        if self.opts.sleep:
            rnd = random.randint(1, int(self.opts.sleep))
            DebugPrint(2, "Sleeping for %d seconds before proceeding." % rnd)
            time.sleep(rnd)

        # Make sure we have an exclusive lock for this probe.
        GratiaWrapper.ExclusiveLock()

        self.register_gratia("slurm_meter")

        # Find the checkpoint filename (if enabled)
        if self.opts.checkpoint:
            checkpoint_file = os.path.join(Gratia.Config.get_WorkingFolder(),
                                           "checkpoint")
        else:
            checkpoint_file = None

        # Open the checkpoint file
        self.checkpoint = SlurmCheckpoint(checkpoint_file)

        # Only process DataFileExpiration days of history
        # (unless we're resuming from a checkpoint file)
        if self.checkpoint.val is None:
            self.checkpoint.val = int(time.time() -
                                      (Gratia.Config.get_DataFileExpiration() *
                                       86400))

        # Connect to database
        self.conn = self.get_db_conn()

        self.cluster = Gratia.Config.getConfigAttribute('SlurmCluster')
        self.sacct = SlurmAcct(self.conn, self.cluster)