예제 #1
0
    def register_gratia(self, name):
        Gratia.RegisterReporter(name)

        try:
            slurm_version = self.get_slurm_version()
        except Exception as e:
            DebugPrint(0, "Unable to get SLURM version: %s" % str(e))
            raise

        Gratia.RegisterService("SLURM", slurm_version)
        Gratia.setProbeBatchManager("slurm")
예제 #2
0
    def __init__(self):
        try:
            self.opts, self.args = self.parse_opts()
        except Exception as e:
            print(e, file=sys.stderr)
            sys.exit(1)

        # Initialize Gratia
        if not self.opts.gratia_config or not os.path.exists(
                self.opts.gratia_config):
            raise Exception("Gratia config, %s, does not exist." %
                            self.opts.gratia_config)
        Gratia.Initialize(self.opts.gratia_config)

        if self.opts.verbose:
            Gratia.Config.set_DebugLevel(5)

        # Sanity checks for the probe's runtime environment.
        GratiaWrapper.CheckPreconditions()

        if self.opts.sleep:
            rnd = random.randint(1, int(self.opts.sleep))
            DebugPrint(2, "Sleeping for %d seconds before proceeding." % rnd)
            time.sleep(rnd)

        # Make sure we have an exclusive lock for this probe.
        GratiaWrapper.ExclusiveLock()

        self.register_gratia("slurm_meter")

        # Find the checkpoint filename (if enabled)
        if self.opts.checkpoint:
            checkpoint_file = os.path.join(Gratia.Config.get_WorkingFolder(),
                                           "checkpoint")
        else:
            checkpoint_file = None

        # Open the checkpoint file
        self.checkpoint = SlurmCheckpoint(checkpoint_file)

        # Only process DataFileExpiration days of history
        # (unless we're resuming from a checkpoint file)
        if self.checkpoint.val is None:
            self.checkpoint.val = int(time.time() -
                                      (Gratia.Config.get_DataFileExpiration() *
                                       86400))

        # Connect to database
        self.conn = self.get_db_conn()

        self.cluster = Gratia.Config.getConfigAttribute('SlurmCluster')

        # SLURM made changes to the accounting database schema
        slurm_version = self.get_slurm_version()
        if LooseVersion(slurm_version) < LooseVersion("15.08.0"):
            # Original schema
            self.sacct = SlurmAcct_v1(self.conn, self.cluster, slurm_version)
        else:
            # Added TRES (Trackable resources) in 15.08.0pre5
            self.sacct = SlurmAcct_v2(self.conn, self.cluster, slurm_version)
예제 #3
0
    def __init__(self, cp):
        global has_gratia
        global Gratia
        global StorageElement
        global StorageElementRecord
        if not has_gratia:
            try:
                Gratia = __import__("Gratia")
                StorageElement = __import__("StorageElement")
                StorageElementRecord = __import__("StorageElementRecord")
                has_gratia = True
            except:
                raise
        if not has_gratia:
            print "Unable to import Gratia and Storage modules!"
            sys.exit(1)

        Gratia.Initialize()
        try:
            if Gratia.Config.get_SiteName().lower().find('generic') >= 0:
                Gratia.Config.setSiteName(socket.getfqdn())
        except:
            pass
        try:
            if Gratia.Config.get_ProbeName().lower().find('generic') >= 0:
                Gratia.Config.setProbeName('dCache-storage:%s' %
                                           socket.getfqdn())
        except:
            pass
예제 #4
0
    def process_record(self, record):
        #TODO: yield the value for processing to gratia ()
        # logfile attribute (if present) is used to keep track and delete files

        DebugPrint(5, "Creating JUR for %s" % record)

        # Filter out uninteresting records (and remove their files)
        if False:
            if 'gratia_logfile' in record:
                DebugPrint(
                    1, 'Deleting transient record file: ' +
                    record["gratia_logfile"])
                file_utils.RemoveFile(record['gratia_logfile'])
            raise IgnoreRecordException("Ignoring record.")

        # Define the record
        # UsageRecord is defined in https://twiki.opensciencegrid.org/bin/view/Accounting/ProbeDevelopement
        # setters have the name of the attribute
        # Set resource type ( Batch, BatchPilot, GridMonitor, Storage, ActiveTape )
        resource_type = "Batch"
        r = Gratia.UsageRecord(resource_type)

        # fill r using the values in record

        # remember to specify the transient file (that will be removed if the record
        # is acquired successfully)
        if 'logfile' in record:
            r.AddTransientInputFile(record['gratia_logfile'])

        return r
예제 #5
0
    def register_gratia(self, name):
        Gratia.RegisterReporter(name)

        try:
            slurm_version = self.get_slurm_version()
        except Exception, e:
            DebugPrint(0, "Unable to get SLURM version: %s" % str(e))
            raise
예제 #6
0
    def register_gratia(self):
        """Register in Gratia the Reporter (gratia probe), ReporterLibrary (Gratia library version)
        and the Service (input)

        :return:
        """
        Gratia.RegisterReporter(self.probe_name)

        try:
            input_version = self.get_version()
        except SystemExit:
            raise
        except KeyboardInterrupt:
            raise
        except Exception as e:
            DebugPrint(0, "Unable to get input version: %s" % str(e))
            raise

        # TODO: check the meaning of RegisterReporter vs RegisterService
        Gratia.RegisterService(self._probeinput.get_name(), input_version)
    def _processDBRow(self, row):
        """
        Completely process a single DB row.  Take the row, convert it to a
        UsageRecord, and send it up to Gratia.  Process any recoverable errors
        which occurred during the process.

        Note we skip a row if it is an Intra-site transfer and we are instructed
        not to send them.

        Otherwise, we process the row in Gratia or exit the probe.

        @return: The number of jobs in this row, regardless of whether we sent
           them successfully or not.
        """
        # Skip intra-site transfers if required
        if self._skipIntraSiteXfer(row):
            return row['njobs']

        if (TestContainer.isTest()):
            if (self._summarize):
                TestContainer.sendInterrupt(15)
            return TestContainer.processRow(row, self._log)

        usageRecord = self._convertBillingInfoToGratiaUsageRecord(\
                        row)

        # Send to gratia, and see what it says.
        response = Gratia.Send(usageRecord)
        baseMsg = "Record: %s, %s, njobs %i" % (str(
            row['datestamp']), row['transaction'], row['njobs'])
        if response == "Fatal Error: too many pending files":
            # The server is currently not accepting record and
            # Gratia.py was not able to store the record, we will
            # need to resend it.
            # For now take a long nap and then by 'break' we
            # force a retry for this record.
            self._log.error("Error sending : too many pending files")
            longsleep = 15 * 60
            self._log.warn("sleeping for = %i seconds." % longsleep)
            sleep_check(longsleep, self._stopFileName)
        elif response.startswith('Fatal Error') or \
            response.startswith('Internal Error'):
            self._log.critical('error sending ' + baseMsg + \
                '\ngot response ' + response)
            sys.exit(2)
            self._log.debug('sent ' + baseMsg)
        # If we got a non-fatal error, slow down since the server
        # might be overloaded.
        if response[:2] != 'OK':
            self._log.error('error sending ' + baseMsg + \
                            '\ngot response ' + response)

        return row['njobs']
예제 #8
0
def GetRecord(jobid=0):
    """ 
    Create a sample Gratia record
    """
    record = Gratia.UsageRecord('Batch')

    record.LocalUserId('cmsuser000')
    record.GlobalUsername('john ainsworth')
    record.DN('CN=john ainsworth, L=MC, OU=Manchester, O=eScience, C=UK')

    record.LocalJobId('PBS.1234.0bad')
    record.LocalJobId('PBS.1234.' + str(jobid))  # overwrite the previous entry

    record.JobName('cmsreco ', 'this is not a real job name')
    record.Charge('1240')
    record.Status('4')
    record.Status(4)

    record.Njobs(3, 'Aggregation over 10 days')

    record.Network(3.5, 'Gb', 30, 'total')

    # record.Disk(3.5, "Gb", 13891, "max")
    # record.Memory(650000, "KB", "min")
    # record.Swap(1.5, "GB", "max")

    record.ServiceLevel('BottomFeeder', 'QOS')

    record.TimeDuration(24, 'submit')
    record.TimeInstant('2005-11-02T15:48:39Z', 'submit')

    record.WallDuration(6000 * 3600 * 25 + 63 * 60 + 21.2,
                        'Was entered in seconds')
    record.CpuDuration('PT23H12M1.75S', 'user', 'Was entered as text')
    record.CpuDuration('PT12M1.75S', 'sys', 'Was entered as text')
    record.NodeCount(3)  # default to total
    record.Processors(3, .75, 'total')
    record.StartTime(1130946550, 'Was entered in seconds')
    record.EndTime('2005-11-03T17:52:55Z', 'Was entered as text')
    record.MachineName('flxi02.fnal.gov')
    record.SubmitHost('patlx7.fnal.gov')
    record.Host('flxi02.fnal.gov', True)
    record.Queue('CepaQueue')

    record.ProjectName('cms reco')

    record.AdditionalInfo('RemoteWallTime', 94365)
    record.Resource('RemoteCpuTime', 'PT23H')

    return record
예제 #9
0
def GetRecord(jobid = 0):
        r = Gratia.UsageRecord("Batch")

        r.LocalUserId("cmsuser000")
        r.GlobalUsername("john ainsworth")
        r.DN("CN=john ainsworth, L=MC, OU=Manchester, O=eScience, C=UK")

        r.LocalJobId("PBS.1234.0bad")
        r.LocalJobId("PBS.1234.5." + str(jobid))        # overwrite the previous entry

        r.JobName("cmsreco","this is not a real job name")
        r.Charge("1240")
        r.Status("4")
        r.Status(4)

        r.Njobs(3,"Aggregation over 10 days")

        r.Network(3.5,"Gb",30,"total")
        #r.Disk(3.5,"Gb",13891,"max")
        #r.Memory(650000,"KB","min")
        #r.Swap(1.5,"GB","max")
        r.ServiceLevel("BottomFeeder","QOS")

        r.TimeDuration(24,"submit")
        r.TimeInstant("2005-11-02T15:48:39Z","submit")

        r.WallDuration(6000*3600*25+63*60+21.2,"Was entered in seconds")
        r.CpuDuration("PT23H12M1.75S","user","Was entered as text")
        r.CpuDuration("PT12M1.75S","sys","Was entered as text")
        r.NodeCount(3) # default to total
        r.Processors(3,.75,"total")
        r.StartTime(1130946550,"Was entered in seconds")
        r.EndTime("2005-11-03T17:52:55Z","Was entered as text")
        r.MachineName("flxi02.fnal.gov")
        r.SubmitHost("patlx7.fnal.gov")
        r.Host("flxi02.fnal.gov",True)
        r.Queue("CepaQueue")

        r.ProjectName("cms reco")

        r.AdditionalInfo("RemoteWallTime",94365)
        r.Resource("RemoteCpuTime","PT23H")

        return r
예제 #10
0
    def process_record(self, record):
        #TODO: yield the value for processing to gratia ()
        # logfile attribute (if present) is used to keep track and delete files

        DebugPrint(5, "Creating JUR for %s" % record)

        # Filter out uninteresting records (and remove their files)
        if False:
            if 'gratia_logfile' in record:
                DebugPrint(
                    1, 'Deleting transient record file: ' +
                    record["gratia_logfile"])
                file_utils.RemoveFile(record['gratia_logfile'])
            raise IgnoreRecordException("Ignoring record.")

        # Define the record
        # UsageRecord is defined in https://twiki.opensciencegrid.org/bin/view/Accounting/ProbeDevelopement
        # setters have the name of the attribute
        # Set resource type ( Batch, BatchPilot, GridMonitor, Storage, ActiveTape )
        resource_type = "Batch"
        r = Gratia.UsageRecord(resource_type)

        # fill r using the values in record

        # remember to specify the transient file (that will be removed if the record
        # is acquired successfully)
        if 'logfile' in record:
            r.AddTransientInputFile(record['gratia_logfile'])

        return r


# TODO: end of part to remove
#############################################################

# Some references
# http://seann.herdejurgen.com/resume/samag.com/html/v11/i04/a6.htm
# http://stackoverflow.com/questions/14863224/efficient-reading-of-800-gb-xml-file-in-python-2-7
# http://radimrehurek.com/2014/03/data-streaming-in-python-generators-iterators-iterables/
예제 #11
0
        lines = fd.readlines()
        boincjob = False

        for var2 in lines:
            if var2.count("QDate") > 0:
                starttime = var2.split()[2]
            elif var2.count("RemoteWallClockTime") > 0:
                walltime = var2.split()[2]
            elif var2.count("CompletionDate") > 0:
                endtime = var2.split()[2]
            elif var2.count("Owner") > 0:
                if var2.split()[2] == '"boinc"':
                    boincjob = True

        if boincjob == True:
            Gratia.setProbeBatchManager("Condor")
            Gratia.Initialize()
            r = Gratia.UsageRecord("Condor")
            r.ResourceType("Backfill")

            # parsing the filenames for the hostname/localjobid.
            # the files are in the format: history.<hostname>#<localjobid>#1#<localjobid>

            host = var.partition(".")[2].partition("#")[0]
            localjobid = var.partition(".")[2].partition("#")[2].partition("#")[0]

            # print 'endtime: ' + endtime
            # print 'starttime: ' + starttime
            # print 'walltime: ' + walltime

            # Gratia likes ints, not strings, for times.
    def _convertBillingInfoToGratiaUsageRecord(self, row):
        """
        Take a record returned from the database and convert it to a Gratia
        UsageRecord

        @param row: A dictionary-like object describing the Billing DB entry.
        @return: UsageRecord equivalent to the input row
        """
        # Convert date to utc. This can't be done perfectly, alas, since we
        # don't have the original timezone. We assume localtime.
        # This code is horrible, but it should work. row['datestamp'] should
        # be a datetime.datetime object.
        # make the time into a float
        fltTime = time.mktime(row['datestamp'].timetuple())
        startTime = time.strftime('%Y-%m-%dT%H:%M:%S', time.gmtime(fltTime))
        # NOTE WELL: we need the time accurate to milliseconds. So we
        # add it back to the UTC time.
        startTime = startTime + "." + \
                    locale.format("%06d", row['datestamp'].microsecond) + "Z"

        # convert the connection time in milliseconds to a decimal in seconds
        connectTime = float(row['connectiontime']) / 1000.0
        connectionTimeStr = 'PT' + str(connectTime) + 'S'

        # Check for the link to the doorinfo table being bad and log a
        # warning in the hope that somebody notices a bug has crept in.
        if row['doorlink'] == '<undefined>' and \
                   not row['protocol'].startswith('DCap'):
            self._log.warn( 'billinginfo record with datestamp ' + \
                        startTime + ' contained undefined initiator field' )

        # Work out the end points of the data transfer.
        thisHost = str(row['cellname']) + '@' + self._dCacheSvrHost
        if row['isnew']:
            srcHost = row['client']
            dstHost = thisHost
            isNew = 1
        else:
            srcHost = thisHost
            dstHost = row['client']
            isNew = 0

        rec = Gratia.UsageRecord('Storage')
        rec.Njobs(row['njobs'])
        rec.AdditionalInfo('Source', srcHost)
        rec.AdditionalInfo('Destination', dstHost)
        rec.AdditionalInfo('Protocol', row['protocol'])
        rec.AdditionalInfo('IsNew', isNew)
        rec.LocalJobId(row['transaction'])
        if row['protocol'].startswith("DCap"):
            rec.Grid("Local")
        else:
            # Set the grid name to the default in the ProbeConfig
            rec.Grid(self._grid)
        rec.StartTime(startTime)
        rec.Network(row['transfersize'], 'b', connectionTimeStr, 'total',
                    row['action'])
        rec.WallDuration(connectionTimeStr)

        # only send the initiator if it is known.
        if row['initiator'] != 'unknown':
            rec.DN(row['initiator'])
        # if the initiator host is "unknown", make it "Unknown".
        initiatorHost = row['initiatorhost']
        if initiatorHost == 'unknown':
            initiatorHost = 'Unknown'
        rec.SubmitHost(initiatorHost)
        rec.Status(row['errorcode'])
        # If we included the mapped uid as the local user id, then
        # Gratia will make a best effort to map this to the VO name.
        mappedUID = row['mappeduid']
        mappedGID = row['mappedgid']
        if row['protocol'] == 'NFS4-4.1':
            username = row['initiator']
            rec.LocalUserId(username)
            return rec
        try:
            username = '******'
            if row['initiator'] != 'unknown':
                username = row['initiator']
            if mappedUID != None and int(mappedUID) >= 0:
                try:
                    info = pwd.getpwuid(int(mappedUID))
                    username = info[0]
                except:
                    try:
                        mtime = os.stat(self._unix_gid_list_file_name).st_mtime
                        if self.__gid_file_mod_time != mtime:
                            self.__gid_file_mod_time = mtime
                            self.__refresh_group_map()
                        username = self.__group_map.get(str(mappedGID))
                        if not username:
                            self._log.warn("UID %s %s not found locally; make sure " \
                                           "/etc/passwd or %s on this host and your dCache are using " \
                                           "the same UIDs,GIDs!" % (self._unix_gid_list_file_name,str(int(mappedUID)),str(int(mappedGID))))
                    except:
                        self._log.warn("UID %s not found locally in /etc/passwed and %s does not exist or "\
                                "inaccessible " % (str(int(mappedUID)),self._unix_gid_list_file_name))
            rec.LocalUserId(username)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self._log.info("Failed to map UID %s to VO." % mappedUID)
        return rec
예제 #13
0
        if p.returncode != 0:
            raise Exception("Unable to invoke %s" % cmd)

        name, version = output.split()
        return version

    def register_gratia(self, name):
        Gratia.RegisterReporter(name)

        try:
            slurm_version = self.get_slurm_version()
        except Exception, e:
            DebugPrint(0, "Unable to get SLURM version: %s" % str(e))
            raise

        Gratia.RegisterService("SLURM", slurm_version)
        Gratia.setProbeBatchManager("slurm")


class SlurmCheckpoint(object):
    """Read and write a checkpoint file
    If class is instantiated without a filename, class works as expected but
    data is not stored to disk
    """

    _val = None
    _fp = None

    def __init__(self, target=None):
        """
        Create a checkpoint file
예제 #14
0
def main():
    # We need the logger variable in the exception handler.
    # So we create it here.
    logger = logging.getLogger('DCacheAggregator')

    # Ignore hangup signals. We shouldn't die just because our parent
    # shell logs out.
    signal.signal(signal.SIGHUP, signal.SIG_IGN)
    # Try to catch common signals and send email before we die
    signal.signal(signal.SIGINT, warn_of_signal)
    signal.signal(signal.SIGQUIT, warn_of_signal)
    signal.signal(signal.SIGTERM, warn_of_signal)

    try:
        # Tell Gratia what versions we are using.
        # CHRIS: is there a way to automate the version extraction
        #        using the pkg_resource package?
        Gratia.RegisterReporterLibrary("psycopg2", "2.0.6")
        #Gratia.RegisterReporterLibrary( "SQLAlchemy", "0.4.1" )
        rev = Gratia.ExtractCvsRevision("$Revision: 1.13 $")
        tag = Gratia.ExtractCvsRevision("$Name:  $")
        Gratia.RegisterReporter("dCacheBillingAggregator.py",
                                str(rev) + " (tag " + str(tag) + ")")

        # BRIAN: attempt to pull the dCache version from RPM.
        version = "UNKNOWN"
        try:
            version = os.popen("rpm -q --qf '%{VERSION}-%{RELEASE}' " \
                               "dcache-server").read()
        except:
            pass
        Gratia.RegisterService("dCache", version)

        # Initialize gratia before attempting to read its config file.
        Gratia.Initialize()
        # Extract the configuration information into local variables.
        myconf = dCacheProbeConfig()

        # Get the name of the directory where we are to store the log files.
        logDir = myconf.get_LogFolder()

        # Make sure that the logging directory is present
        if not os.path.isdir(logDir):
            os.mkdir(logDir, 0755)

        logFileName = os.path.join(logDir, "dcacheTransfer.log")

        # Set up an alarm to send an email if the program terminates.
        termSubject = "dCache-transfer probe is going down"
        termMessage = "The dCache transfer probe for Gratia has " + \
                      "terminated.\nPlease check the logfile\n\n   " + \
                      logFileName + \
                      "\n\nfor the cause.\n"

        terminationAlarm = Alarm(myconf.get_EmailServerHost(),
                                 myconf.get_EmailFromAddress(),
                                 myconf.get_EmailToList(), termSubject,
                                 termMessage, 0, 0, False)

        # Set up the logger with a suitable format
        hdlr = RotatingFileHandler(logFileName, 'a', 512000, 10)
        formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
        hdlr.setFormatter(formatter)
        logger.addHandler(hdlr)
        logger.setLevel(myconf.get_AggrLogLevel())
        logger.info("starting " + ProgramName)

        stopFileName = myconf.get_StopFileName()
        updateFreq = float(myconf.get_UpdateFrequency())
        logger.warn("update freq = %.2f" % updateFreq)

        # Create the aggregator instance that we will use.
        dataDir = myconf.get_DataFolder()
        aggregator = DCacheAggregator(myconf, dataDir)

        # If profiling was requested, turn it on.
        profiling = sys.argv.count('-profile') > 0
        if profiling:
            profiler = hotshot.Profile("profile.dat")
            logger.info("Enabling Profiling")

        # Now aggregate new records, then sleep, until somebody creates
        # the stop file...
        while 1:
            # Make sure we (still) have a connection to Gratia.
            if (not TestContainer.isTest()
                ):  # no need in that during self test
                Gratia.Maintenance()

            if profiling:
                profiler.run("aggregator.sendBillingInfoRecordsToGratia()")
            else:
                try:
                    aggregator.sendBillingInfoRecordsToGratia()
                except TestContainer.SimInterrupt:
                    logger.info("BillingRecSimulator.SimInterrupt caught, " \
                        "restarting")
                    aggregator = DCacheAggregator(myconf, dataDir)
                    continue
            # Are we are shutting down?
            if os.path.exists(stopFileName):
                break

            if TestContainer.isTest():
                break

            logger.warn("sleeping for = %.2f seconds" % updateFreq)
            sleep_check(updateFreq, stopFileName)

        # If we are profiling, print the results...
        if profiling:
            profiler.close()
            stats = hotshot.stats.load("profile.dat")
            stats.sort_stats('time', 'calls')
            stats.print_stats()

        logger.warn(ProgramName + " stop file detected.")
    except (KeyboardInterrupt, SystemExit):
        raise
    except:
        # format the traceback into a string
        tblist = traceback.format_exception(sys.exc_type, sys.exc_value,
                                            sys.exc_traceback)
        msg = ProgramName + " caught an exception:\n" + "".join(tblist)
        print msg
        logger.error(msg)

    TestContainer.dumpStatistics(logger)

    # shut down the logger to make sure nothing is lost.
    logger.critical(ProgramName + " shutting down.")
    logging.shutdown()
    # try to send an email warning of the shutdown.
    if terminationAlarm != None:
        terminationAlarm.event()

    sys.exit(1)
예제 #15
0
        name, version = output.split()
        return version

    def register_gratia(self, name):
        Gratia.RegisterReporter(name, "%s (tag %s)" % \
            (prog_revision, prog_version))

        try:
            slurm_version = self.get_slurm_version()
        except Exception, e:
            DebugPrint(0, "Unable to get SLURM version: %s" % str(e))
            raise

        Gratia.RegisterService("SLURM", slurm_version)
        Gratia.setProbeBatchManager("slurm")

class SlurmCheckpoint(object):
    """Read and write a checkpoint file
    If class is instantiated without a filename, class works as expected but
    data is not stored to disk
    """

    _val = None
    _fp  = None

    def __init__(self, target=None):
        """
        Create a checkpoint file
        target - checkpoint filename (optionally null)
        """
예제 #16
0
BASEDIR = '/home/gprobe/Data/'
flist = os.listdir(BASEDIR)

# test file
# file = 'history.COES-MCAD120-1#1256755408#1#1256755408'

# Fields that we're going to populate

starttime = ''
walltime = ''
localjobid = ''
endtime = ''
user = '******'

rev = '$Revision: 3273 $'
Gratia.RegisterReporterLibrary('myprobe.py', Gratia.ExtractSvnRevision(rev))

for var in flist:
    if var.count('history') > 0:
        fd = open('/home/gprobe/Data/' + var)
        lines = fd.readlines()
        boincjob = False

        for var2 in lines:
            if var2.count('QDate') > 0:
                starttime = var2.split()[2]
            elif var2.count('RemoteWallClockTime') > 0:
                walltime = var2.split()[2]
            elif var2.count('CompletionDate') > 0:
                endtime = var2.split()[2]
            elif var2.count('Owner') > 0:
예제 #17
0
 def send(self, record):
     Gratia.Send(record)
예제 #18
0
BASEDIR = '/home/gprobe/Data/'
flist = os.listdir(BASEDIR)

# test file
# file = 'history.COES-MCAD120-1#1256755408#1#1256755408'

# Fields that we're going to populate

starttime = ''
walltime = ''
localjobid = ''
endtime = ''
user = '******'

Gratia.RegisterReporterLibrary('myprobe.py')

for var in flist:
    if var.count('history') > 0:
        fd = open('/home/gprobe/Data/' + var)
        lines = fd.readlines()
        boincjob = False

        for var2 in lines:
            if var2.count('QDate') > 0:
                starttime = var2.split()[2]
            elif var2.count('RemoteWallClockTime') > 0:
                walltime = var2.split()[2]
            elif var2.count('CompletionDate') > 0:
                endtime = var2.split()[2]
            elif var2.count('Owner') > 0:
예제 #19
0
    def start(self):
        """Initializes Gratia (to read the option file), does random sleep (if any), acquires the lock,
        initializes the input and registers Gratia.
        Must be invoked after options and parameters are parsed (option file name is needed)
        """

        ### Initialize Gratia
        if not self._opts or not self._opts.gratia_config or not os.path.exists(
                self._opts.gratia_config):
            # TODO: print a message instead of an exception?
            raise Exception("Gratia config file (%s) does not exist." %
                            self._opts.gratia_config)
        # Print options and initial conditions
        DebugPrint(5, "Initial options: %s" % self._opts)

        # Initialization parses the config file. No debug print will work before this
        Gratia.Initialize(self._opts.gratia_config)

        # Set to verbose in case the config changed it
        self.set_verbose()

        # Sanity checks for the probe's runtime environment.
        if self._opts.enable:
            GratiaWrapper.CheckPreconditions(check_enabled=False)
        else:
            GratiaWrapper.CheckPreconditions()

        if self._opts.sleep:
            rnd = random.randint(1, int(self._opts.sleep))
            DebugPrint(2, "Sleeping for %d seconds before proceeding." % rnd)
            time.sleep(rnd)

        # Make sure we have an exclusive lock for this probe.
        GratiaWrapper.ExclusiveLock()

        ### Initialize input (config file must be available)
        # Input must specify which parameters it requires form the config file
        # The probe provides static information form the config file
        if not self._probeinput:
            self._probeinput = ProbeInput()
        input_parameters = self._probeinput.get_init_params()
        input_ini = self.get_config_att_list(input_parameters)
        # Check for test mode: start and other methods may change
        if 'input' in self._opts.test:
            DebugPrint(3, "Running input in test mode")
            self._probeinput.do_test()
        # Finish input initialization, including DB connection (if used)
        self._probeinput.start(input_ini)

        # get_DataFileExpiration() returns the value in the config file or 31
        # TODO: Do we want to always not consider values older than 31 days or only when checkpointing is
        # enabled?
        # data_expiration = Gratia.Config.get_DataFileExpiration()

        # Find the checkpoint filename (if enabled) - after initializing the input!
        if self._opts.checkpoint:
            checkpoint_file = self.get_config_attribute('CheckpointFile')
            full_checkpoint_name = True
            if not checkpoint_file:
                full_checkpoint_name = False
                checkpoint_file = os.path.join(
                    Gratia.Config.get_WorkingFolder(), "checkpoint")
            data_expiration = Gratia.Config.get_DataFileExpiration()
            # Only process DataFileExpiration days of history
            # (unless we're resuming from a checkpoint file)
            # TODO: is datafileexpiration a maximum value or a default (if no checkpoint is specified)?
            #       Do we want both?
            # Open the checkpoint file
            self._probeinput.add_checkpoint(checkpoint_file,
                                            default_val=data_expiration,
                                            fullname=full_checkpoint_name)

        ### Complete Gratia initialization
        # This uses the input version (after Input initialization)
        self.register_gratia()
예제 #20
0
class SlurmProbe:

    opts = None
    args = None
    checkpoint = None
    conn = None
    cluster = None
    sacct = None

    def __init__(self):
        try:
            self.opts, self.args = self.parse_opts()
        except Exception, e:
            print >> sys.stderr, str(e)
            sys.exit(1)

        # Initialize Gratia
        if not self.opts.gratia_config or not os.path.exists(
                self.opts.gratia_config):
            raise Exception("Gratia config, %s, does not exist." %
                            self.opts.gratia_config)
        Gratia.Initialize(self.opts.gratia_config)

        if self.opts.verbose:
            Gratia.Config.set_DebugLevel(5)

        # Sanity checks for the probe's runtime environment.
        GratiaWrapper.CheckPreconditions()

        if self.opts.sleep:
            rnd = random.randint(1, int(self.opts.sleep))
            DebugPrint(2, "Sleeping for %d seconds before proceeding." % rnd)
            time.sleep(rnd)

        # Make sure we have an exclusive lock for this probe.
        GratiaWrapper.ExclusiveLock()

        self.register_gratia("slurm_meter")

        # Find the checkpoint filename (if enabled)
        if self.opts.checkpoint:
            checkpoint_file = os.path.join(Gratia.Config.get_WorkingFolder(),
                                           "checkpoint")
        else:
            checkpoint_file = None

        # Open the checkpoint file
        self.checkpoint = SlurmCheckpoint(checkpoint_file)

        # Only process DataFileExpiration days of history
        # (unless we're resuming from a checkpoint file)
        if self.checkpoint.val is None:
            self.checkpoint.val = int(time.time() -
                                      (Gratia.Config.get_DataFileExpiration() *
                                       86400))

        # Connect to database
        self.conn = self.get_db_conn()

        self.cluster = Gratia.Config.getConfigAttribute('SlurmCluster')
        self.sacct = SlurmAcct(self.conn, self.cluster)