def __init__( self, tests, apis = None ): """ Constructor examples: >>> tests = { 'WMS-Test' : { 'module' : 'WMSTest', 'args' : { 'executable' : [ '/usr/bin/python', 'wms_test.py' ], 'timeout' : 1800 } } } >>> elements = { 'ComputingElement' : [ 'chenj01.ihep.ac.cn' ], 'StorageElement', [ 'IHEPD-USER' ], 'CLOUD' : [ 'CLOUD.IHEP-OPENSTACK.cn' ] } >>> executor = TestExecutor( tests, elements ) >>> executor1 = TestExecutor( tests, elements, { 'ResourceManagementClient' : ResourceManagementClient() } ) :Parameters: **tests** - `dict` dictionary with tests to be executed. The test class is loaded according to the 'module' key and instantiated with 'args' key. **elements** - `dict` the elements need to be tested. The elements is grouped by type. **apis** - 'dict' dictionary with clients to be used in the commands issued by the policies. If not defined, the commands will import them. """ self.apis = apis or {} self.__tests = tests self.log = gLogger.getSubLogger( 'TestExecutor' ) if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis[ 'ResourceManagementClient' ] else: self.rmClient = ResourceManagementClient()
def __init__( self, args = None, clients = None ): super( StorageCommand, self ).__init__( args, clients ) if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis[ 'ResourceManagementClient' ] else: self.rmClient = ResourceManagementClient() if 'FileCatalogDB' in self.apis: self.fcDB = self.apis[ 'FileCatalogDB' ] else: self.fcDB = FileCatalogDB()
def __init__(self, args=None, clients=None): super(WorkNodeCommand, self).__init__(args, clients) if "JobDB" in self.apis: self.jobDB = self.apis["JobDB"] else: self.jobDB = JobDB() if "ResourceManagementClient" in self.apis: self.rmClient = self.apis["ResourceManagementClient"] else: self.rmClient = ResourceManagementClient()
def __init__( self, args = None, clients = None ): super( JobCommand, self ).__init__( args, clients ) if 'WMSAdministrator' in self.apis: self.wmsAdmin = self.apis[ 'WMSAdministrator' ] else: self.wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis[ 'ResourceManagementClient' ] else: self.rmClient = ResourceManagementClient()
def __init__(self, apis): """ Constructor examples: >>> sites = { 'CLUSTER' : [ 'CLUSTER.USTC.cn' ], 'GRID' : [ 'GRID.JINR.ru' ], 'CLOUD' : [ ''CLOUD.IHEP-OPENSTACK.cn' ] } >>> evaluator = StatusEvaluator( sites ) :Parameters: **sites** - `dict` the sites to evaluate SAM status. The sites is grouped by domain. """ self.log = gLogger.getSubLogger( 'StatusEvaluator' ) if "ResourceManagementClient" in apis: self.rmClient = apis[ "ResourceManagementClient" ] else: self.rmClient = ResourceManagementClient()
class TestExecutor( object ): """ TestExecutor """ def __init__( self, tests, apis = None ): """ Constructor examples: >>> tests = { 'WMS-Test' : { 'module' : 'WMSTest', 'args' : { 'executable' : [ '/usr/bin/python', 'wms_test.py' ], 'timeout' : 1800 } } } >>> elements = { 'ComputingElement' : [ 'chenj01.ihep.ac.cn' ], 'StorageElement', [ 'IHEPD-USER' ], 'CLOUD' : [ 'CLOUD.IHEP-OPENSTACK.cn' ] } >>> executor = TestExecutor( tests, elements ) >>> executor1 = TestExecutor( tests, elements, { 'ResourceManagementClient' : ResourceManagementClient() } ) :Parameters: **tests** - `dict` dictionary with tests to be executed. The test class is loaded according to the 'module' key and instantiated with 'args' key. **elements** - `dict` the elements need to be tested. The elements is grouped by type. **apis** - 'dict' dictionary with clients to be used in the commands issued by the policies. If not defined, the commands will import them. """ self.apis = apis or {} self.__tests = tests self.log = gLogger.getSubLogger( 'TestExecutor' ) if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis[ 'ResourceManagementClient' ] else: self.rmClient = ResourceManagementClient() def __matchTests( self, matchArgs ): execTests = [] for testType, testDict in self.__tests.items(): testMatchArgs = testDict[ 'match' ] match = True for name, value in matchArgs.items(): if not value: continue if type( value ) == str: value = ( value, ) if not testMatchArgs.has_key( name ): continue target = testMatchArgs[ name ] if type( target ) == str: target = ( target, ) match = False for val in value: if val in target: match = True break if not match: break if match: execTests.append( testType ) return execTests def __storeTestResults( self, elementName, elementType, testResults ): """ store the test results. """ for testType, testDict in testResults.items(): testDict[ 'CompletionTime' ] = testDict.get( 'CompletionTime' ) or '0000-0-0' testDict[ 'ApplicationTime' ] = testDict.get( 'ApplicationTime' ) or 0 resQuery = self.rmClient.addOrModifySAMResult( elementName, testType, elementType, testDict.get( 'Status' ), testDict.get( 'Log' ), testDict.get( 'JobID' ), testDict.get( 'SubmissionTime' ), testDict.get( 'CompletionTime' ), testDict.get( 'ApplicationTime' ), testDict.get( 'LastCheckTime' ) ) if not resQuery[ 'OK' ]: return resQuery return S_OK() def execute( self, element, lastCheckTime = None ): """ Main method which executes the tests and obtains the results. Use two loops to do all the work. In the first loop, execute all the tests for corresponding elements and put the executed tests into executedTestsQueue. In the second loop, traverse executedTestsQueue to obtain test results. examples: >>> executor.execute()[ 'Value' ] { 'Records' : ( ( 'chenj01.ihep.ac.cn', 'WMS-Test', 'ComputingElement', 'OK', 'balabala', 1, '2016-5-8 00:00:00', '2016-5-8 00:05:23', 0.1234 ), ( 'chenj01.ihep.ac.cn', 'BOSS-Test', 'ComputingElement', 'Bad', 'balabala', 2, '2016-5-8 00:00:00', '0000-0-0', 0 ), ( 'IHEPD-USER', 'SE-Test', 'StorageElement', 'Bad', 'balabala', None, '2016-5-8 00:00:00', '0000-0-0', 0 ) ), 'Columns' : ( 'ElementName', 'TestType', 'ElementType', 'Status', 'Log', 'JobID', 'SubmissionTime', 'CompletionTime', 'ApplicationTime' ) } :return: S_OK( { 'Records' : `tuple`, 'Columns' : `tuple` } ) / S_ERROR """ elementName = element['ElementName' ] elementType = element[ 'ElementType' ] lastCheckTime = lastCheckTime or datetime.utcnow().replace( micresecond = 0 ) matchArgs = { 'ElementType' : elementType, 'VO' : element.get( 'VO' ) } execTests = self.__matchTests( matchArgs ) if execTests == []: return S_ERROR( 'No SAM test matched for %s' % elementName ) testResults = {} runningTestsQueue = Queue.Queue() for testType in execTests: testObj = self.__tests[ testType ][ 'object' ] result = testObj.doTest( element ) if not result[ 'OK' ]: self.log.error( 'Failed to execute %s for %s' % ( testType, elementName ) ) self.log.error( result[ 'Message' ] ) return S_ERROR( 'Failed to execute SAM tests.' ) result = result[ 'Value' ] result[ 'Result' ][ 'LastCheckTime' ] = lastCheckTime testResults[ testType ] = result[ 'Result' ] if not result[ 'Finish' ]: runningTestsQueue.put( testType ) while not runningTestsQueue.empty(): testType = runningTestsQueue.get_nowait() testObj = self.__tests[ testType ][ 'object' ] jobID = testResults[ testType ][ 'JobID' ] vo = testResults[ testType ][ 'VO' ] submissionTime = testResults[ testType ][ 'SubmissionTime' ] result = testObj.getTestResult( elementName, vo, jobID, submissionTime ) if not result[ 'OK' ]: self.log.error( 'Failed to get %s result for %s' % ( testType, elementName ) ) self.log.error( result[ 'Message' ] ) return S_ERROR( 'Failed to get SAM test results.' ) result = result[ 'Value' ] if not result: runningTestsQueue.put( testType ) else: testResults[ testType ].update( result ) runningTestsQueue.task_done() runningTestsQueue.join() storeRes = self.__storeTestResults( elementName, elementType, testResults ) if not storeRes[ 'OK' ]: return S_ERROR( 'Failed to store SAM test results.' ) testsStatus = {} for testType, testDict in testResults.items(): testsStatus[ testType ] = testDict[ 'Status' ] return S_OK( testsStatus )
class JobCommand( Command ): """ Job "master" Command. """ def __init__( self, args = None, clients = None ): super( JobCommand, self ).__init__( args, clients ) if 'WMSAdministrator' in self.apis: self.wmsAdmin = self.apis[ 'WMSAdministrator' ] else: self.wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis[ 'ResourceManagementClient' ] else: self.rmClient = ResourceManagementClient() def _storeCommand( self, result ): """ Stores the results of doNew method on the database. """ for jobDict in result: resQuery = self.rmClient.addOrModifyJobCache( jobDict[ 'Site' ], jobDict[ 'MaskStatus' ], jobDict[ 'Efficiency' ], jobDict[ 'Running' ], jobDict[ 'Waiting' ], jobDict[ 'Done' ], jobDict[ 'Failed' ], jobDict[ 'Completed' ], jobDict[ 'Stalled' ], jobDict[ 'Status' ]) if not resQuery[ 'OK' ]: return resQuery return S_OK() def _prepareCommand( self ): """ JobCommand requires one arguments: - name : <str> """ if not 'name' in self.args: return S_ERROR( '"name" not found in self.args' ) name = self.args[ 'name' ] return S_OK( name ) def doNew( self, masterParams = None ): """ Gets the parameters to run, either from the master method or from its own arguments. It contacts the WMSAdministrator with a list of site names, or a single site. If there are jobs, are recorded and then returned. """ if masterParams is not None: name = masterParams else: params = self._prepareCommand() if not params[ 'OK' ]: return params name = params[ 'Value' ] resultMask = self.wmsAdmin.getSiteMask() if not resultMask[ 'OK' ]: return resultMask resultMask = resultMask[ 'Value' ] # selectDict, sortList, startItem, maxItems # Returns statistics of Last day ! results = self.wmsAdmin.getSiteSummaryWeb( { 'Site' : name }, [], 0, 0 ) if not results[ 'OK' ]: return results results = results[ 'Value' ] if not 'ParameterNames' in results: return S_ERROR( 'Wrong result dictionary, missing "ParameterNames"' ) params = results[ 'ParameterNames' ] if not 'Records' in results: return S_ERROR( 'Wrong formed result dictionary, missing "Records"' ) records = results[ 'Records' ] uniformResult = [] siteJobs = {} for record in records: # This returns a dictionary with the following keys # 'Site', 'GridType', 'Country', 'Tier', 'MaskStatus', 'Received', # 'Checking', 'Staging', 'Waiting', 'Matched', 'Running', 'Stalled', # 'Done', 'Completed', 'Failed', 'Efficiency', 'Status' jobDict = dict( zip( params , record )) siteJobs[ jobDict.pop( 'Site' ) ] = jobDict # # We cast efficiency to a float # jobDict[ 'Efficiency' ] = float( jobDict[ 'Efficiency' ] ) # uniformResult.append( jobDict ) for site in name: recordDict = {} recordDict[ 'Site' ] = site if siteJobs.has_key( site ): recordDict[ 'MaskStatus' ] = siteJobs[ site ][ 'MaskStatus' ] recordDict[ 'Running' ] = siteJobs[ site ][ 'Running' ] recordDict[ 'Waiting' ] = siteJobs[ site ][ 'Waiting' ] + siteJobs[ site ][ 'Checking' ] recordDict[ 'Done' ] = siteJobs[ site ][ 'Done' ] recordDict[ 'Failed' ] = siteJobs[ site ][ 'Failed' ] recordDict[ 'Completed' ] = siteJobs[ site ][ 'Completed' ] recordDict[ 'Stalled' ] = siteJobs[ site ][ 'Stalled' ] recordDict[ 'Efficiency' ] = float( siteJobs[ site ][ 'Efficiency' ] ) recordDict[ 'Status' ] = siteJobs[ site ][ 'Status' ] else: if site in resultMask: recordDict[ 'MaskStatus' ] = 'Active' else: recordDict[ 'MaskStatus' ] = 'Banned' recordDict[ 'Running' ] = 0 recordDict[ 'Waiting' ] = 0 recordDict[ 'Done' ] = 0 recordDict[ 'Failed' ] = 0 recordDict[ 'Completed' ] = 0 recordDict[ 'Stalled' ] = 0 recordDict[ 'Efficiency' ] = 0.0 recordDict[ 'Status' ] = 'Idle' uniformResult.append( recordDict ) storeRes = self._storeCommand( uniformResult ) if not storeRes[ 'OK' ]: return storeRes return S_OK( uniformResult ) def doCache( self ): """ Method that reads the cache table and tries to read from it. It will return a list of dictionaries if there are results. """ params = self._prepareCommand() if not params[ 'OK' ]: return params name = params[ 'Value' ] result = self.rmClient.selectJobCache( name ) if result[ 'OK' ]: result = S_OK( [ dict( zip( result[ 'Columns' ], res ) ) for res in result[ 'Value' ] ] ) return result def doMaster( self ): """ Master method. Gets all sites and calls doNew method. """ siteNames = CSHelpers.getSites() if not siteNames[ 'OK' ]: return siteNames siteNames = siteNames[ 'Value' ] jobsResults = self.doNew( siteNames ) if not jobsResults[ 'OK' ]: self.metrics[ 'failed' ].append( jobsResults[ 'Message' ] ) return S_OK( self.metrics )
class StorageCommand( Command ): """ StorageCommand """ def __init__( self, args = None, clients = None ): super( StorageCommand, self ).__init__( args, clients ) if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis[ 'ResourceManagementClient' ] else: self.rmClient = ResourceManagementClient() if 'FileCatalogDB' in self.apis: self.fcDB = self.apis[ 'FileCatalogDB' ] else: self.fcDB = FileCatalogDB() def _storeCommand( self, result): """ Stores the results of doNew method on the database. """ for storageDict in result: resQuery = self.rmClient.addOrModifyStorageCache( sE = storageDict[ 'SE' ], occupied = storageDict[ 'Occupied' ], free = storageDict[ 'Free' ], usage = storageDict[ 'Usage' ] ) if not resQuery[ 'OK' ]: return resQuery return S_OK() def doNew( self, masterParams = None ): """ It searches FileCatalogDB to find out occupied storage. """ ses = masterParams seMaxStorage = {} for se in ses: maxStorage = gConfig.getValue('/Resources/StorageElements/%s/Capacity' % se, 0) * 1 << 40 seMaxStorage[ se ] = maxStorage sqlStr = """select SE.SEName, sum(F.Size) from FC_Replicas R, FC_Files F, FC_StorageElements SE where R.FileID=F.FileID and R.SEID=SE.SEID group by R.SEID;""" result = self.fcDB._query(sqlStr) if not result[ 'OK' ]: return result result = result[ 'Value' ] seOccupied = {} for se, occupied in result: seOccupied[ se ] = int(occupied) uniformResult = [] for se in ses: max = seMaxStorage.get(se, 0) occupied = seOccupied.get(se, 0) if max == 0: usage = 0.0 free = 0 else: usage = math.floor(float(occupied) / max * 1000) / 10 free = max - occupied uniformResult.append( { 'SE' : se, 'Occupied' : occupied, 'Free' : free, 'Usage' : usage } ) storeRes = self._storeCommand( uniformResult ) if not storeRes[ 'OK' ]: return storeRes return S_OK( result ) def doMaster(self): """ Master method Gets all ses and call doNew method """ ses = CSHelpers.getStorageElements() if not ses[ 'OK' ]: return ses storageResults = self.doNew( ses[ 'Value' ] ) if not storageResults[ 'OK' ]: self.metrics[ 'failed' ].append( storageResults[ 'Message' ] ) return S_OK( self.metrics )
class StatusEvaluator(object): """ StatusEvaluator """ def __init__(self, apis): """ Constructor examples: >>> sites = { 'CLUSTER' : [ 'CLUSTER.USTC.cn' ], 'GRID' : [ 'GRID.JINR.ru' ], 'CLOUD' : [ ''CLOUD.IHEP-OPENSTACK.cn' ] } >>> evaluator = StatusEvaluator( sites ) :Parameters: **sites** - `dict` the sites to evaluate SAM status. The sites is grouped by domain. """ self.log = gLogger.getSubLogger( 'StatusEvaluator' ) if "ResourceManagementClient" in apis: self.rmClient = apis[ "ResourceManagementClient" ] else: self.rmClient = ResourceManagementClient() def __storeResourceStatus( self, resDict ): storeRes = self.rmClient.addOrModifyResourceSAMStatus( resDict['VO' ] , resDict[ 'ElementName' ], resDict[ 'ElementType' ], resDict[ 'Tests' ], resDict[ 'Status' ], resDict[ 'LastCheckTime' ] ) if not storeRes[ 'OK' ]: return storeRes return S_OK() def __storeSiteStatus( self, resDict ): storeRes = self.rmClient.addOrModifySiteSAMStatus( resDict[ 'VO' ], resDict[ 'Site' ], resDict[ 'SiteType' ], resDict[ 'Status' ], resDict[ 'CEStatus' ], resDict[ 'SEStatus' ], resDict[ 'LastCheckTime' ] ) if not storeRes[ 'OK' ]: return storeRes return S_OK() def __resourceStatusRule( self, statusList ): if 'Bad' in statusList: return 'Bad' if 'Unknown' in statusList: return 'Unknown' if 'Busy' in statusList: return 'Busy' if 'OK' in statusList: return 'OK' return '' def __siteStatusRule( self, ceStatusList, seStatus ): if 'OK' in ceStatusList: ceStatus = 'OK' elif 'Busy' in ceStatusList: ceStatus = 'Busy' elif 'Bad' in ceStatusList: ceStatus = 'Bad' elif 'Unknown' in ceStatusList: ceStatus = 'Unknown' else: ceStatus = None if not seStatus: status = ceStatus else: if 'Bad' == seStatus: status = 'Bad' else: status = ceStatus return ( status, ceStatus, seStatus ) def evaluateResourceStatus( self, elementDict, testResults, vo = None, lastCheckTime = None ): vo = vo or 'all' lastCheckTime = lastCheckTime or datetime.utcnow().replace( microsecond = 0 ) elementName = elementDict[ 'ElementName' ] elementType = elementDict[ 'ElementType' ] tests = ','.join( testResults.keys() ) status = self.__resourceStatusRule( testResults.values() ) resDict = { 'ElementName' : elementName, 'VO' : vo, 'ElementType' : elementType, 'Tests' : tests, 'Status' : status, 'LastCheckTime' : lastCheckTime } storeRes = self.__storeResourceStatus( resDict ) if not storeRes[ 'OK' ]: return S_ERROR( 'Failed to store resource SAM status.' ) return S_OK( status ) def evaluateSiteStatus( self, site, ceStatusList, seStatus = None, vo = None, lastCheckTime = None ): vo = vo or 'all' lastCheckTime = lastCheckTime or datetime.utcnow().replace( microsecond = 0 ) siteType = site.split( '.' )[ 0 ] status, ceStatus, seStatus = self.__siteStatusRule( ceStatusList, seStatus ) resDict = { 'Site' : site, 'VO' : vo, 'SiteType' : siteType, 'Status' : status, 'CEStatus' : ceStatus, 'SEStatus' : seStatus, 'LastCheckTime' : lastCheckTime } storeRes = self.__storeSiteStatus( resDict ) if not storeRes[ 'OK' ]: S_ERROR( 'Failed to store site SAM status.' ) return S_OK( status )
class WorkNodeCommand(Command): def __init__(self, args=None, clients=None): super(WorkNodeCommand, self).__init__(args, clients) if "JobDB" in self.apis: self.jobDB = self.apis["JobDB"] else: self.jobDB = JobDB() if "ResourceManagementClient" in self.apis: self.rmClient = self.apis["ResourceManagementClient"] else: self.rmClient = ResourceManagementClient() def _storeCommand(self, result): """ Stores the results of doNew method on the database. """ for hostDict in result: resQuery = self.rmClient.addOrModifyWorkNodeCache( host=hostDict["Host"], site=hostDict["Site"], done=hostDict["Done"], failed=hostDict["Failed"], efficiency=hostDict["Efficiency"], ) if not resQuery["OK"]: return resQuery return S_OK() def doNew(self, masterParams=None): hosts = masterParams sql = """ select JP.Value, J.Status, J.Site, count(*) from Jobs J, JobParameters JP where J.JobID = JP.JobID and JP.Name = 'HostName' and J.EndExecTime >= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 24 HOUR) group by JP.Value, J.Status """ jobDB = JobDB() queryRes = jobDB._query(sql) if not queryRes["OK"]: return queryRes records = queryRes["Value"] hostJobs = {} for record in records: hostName = record[0] status = record[1] if status != "Done" and status != "Failed": continue if hostName not in hostJobs: hostJobs[hostName] = {"Site": record[2], "Done": 0, "Failed": 0} hostJobs[hostName][record[1]] = record[3] uniformResult = [] for host, hostDict in hostJobs.items(): hostDict["Host"] = host try: hosts.remove(host) except ValueError: pass if hostDict["Done"] == 0 and hostDict["Failed"] == 0: hostDict["Efficiency"] = 0.0 else: hostDict["Efficiency"] = ( math.floor(float(hostDict["Done"]) / (hostDict["Done"] + hostDict["Failed"]) * 1000) / 10 ) uniformResult.append(hostDict) if len(hosts) != 0: deleteRes = self.rmClient.deleteWorkNodeCache(host=hosts) if not deleteRes["OK"]: return deleteRes storeRes = self._storeCommand(uniformResult) if not storeRes["OK"]: return storeRes return S_OK(uniformResult) def doMaster(self): """ Master method. Gets all sites and calls doNew method. """ queryRes = self.rmClient.selectWorkNodeCache(meta={"columns": ["Host"]}) if not queryRes["OK"]: return queryRes records = queryRes["Value"] hosts = [record[0] for record in records] jobsResults = self.doNew(hosts) if not jobsResults["OK"]: self.metrics["failed"].append(jobsResults["Message"]) return S_OK(self.metrics)