示例#1
0
class JobHistoryAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """
    def initialize(self):

        self.jobDB = JobDB()

        for status in MONITOR_STATUS:
            for site in MONITOR_SITES:
                gLogger.verbose("Registering activity %s-%s" % (status, site))
                gLogger.verbose("Jobs in %s state at %s" % (status, site))
                gMonitor.registerActivity(
                    "%s-%s" % (status, site),
                    "Jobs in %s state at %s" % (status, site),
                    "JobHistoryAgent", "Jobs/minute", gMonitor.OP_MEAN)

        self.last_update = 0
        self.resultDB = None
        self.reportPeriod = 60
        return S_OK()

    def execute(self):
        """ Main execution method
    """
        delta = time.time() - self.last_update
        if delta > self.reportPeriod:
            result = self.jobDB.getCounters('Jobs', ['Status', 'Site'], {}, '')
            if not result['OK']:
                return S_ERROR('Failed to get data from the Job Database')
            self.resultDB = result['Value']
            self.last_update = time.time()

        totalDict = {}
        for status in MONITOR_STATUS:
            totalDict[status] = 0

        for row in self.resultDB:
            site = row[0]['Site']
            status = row[0]['Status']
            count = row[1]
            if site in MONITOR_SITES and status in MONITOR_STATUS:
                gLogger.verbose("Adding mark %s-%s: " % (status, site) +
                                str(count))
                gMonitor.addMark("%s-%s" % (status, site), count)
            if status in totalDict:
                totalDict[status] += count

        for status in MONITOR_STATUS:
            gLogger.verbose("Adding mark %s-All sites: " % status +
                            str(totalDict[status]))
            gMonitor.addMark("%s-All sites" % status, totalDict[status])

        return S_OK()
示例#2
0
class JobHistoryAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  def initialize( self ):

    self.jobDB = JobDB()

    for status in MONITOR_STATUS:
      for site in MONITOR_SITES:
        gLogger.verbose( "Registering activity %s-%s" % ( status, site ) )
        gLogger.verbose( "Jobs in %s state at %s" % ( status, site ) )
        gMonitor.registerActivity( "%s-%s" % ( status, site ), "Jobs in %s state at %s" % ( status, site ),
                                  "JobHistoryAgent", "Jobs/minute", gMonitor.OP_MEAN )

    self.last_update = 0
    self.resultDB = None
    self.reportPeriod = 60
    return S_OK()

  def execute( self ):
    """ Main execution method
    """
    delta = time.time() - self.last_update
    if delta > self.reportPeriod:
      result = self.jobDB.getCounters( 'Jobs', ['Status', 'Site'], {}, '' )
      if not result['OK']:
        return S_ERROR( 'Failed to get data from the Job Database' )
      self.resultDB = result['Value']
      self.last_update = time.time()

    totalDict = {}
    for status in MONITOR_STATUS:
      totalDict[status] = 0

    for row in self.resultDB:
      site = row[0]['Site']
      status = row[0]['Status']
      count = row[1]
      if site in MONITOR_SITES and status in MONITOR_STATUS:
        gLogger.verbose( "Adding mark %s-%s: " % ( status, site ) + str( count ) )
        gMonitor.addMark( "%s-%s" % ( status, site ), count )
      if status in totalDict:
        totalDict[status] += count

    for status in MONITOR_STATUS:
      gLogger.verbose( "Adding mark %s-All sites: " % status + str( totalDict[status] ) )
      gMonitor.addMark( "%s-All sites" % status, totalDict[status] )

    return S_OK()
示例#3
0
class SPTCorrector(BaseCorrector):

    _GLOBAL_MAX_CORRECTION = "MaxGlobalCorrection"
    _SLICE_TIME_SPAN = "TimeSpan"
    _SLICE_WEIGHT = "Weight"
    _SLICE_MAX_CORRECTION = "MaxCorrection"

    def initialize(self):

        self.__jobDB = JobDB()

        return S_OK()

    def applyCorrection(self, entitiesExpectedShare):

        print "AT >>> entitiesExpectedShare", entitiesExpectedShare

        ownerDNs = entitiesExpectedShare.keys()

        group = self.getGroup()
        result = self.__jobDB.getCounters("Jobs", ["OwnerDN"], {"OwnerGroup": group, "Status": "Waiting"})
        if not result["OK"]:
            print "AT >>> result", result
            return entitiesExpectedShare

        ownerDict = {}
        for row in result["Value"]:
            ownerDict[row[0]["OwnerDN"]] = row[1]
        print "AT >>> ownerDict", ownerDict

        resultShare = {}
        minNumber = 1000000000000
        minOwnerDN = ""
        for ownerDN in ownerDNs:
            resultShare[ownerDN] = 0
            if minNumber > ownerDict[ownerDN]:
                minNumber = ownerDict[ownerDN]
                minOwnerDN = ownerDN
        resultShare[minOwnerDN] = 1

        print "AT >>> resultShare", resultShare

        return resultShare

    def updateHistoryKnowledge(self):

        return S_OK()
示例#4
0
class Limiter(object):

  # static variables shared between all instances of this class
  csDictCache = DictCache()
  condCache = DictCache()
  delayMem = {}

  def __init__(self, jobDB=None, opsHelper=None):
    """ Constructor
    """
    self.__runningLimitSection = "JobScheduling/RunningLimit"
    self.__matchingDelaySection = "JobScheduling/MatchingDelay"

    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()

    self.log = gLogger.getSubLogger("Limiter")

    if opsHelper:
      self.__opsHelper = opsHelper
    else:
      self.__opsHelper = Operations()

  def getNegativeCond(self):
    """ Get negative condition for ALL sites
    """
    orCond = self.condCache.get("GLOBAL")
    if orCond:
      return orCond
    negCond = {}
    # Run Limit
    result = self.__opsHelper.getSections(self.__runningLimitSection)
    sites = []
    if result['OK']:
      sites = result['Value']
    for siteName in sites:
      result = self.__getRunningCondition(siteName)
      if not result['OK']:
        continue
      data = result['Value']
      if data:
        negCond[siteName] = data
    # Delay limit
    result = self.__opsHelper.getSections(self.__matchingDelaySection)
    sites = []
    if result['OK']:
      sites = result['Value']
    for siteName in sites:
      result = self.__getDelayCondition(siteName)
      if not result['OK']:
        continue
      data = result['Value']
      if not data:
        continue
      if siteName in negCond:
        negCond[siteName] = self.__mergeCond(negCond[siteName], data)
      else:
        negCond[siteName] = data
    orCond = []
    for siteName in negCond:
      negCond[siteName]['Site'] = siteName
      orCond.append(negCond[siteName])
    self.condCache.add("GLOBAL", 10, orCond)
    return orCond

  def getNegativeCondForSite(self, siteName):
    """ Generate a negative query based on the limits set on the site
    """
    # Check if Limits are imposed onto the site
    negativeCond = {}
    if self.__opsHelper.getValue("JobScheduling/CheckJobLimits", True):
      result = self.__getRunningCondition(siteName)
      if result['OK']:
        negativeCond = result['Value']
      self.log.verbose('Negative conditions for site',
                       '%s after checking limits are: %s' % (siteName, str(negativeCond)))

    if self.__opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
      result = self.__getDelayCondition(siteName)
      if result['OK']:
        delayCond = result['Value']
        self.log.verbose('Negative conditions for site',
                         '%s after delay checking are: %s' % (siteName, str(delayCond)))
        negativeCond = self.__mergeCond(negativeCond, delayCond)

    if negativeCond:
      self.log.info('Negative conditions for site',
                    '%s are: %s' % (siteName, str(negativeCond)))

    return negativeCond

  def __mergeCond(self, negCond, addCond):
    """ Merge two negative dicts
    """
    # Merge both negative dicts
    for attr in addCond:
      if attr not in negCond:
        negCond[attr] = []
      for value in addCond[attr]:
        if value not in negCond[attr]:
          negCond[attr].append(value)
    return negCond

  def __extractCSData(self, section):
    """ Extract limiting information from the CS in the form:
        { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } }
    """
    stuffDict = self.csDictCache.get(section)
    if stuffDict:
      return S_OK(stuffDict)

    result = self.__opsHelper.getSections(section)
    if not result['OK']:
      return result
    attribs = result['Value']
    stuffDict = {}
    for attName in attribs:
      result = self.__opsHelper.getOptionsDict("%s/%s" % (section, attName))
      if not result['OK']:
        return result
      attLimits = result['Value']
      try:
        attLimits = dict([(k, int(attLimits[k])) for k in attLimits])
      except Exception as excp:
        errMsg = "%s/%s has to contain numbers: %s" % (section, attName, str(excp))
        self.log.error(errMsg)
        return S_ERROR(errMsg)
      stuffDict[attName] = attLimits

    self.csDictCache.add(section, 300, stuffDict)
    return S_OK(stuffDict)

  def __getRunningCondition(self, siteName):
    """ Get extra conditions allowing site throttling
    """
    siteSection = "%s/%s" % (self.__runningLimitSection, siteName)
    result = self.__extractCSData(siteSection)
    if not result['OK']:
      return result
    limitsDict = result['Value']
    # limitsDict is something like { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } }
    if not limitsDict:
      return S_OK({})
    # Check if the site exceeding the given limits
    negCond = {}
    for attName in limitsDict:
      if attName not in self.jobDB.jobAttributeNames:
        self.log.error("Attribute does not exist",
                       "(%s). Check the job limits" % attName)
        continue
      cK = "Running:%s:%s" % (siteName, attName)
      data = self.condCache.get(cK)
      if not data:
        result = self.jobDB.getCounters(
            'Jobs', [attName], {
                'Site': siteName, 'Status': [
                    'Running', 'Matched', 'Stalled']})
        if not result['OK']:
          return result
        data = result['Value']
        data = dict([(k[0][attName], k[1]) for k in data])
        self.condCache.add(cK, 10, data)
      for attValue in limitsDict[attName]:
        limit = limitsDict[attName][attValue]
        running = data.get(attValue, 0)
        if running >= limit:
          self.log.verbose('Job Limit imposed',
                           'at %s on %s/%s=%d, %d jobs already deployed' % (siteName,
                                                                            attName, attValue, limit, running))
          if attName not in negCond:
            negCond[attName] = []
          negCond[attName].append(attValue)
    # negCond is something like : {'JobType': ['Merge']}
    return S_OK(negCond)

  def updateDelayCounters(self, siteName, jid):
    # Get the info from the CS
    siteSection = "%s/%s" % (self.__matchingDelaySection, siteName)
    result = self.__extractCSData(siteSection)
    if not result['OK']:
      return result
    delayDict = result['Value']
    # limitsDict is something like { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } }
    if not delayDict:
      return S_OK()
    attNames = []
    for attName in delayDict:
      if attName not in self.jobDB.jobAttributeNames:
        self.log.error("Attribute does not exist in the JobDB. Please fix it!",
                       "(%s)" % attName)
      else:
        attNames.append(attName)
    result = self.jobDB.getJobAttributes(jid, attNames)
    if not result['OK']:
      self.log.error("Error while retrieving attributes",
                     "coming from %s: %s" % (siteSection, result['Message']))
      return result
    atts = result['Value']
    # Create the DictCache if not there
    if siteName not in self.delayMem:
      self.delayMem[siteName] = DictCache()
    # Update the counters
    delayCounter = self.delayMem[siteName]
    for attName in atts:
      attValue = atts[attName]
      if attValue in delayDict[attName]:
        delayTime = delayDict[attName][attValue]
        self.log.notice("Adding delay for %s/%s=%s of %s secs" % (siteName, attName,
                                                                  attValue, delayTime))
        delayCounter.add((attName, attValue), delayTime)
    return S_OK()

  def __getDelayCondition(self, siteName):
    """ Get extra conditions allowing matching delay
    """
    if siteName not in self.delayMem:
      return S_OK({})
    lastRun = self.delayMem[siteName].getKeys()
    negCond = {}
    for attName, attValue in lastRun:
      if attName not in negCond:
        negCond[attName] = []
      negCond[attName].append(attValue)
    return S_OK(negCond)
示例#5
0
class JobCommand( Command ):
  '''
  Job "master" Command.
  '''


  def __init__( self, args = None, clients = None ):
    
    super( JobCommand, self ).__init__( args, clients )

    if 'JobDB' in self.apis:
      self.jobDB = self.apis[ 'JobDB' ]
    else:
      self.jobDB = JobDB()

    if 'ResourceManagementClient' in self.apis:
      self.rmClient = self.apis[ 'ResourceManagementClient' ]
    else:
      self.rmClient = ResourceManagementClient()


  def _storeCommand( self, result ):
    '''
    Stores the results of doNew method on the database.
    '''
    
    for jobDict in result:
      
      lowerCaseJobDict = {}
      for key, value in jobDict.iteritems():
        lowerCaseJobDict[ key[0].lower() + key[1:] ] = value
      
      resQuery = self.rmClient.addOrModifyJobCache( **lowerCaseJobDict )
      
      if not resQuery[ 'OK' ]:
        return resQuery
    
    return S_OK()

  
  def _prepareCommand( self ):
    '''
    JobCommand requires one arguments:
    - name : <str>
    '''

    if not 'name' in self.args:
      return S_ERROR( '"name" not found in self.args' )
    name = self.args[ 'name' ]
     
    if not 'timespan' in self.args:
      return S_ERROR( '"timespan" not found in self.args' )
    timespan = self.args[ 'timespan' ]
  
    return S_OK( ( name, timespan ) )
  
  
  def doNew( self, masterParams = None ):
    '''
    Gets the parameters to run, either from the master method or from its
    own arguments.
    It contacts the WMSAdministrator with a list of site names, or a single
    site.
    If there are jobs, are recorded and then returned.
    '''
    
    if masterParams is True:
      self.args[ 'name' ] = ''

    params = self._prepareCommand()
    if not params[ 'OK' ]:
      return params

    name, timespan = params[ 'Value' ]
    
    condDict = {}
    if name:
      condDict = { 'Site' : name }

    startTimeWindow = datetime.utcnow() - timedelta( seconds = timespan )
    
    results = self.jobDB.getCounters( 'Jobs', ['Site', 'Status'],
                                      condDict, newer = startTimeWindow,
                                      timeStamp = 'LastUpdateTime' )
    
    if not results[ 'OK' ]:
      return results
    # Results look like this
    # [ ({'Status': 'Checking', 'Site': 'ANY'}, 6L), ...
    
    uniformResult = {}
    
    jobStatuses = ( 'Checking', 'Completed', 'Done', 'Failed', 'Killed', 'Matched',
                    'Received', 'Rescheduled', 'Running', 'Staging', 'Stalled',
                    'Waiting' )
    
    for resultTuple in results[ 'Value' ]:
      
      selectionDict, numberOfJobs = resultTuple
    
      siteName = selectionDict[ 'Site' ]
      
      if siteName in ( 'ANY', 'Multiple' ):
        continue
    
      if not siteName in uniformResult:
        uniformResult[ siteName ] = dict.fromkeys( jobStatuses, 0 )
      
      uniformResult[ siteName ][ selectionDict[ 'Status' ] ] = numberOfJobs

    # Store results
    storeRes = self._storeCommand( uniformResult )
    if not storeRes[ 'OK' ]:
      return storeRes
    
    return S_OK( uniformResult )
  
  
  def doCache( self ):
    '''
    Method that reads the cache table and tries to read from it. It will
    return a list of dictionaries if there are results.
    '''
    
    params = self._prepareCommand()
    if not params[ 'OK' ]:
      return params
    name = params[ 'Value' ]
    
    result = self.rmClient.selectJobCache( name )
    if result[ 'OK' ]:
      result = S_OK( [ dict( zip( result[ 'Columns' ], res ) ) for res in result[ 'Value' ] ] )
      
    return result
         
             
  def doMaster( self ):
    '''
    Master method.
    Gets all sites and calls doNew method.
    '''
        
    jobsResults = self.doNew( True )
    if not jobsResults[ 'OK' ]:
      self.metrics[ 'failed' ].append( jobsResults[ 'Message' ] )
      
    return S_OK( self.metrics )       
                 
################################################################################
################################################################################
################################################################################
################################################################################
################################################################################
################################################################################
################################################################################
################################################################################

#class JobsStatsCommand( Command ):
#  
#  def __init__( self, args = None, clients = None ):
#    
#    super( JobsStatsCommand, self ).__init__( args, clients )
#    
#    if 'JobsClient' in self.apis:
#      self.jClient = self.apis[ 'JobsClient' ]
#    else:
#      self.jClient = JobsClient()  
#  
#  def doCommand( self ):
#    """ 
#    Return getJobStats from Jobs Client  
#    
#   :attr:`args`: 
#     - args[0]: string: should be a ValidElement
#
#     - args[1]: string: should be the name of the ValidElement
#
#  returns:
#    {
#      'MeanProcessedJobs': X
#    }
#    """
#
#    return self.jClient.getJobsStats( self.args[0], self.args[1], self.args[2] )
    
################################################################################
################################################################################

#class JobsEffCommand( Command ):
#
#  def __init__( self, args = None, clients = None ):
#    
#    super( JobsEffCommand, self ).__init__( args, clients )
#    
#    if 'JobsClient' in self.apis:
#      self.jClient = self.apis[ 'JobsClient' ]
#    else:
#      self.jClient = JobsClient()  
#  
#  def doCommand( self ):
#    """ 
#    Return getJobsEff from Jobs Client  
#    
#   :attr:`args`: 
#       - args[0]: string: should be a ValidElement
#  
#       - args[1]: string: should be the name of the ValidElement
#
#    returns:
#      {
#        'JobsEff': X
#      }
#    """
#         
#    res = self.jClient.getJobsEff( self.args[0], self.args[1], self.args[2] )
#       
#    return S_OK( res )   

################################################################################
################################################################################

#class SystemChargeCommand( Command ):
#  
#  def __init__( self, args = None, clients = None ):
#    
#    super( SystemChargeCommand, self ).__init__( args, clients )
#    
#    if 'JobsClient' in self.apis:
#      self.jClient = self.apis[ 'JobsClient' ]
#    else:
#      self.jClient = JobsClient()  
#  
#  def doCommand(self):
#    """ Returns last hour system charge, and the system charge of an hour before
#
#        returns:
#          {
#            'LastHour': n_lastHour
#            'anHourBefore': n_anHourBefore
#          }
#    """
#    
#      
#    res = self.jClient.getSystemCharge()
#
#    return S_OK( res )   
    
################################################################################
################################################################################

#class JobsWMSCommand( Command ):
#  
#  def __init__( self, args = None, clients = None ):
#    
#    super( JobsWMSCommand, self ).__init__( args, clients )
#
#    if 'WMSAdministrator' in self.apis:
#      self.wmsAdmin = self.apis[ 'WMSAdministrator' ]
#    else:  
#      self.wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' )
#  
#  def doCommand( self ):
#    """ 
#    Returns simple jobs efficiency
#
#    :attr:`args`: 
#       - args[0]: string: should be a ValidElement
#  
#       - args[1]: string should be the name of the ValidElement
#
#    returns:
#      {
#        'Result': 'Good'|'Fair'|'Poor'|'Idle'|'Bad'
#      }
#    """
#   
#    if not 'siteName' in self.args:
#      return self.returnERROR( S_ERROR( 'siteName is missing' ) )
#    siteName = self.args[ 'siteName' ]
#    
#    # If siteName is None, we take all sites
#    if siteName is None:
#      siteName = CSHelpers.getSites()      
#      if not siteName[ 'OK' ]:
#        return self.returnERROR( siteName )
#      siteName = siteName[ 'Value' ]
#    
#    results = self.wmsAdmin.getSiteSummaryWeb( { 'Site' : siteName }, [], 0, 500 )
#
#    if not results[ 'OK' ]:
#      return self.returnERROR( results )
#    results = results[ 'Value' ]
#    
#    if not 'ParameterNames' in results:
#      return self.returnERROR( S_ERROR( 'Malformed result dictionary' ) )
#    params = results[ 'ParameterNames' ]
#    
#    if not 'Records' in results:
#      return self.returnERROR( S_ERROR( 'Malformed result dictionary' ) )
#    records = results[ 'Records' ]
#    
#    jobResults = [] 
#       
#    for record in records:
#      
#      jobDict = dict( zip( params , record ))
#      try:
#        jobDict[ 'Efficiency' ] = float( jobDict[ 'Efficiency' ] )
#      except KeyError, e:
#        return self.returnERROR( S_ERROR( e ) )
#      except ValueError, e:
#        return self.returnERROR( S_ERROR( e ) )  
#      
#      jobResults.append( jobDict )
#    
#    return S_OK( jobResults )  

################################################################################
################################################################################

#class JobsEffSimpleEveryOneCommand( Command ):
#
#  #FIXME: write propper docstrings
#
#  def __init__( self, args = None, clients = None ):
#    
#    super( JobsEffSimpleEveryOneCommand, self ).__init__( args, clients )
#
#    if 'JobsClient' in self.apis:
#      self.jClient = self.apis[ 'JobsClient' ]
#    else:
#      self.jClient = JobsClient() 
#    
#  def doCommand( self ):
#    """ 
#    Returns simple jobs efficiency for all the sites in input.
#        
#    :params:
#      :attr:`sites`: list of site names (when not given, take every site)
#    
#    :returns:
#      {'SiteName': {'JE_S': 'Good'|'Fair'|'Poor'|'Idle'|'Bad'}, ...}
#    """
#
#    sites = None
#
#    if 'sites' in self.args:
#      sites = self.args[ 'sites' ] 
#
#    if sites is None:
#      #FIXME: we do not get them from RSS DB anymore, from CS now.
#      #sites = self.rsClient.selectSite( meta = { 'columns' : 'SiteName' } )
#      sites = CSHelpers.getSites()
#        
#      if not sites['OK']:
#        return sites
#      sites = sites[ 'Value' ]   
#      #sites = [ site[ 0 ] for site in sites[ 'Value' ] ]
#
#    results = self.jClient.getJobsSimpleEff( sites )
#    
#    return results
#    
##    if not results[ 'OK' ]:
##      return results
##    results = results[ 'Value' ]
#        
##    if results is None:
##      results = {}
#
##    resToReturn = {}
#
#    #for site in results:
#    #  resToReturn[ site ] = results[ site ]
#
##    return S_OK( resToReturn )   

################################################################################
################################################################################ 

#class JobsEffSimpleCachedCommand( Command ):
#  
#  def __init__( self, args = None, clients = None ):
#    
#    super( JobsEffSimpleCachedCommand, self ).__init__( args, clients )
#          
#    if 'ResourceStatusClient' in self.apis:
#      self.rsClient = self.apis[ 'ResourceStatusClient' ]
#    else:
#      self.rsClient = ResourceStatusClient()  
#  
#    if 'ResourceManagementClient' in self.apis:
#      self.rmClient = self.apis[ 'ResourceManagementClient' ]
#    else:
#      self.rmClient = ResourceManagementClient()   
#  
#  def doCommand( self ):
#    """ 
#    Returns simple jobs efficiency
#
#    :attr:`args`: 
#       - args[0]: string: should be a ValidElement
#  
#       - args[1]: string should be the name of the ValidElement
#
#    returns:
#      {
#        'Result': 'Good'|'Fair'|'Poor'|'Idle'|'Bad'
#      }
#    """
#         
#    if self.args[0] == 'Service':
#      name = self.rsClient.getGeneralName( self.args[0], self.args[1], 'Site' )
#      name        = name[ 'Value' ][ 0 ]
#      granularity = 'Site'
#    elif self.args[0] == 'Site':
#      name        = self.args[1]
#      granularity = self.args[0]
#    else:
#      return S_ERROR( '%s is not a valid granularity' % self.args[ 0 ] )
#     
#    clientDict = { 
#                  'name'        : name,
#                  'commandName' : 'JobsEffSimpleEveryOne',
#                  'value'       : 'JE_S',
#                  'opt_ID'      : 'NULL',
#                  'meta'        : { 'columns'     : 'Result' }
#                  }
#      
#    res = self.rmClient.getClientCache( **clientDict )
#      
#    if res[ 'OK' ]:
#      res = res[ 'Value' ]
#      if res == None or res == []:
#        res = S_OK( 'Idle' )
#      else:
#        res = S_OK( res[ 0 ] )
#        
#    return res

################################################################################
#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
示例#6
0
class JobCommand(Command):
    '''
  Job "master" Command.
  '''
    def __init__(self, args=None, clients=None):

        super(JobCommand, self).__init__(args, clients)

        if 'JobDB' in self.apis:
            self.jobDB = self.apis['JobDB']
        else:
            self.jobDB = JobDB()

        if 'ResourceManagementClient' in self.apis:
            self.rmClient = self.apis['ResourceManagementClient']
        else:
            self.rmClient = ResourceManagementClient()

    def _storeCommand(self, result):
        '''
    Stores the results of doNew method on the database.
    '''

        for jobDict in result:

            lowerCaseJobDict = {}
            for key, value in jobDict.iteritems():
                lowerCaseJobDict[key[0].lower() + key[1:]] = value

            resQuery = self.rmClient.addOrModifyJobCache(**lowerCaseJobDict)

            if not resQuery['OK']:
                return resQuery

        return S_OK()

    def _prepareCommand(self):
        '''
    JobCommand requires one arguments:
    - name : <str>
    '''

        if not 'name' in self.args:
            return S_ERROR('"name" not found in self.args')
        name = self.args['name']

        if not 'timespan' in self.args:
            return S_ERROR('"timespan" not found in self.args')
        timespan = self.args['timespan']

        return S_OK((name, timespan))

    def doNew(self, masterParams=None):
        '''
    Gets the parameters to run, either from the master method or from its
    own arguments.
    It contacts the WMSAdministrator with a list of site names, or a single
    site.
    If there are jobs, are recorded and then returned.
    '''

        if masterParams is True:
            self.args['name'] = ''

        params = self._prepareCommand()
        if not params['OK']:
            return params

        name, timespan = params['Value']

        condDict = {}
        if name:
            condDict = {'Site': name}

        startTimeWindow = datetime.utcnow() - timedelta(seconds=timespan)

        results = self.jobDB.getCounters('Jobs', ['Site', 'Status'],
                                         condDict,
                                         newer=startTimeWindow,
                                         timeStamp='LastUpdateTime')

        if not results['OK']:
            return results
        # Results look like this
        # [ ({'Status': 'Checking', 'Site': 'ANY'}, 6L), ...

        uniformResult = {}

        jobStatuses = ('Checking', 'Completed', 'Done', 'Failed', 'Killed',
                       'Matched', 'Received', 'Rescheduled', 'Running',
                       'Staging', 'Stalled', 'Waiting')

        for resultTuple in results['Value']:

            selectionDict, numberOfJobs = resultTuple

            siteName = selectionDict['Site']

            if siteName in ('ANY', 'Multiple'):
                continue

            if not siteName in uniformResult:
                uniformResult[siteName] = dict.fromkeys(jobStatuses, 0)

            uniformResult[siteName][selectionDict['Status']] = numberOfJobs

        # Store results
        storeRes = self._storeCommand(uniformResult)
        if not storeRes['OK']:
            return storeRes

        return S_OK(uniformResult)

    def doCache(self):
        '''
    Method that reads the cache table and tries to read from it. It will
    return a list of dictionaries if there are results.
    '''

        params = self._prepareCommand()
        if not params['OK']:
            return params
        name = params['Value']

        result = self.rmClient.selectJobCache(name)
        if result['OK']:
            result = S_OK(
                [dict(zip(result['Columns'], res)) for res in result['Value']])

        return result

    def doMaster(self):
        '''
    Master method.
    Gets all sites and calls doNew method.
    '''

        jobsResults = self.doNew(True)
        if not jobsResults['OK']:
            self.metrics['failed'].append(jobsResults['Message'])

        return S_OK(self.metrics)


################################################################################
################################################################################
################################################################################
################################################################################
################################################################################
################################################################################
################################################################################
################################################################################

#class JobsStatsCommand( Command ):
#
#  def __init__( self, args = None, clients = None ):
#
#    super( JobsStatsCommand, self ).__init__( args, clients )
#
#    if 'JobsClient' in self.apis:
#      self.jClient = self.apis[ 'JobsClient' ]
#    else:
#      self.jClient = JobsClient()
#
#  def doCommand( self ):
#    """
#    Return getJobStats from Jobs Client
#
#   :attr:`args`:
#     - args[0]: string: should be a ValidElement
#
#     - args[1]: string: should be the name of the ValidElement
#
#  returns:
#    {
#      'MeanProcessedJobs': X
#    }
#    """
#
#    return self.jClient.getJobsStats( self.args[0], self.args[1], self.args[2] )

################################################################################
################################################################################

#class JobsEffCommand( Command ):
#
#  def __init__( self, args = None, clients = None ):
#
#    super( JobsEffCommand, self ).__init__( args, clients )
#
#    if 'JobsClient' in self.apis:
#      self.jClient = self.apis[ 'JobsClient' ]
#    else:
#      self.jClient = JobsClient()
#
#  def doCommand( self ):
#    """
#    Return getJobsEff from Jobs Client
#
#   :attr:`args`:
#       - args[0]: string: should be a ValidElement
#
#       - args[1]: string: should be the name of the ValidElement
#
#    returns:
#      {
#        'JobsEff': X
#      }
#    """
#
#    res = self.jClient.getJobsEff( self.args[0], self.args[1], self.args[2] )
#
#    return S_OK( res )

################################################################################
################################################################################

#class SystemChargeCommand( Command ):
#
#  def __init__( self, args = None, clients = None ):
#
#    super( SystemChargeCommand, self ).__init__( args, clients )
#
#    if 'JobsClient' in self.apis:
#      self.jClient = self.apis[ 'JobsClient' ]
#    else:
#      self.jClient = JobsClient()
#
#  def doCommand(self):
#    """ Returns last hour system charge, and the system charge of an hour before
#
#        returns:
#          {
#            'LastHour': n_lastHour
#            'anHourBefore': n_anHourBefore
#          }
#    """
#
#
#    res = self.jClient.getSystemCharge()
#
#    return S_OK( res )

################################################################################
################################################################################

#class JobsWMSCommand( Command ):
#
#  def __init__( self, args = None, clients = None ):
#
#    super( JobsWMSCommand, self ).__init__( args, clients )
#
#    if 'WMSAdministrator' in self.apis:
#      self.wmsAdmin = self.apis[ 'WMSAdministrator' ]
#    else:
#      self.wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' )
#
#  def doCommand( self ):
#    """
#    Returns simple jobs efficiency
#
#    :attr:`args`:
#       - args[0]: string: should be a ValidElement
#
#       - args[1]: string should be the name of the ValidElement
#
#    returns:
#      {
#        'Result': 'Good'|'Fair'|'Poor'|'Idle'|'Bad'
#      }
#    """
#
#    if not 'siteName' in self.args:
#      return self.returnERROR( S_ERROR( 'siteName is missing' ) )
#    siteName = self.args[ 'siteName' ]
#
#    # If siteName is None, we take all sites
#    if siteName is None:
#      siteName = CSHelpers.getSites()
#      if not siteName[ 'OK' ]:
#        return self.returnERROR( siteName )
#      siteName = siteName[ 'Value' ]
#
#    results = self.wmsAdmin.getSiteSummaryWeb( { 'Site' : siteName }, [], 0, 500 )
#
#    if not results[ 'OK' ]:
#      return self.returnERROR( results )
#    results = results[ 'Value' ]
#
#    if not 'ParameterNames' in results:
#      return self.returnERROR( S_ERROR( 'Malformed result dictionary' ) )
#    params = results[ 'ParameterNames' ]
#
#    if not 'Records' in results:
#      return self.returnERROR( S_ERROR( 'Malformed result dictionary' ) )
#    records = results[ 'Records' ]
#
#    jobResults = []
#
#    for record in records:
#
#      jobDict = dict( zip( params , record ))
#      try:
#        jobDict[ 'Efficiency' ] = float( jobDict[ 'Efficiency' ] )
#      except KeyError, e:
#        return self.returnERROR( S_ERROR( e ) )
#      except ValueError, e:
#        return self.returnERROR( S_ERROR( e ) )
#
#      jobResults.append( jobDict )
#
#    return S_OK( jobResults )

################################################################################
################################################################################

#class JobsEffSimpleEveryOneCommand( Command ):
#
#  #FIXME: write propper docstrings
#
#  def __init__( self, args = None, clients = None ):
#
#    super( JobsEffSimpleEveryOneCommand, self ).__init__( args, clients )
#
#    if 'JobsClient' in self.apis:
#      self.jClient = self.apis[ 'JobsClient' ]
#    else:
#      self.jClient = JobsClient()
#
#  def doCommand( self ):
#    """
#    Returns simple jobs efficiency for all the sites in input.
#
#    :params:
#      :attr:`sites`: list of site names (when not given, take every site)
#
#    :returns:
#      {'SiteName': {'JE_S': 'Good'|'Fair'|'Poor'|'Idle'|'Bad'}, ...}
#    """
#
#    sites = None
#
#    if 'sites' in self.args:
#      sites = self.args[ 'sites' ]
#
#    if sites is None:
#      #FIXME: we do not get them from RSS DB anymore, from CS now.
#      #sites = self.rsClient.selectSite( meta = { 'columns' : 'SiteName' } )
#      sites = CSHelpers.getSites()
#
#      if not sites['OK']:
#        return sites
#      sites = sites[ 'Value' ]
#      #sites = [ site[ 0 ] for site in sites[ 'Value' ] ]
#
#    results = self.jClient.getJobsSimpleEff( sites )
#
#    return results
#
##    if not results[ 'OK' ]:
##      return results
##    results = results[ 'Value' ]
#
##    if results is None:
##      results = {}
#
##    resToReturn = {}
#
#    #for site in results:
#    #  resToReturn[ site ] = results[ site ]
#
##    return S_OK( resToReturn )

################################################################################
################################################################################

#class JobsEffSimpleCachedCommand( Command ):
#
#  def __init__( self, args = None, clients = None ):
#
#    super( JobsEffSimpleCachedCommand, self ).__init__( args, clients )
#
#    if 'ResourceStatusClient' in self.apis:
#      self.rsClient = self.apis[ 'ResourceStatusClient' ]
#    else:
#      self.rsClient = ResourceStatusClient()
#
#    if 'ResourceManagementClient' in self.apis:
#      self.rmClient = self.apis[ 'ResourceManagementClient' ]
#    else:
#      self.rmClient = ResourceManagementClient()
#
#  def doCommand( self ):
#    """
#    Returns simple jobs efficiency
#
#    :attr:`args`:
#       - args[0]: string: should be a ValidElement
#
#       - args[1]: string should be the name of the ValidElement
#
#    returns:
#      {
#        'Result': 'Good'|'Fair'|'Poor'|'Idle'|'Bad'
#      }
#    """
#
#    if self.args[0] == 'Service':
#      name = self.rsClient.getGeneralName( self.args[0], self.args[1], 'Site' )
#      name        = name[ 'Value' ][ 0 ]
#      granularity = 'Site'
#    elif self.args[0] == 'Site':
#      name        = self.args[1]
#      granularity = self.args[0]
#    else:
#      return S_ERROR( '%s is not a valid granularity' % self.args[ 0 ] )
#
#    clientDict = {
#                  'name'        : name,
#                  'commandName' : 'JobsEffSimpleEveryOne',
#                  'value'       : 'JE_S',
#                  'opt_ID'      : 'NULL',
#                  'meta'        : { 'columns'     : 'Result' }
#                  }
#
#    res = self.rmClient.getClientCache( **clientDict )
#
#    if res[ 'OK' ]:
#      res = res[ 'Value' ]
#      if res == None or res == []:
#        res = S_OK( 'Idle' )
#      else:
#        res = S_OK( res[ 0 ] )
#
#    return res

################################################################################
#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
示例#7
0
class Limiter(object):

  def __init__(self, jobDB=None, opsHelper=None):
    """ Constructor
    """
    self.__runningLimitSection = "JobScheduling/RunningLimit"
    self.__matchingDelaySection = "JobScheduling/MatchingDelay"
    self.csDictCache = DictCache()
    self.condCache = DictCache()
    self.delayMem = {}

    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()

    self.log = gLogger.getSubLogger("Limiter")

    if opsHelper:
      self.__opsHelper = opsHelper
    else:
      self.__opsHelper = Operations()

  def getNegativeCond(self):
    """ Get negative condition for ALL sites
    """
    orCond = self.condCache.get("GLOBAL")
    if orCond:
      return orCond
    negCond = {}
    # Run Limit
    result = self.__opsHelper.getSections(self.__runningLimitSection)
    sites = []
    if result['OK']:
      sites = result['Value']
    for siteName in sites:
      result = self.__getRunningCondition(siteName)
      if not result['OK']:
        continue
      data = result['Value']
      if data:
        negCond[siteName] = data
    # Delay limit
    result = self.__opsHelper.getSections(self.__matchingDelaySection)
    sites = []
    if result['OK']:
      sites = result['Value']
    for siteName in sites:
      result = self.__getDelayCondition(siteName)
      if not result['OK']:
        continue
      data = result['Value']
      if not data:
        continue
      if siteName in negCond:
        negCond[siteName] = self.__mergeCond(negCond[siteName], data)
      else:
        negCond[siteName] = data
    orCond = []
    for siteName in negCond:
      negCond[siteName]['Site'] = siteName
      orCond.append(negCond[siteName])
    self.condCache.add("GLOBAL", 10, orCond)
    return orCond

  def getNegativeCondForSite(self, siteName):
    """ Generate a negative query based on the limits set on the site
    """
    # Check if Limits are imposed onto the site
    negativeCond = {}
    if self.__opsHelper.getValue("JobScheduling/CheckJobLimits", True):
      result = self.__getRunningCondition(siteName)
      if result['OK']:
        negativeCond = result['Value']
      self.log.verbose('Negative conditions for site %s after checking limits are: %s' % (siteName, str(negativeCond)))

    if self.__opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
      result = self.__getDelayCondition(siteName)
      if result['OK']:
        delayCond = result['Value']
        self.log.verbose('Negative conditions for site %s after delay checking are: %s' % (siteName, str(delayCond)))
        negativeCond = self.__mergeCond(negativeCond, delayCond)

    if negativeCond:
      self.log.info('Negative conditions for site %s are: %s' % (siteName, str(negativeCond)))

    return negativeCond

  def __mergeCond(self, negCond, addCond):
    """ Merge two negative dicts
    """
    # Merge both negative dicts
    for attr in addCond:
      if attr not in negCond:
        negCond[attr] = []
      for value in addCond[attr]:
        if value not in negCond[attr]:
          negCond[attr].append(value)
    return negCond

  def __extractCSData(self, section):
    """ Extract limiting information from the CS in the form:
        { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } }
    """
    stuffDict = self.csDictCache.get(section)
    if stuffDict:
      return S_OK(stuffDict)

    result = self.__opsHelper.getSections(section)
    if not result['OK']:
      return result
    attribs = result['Value']
    stuffDict = {}
    for attName in attribs:
      result = self.__opsHelper.getOptionsDict("%s/%s" % (section, attName))
      if not result['OK']:
        return result
      attLimits = result['Value']
      try:
        attLimits = dict([(k, int(attLimits[k])) for k in attLimits])
      except Exception as excp:
        errMsg = "%s/%s has to contain numbers: %s" % (section, attName, str(excp))
        self.log.error(errMsg)
        return S_ERROR(errMsg)
      stuffDict[attName] = attLimits

    self.csDictCache.add(section, 300, stuffDict)
    return S_OK(stuffDict)

  def __getRunningCondition(self, siteName):
    """ Get extra conditions allowing site throttling
    """
    siteSection = "%s/%s" % (self.__runningLimitSection, siteName)
    result = self.__extractCSData(siteSection)
    if not result['OK']:
      return result
    limitsDict = result['Value']
    # limitsDict is something like { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } }
    if not limitsDict:
      return S_OK({})
    # Check if the site exceeding the given limits
    negCond = {}
    for attName in limitsDict:
      if attName not in self.jobDB.jobAttributeNames:
        self.log.error("Attribute %s does not exist. Check the job limits" % attName)
        continue
      cK = "Running:%s:%s" % (siteName, attName)
      data = self.condCache.get(cK)
      if not data:
        result = self.jobDB.getCounters(
            'Jobs', [attName], {
                'Site': siteName, 'Status': [
                    'Running', 'Matched', 'Stalled']})
        if not result['OK']:
          return result
        data = result['Value']
        data = dict([(k[0][attName], k[1]) for k in data])
        self.condCache.add(cK, 10, data)
      for attValue in limitsDict[attName]:
        limit = limitsDict[attName][attValue]
        running = data.get(attValue, 0)
        if running >= limit:
          self.log.verbose('Job Limit imposed at %s on %s/%s=%d,'
                           ' %d jobs already deployed' % (siteName, attName, attValue, limit, running))
          if attName not in negCond:
            negCond[attName] = []
          negCond[attName].append(attValue)
    # negCond is something like : {'JobType': ['Merge']}
    return S_OK(negCond)

  def updateDelayCounters(self, siteName, jid):
    # Get the info from the CS
    siteSection = "%s/%s" % (self.__matchingDelaySection, siteName)
    result = self.__extractCSData(siteSection)
    if not result['OK']:
      return result
    delayDict = result['Value']
    # limitsDict is something like { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } }
    if not delayDict:
      return S_OK()
    attNames = []
    for attName in delayDict:
      if attName not in self.jobDB.jobAttributeNames:
        self.log.error("Attribute %s does not exist in the JobDB. Please fix it!" % attName)
      else:
        attNames.append(attName)
    result = self.jobDB.getJobAttributes(jid, attNames)
    if not result['OK']:
      self.log.error("While retrieving attributes coming from %s: %s" % (siteSection, result['Message']))
      return result
    atts = result['Value']
    # Create the DictCache if not there
    if siteName not in self.delayMem:
      self.delayMem[siteName] = DictCache()
    # Update the counters
    delayCounter = self.delayMem[siteName]
    for attName in atts:
      attValue = atts[attName]
      if attValue in delayDict[attName]:
        delayTime = delayDict[attName][attValue]
        self.log.notice("Adding delay for %s/%s=%s of %s secs" % (siteName, attName,
                                                                  attValue, delayTime))
        delayCounter.add((attName, attValue), delayTime)
    return S_OK()

  def __getDelayCondition(self, siteName):
    """ Get extra conditions allowing matching delay
    """
    if siteName not in self.delayMem:
      return S_OK({})
    lastRun = self.delayMem[siteName].getKeys()
    negCond = {}
    for attName, attValue in lastRun:
      if attName not in negCond:
        negCond[attName] = []
      negCond[attName].append(attValue)
    return S_OK(negCond)