Пример #1
0
class MatcherHandler( RequestHandler ):

  def initialize( self ):
    self.limiter = Limiter( jobDB = gJobDB )

##############################################################################
  types_requestJob = [ list( StringTypes ) + [DictType] ]
  def export_requestJob( self, resourceDescription ):
    """ Serve a job to the request of an agent which is the highest priority
        one matching the agent's site capacity
    """

    resourceDescription['Setup'] = self.serviceInfoDict['clientSetup']
    credDict = self.getRemoteCredentials()

    try:
      opsHelper = Operations( group = credDict['group'] )
      matcher = Matcher( pilotAgentsDB = pilotAgentsDB,
                         jobDB = gJobDB,
                         tqDB = gTaskQueueDB,
                         jlDB = jlDB,
                         opsHelper = opsHelper )
      result = matcher.selectJob( resourceDescription, credDict )
    except RuntimeError as rte:
      self.log.error( "Error requesting job: ", rte )
      return S_ERROR( "Error requesting job" )

    # result can be empty, meaning that no job matched
    if result:
      gMonitor.addMark( "matchesDone" )
      gMonitor.addMark( "matchesOK" )
      return S_OK( result )
    else:
      # FIXME: This is correctly interpreted by the JobAgent, but DErrno should be used instead
      return S_ERROR( "No match found" )

##############################################################################
  types_getActiveTaskQueues = []
  def export_getActiveTaskQueues( self ):
    """ Return all task queues
    """
    return gTaskQueueDB.retrieveTaskQueues()

##############################################################################
  types_getMatchingTaskQueues = [ DictType ]
  def export_getMatchingTaskQueues( self, resourceDict ):
    """ Return all task queues
    """
    if 'Site' in resourceDict and type( resourceDict[ 'Site' ] ) in StringTypes:
      negativeCond = self.limiter.getNegativeCondForSite( resourceDict[ 'Site' ] )
    else:
      negativeCond = self.limiter.getNegativeCond()
    return gTaskQueueDB.retrieveTaskQueuesThatMatch( resourceDict, negativeCond = negativeCond )

##############################################################################
  types_matchAndGetTaskQueue = [ DictType ]
  def export_matchAndGetTaskQueue( self, resourceDict ):
    """ Return matching task queues
    """
    return gTaskQueueDB.matchAndGetTaskQueue( resourceDict )
Пример #2
0
    def __init__(self,
                 pilotAgentsDB=None,
                 jobDB=None,
                 tqDB=None,
                 jlDB=None,
                 opsHelper=None):
        """ c'tor
    """
        if pilotAgentsDB:
            self.pilotAgentsDB = pilotAgentsDB
        else:
            self.pilotAgentsDB = PilotAgentsDB()
        if jobDB:
            self.jobDB = jobDB
        else:
            self.jobDB = JobDB()
        if tqDB:
            self.tqDB = tqDB
        else:
            self.tqDB = TaskQueueDB()
        if jlDB:
            self.jlDB = jlDB
        else:
            self.jlDB = JobLoggingDB()

        if opsHelper:
            self.opsHelper = opsHelper
        else:
            self.opsHelper = Operations()

        self.log = gLogger.getSubLogger("Matcher")

        self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper)

        self.siteClient = SiteStatus()
Пример #3
0
  def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger( "Matcher" )

    self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )
Пример #4
0
    def initializeHandler(cls, serviceInfoDict):
        try:
            result = ObjectLoader().loadObject(
                "WorkloadManagementSystem.DB.JobDB", "JobDB")
            if not result["OK"]:
                return result
            cls.jobDB = result["Value"](parentLogger=cls.log)

            result = ObjectLoader().loadObject(
                "WorkloadManagementSystem.DB.JobLoggingDB", "JobLoggingDB")
            if not result["OK"]:
                return result
            cls.jobLoggingDB = result["Value"](parentLogger=cls.log)

            result = ObjectLoader().loadObject(
                "WorkloadManagementSystem.DB.TaskQueueDB", "TaskQueueDB")
            if not result["OK"]:
                return result
            cls.taskQueueDB = result["Value"](parentLogger=cls.log)

            result = ObjectLoader().loadObject(
                "WorkloadManagementSystem.DB.PilotAgentsDB", "PilotAgentsDB")
            if not result["OK"]:
                return result
            cls.pilotAgentsDB = result["Value"](parentLogger=cls.log)

        except RuntimeError as excp:
            return S_ERROR("Can't connect to DB: %s" % excp)

        cls.limiter = Limiter(jobDB=cls.jobDB)

        return S_OK()
Пример #5
0
    def initializeHandler(cls, serviceInfoDict):
        try:
            result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.JobDB", "JobDB")
            if not result["OK"]:
                return result
            cls.jobDB = result["Value"]()

            result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.JobLoggingDB", "JobLoggingDB")
            if not result["OK"]:
                return result
            cls.jobLoggingDB = result["Value"]()

            result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.TaskQueueDB", "TaskQueueDB")
            if not result["OK"]:
                return result
            cls.taskQueueDB = result["Value"]()

            result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.PilotAgentsDB", "PilotAgentsDB")
            if not result["OK"]:
                return result
            cls.pilotAgentsDB = result["Value"]()

        except RuntimeError as excp:
            return S_ERROR("Can't connect to DB: %s" % excp)

        cls.limiter = Limiter(jobDB=cls.jobDB)

        gMonitor.registerActivity("matchTime", "Job matching time", "Matching", "secs", gMonitor.OP_MEAN, 300)
        gMonitor.registerActivity("matchesDone", "Job Match Request", "Matching", "matches", gMonitor.OP_RATE, 300)
        gMonitor.registerActivity("matchesOK", "Matched jobs", "Matching", "matches", gMonitor.OP_RATE, 300)
        gMonitor.registerActivity("numTQs", "Number of Task Queues", "Matching", "tqsk queues", gMonitor.OP_MEAN, 300)

        return S_OK()
Пример #6
0
  def initializeHandler(cls, serviceInfoDict):
    cls.jobDB = JobDB()
    cls.jobLoggingDB = JobLoggingDB()
    cls.taskQueueDB = TaskQueueDB()
    cls.pilotAgentsDB = PilotAgentsDB()
    cls.limiter = Limiter(jobDB=cls.jobDB)

    cls.taskQueueDB.recalculateTQSharesForAll()

    gMonitor.registerActivity('matchTime', "Job matching time",
                              'Matching', "secs", gMonitor.OP_MEAN, 300)
    gMonitor.registerActivity('matchesDone', "Job Match Request",
                              'Matching', "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('matchesOK', "Matched jobs",
                              'Matching', "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('numTQs', "Number of Task Queues",
                              'Matching', "tqsk queues", gMonitor.OP_MEAN, 300)

    gThreadScheduler.addPeriodicTask(120, cls.taskQueueDB.recalculateTQSharesForAll)
    gThreadScheduler.addPeriodicTask(60, cls.sendNumTaskQueues)

    cls.sendNumTaskQueues()
    return S_OK()
Пример #7
0
class Matcher( object ):
  """ Logic for matching
  """

  def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger( "Matcher" )

    self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )


  def selectJob( self, resourceDescription, credDict ):
    """ Main job selection function to find the highest priority job matching the resource capacity
    """

    startTime = time.time()

    resourceDict = self._getResourceDict( resourceDescription, credDict )

    negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] )
    result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond )

    if not result['OK']:
      return result
    result = result['Value']
    if not result['matchFound']:
      self.log.info( "No match found" )
      raise RuntimeError( "No match found" )

    jobID = result['jobId']
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( "No attributes returned for job" )
    if not resAtt['Value']['Status'] == 'Waiting':
      self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) )
      result = self.tqDB.deleteJob( jobID )
      if not result[ 'OK' ]:
        return result
      raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) )

    self._reportStatus( resourceDict, jobID )

    result = self.jobDB.getJobJDL( jobID )
    if not result['OK']:
      raise RuntimeError( "Failed to get the job JDL" )

    resultDict = {}
    resultDict['JDL'] = result['Value']
    resultDict['JobID'] = jobID

    matchTime = time.time() - startTime
    self.log.info( "Match time: [%s]" % str( matchTime ) )
    gMonitor.addMark( "matchTime", matchTime )

    # Get some extra stuff into the response returned
    resOpt = self.jobDB.getJobOptParameters( jobID )
    if resOpt['OK']:
      for key, value in resOpt['Value'].items():
        resultDict[key] = value
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( 'No attributes returned for job' )

    if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ):
      self.limiter.updateDelayCounters( resourceDict['Site'], jobID )

    pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False )
    if not pilotInfoReportedFlag:
      self._updatePilotInfo( resourceDict )
    self._updatePilotJobMapping( resourceDict, jobID )

    resultDict['DN'] = resAtt['Value']['OwnerDN']
    resultDict['Group'] = resAtt['Value']['OwnerGroup']
    resultDict['PilotInfoReportedFlag'] = True

    return resultDict


  def _getResourceDict( self, resourceDescription, credDict ):
    """ from resourceDescription to resourceDict (just various mods)
    """
    resourceDict = self._processResourceDescription( resourceDescription )
    resourceDict = self._checkCredentials( resourceDict, credDict )
    self._checkPilotVersion( resourceDict )
    if not self._checkMask( resourceDict ):
      # Banned destinations can only take Test jobs
      resourceDict['JobType'] = 'Test'

    self.log.verbose( "Resource description:" )
    for key in resourceDict:
      self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) )

    return resourceDict

  def _processResourceDescription( self, resourceDescription ):
    """ Check and form the resource description dictionary

        resourceDescription is a ceDict coming from a JobAgent, for example.
    """

    resourceDict = {}
    if isinstance( resourceDescription, basestring ):
      classAdAgent = ClassAd( resourceDescription )
      if not classAdAgent.isOK():
        raise ValueError( 'Illegal Resource JDL' )
      self.log.verbose( classAdAgent.asJDL() )

      for name in singleValueDefFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'CPUTime':
            resourceDict[name] = classAdAgent.getAttributeInt( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      for name in multiValueMatchFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'SubmitPool':
            resourceDict[name] = classAdAgent.getListFromExpression( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      # Check if a JobID is requested
      if classAdAgent.lookupAttribute( 'JobID' ):
        resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' )

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ):
        if classAdAgent.lookupAttribute( k ):
          resourceDict[ k ] = classAdAgent.getAttributeString( k )

    else:
      for name in singleValueDefFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      for name in multiValueMatchFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      if resourceDescription.has_key( 'JobID' ):
        resourceDict['JobID'] = resourceDescription['JobID']

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization',
                 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ):
        if k in resourceDescription:
          resourceDict[ k ] = resourceDescription[ k ]

    return resourceDict



  def _reportStatus( self, resourceDict, jobID ):
    """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
    attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
    attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
    result = self.jobDB.setJobAttributes( jobID, attNames, attValues )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Set job attributes for jobID %s" % jobID )

    result = self.jlDB.addLoggingRecord( jobID,
                                         status = 'Matched',
                                         minor = 'Assigned',
                                         source = 'Matcher' )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Added logging record for jobID %s" % jobID )


  def _checkMask( self, resourceDict ):
    """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
    if not 'Site' in resourceDict:
      self.log.error( "Missing Site Name in Resource JDL" )
      raise RuntimeError( "Missing Site Name in Resource JDL" )

    # Get common site mask and check the agent site
    result = self.jobDB.getSiteMask( siteState = 'Active' )
    if not result['OK']:
      self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] )
      raise RuntimeError( "Internal error" )
    maskList = result['Value']

    if resourceDict['Site'] not in maskList:
      return False

    return True

  def _updatePilotInfo( self, resourceDict ):
    """ Update pilot information - do not fail if we don't manage to do it
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      gridCE = resourceDict.get( 'GridCE', 'Unknown' )
      site = resourceDict.get( 'Site', 'Unknown' )
      benchmark = resourceDict.get( 'PilotBenchmark', 0.0 )
      self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) )

      result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site,
                                                  destination = gridCE, benchmark = benchmark )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _updatePilotJobMapping( self, resourceDict, jobID ):
    """ Update pilot to job mapping information
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )
      result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _checkCredentials( self, resourceDict, credDict ):
    """ Check if we can get a job given the passed credentials
    """
    if Properties.GENERIC_PILOT in credDict[ 'properties' ]:
      # You can only match groups in the same VO
      if credDict[ 'group' ] == "hosts":
        # for the host case the VirtualOrganization parameter
        # is mandatory in resourceDict
        vo = resourceDict.get( 'VirtualOrganization', '' )
      else:
        vo = Registry.getVOForGroup( credDict[ 'group' ] )
      result = Registry.getGroupsForVO( vo )
      if result[ 'OK' ]:
        resourceDict[ 'OwnerGroup' ] = result[ 'Value' ]
      else:
        raise RuntimeError( result['Message'] )
    else:
      # If it's a private pilot, the DN has to be the same
      if Properties.PILOT in credDict[ 'properties' ]:
        self.log.notice( "Setting the resource DN to the credentials DN" )
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # If it's a job sharing. The group has to be the same and just check that the DN (if any)
      # belongs to the same group
      elif Properties.JOB_SHARING in credDict[ 'properties' ]:
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]
        self.log.notice( "Setting the resource group to the credentials group" )
        if 'OwnerDN'  in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]:
          ownerDN = resourceDict[ 'OwnerDN' ]
          result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] )
          if not result[ 'OK' ]:
            raise RuntimeError( result['Message'] )
          if credDict[ 'group' ] not in result[ 'Value' ]:
            # DN is not in the same group! bad boy.
            self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN )
            resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # Nothing special, group and DN have to be the same
      else:
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]

    return resourceDict

  def _checkPilotVersion( self, resourceDict ):
    """ Check the pilot DIRAC version
    """
    if self.opsHelper.getValue( "Pilot/CheckVersion", True ):
      if 'ReleaseVersion' not in resourceDict:
        if not 'DIRACVersion' in resourceDict:
          raise RuntimeError( 'Version check requested and not provided by Pilot' )
        else:
          pilotVersion = resourceDict['DIRACVersion']
      else:
        pilotVersion = resourceDict['ReleaseVersion']

      validVersions = self.opsHelper.getValue( "Pilot/Version", [] )
      if validVersions and pilotVersion not in validVersions:
        raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \
                            ( pilotVersion, ",".join( validVersions ) ) )
      # Check project if requested
      validProject = self.opsHelper.getValue( "Pilot/Project", "" )
      if validProject:
        if 'ReleaseProject' not in resourceDict:
          raise RuntimeError( "Version check requested but expected project %s not received" % validProject )
        if resourceDict[ 'ReleaseProject' ] != validProject:
          raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject,
                                                                                                   resourceDict[ 'ReleaseProject' ] ) )
Пример #8
0
 def initialize(self):
     self.limiter = Limiter(jobDB=gJobDB)
Пример #9
0
class MatcherHandler(RequestHandler):
    def initialize(self):
        self.limiter = Limiter(jobDB=gJobDB)

##############################################################################

    types_requestJob = [[basestring, dict]]

    def export_requestJob(self, resourceDescription):
        """ Serve a job to the request of an agent which is the highest priority
        one matching the agent's site capacity
    """

        resourceDescription['Setup'] = self.serviceInfoDict['clientSetup']
        credDict = self.getRemoteCredentials()

        try:
            opsHelper = Operations(group=credDict['group'])
            matcher = Matcher(pilotAgentsDB=pilotAgentsDB,
                              jobDB=gJobDB,
                              tqDB=gTaskQueueDB,
                              jlDB=jlDB,
                              opsHelper=opsHelper)
            result = matcher.selectJob(resourceDescription, credDict)
        except RuntimeError as rte:
            self.log.error("Error requesting job: ", rte)
            return S_ERROR("Error requesting job")

        # result can be empty, meaning that no job matched
        if result:
            gMonitor.addMark("matchesDone")
            gMonitor.addMark("matchesOK")
            return S_OK(result)
        # FIXME: This is correctly interpreted by the JobAgent, but DErrno should be used instead
        return S_ERROR("No match found")

##############################################################################

    types_getActiveTaskQueues = []

    @staticmethod
    def export_getActiveTaskQueues():
        """ Return all task queues
    """
        return gTaskQueueDB.retrieveTaskQueues()

##############################################################################

    types_getMatchingTaskQueues = [dict]

    def export_getMatchingTaskQueues(self, resourceDict):
        """ Return all task queues
    """
        if 'Site' in resourceDict and isinstance(resourceDict['Site'],
                                                 basestring):
            negativeCond = self.limiter.getNegativeCondForSite(
                resourceDict['Site'])
        else:
            negativeCond = self.limiter.getNegativeCond()
        matcher = Matcher(pilotAgentsDB=pilotAgentsDB,
                          jobDB=gJobDB,
                          tqDB=gTaskQueueDB,
                          jlDB=jlDB)
        resourceDescriptionDict = matcher._processResourceDescription(
            resourceDict)
        return gTaskQueueDB.retrieveTaskQueuesThatMatch(
            resourceDescriptionDict, negativeCond=negativeCond)


##############################################################################

    types_matchAndGetTaskQueue = [dict]

    @staticmethod
    def export_matchAndGetTaskQueue(resourceDict):
        """ Return matching task queues
    """
        return gTaskQueueDB.matchAndGetTaskQueue(resourceDict)
Пример #10
0
class Matcher(object):
    """ Logic for matching
  """
    def __init__(self,
                 pilotAgentsDB=None,
                 jobDB=None,
                 tqDB=None,
                 jlDB=None,
                 opsHelper=None):
        """ c'tor
    """
        if pilotAgentsDB:
            self.pilotAgentsDB = pilotAgentsDB
        else:
            self.pilotAgentsDB = PilotAgentsDB()
        if jobDB:
            self.jobDB = jobDB
        else:
            self.jobDB = JobDB()
        if tqDB:
            self.tqDB = tqDB
        else:
            self.tqDB = TaskQueueDB()
        if jlDB:
            self.jlDB = jlDB
        else:
            self.jlDB = JobLoggingDB()

        if opsHelper:
            self.opsHelper = opsHelper
        else:
            self.opsHelper = Operations()

        self.log = gLogger.getSubLogger("Matcher")

        self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper)

        self.siteClient = SiteStatus()

    def selectJob(self, resourceDescription, credDict):
        """ Main job selection function to find the highest priority job matching the resource capacity
    """

        startTime = time.time()

        resourceDict = self._getResourceDict(resourceDescription, credDict)

        # Make a nice print of the resource matching parameters
        toPrintDict = dict(resourceDict)
        if "MaxRAM" in resourceDescription:
            toPrintDict['MaxRAM'] = resourceDescription['MaxRAM']
        if "NumberOfProcessors" in resourceDescription:
            toPrintDict['NumberOfProcessors'] = resourceDescription[
                'NumberOfProcessors']
        toPrintDict['Tag'] = []
        if "Tag" in resourceDict:
            for tag in resourceDict['Tag']:
                if not tag.endswith('GB') and not tag.endswith('Processors'):
                    toPrintDict['Tag'].append(tag)
        if not toPrintDict['Tag']:
            toPrintDict.pop('Tag')
        gLogger.info('Resource description for matching',
                     printDict(toPrintDict))

        negativeCond = self.limiter.getNegativeCondForSite(
            resourceDict['Site'])
        result = self.tqDB.matchAndGetJob(resourceDict,
                                          negativeCond=negativeCond)

        if not result['OK']:
            raise RuntimeError(result['Message'])
        result = result['Value']
        if not result['matchFound']:
            self.log.info("No match found")
            return {}

        jobID = result['jobId']
        resAtt = self.jobDB.getJobAttributes(
            jobID, ['OwnerDN', 'OwnerGroup', 'Status'])
        if not resAtt['OK']:
            raise RuntimeError('Could not retrieve job attributes')
        if not resAtt['Value']:
            raise RuntimeError("No attributes returned for job")
        if not resAtt['Value']['Status'] == 'Waiting':
            self.log.error('Job matched by the TQ is not in Waiting state',
                           str(jobID))
            result = self.tqDB.deleteJob(jobID)
            if not result['OK']:
                raise RuntimeError(result['Message'])
            raise RuntimeError("Job %s is not in Waiting state" % str(jobID))

        self._reportStatus(resourceDict, jobID)

        result = self.jobDB.getJobJDL(jobID)
        if not result['OK']:
            raise RuntimeError("Failed to get the job JDL")

        resultDict = {}
        resultDict['JDL'] = result['Value']
        resultDict['JobID'] = jobID

        matchTime = time.time() - startTime
        self.log.info("Match time: [%s]" % str(matchTime))
        gMonitor.addMark("matchTime", matchTime)

        # Get some extra stuff into the response returned
        resOpt = self.jobDB.getJobOptParameters(jobID)
        if resOpt['OK']:
            for key, value in resOpt['Value'].items():
                resultDict[key] = value
        resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup'])
        if not resAtt['OK']:
            raise RuntimeError('Could not retrieve job attributes')
        if not resAtt['Value']:
            raise RuntimeError('No attributes returned for job')

        if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
            self.limiter.updateDelayCounters(resourceDict['Site'], jobID)

        pilotInfoReportedFlag = resourceDict.get('PilotInfoReportedFlag',
                                                 False)
        if not pilotInfoReportedFlag:
            self._updatePilotInfo(resourceDict)
        self._updatePilotJobMapping(resourceDict, jobID)

        resultDict['DN'] = resAtt['Value']['OwnerDN']
        resultDict['Group'] = resAtt['Value']['OwnerGroup']
        resultDict['PilotInfoReportedFlag'] = True

        return resultDict

    def _getResourceDict(self, resourceDescription, credDict):
        """ from resourceDescription to resourceDict (just various mods)
    """
        resourceDict = self._processResourceDescription(resourceDescription)
        resourceDict = self._checkCredentials(resourceDict, credDict)
        self._checkPilotVersion(resourceDict)
        if not self._checkMask(resourceDict):
            # Banned destinations can only take Test jobs
            resourceDict['JobType'] = 'Test'

        self.log.verbose("Resource description:")
        for key in resourceDict:
            self.log.verbose("%s : %s" % (key.rjust(20), resourceDict[key]))

        return resourceDict

    def _processResourceDescription(self, resourceDescription):
        """ Check and form the resource description dictionary

        resourceDescription is a ceDict coming from a JobAgent, for example.
    """

        resourceDict = {}
        for name in singleValueDefFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        for name in multiValueMatchFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        for name in tagMatchFields:
            if name in resourceDescription and resourceDescription[name]:
                resourceDict[name] = resourceDescription[name]
            rname = 'Required%s' % name
            if rname in resourceDescription:
                resourceDict[rname] = resourceDescription[rname]

        if 'JobID' in resourceDescription:
            resourceDict['JobID'] = resourceDescription['JobID']

        # Convert MaxRAM and NumberOfProcessors parameters into a list of tags
        maxRAM = resourceDescription.get('MaxRAM')
        if maxRAM:
            try:
                maxRAM = int(maxRAM) / 1000
            except ValueError:
                maxRAM = None
        nProcessors = resourceDescription.get('NumberOfProcessors')
        if nProcessors:
            try:
                nProcessors = int(nProcessors)
            except ValueError:
                nProcessors = None
        for param, key in [(maxRAM, 'GB'), (nProcessors, 'Processors')]:
            if param and param <= 128:
                paramList = range(2, param + 1)
                paramTags = ['%d%s' % (par, key) for par in paramList]
                if paramTags:
                    resourceDict.setdefault("Tag", []).extend(paramTags)

        if "WholeNode" in resourceDescription:
            resourceDict.setdefault("Tag", []).append("WholeNode")

        if 'Tag' in resourceDict:
            resourceDict['Tag'] = list(set(resourceDict['Tag']))

        for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject',
                  'VirtualOrganization', 'PilotReference', 'PilotBenchmark',
                  'PilotInfoReportedFlag'):
            if k in resourceDescription:
                resourceDict[k] = resourceDescription[k]

        return resourceDict

    def _reportStatus(self, resourceDict, jobID):
        """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
        attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
        attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
        result = self.jobDB.setJobAttributes(jobID, attNames, attValues)
        if not result['OK']:
            self.log.error(
                "Problem reporting job status",
                "setJobAttributes, jobID = %s: %s" %
                (jobID, result['Message']))
        else:
            self.log.verbose("Set job attributes for jobID %s" % jobID)

        result = self.jlDB.addLoggingRecord(jobID,
                                            status='Matched',
                                            minor='Assigned',
                                            source='Matcher')
        if not result['OK']:
            self.log.error(
                "Problem reporting job status",
                "addLoggingRecord, jobID = %s: %s" %
                (jobID, result['Message']))
        else:
            self.log.verbose("Added logging record for jobID %s" % jobID)

    def _checkMask(self, resourceDict):
        """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
        if 'Site' not in resourceDict:
            self.log.error("Missing Site Name in Resource JDL")
            raise RuntimeError("Missing Site Name in Resource JDL")

        # Check if site is allowed
        result = self.siteClient.getUsableSites(resourceDict['Site'])
        if not result['OK']:
            self.log.error("Internal error",
                           "siteClient.getUsableSites: %s" % result['Message'])
            raise RuntimeError("Internal error")

        if resourceDict['Site'] not in result['Value']:
            return False

        return True

    def _updatePilotInfo(self, resourceDict):
        """ Update pilot information - do not fail if we don't manage to do it
    """
        pilotReference = resourceDict.get('PilotReference', '')
        if pilotReference:
            gridCE = resourceDict.get('GridCE', 'Unknown')
            site = resourceDict.get('Site', 'Unknown')
            benchmark = resourceDict.get('PilotBenchmark', 0.0)
            self.log.verbose(
                'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f'
                % (pilotReference, gridCE, site, benchmark))

            result = self.pilotAgentsDB.setPilotStatus(pilotReference,
                                                       status='Running',
                                                       gridSite=site,
                                                       destination=gridCE,
                                                       benchmark=benchmark)
            if not result['OK']:
                self.log.warn(
                    "Problem updating pilot information",
                    "; setPilotStatus. pilotReference: %s; %s" %
                    (pilotReference, result['Message']))

    def _updatePilotJobMapping(self, resourceDict, jobID):
        """ Update pilot to job mapping information
    """
        pilotReference = resourceDict.get('PilotReference', '')
        if pilotReference:
            result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID)
            if not result['OK']:
                self.log.error(
                    "Problem updating pilot information",
                    ";setCurrentJobID. pilotReference: %s; %s" %
                    (pilotReference, result['Message']))
            result = self.pilotAgentsDB.setJobForPilot(jobID,
                                                       pilotReference,
                                                       updateStatus=False)
            if not result['OK']:
                self.log.error(
                    "Problem updating pilot information",
                    "; setJobForPilot. pilotReference: %s; %s" %
                    (pilotReference, result['Message']))

    def _checkCredentials(self, resourceDict, credDict):
        """ Check if we can get a job given the passed credentials
    """
        if Properties.GENERIC_PILOT in credDict['properties']:
            # You can only match groups in the same VO
            if credDict['group'] == "hosts":
                # for the host case the VirtualOrganization parameter
                # is mandatory in resourceDict
                vo = resourceDict.get('VirtualOrganization', '')
            else:
                vo = Registry.getVOForGroup(credDict['group'])
            result = Registry.getGroupsForVO(vo)
            if result['OK']:
                resourceDict['OwnerGroup'] = result['Value']
            else:
                raise RuntimeError(result['Message'])
        else:
            # If it's a private pilot, the DN has to be the same
            if Properties.PILOT in credDict['properties']:
                self.log.notice(
                    "Setting the resource DN to the credentials DN")
                resourceDict['OwnerDN'] = credDict['DN']
            # If it's a job sharing. The group has to be the same and just check that the DN (if any)
            # belongs to the same group
            elif Properties.JOB_SHARING in credDict['properties']:
                resourceDict['OwnerGroup'] = credDict['group']
                self.log.notice(
                    "Setting the resource group to the credentials group")
                if 'OwnerDN' in resourceDict and resourceDict[
                        'OwnerDN'] != credDict['DN']:
                    ownerDN = resourceDict['OwnerDN']
                    result = Registry.getGroupsForDN(resourceDict['OwnerDN'])
                    if not result['OK']:
                        raise RuntimeError(result['Message'])
                    if credDict['group'] not in result['Value']:
                        # DN is not in the same group! bad boy.
                        self.log.notice(
                            "You cannot request jobs from DN %s. It does not belong to your group!"
                            % ownerDN)
                        resourceDict['OwnerDN'] = credDict['DN']
            # Nothing special, group and DN have to be the same
            else:
                resourceDict['OwnerDN'] = credDict['DN']
                resourceDict['OwnerGroup'] = credDict['group']

        return resourceDict

    def _checkPilotVersion(self, resourceDict):
        """ Check the pilot DIRAC version
    """
        if self.opsHelper.getValue("Pilot/CheckVersion", True):
            if 'ReleaseVersion' not in resourceDict:
                if 'DIRACVersion' not in resourceDict:
                    raise RuntimeError(
                        'Version check requested and not provided by Pilot')
                else:
                    pilotVersion = resourceDict['DIRACVersion']
            else:
                pilotVersion = resourceDict['ReleaseVersion']

            validVersions = self.opsHelper.getValue("Pilot/Version", [])
            if validVersions and pilotVersion not in validVersions:
                raise RuntimeError(
                    'Pilot version does not match the production version %s not in ( %s )'
                    % (pilotVersion, ",".join(validVersions)))
            # Check project if requested
            validProject = self.opsHelper.getValue("Pilot/Project", "")
            if validProject:
                if 'ReleaseProject' not in resourceDict:
                    raise RuntimeError(
                        "Version check requested but expected project %s not received"
                        % validProject)
                if resourceDict['ReleaseProject'] != validProject:
                    raise RuntimeError(
                        "Version check requested \
          but expected project %s != received %s" %
                        (validProject, resourceDict['ReleaseProject']))
Пример #11
0
class Matcher( object ):
  """ Logic for matching
  """

  def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger( "Matcher" )

    self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )


  def selectJob( self, resourceDescription, credDict ):
    """ Main job selection function to find the highest priority job matching the resource capacity
    """

    startTime = time.time()

    resourceDict = self._getResourceDict( resourceDescription, credDict )

    negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] )
    result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond )

    if not result['OK']:
      return result
    result = result['Value']
    if not result['matchFound']:
      self.log.info( "No match found" )
      raise RuntimeError( "No match found" )

    jobID = result['jobId']
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( "No attributes returned for job" )
    if not resAtt['Value']['Status'] == 'Waiting':
      self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) )
      result = self.tqDB.deleteJob( jobID )
      if not result[ 'OK' ]:
        return result
      raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) )

    self._reportStatus( resourceDict, jobID )

    result = self.jobDB.getJobJDL( jobID )
    if not result['OK']:
      raise RuntimeError( "Failed to get the job JDL" )

    resultDict = {}
    resultDict['JDL'] = result['Value']
    resultDict['JobID'] = jobID

    matchTime = time.time() - startTime
    self.log.info( "Match time: [%s]" % str( matchTime ) )
    gMonitor.addMark( "matchTime", matchTime )

    # Get some extra stuff into the response returned
    resOpt = self.jobDB.getJobOptParameters( jobID )
    if resOpt['OK']:
      for key, value in resOpt['Value'].items():
        resultDict[key] = value
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( 'No attributes returned for job' )

    if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ):
      self.limiter.updateDelayCounters( resourceDict['Site'], jobID )

    pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False )
    if not pilotInfoReportedFlag:
      self._updatePilotInfo( resourceDict )
    self._updatePilotJobMapping( resourceDict, jobID )

    resultDict['DN'] = resAtt['Value']['OwnerDN']
    resultDict['Group'] = resAtt['Value']['OwnerGroup']
    resultDict['PilotInfoReportedFlag'] = True

    return resultDict


  def _getResourceDict( self, resourceDescription, credDict ):
    """ from resourceDescription to resourceDict (just various mods)
    """
    resourceDict = self._processResourceDescription( resourceDescription )
    resourceDict = self._checkCredentials( resourceDict, credDict )
    self._checkPilotVersion( resourceDict )
    if not self._checkMask( resourceDict ):
      # Banned destinations can only take Test jobs
      resourceDict['JobType'] = 'Test'

    self.log.verbose( "Resource description:" )
    for key in resourceDict:
      self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) )

    return resourceDict

  def _processResourceDescription( self, resourceDescription ):
    """ Check and form the resource description dictionary

        resourceDescription is a ceDict coming from a JobAgent, for example.
    """

    resourceDict = {}
    if type( resourceDescription ) in StringTypes:
      classAdAgent = ClassAd( resourceDescription )
      if not classAdAgent.isOK():
        raise ValueError( 'Illegal Resource JDL' )
      self.log.verbose( classAdAgent.asJDL() )

      for name in singleValueDefFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'CPUTime':
            resourceDict[name] = classAdAgent.getAttributeInt( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      for name in multiValueMatchFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'SubmitPool':
            resourceDict[name] = classAdAgent.getListFromExpression( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      # Check if a JobID is requested
      if classAdAgent.lookupAttribute( 'JobID' ):
        resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' )

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ):
        if classAdAgent.lookupAttribute( k ):
          resourceDict[ k ] = classAdAgent.getAttributeString( k )

    else:
      for name in singleValueDefFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      for name in multiValueMatchFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      if resourceDescription.has_key( 'JobID' ):
        resourceDict['JobID'] = resourceDescription['JobID']

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization',
                 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ):
        if k in resourceDescription:
          resourceDict[ k ] = resourceDescription[ k ]

    return resourceDict



  def _reportStatus( self, resourceDict, jobID ):
    """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
    attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
    attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
    result = self.jobDB.setJobAttributes( jobID, attNames, attValues )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Set job attributes for jobID %s" % jobID )

    result = self.jlDB.addLoggingRecord( jobID,
                                         status = 'Matched',
                                         minor = 'Assigned',
                                         source = 'Matcher' )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Added logging record for jobID %s" % jobID )


  def _checkMask( self, resourceDict ):
    """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
    if not 'Site' in resourceDict:
      self.log.error( "Missing Site Name in Resource JDL" )
      raise RuntimeError( "Missing Site Name in Resource JDL" )

    # Get common site mask and check the agent site
    result = self.jobDB.getSiteMask( siteState = 'Active' )
    if not result['OK']:
      self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] )
      raise RuntimeError( "Internal error" )
    maskList = result['Value']

    if resourceDict['Site'] not in maskList:
      return False

    return True

  def _updatePilotInfo( self, resourceDict ):
    """ Update pilot information - do not fail if we don't manage to do it
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      gridCE = resourceDict.get( 'GridCE', 'Unknown' )
      site = resourceDict.get( 'Site', 'Unknown' )
      benchmark = resourceDict.get( 'PilotBenchmark', 0.0 )
      self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) )

      result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site,
                                                  destination = gridCE, benchmark = benchmark )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _updatePilotJobMapping( self, resourceDict, jobID ):
    """ Update pilot to job mapping information
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )
      result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _checkCredentials( self, resourceDict, credDict ):
    """ Check if we can get a job given the passed credentials
    """
    # Check credentials if not generic pilot
    if Properties.GENERIC_PILOT in credDict[ 'properties' ]:
      # You can only match groups in the same VO
      vo = Registry.getVOForGroup( credDict[ 'group' ] )
      result = Registry.getGroupsForVO( vo )
      if result[ 'OK' ]:
        resourceDict[ 'OwnerGroup' ] = result[ 'Value' ]
      else:
        raise RuntimeError( result['Message'] )
    else:
      # If it's a private pilot, the DN has to be the same
      if Properties.PILOT in credDict[ 'properties' ]:
        self.log.notice( "Setting the resource DN to the credentials DN" )
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # If it's a job sharing. The group has to be the same and just check that the DN (if any)
      # belongs to the same group
      elif Properties.JOB_SHARING in credDict[ 'properties' ]:
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]
        self.log.notice( "Setting the resource group to the credentials group" )
        if 'OwnerDN'  in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]:
          ownerDN = resourceDict[ 'OwnerDN' ]
          result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] )
          if not result[ 'OK' ]:
            raise RuntimeError( result['Message'] )
          if credDict[ 'group' ] not in result[ 'Value' ]:
            # DN is not in the same group! bad boy.
            self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN )
            resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # Nothing special, group and DN have to be the same
      else:
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]

    return resourceDict

  def _checkPilotVersion( self, resourceDict ):
    """ Check the pilot DIRAC version
    """
    if self.opsHelper.getValue( "Pilot/CheckVersion", True ):
      if 'ReleaseVersion' not in resourceDict:
        if not 'DIRACVersion' in resourceDict:
          raise RuntimeError( 'Version check requested and not provided by Pilot' )
        else:
          pilotVersion = resourceDict['DIRACVersion']
      else:
        pilotVersion = resourceDict['ReleaseVersion']

      validVersions = self.opsHelper.getValue( "Pilot/Version", [] )
      if validVersions and pilotVersion not in validVersions:
        raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \
                            ( pilotVersion, ",".join( validVersions ) ) )
      # Check project if requested
      validProject = self.opsHelper.getValue( "Pilot/Project", "" )
      if validProject:
        if 'ReleaseProject' not in resourceDict:
          raise RuntimeError( "Version check requested but expected project %s not received" % validProject )
        if resourceDict[ 'ReleaseProject' ] != validProject:
          raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject,
                                                                                                   resourceDict[ 'ReleaseProject' ] ) )
Пример #12
0
 def initialize(self):
   self.limiter = Limiter(jobDB=gJobDB)
Пример #13
0
class Matcher(object):
    """Logic for matching"""
    def __init__(self,
                 pilotAgentsDB=None,
                 jobDB=None,
                 tqDB=None,
                 jlDB=None,
                 opsHelper=None,
                 pilotRef=None):
        """c'tor"""
        if pilotAgentsDB:
            self.pilotAgentsDB = pilotAgentsDB
        else:
            self.pilotAgentsDB = PilotAgentsDB()
        if jobDB:
            self.jobDB = jobDB
        else:
            self.jobDB = JobDB()
        if tqDB:
            self.tqDB = tqDB
        else:
            self.tqDB = TaskQueueDB()
        if jlDB:
            self.jlDB = jlDB
        else:
            self.jlDB = JobLoggingDB()

        if opsHelper:
            self.opsHelper = opsHelper
        else:
            self.opsHelper = Operations()

        if pilotRef:
            self.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
            self.pilotAgentsDB.log = gLogger.getSubLogger("[%s]Matcher" %
                                                          pilotRef)
            self.jobDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
            self.tqDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
            self.jlDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
        else:
            self.log = gLogger.getSubLogger("Matcher")

        self.limiter = Limiter(jobDB=self.jobDB,
                               opsHelper=self.opsHelper,
                               pilotRef=pilotRef)

        self.siteClient = SiteStatus()

    def selectJob(self, resourceDescription, credDict):
        """Main job selection function to find the highest priority job matching the resource capacity"""

        startTime = time.time()

        resourceDict = self._getResourceDict(resourceDescription, credDict)

        # Make a nice print of the resource matching parameters
        toPrintDict = dict(resourceDict)
        if "MaxRAM" in resourceDescription:
            toPrintDict["MaxRAM"] = resourceDescription["MaxRAM"]
        if "NumberOfProcessors" in resourceDescription:
            toPrintDict["NumberOfProcessors"] = resourceDescription[
                "NumberOfProcessors"]
        toPrintDict["Tag"] = []
        if "Tag" in resourceDict:
            for tag in resourceDict["Tag"]:
                if not tag.endswith("GB") and not tag.endswith("Processors"):
                    toPrintDict["Tag"].append(tag)
        if not toPrintDict["Tag"]:
            toPrintDict.pop("Tag")
        self.log.info("Resource description for matching",
                      printDict(toPrintDict))

        negativeCond = self.limiter.getNegativeCondForSite(
            resourceDict["Site"], resourceDict.get("GridCE"))
        result = self.tqDB.matchAndGetJob(resourceDict,
                                          negativeCond=negativeCond)

        if not result["OK"]:
            raise RuntimeError(result["Message"])
        result = result["Value"]
        if not result["matchFound"]:
            self.log.info("No match found")
            return {}

        jobID = result["jobId"]
        resAtt = self.jobDB.getJobAttributes(
            jobID, ["OwnerDN", "OwnerGroup", "Status"])
        if not resAtt["OK"]:
            raise RuntimeError("Could not retrieve job attributes")
        if not resAtt["Value"]:
            raise RuntimeError("No attributes returned for job")
        if not resAtt["Value"]["Status"] == "Waiting":
            self.log.error("Job matched by the TQ is not in Waiting state",
                           str(jobID))
            result = self.tqDB.deleteJob(jobID)
            if not result["OK"]:
                raise RuntimeError(result["Message"])
            raise RuntimeError("Job %s is not in Waiting state" % str(jobID))

        self._reportStatus(resourceDict, jobID)

        result = self.jobDB.getJobJDL(jobID)
        if not result["OK"]:
            raise RuntimeError("Failed to get the job JDL")

        resultDict = {}
        resultDict["JDL"] = result["Value"]
        resultDict["JobID"] = jobID

        matchTime = time.time() - startTime
        self.log.verbose("Match time", "[%s]" % str(matchTime))
        gMonitor.addMark("matchTime", matchTime)

        # Get some extra stuff into the response returned
        resOpt = self.jobDB.getJobOptParameters(jobID)
        if resOpt["OK"]:
            for key, value in resOpt["Value"].items():
                resultDict[key] = value
        resAtt = self.jobDB.getJobAttributes(jobID, ["OwnerDN", "OwnerGroup"])
        if not resAtt["OK"]:
            raise RuntimeError("Could not retrieve job attributes")
        if not resAtt["Value"]:
            raise RuntimeError("No attributes returned for job")

        if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
            self.limiter.updateDelayCounters(resourceDict["Site"], jobID)

        pilotInfoReportedFlag = resourceDict.get("PilotInfoReportedFlag",
                                                 False)
        if not pilotInfoReportedFlag:
            self._updatePilotInfo(resourceDict)
        self._updatePilotJobMapping(resourceDict, jobID)

        resultDict["DN"] = resAtt["Value"]["OwnerDN"]
        resultDict["Group"] = resAtt["Value"]["OwnerGroup"]
        resultDict["PilotInfoReportedFlag"] = True

        return resultDict

    def _getResourceDict(self, resourceDescription, credDict):
        """from resourceDescription to resourceDict (just various mods)"""
        resourceDict = self._processResourceDescription(resourceDescription)
        resourceDict = self._checkCredentials(resourceDict, credDict)
        self._checkPilotVersion(resourceDict)
        if not self._checkMask(resourceDict):
            # Banned destinations can only take Test jobs
            resourceDict["JobType"] = "Test"

        self.log.verbose("Resource description")
        for key in resourceDict:
            self.log.debug("%s : %s" % (key.rjust(20), resourceDict[key]))

        return resourceDict

    def _processResourceDescription(self, resourceDescription):
        """Check and form the resource description dictionary

        :param resourceDescription: a ceDict coming from a JobAgent,
                                    for example.
        :return: updated dictionary of resource description parameters
        """

        resourceDict = {}
        for name in singleValueDefFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        for name in multiValueMatchFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        if resourceDescription.get("Tag"):
            tags = resourceDescription["Tag"]
            resourceDict["Tag"] = (tags if isinstance(tags, list) else list(
                {tag.strip("\"' ")
                 for tag in tags.strip("[]").split(",")}))
            if "RequiredTag" in resourceDescription:
                requiredTagsList = (list({
                    tag.strip("\"' ")
                    for tag in resourceDescription["RequiredTag"].strip(
                        "[]").split(",")
                }) if isinstance(resourceDescription["RequiredTag"], str) else
                                    resourceDescription["RequiredTag"])
                resourceDict["RequiredTag"] = requiredTagsList

        if "JobID" in resourceDescription:
            resourceDict["JobID"] = resourceDescription["JobID"]

        # Convert MaxRAM and NumberOfProcessors parameters into a list of tags
        maxRAM = resourceDescription.get("MaxRAM")
        if maxRAM:
            try:
                maxRAM = int(maxRAM / 1000)
            except ValueError:
                maxRAM = None
        nProcessors = resourceDescription.get("NumberOfProcessors")
        if nProcessors:
            try:
                nProcessors = int(nProcessors)
            except ValueError:
                nProcessors = None
        for param, key in [(maxRAM, "GB"), (nProcessors, "Processors")]:
            if param and param <= 1024:
                paramList = list(range(2, param + 1))
                paramTags = ["%d%s" % (par, key) for par in paramList]
                if paramTags:
                    resourceDict.setdefault("Tag", []).extend(paramTags)

        # Add 'MultiProcessor' to the list of tags
        if nProcessors and nProcessors > 1:
            resourceDict.setdefault("Tag", []).append("MultiProcessor")

        # Add 'WholeNode' to the list of tags
        if "WholeNode" in resourceDescription:
            resourceDict.setdefault("Tag", []).append("WholeNode")

        if "Tag" in resourceDict:
            resourceDict["Tag"] = list(set(resourceDict["Tag"]))
        if "RequiredTag" in resourceDict:
            resourceDict["RequiredTag"] = list(set(
                resourceDict["RequiredTag"]))

        for k in (
                "DIRACVersion",
                "ReleaseVersion",
                "ReleaseProject",
                "VirtualOrganization",
                "PilotReference",
                "PilotBenchmark",
                "PilotInfoReportedFlag",
        ):
            if k in resourceDescription:
                resourceDict[k] = resourceDescription[k]

        return resourceDict

    def _reportStatus(self, resourceDict, jobID):
        """Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
        """
        attNames = ["Status", "MinorStatus", "ApplicationStatus", "Site"]
        attValues = ["Matched", "Assigned", "Unknown", resourceDict["Site"]]
        result = self.jobDB.setJobAttributes(jobID, attNames, attValues)
        if not result["OK"]:
            self.log.error(
                "Problem reporting job status",
                "setJobAttributes, jobID = %s: %s" %
                (jobID, result["Message"]))
        else:
            self.log.verbose("Set job attributes for jobID", jobID)

        result = self.jlDB.addLoggingRecord(jobID,
                                            status=JobStatus.MATCHED,
                                            minorStatus="Assigned",
                                            source="Matcher")
        if not result["OK"]:
            self.log.error(
                "Problem reporting job status",
                "addLoggingRecord, jobID = %s: %s" %
                (jobID, result["Message"]))
        else:
            self.log.verbose("Added logging record for jobID", jobID)

    def _checkMask(self, resourceDict):
        """Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
        """
        if "Site" not in resourceDict:
            self.log.error("Missing Site Name in Resource JDL")
            raise RuntimeError("Missing Site Name in Resource JDL")

        # Check if site is allowed
        result = self.siteClient.getUsableSites(resourceDict["Site"])
        if not result["OK"]:
            self.log.error("Internal error",
                           "siteClient.getUsableSites: %s" % result["Message"])
            raise RuntimeError("Internal error")

        if resourceDict["Site"] not in result["Value"]:
            return False

        return True

    def _updatePilotInfo(self, resourceDict):
        """Update pilot information - do not fail if we don't manage to do it"""
        pilotReference = resourceDict.get("PilotReference", "")
        if pilotReference and pilotReference != "Unknown":
            gridCE = resourceDict.get("GridCE", "Unknown")
            site = resourceDict.get("Site", "Unknown")
            benchmark = resourceDict.get("PilotBenchmark", 0.0)
            self.log.verbose(
                "Reporting pilot info",
                "for %s: gridCE=%s, site=%s, benchmark=%f" %
                (pilotReference, gridCE, site, benchmark),
            )

            result = self.pilotAgentsDB.setPilotStatus(
                pilotReference,
                status=PilotStatus.RUNNING,
                gridSite=site,
                destination=gridCE,
                benchmark=benchmark)
            if not result["OK"]:
                self.log.warn(
                    "Problem updating pilot information",
                    "; setPilotStatus. pilotReference: %s; %s" %
                    (pilotReference, result["Message"]),
                )

    def _updatePilotJobMapping(self, resourceDict, jobID):
        """Update pilot to job mapping information"""
        pilotReference = resourceDict.get("PilotReference", "")
        if pilotReference and pilotReference != "Unknown":
            result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID)
            if not result["OK"]:
                self.log.error(
                    "Problem updating pilot information",
                    ";setCurrentJobID. pilotReference: %s; %s" %
                    (pilotReference, result["Message"]),
                )
            result = self.pilotAgentsDB.setJobForPilot(jobID,
                                                       pilotReference,
                                                       updateStatus=False)
            if not result["OK"]:
                self.log.error(
                    "Problem updating pilot information",
                    "; setJobForPilot. pilotReference: %s; %s" %
                    (pilotReference, result["Message"]),
                )

    def _checkCredentials(self, resourceDict, credDict):
        """Check if we can get a job given the passed credentials"""
        if Properties.GENERIC_PILOT in credDict["properties"]:
            # You can only match groups in the same VO
            if credDict["group"] == "hosts":
                # for the host case the VirtualOrganization parameter
                # is mandatory in resourceDict
                vo = resourceDict.get("VirtualOrganization", "")
            else:
                vo = Registry.getVOForGroup(credDict["group"])
            if "OwnerGroup" not in resourceDict:
                result = Registry.getGroupsForVO(vo)
                if result["OK"]:
                    resourceDict["OwnerGroup"] = result["Value"]
                else:
                    raise RuntimeError(result["Message"])
        else:
            # If it's a private pilot, the DN has to be the same
            if Properties.PILOT in credDict["properties"]:
                self.log.notice(
                    "Setting the resource DN to the credentials DN")
                resourceDict["OwnerDN"] = credDict["DN"]
            # If it's a job sharing. The group has to be the same and just check that the DN (if any)
            # belongs to the same group
            elif Properties.JOB_SHARING in credDict["properties"]:
                resourceDict["OwnerGroup"] = credDict["group"]
                self.log.notice(
                    "Setting the resource group to the credentials group")
                if "OwnerDN" in resourceDict and resourceDict[
                        "OwnerDN"] != credDict["DN"]:
                    ownerDN = resourceDict["OwnerDN"]
                    result = Registry.getGroupsForDN(resourceDict["OwnerDN"])
                    if not result["OK"]:
                        raise RuntimeError(result["Message"])
                    if credDict["group"] not in result["Value"]:
                        # DN is not in the same group! bad boy.
                        self.log.warn(
                            "You cannot request jobs from this DN, as it does not belong to your group!",
                            "(%s)" % ownerDN,
                        )
                        resourceDict["OwnerDN"] = credDict["DN"]
            # Nothing special, group and DN have to be the same
            else:
                resourceDict["OwnerDN"] = credDict["DN"]
                resourceDict["OwnerGroup"] = credDict["group"]

        return resourceDict

    def _checkPilotVersion(self, resourceDict):
        """Check the pilot DIRAC version"""
        if self.opsHelper.getValue("Pilot/CheckVersion", True):
            if "ReleaseVersion" not in resourceDict:
                if "DIRACVersion" not in resourceDict:
                    raise PilotVersionError(
                        "Version check requested and not provided by Pilot")
                else:
                    pilotVersion = resourceDict["DIRACVersion"]
            else:
                pilotVersion = resourceDict["ReleaseVersion"]

            validVersions = [
                convertToPy3VersionNumber(newStyleVersion)
                for newStyleVersion in self.opsHelper.getValue(
                    "Pilot/Version", [])
            ]
            if validVersions and convertToPy3VersionNumber(
                    pilotVersion) not in validVersions:
                raise PilotVersionError(
                    "Pilot version does not match the production version: %s not in ( %s )"
                    % (pilotVersion, ",".join(validVersions)))
            # Check project if requested
            validProject = self.opsHelper.getValue("Pilot/Project", "")
            if validProject:
                if "ReleaseProject" not in resourceDict:
                    raise PilotVersionError(
                        "Version check requested but expected project %s not received"
                        % validProject)
                if resourceDict["ReleaseProject"] != validProject:
                    raise PilotVersionError(
                        "Version check requested but expected project %s != received %s"
                        % (validProject, resourceDict["ReleaseProject"]))