示例#1
0
  def __checkSitesInMask( self, job, siteCandidates ):
    """Returns list of site candidates that are in current mask.
    """

    siteStatus = SiteStatus()
    result     = siteStatus.getUsableSites( 'ComputingAccess' )  
    if not result['OK']:
      return S_ERROR( 'Could not get site mask' )

    sites = []
    usableSites = result['Value']
    for candidate in siteCandidates:
      if not candidate in usableSites:
        self.log.verbose( '%s is a candidate site for job %s but not in mask' % ( candidate, job ) )
      else:
        sites.append( candidate )

    self.log.info( 'Candidate sites in Mask are %s' % ( sites ) )

    return S_OK( sites )
示例#2
0
    def getSiteMask(self, printOutput=False):
        """Retrieve current site mask from WMS Administrator service.

       Example usage:

       >>> print diracAdmin.getSiteMask()
       {'OK': True, 'Value': 0L}

       :returns: S_OK,S_ERROR

    """

        siteStatus = SiteStatus()
        result = siteStatus.getUsableSites('ComputingAccess')
        if result['OK']:
            sites = result['Value']
            if printOutput:
                sites.sort()
                for site in sites:
                    print site

        return result
示例#3
0
  def getSiteMask( self, printOutput = False ):
    """Retrieve current site mask from WMS Administrator service.

       Example usage:

       >>> print diracAdmin.getSiteMask()
       {'OK': True, 'Value': 0L}

       :returns: S_OK,S_ERROR

    """
    
    siteStatus = SiteStatus()
    result = siteStatus.getUsableSites( 'ComputingAccess' )
    if result['OK']:
      sites = result['Value']
      if printOutput:
        sites.sort()
        for site in sites:
          print site

    return result
示例#4
0
    def __checkSitesInMask(self, job, siteCandidates):
        """Returns list of site candidates that are in current mask.
    """

        siteStatus = SiteStatus()
        result = siteStatus.getUsableSites('ComputingAccess')
        if not result['OK']:
            return S_ERROR('Could not get site mask')

        sites = []
        usableSites = result['Value']
        for candidate in siteCandidates:
            if not candidate in usableSites:
                self.log.verbose(
                    '%s is a candidate site for job %s but not in mask' %
                    (candidate, job))
            else:
                sites.append(candidate)

        self.log.info('Candidate sites in Mask are %s' % (sites))

        return S_OK(sites)
示例#5
0
class Matcher(object):
    """ Logic for matching
  """
    def __init__(self,
                 pilotAgentsDB=None,
                 jobDB=None,
                 tqDB=None,
                 jlDB=None,
                 opsHelper=None):
        """ c'tor
    """
        if pilotAgentsDB:
            self.pilotAgentsDB = pilotAgentsDB
        else:
            self.pilotAgentsDB = PilotAgentsDB()
        if jobDB:
            self.jobDB = jobDB
        else:
            self.jobDB = JobDB()
        if tqDB:
            self.tqDB = tqDB
        else:
            self.tqDB = TaskQueueDB()
        if jlDB:
            self.jlDB = jlDB
        else:
            self.jlDB = JobLoggingDB()

        if opsHelper:
            self.opsHelper = opsHelper
        else:
            self.opsHelper = Operations()

        self.log = gLogger.getSubLogger("Matcher")

        self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper)

        self.siteClient = SiteStatus()

    def selectJob(self, resourceDescription, credDict):
        """ Main job selection function to find the highest priority job matching the resource capacity
    """

        startTime = time.time()

        resourceDict = self._getResourceDict(resourceDescription, credDict)

        # Make a nice print of the resource matching parameters
        toPrintDict = dict(resourceDict)
        if "MaxRAM" in resourceDescription:
            toPrintDict['MaxRAM'] = resourceDescription['MaxRAM']
        if "NumberOfProcessors" in resourceDescription:
            toPrintDict['NumberOfProcessors'] = resourceDescription[
                'NumberOfProcessors']
        toPrintDict['Tag'] = []
        if "Tag" in resourceDict:
            for tag in resourceDict['Tag']:
                if not tag.endswith('GB') and not tag.endswith('Processors'):
                    toPrintDict['Tag'].append(tag)
        if not toPrintDict['Tag']:
            toPrintDict.pop('Tag')
        gLogger.info('Resource description for matching',
                     printDict(toPrintDict))

        negativeCond = self.limiter.getNegativeCondForSite(
            resourceDict['Site'])
        result = self.tqDB.matchAndGetJob(resourceDict,
                                          negativeCond=negativeCond)

        if not result['OK']:
            raise RuntimeError(result['Message'])
        result = result['Value']
        if not result['matchFound']:
            self.log.info("No match found")
            return {}

        jobID = result['jobId']
        resAtt = self.jobDB.getJobAttributes(
            jobID, ['OwnerDN', 'OwnerGroup', 'Status'])
        if not resAtt['OK']:
            raise RuntimeError('Could not retrieve job attributes')
        if not resAtt['Value']:
            raise RuntimeError("No attributes returned for job")
        if not resAtt['Value']['Status'] == 'Waiting':
            self.log.error('Job matched by the TQ is not in Waiting state',
                           str(jobID))
            result = self.tqDB.deleteJob(jobID)
            if not result['OK']:
                raise RuntimeError(result['Message'])
            raise RuntimeError("Job %s is not in Waiting state" % str(jobID))

        self._reportStatus(resourceDict, jobID)

        result = self.jobDB.getJobJDL(jobID)
        if not result['OK']:
            raise RuntimeError("Failed to get the job JDL")

        resultDict = {}
        resultDict['JDL'] = result['Value']
        resultDict['JobID'] = jobID

        matchTime = time.time() - startTime
        self.log.info("Match time: [%s]" % str(matchTime))
        gMonitor.addMark("matchTime", matchTime)

        # Get some extra stuff into the response returned
        resOpt = self.jobDB.getJobOptParameters(jobID)
        if resOpt['OK']:
            for key, value in resOpt['Value'].items():
                resultDict[key] = value
        resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup'])
        if not resAtt['OK']:
            raise RuntimeError('Could not retrieve job attributes')
        if not resAtt['Value']:
            raise RuntimeError('No attributes returned for job')

        if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
            self.limiter.updateDelayCounters(resourceDict['Site'], jobID)

        pilotInfoReportedFlag = resourceDict.get('PilotInfoReportedFlag',
                                                 False)
        if not pilotInfoReportedFlag:
            self._updatePilotInfo(resourceDict)
        self._updatePilotJobMapping(resourceDict, jobID)

        resultDict['DN'] = resAtt['Value']['OwnerDN']
        resultDict['Group'] = resAtt['Value']['OwnerGroup']
        resultDict['PilotInfoReportedFlag'] = True

        return resultDict

    def _getResourceDict(self, resourceDescription, credDict):
        """ from resourceDescription to resourceDict (just various mods)
    """
        resourceDict = self._processResourceDescription(resourceDescription)
        resourceDict = self._checkCredentials(resourceDict, credDict)
        self._checkPilotVersion(resourceDict)
        if not self._checkMask(resourceDict):
            # Banned destinations can only take Test jobs
            resourceDict['JobType'] = 'Test'

        self.log.verbose("Resource description:")
        for key in resourceDict:
            self.log.verbose("%s : %s" % (key.rjust(20), resourceDict[key]))

        return resourceDict

    def _processResourceDescription(self, resourceDescription):
        """ Check and form the resource description dictionary

        resourceDescription is a ceDict coming from a JobAgent, for example.
    """

        resourceDict = {}
        for name in singleValueDefFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        for name in multiValueMatchFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        for name in tagMatchFields:
            if name in resourceDescription and resourceDescription[name]:
                resourceDict[name] = resourceDescription[name]
            rname = 'Required%s' % name
            if rname in resourceDescription:
                resourceDict[rname] = resourceDescription[rname]

        if 'JobID' in resourceDescription:
            resourceDict['JobID'] = resourceDescription['JobID']

        # Convert MaxRAM and NumberOfProcessors parameters into a list of tags
        maxRAM = resourceDescription.get('MaxRAM')
        if maxRAM:
            try:
                maxRAM = int(maxRAM) / 1000
            except ValueError:
                maxRAM = None
        nProcessors = resourceDescription.get('NumberOfProcessors')
        if nProcessors:
            try:
                nProcessors = int(nProcessors)
            except ValueError:
                nProcessors = None
        for param, key in [(maxRAM, 'GB'), (nProcessors, 'Processors')]:
            if param and param <= 128:
                paramList = range(2, param + 1)
                paramTags = ['%d%s' % (par, key) for par in paramList]
                if paramTags:
                    resourceDict.setdefault("Tag", []).extend(paramTags)

        if "WholeNode" in resourceDescription:
            resourceDict.setdefault("Tag", []).append("WholeNode")

        if 'Tag' in resourceDict:
            resourceDict['Tag'] = list(set(resourceDict['Tag']))

        for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject',
                  'VirtualOrganization', 'PilotReference', 'PilotBenchmark',
                  'PilotInfoReportedFlag'):
            if k in resourceDescription:
                resourceDict[k] = resourceDescription[k]

        return resourceDict

    def _reportStatus(self, resourceDict, jobID):
        """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
        attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
        attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
        result = self.jobDB.setJobAttributes(jobID, attNames, attValues)
        if not result['OK']:
            self.log.error(
                "Problem reporting job status",
                "setJobAttributes, jobID = %s: %s" %
                (jobID, result['Message']))
        else:
            self.log.verbose("Set job attributes for jobID %s" % jobID)

        result = self.jlDB.addLoggingRecord(jobID,
                                            status='Matched',
                                            minor='Assigned',
                                            source='Matcher')
        if not result['OK']:
            self.log.error(
                "Problem reporting job status",
                "addLoggingRecord, jobID = %s: %s" %
                (jobID, result['Message']))
        else:
            self.log.verbose("Added logging record for jobID %s" % jobID)

    def _checkMask(self, resourceDict):
        """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
        if 'Site' not in resourceDict:
            self.log.error("Missing Site Name in Resource JDL")
            raise RuntimeError("Missing Site Name in Resource JDL")

        # Check if site is allowed
        result = self.siteClient.getUsableSites(resourceDict['Site'])
        if not result['OK']:
            self.log.error("Internal error",
                           "siteClient.getUsableSites: %s" % result['Message'])
            raise RuntimeError("Internal error")

        if resourceDict['Site'] not in result['Value']:
            return False

        return True

    def _updatePilotInfo(self, resourceDict):
        """ Update pilot information - do not fail if we don't manage to do it
    """
        pilotReference = resourceDict.get('PilotReference', '')
        if pilotReference:
            gridCE = resourceDict.get('GridCE', 'Unknown')
            site = resourceDict.get('Site', 'Unknown')
            benchmark = resourceDict.get('PilotBenchmark', 0.0)
            self.log.verbose(
                'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f'
                % (pilotReference, gridCE, site, benchmark))

            result = self.pilotAgentsDB.setPilotStatus(pilotReference,
                                                       status='Running',
                                                       gridSite=site,
                                                       destination=gridCE,
                                                       benchmark=benchmark)
            if not result['OK']:
                self.log.warn(
                    "Problem updating pilot information",
                    "; setPilotStatus. pilotReference: %s; %s" %
                    (pilotReference, result['Message']))

    def _updatePilotJobMapping(self, resourceDict, jobID):
        """ Update pilot to job mapping information
    """
        pilotReference = resourceDict.get('PilotReference', '')
        if pilotReference:
            result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID)
            if not result['OK']:
                self.log.error(
                    "Problem updating pilot information",
                    ";setCurrentJobID. pilotReference: %s; %s" %
                    (pilotReference, result['Message']))
            result = self.pilotAgentsDB.setJobForPilot(jobID,
                                                       pilotReference,
                                                       updateStatus=False)
            if not result['OK']:
                self.log.error(
                    "Problem updating pilot information",
                    "; setJobForPilot. pilotReference: %s; %s" %
                    (pilotReference, result['Message']))

    def _checkCredentials(self, resourceDict, credDict):
        """ Check if we can get a job given the passed credentials
    """
        if Properties.GENERIC_PILOT in credDict['properties']:
            # You can only match groups in the same VO
            if credDict['group'] == "hosts":
                # for the host case the VirtualOrganization parameter
                # is mandatory in resourceDict
                vo = resourceDict.get('VirtualOrganization', '')
            else:
                vo = Registry.getVOForGroup(credDict['group'])
            result = Registry.getGroupsForVO(vo)
            if result['OK']:
                resourceDict['OwnerGroup'] = result['Value']
            else:
                raise RuntimeError(result['Message'])
        else:
            # If it's a private pilot, the DN has to be the same
            if Properties.PILOT in credDict['properties']:
                self.log.notice(
                    "Setting the resource DN to the credentials DN")
                resourceDict['OwnerDN'] = credDict['DN']
            # If it's a job sharing. The group has to be the same and just check that the DN (if any)
            # belongs to the same group
            elif Properties.JOB_SHARING in credDict['properties']:
                resourceDict['OwnerGroup'] = credDict['group']
                self.log.notice(
                    "Setting the resource group to the credentials group")
                if 'OwnerDN' in resourceDict and resourceDict[
                        'OwnerDN'] != credDict['DN']:
                    ownerDN = resourceDict['OwnerDN']
                    result = Registry.getGroupsForDN(resourceDict['OwnerDN'])
                    if not result['OK']:
                        raise RuntimeError(result['Message'])
                    if credDict['group'] not in result['Value']:
                        # DN is not in the same group! bad boy.
                        self.log.notice(
                            "You cannot request jobs from DN %s. It does not belong to your group!"
                            % ownerDN)
                        resourceDict['OwnerDN'] = credDict['DN']
            # Nothing special, group and DN have to be the same
            else:
                resourceDict['OwnerDN'] = credDict['DN']
                resourceDict['OwnerGroup'] = credDict['group']

        return resourceDict

    def _checkPilotVersion(self, resourceDict):
        """ Check the pilot DIRAC version
    """
        if self.opsHelper.getValue("Pilot/CheckVersion", True):
            if 'ReleaseVersion' not in resourceDict:
                if 'DIRACVersion' not in resourceDict:
                    raise RuntimeError(
                        'Version check requested and not provided by Pilot')
                else:
                    pilotVersion = resourceDict['DIRACVersion']
            else:
                pilotVersion = resourceDict['ReleaseVersion']

            validVersions = self.opsHelper.getValue("Pilot/Version", [])
            if validVersions and pilotVersion not in validVersions:
                raise RuntimeError(
                    'Pilot version does not match the production version %s not in ( %s )'
                    % (pilotVersion, ",".join(validVersions)))
            # Check project if requested
            validProject = self.opsHelper.getValue("Pilot/Project", "")
            if validProject:
                if 'ReleaseProject' not in resourceDict:
                    raise RuntimeError(
                        "Version check requested but expected project %s not received"
                        % validProject)
                if resourceDict['ReleaseProject'] != validProject:
                    raise RuntimeError(
                        "Version check requested \
          but expected project %s != received %s" %
                        (validProject, resourceDict['ReleaseProject']))
示例#6
0
class Matcher(object):
  """ Logic for matching
  """

  def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger("Matcher")

    self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper)

    self.siteClient = SiteStatus()

  def selectJob(self, resourceDescription, credDict):
    """ Main job selection function to find the highest priority job matching the resource capacity
    """

    startTime = time.time()

    resourceDict = self._getResourceDict(resourceDescription, credDict)

    # Make a nice print of the resource matching parameters
    toPrintDict = dict(resourceDict)
    if "MaxRAM" in resourceDescription:
      toPrintDict['MaxRAM'] = resourceDescription['MaxRAM']
    if "NumberOfProcessors" in resourceDescription:
      toPrintDict['NumberOfProcessors'] = resourceDescription['NumberOfProcessors']
    toPrintDict['Tag'] = []
    if "Tag" in resourceDict:
      for tag in resourceDict['Tag']:
        if not tag.endswith('GB') and not tag.endswith('Processors'):
          toPrintDict['Tag'].append(tag)
    if not toPrintDict['Tag']:
      toPrintDict.pop('Tag')
    gLogger.info('Resource description for matching', printDict(toPrintDict))

    negativeCond = self.limiter.getNegativeCondForSite(resourceDict['Site'])
    result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond)

    if not result['OK']:
      raise RuntimeError(result['Message'])
    result = result['Value']
    if not result['matchFound']:
      self.log.info("No match found")
      return {}

    jobID = result['jobId']
    resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup', 'Status'])
    if not resAtt['OK']:
      raise RuntimeError('Could not retrieve job attributes')
    if not resAtt['Value']:
      raise RuntimeError("No attributes returned for job")
    if not resAtt['Value']['Status'] == 'Waiting':
      self.log.error('Job matched by the TQ is not in Waiting state', str(jobID))
      result = self.tqDB.deleteJob(jobID)
      if not result['OK']:
        raise RuntimeError(result['Message'])
      raise RuntimeError("Job %s is not in Waiting state" % str(jobID))

    self._reportStatus(resourceDict, jobID)

    result = self.jobDB.getJobJDL(jobID)
    if not result['OK']:
      raise RuntimeError("Failed to get the job JDL")

    resultDict = {}
    resultDict['JDL'] = result['Value']
    resultDict['JobID'] = jobID

    matchTime = time.time() - startTime
    self.log.info("Match time: [%s]" % str(matchTime))
    gMonitor.addMark("matchTime", matchTime)

    # Get some extra stuff into the response returned
    resOpt = self.jobDB.getJobOptParameters(jobID)
    if resOpt['OK']:
      for key, value in resOpt['Value'].items():
        resultDict[key] = value
    resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup'])
    if not resAtt['OK']:
      raise RuntimeError('Could not retrieve job attributes')
    if not resAtt['Value']:
      raise RuntimeError('No attributes returned for job')

    if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
      self.limiter.updateDelayCounters(resourceDict['Site'], jobID)

    pilotInfoReportedFlag = resourceDict.get('PilotInfoReportedFlag', False)
    if not pilotInfoReportedFlag:
      self._updatePilotInfo(resourceDict)
    self._updatePilotJobMapping(resourceDict, jobID)

    resultDict['DN'] = resAtt['Value']['OwnerDN']
    resultDict['Group'] = resAtt['Value']['OwnerGroup']
    resultDict['PilotInfoReportedFlag'] = True

    return resultDict

  def _getResourceDict(self, resourceDescription, credDict):
    """ from resourceDescription to resourceDict (just various mods)
    """
    resourceDict = self._processResourceDescription(resourceDescription)
    resourceDict = self._checkCredentials(resourceDict, credDict)
    self._checkPilotVersion(resourceDict)
    if not self._checkMask(resourceDict):
      # Banned destinations can only take Test jobs
      resourceDict['JobType'] = 'Test'

    self.log.verbose("Resource description:")
    for key in resourceDict:
      self.log.verbose("%s : %s" % (key.rjust(20), resourceDict[key]))

    return resourceDict

  def _processResourceDescription(self, resourceDescription):
    """ Check and form the resource description dictionary

        :param resourceDescription: a ceDict coming from a JobAgent,
                                    for example.
        :return: updated dictionary of resource description parameters
    """

    resourceDict = {}
    for name in singleValueDefFields:
      if name in resourceDescription:
        resourceDict[name] = resourceDescription[name]

    for name in multiValueMatchFields:
      if name in resourceDescription:
        resourceDict[name] = resourceDescription[name]

    if resourceDescription.get('Tag'):
      resourceDict['Tag'] = resourceDescription['Tag']
      if 'RequiredTag' in resourceDescription:
        resourceDict['RequiredTag'] = resourceDescription['RequiredTag']

    if 'JobID' in resourceDescription:
      resourceDict['JobID'] = resourceDescription['JobID']

    # Convert MaxRAM and NumberOfProcessors parameters into a list of tags
    maxRAM = resourceDescription.get('MaxRAM')
    if maxRAM:
      try:
        maxRAM = int(maxRAM) / 1000
      except ValueError:
        maxRAM = None
    nProcessors = resourceDescription.get('NumberOfProcessors')
    if nProcessors:
      try:
        nProcessors = int(nProcessors)
      except ValueError:
        nProcessors = None
    for param, key in [(maxRAM, 'GB'), (nProcessors, 'Processors')]:
      if param and param <= 128:
        paramList = range(2, param + 1)
        paramTags = ['%d%s' % (par, key) for par in paramList]
        if paramTags:
          resourceDict.setdefault("Tag", []).extend(paramTags)

    # Add 'MultiProcessor' to the list of tags
    if nProcessors > 1:
      resourceDict.setdefault("Tag", []).append("MultiProcessor")

    # Add 'WholeNode' to the list of tags
    if "WholeNode" in resourceDescription:
      resourceDict.setdefault("Tag", []).append("WholeNode")

    if 'Tag' in resourceDict:
      resourceDict['Tag'] = list(set(resourceDict['Tag']))

    for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization',
              'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag'):
      if k in resourceDescription:
        resourceDict[k] = resourceDescription[k]

    return resourceDict

  def _reportStatus(self, resourceDict, jobID):
    """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
    attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
    attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
    result = self.jobDB.setJobAttributes(jobID, attNames, attValues)
    if not result['OK']:
      self.log.error("Problem reporting job status",
                     "setJobAttributes, jobID = %s: %s" % (jobID, result['Message']))
    else:
      self.log.verbose("Set job attributes for jobID %s" % jobID)

    result = self.jlDB.addLoggingRecord(jobID,
                                        status='Matched',
                                        minor='Assigned',
                                        source='Matcher')
    if not result['OK']:
      self.log.error("Problem reporting job status",
                     "addLoggingRecord, jobID = %s: %s" % (jobID, result['Message']))
    else:
      self.log.verbose("Added logging record for jobID %s" % jobID)

  def _checkMask(self, resourceDict):
    """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
    if 'Site' not in resourceDict:
      self.log.error("Missing Site Name in Resource JDL")
      raise RuntimeError("Missing Site Name in Resource JDL")

    # Check if site is allowed
    result = self.siteClient.getUsableSites(resourceDict['Site'])
    if not result['OK']:
      self.log.error("Internal error",
                     "siteClient.getUsableSites: %s" % result['Message'])
      raise RuntimeError("Internal error")

    if resourceDict['Site'] not in result['Value']:
      return False

    return True

  def _updatePilotInfo(self, resourceDict):
    """ Update pilot information - do not fail if we don't manage to do it
    """
    pilotReference = resourceDict.get('PilotReference', '')
    if pilotReference:
      gridCE = resourceDict.get('GridCE', 'Unknown')
      site = resourceDict.get('Site', 'Unknown')
      benchmark = resourceDict.get('PilotBenchmark', 0.0)
      self.log.verbose('Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % (pilotReference,
                                                                                          gridCE,
                                                                                          site,
                                                                                          benchmark))

      result = self.pilotAgentsDB.setPilotStatus(pilotReference, status='Running', gridSite=site,
                                                 destination=gridCE, benchmark=benchmark)
      if not result['OK']:
        self.log.warn("Problem updating pilot information",
                      "; setPilotStatus. pilotReference: %s; %s" % (pilotReference, result['Message']))

  def _updatePilotJobMapping(self, resourceDict, jobID):
    """ Update pilot to job mapping information
    """
    pilotReference = resourceDict.get('PilotReference', '')
    if pilotReference:
      result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID)
      if not result['OK']:
        self.log.error("Problem updating pilot information",
                       ";setCurrentJobID. pilotReference: %s; %s" % (pilotReference, result['Message']))
      result = self.pilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False)
      if not result['OK']:
        self.log.error("Problem updating pilot information",
                       "; setJobForPilot. pilotReference: %s; %s" % (pilotReference, result['Message']))

  def _checkCredentials(self, resourceDict, credDict):
    """ Check if we can get a job given the passed credentials
    """
    if Properties.GENERIC_PILOT in credDict['properties']:
      # You can only match groups in the same VO
      if credDict['group'] == "hosts":
        # for the host case the VirtualOrganization parameter
        # is mandatory in resourceDict
        vo = resourceDict.get('VirtualOrganization', '')
      else:
        vo = Registry.getVOForGroup(credDict['group'])
      if 'OwnerGroup' not in resourceDict:
        result = Registry.getGroupsForVO(vo)
        if result['OK']:
          resourceDict['OwnerGroup'] = result['Value']
        else:
          raise RuntimeError(result['Message'])
    else:
      # If it's a private pilot, the DN has to be the same
      if Properties.PILOT in credDict['properties']:
        self.log.notice("Setting the resource DN to the credentials DN")
        resourceDict['OwnerDN'] = credDict['DN']
      # If it's a job sharing. The group has to be the same and just check that the DN (if any)
      # belongs to the same group
      elif Properties.JOB_SHARING in credDict['properties']:
        resourceDict['OwnerGroup'] = credDict['group']
        self.log.notice("Setting the resource group to the credentials group")
        if 'OwnerDN' in resourceDict and resourceDict['OwnerDN'] != credDict['DN']:
          ownerDN = resourceDict['OwnerDN']
          result = Registry.getGroupsForDN(resourceDict['OwnerDN'])
          if not result['OK']:
            raise RuntimeError(result['Message'])
          if credDict['group'] not in result['Value']:
            # DN is not in the same group! bad boy.
            self.log.notice("You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN)
            resourceDict['OwnerDN'] = credDict['DN']
      # Nothing special, group and DN have to be the same
      else:
        resourceDict['OwnerDN'] = credDict['DN']
        resourceDict['OwnerGroup'] = credDict['group']

    return resourceDict

  def _checkPilotVersion(self, resourceDict):
    """ Check the pilot DIRAC version
    """
    if self.opsHelper.getValue("Pilot/CheckVersion", True):
      if 'ReleaseVersion' not in resourceDict:
        if 'DIRACVersion' not in resourceDict:
          raise RuntimeError('Version check requested and not provided by Pilot')
        else:
          pilotVersion = resourceDict['DIRACVersion']
      else:
        pilotVersion = resourceDict['ReleaseVersion']

      validVersions = self.opsHelper.getValue("Pilot/Version", [])
      if validVersions and pilotVersion not in validVersions:
        raise RuntimeError('Pilot version does not match the production version %s not in ( %s )' %
                           (pilotVersion, ",".join(validVersions)))
      # Check project if requested
      validProject = self.opsHelper.getValue("Pilot/Project", "")
      if validProject:
        if 'ReleaseProject' not in resourceDict:
          raise RuntimeError("Version check requested but expected project %s not received" % validProject)
        if resourceDict['ReleaseProject'] != validProject:
          raise RuntimeError("Version check requested \
          but expected project %s != received %s" % (validProject,
                                                     resourceDict['ReleaseProject']))
示例#7
0
class CloudDirector(AgentModule):
    """The CloudDirector works like a SiteDirector for cloud sites:
    It looks at the queued jobs in the task queues and attempts to
    start VM instances to meet the current demand.
    """

    def __init__(self, *args, **kwargs):
        super(CloudDirector, self).__init__(*args, **kwargs)
        self.vmTypeDict = {}
        self.vmTypeCECache = {}
        self.vmTypeSlots = {}
        self.failedVMTypes = defaultdict(int)
        self.firstPass = True

        self.vo = ""
        self.group = ""
        # self.voGroups contain all the eligible user groups for clouds submitted by this SiteDirector
        self.voGroups = []
        self.cloudDN = ""
        self.cloudGroup = ""
        self.platforms = []
        self.sites = []
        self.siteClient = None

        self.proxy = None

        self.updateStatus = True
        self.getOutput = False
        self.sendAccounting = True

    def initialize(self):
        self.siteClient = SiteStatus()
        return S_OK()

    def beginExecution(self):

        # The Director is for a particular user community
        self.vo = self.am_getOption("VO", "")
        if not self.vo:
            self.vo = CSGlobals.getVO()
        # The SiteDirector is for a particular user group
        self.group = self.am_getOption("Group", "")

        # Choose the group for which clouds will be submitted. This is a hack until
        # we will be able to match clouds to VOs.
        if not self.group:
            if self.vo:
                result = Registry.getGroupsForVO(self.vo)
                if not result["OK"]:
                    return result
                self.voGroups = []
                for group in result["Value"]:
                    if "NormalUser" in Registry.getPropertiesForGroup(group):
                        self.voGroups.append(group)
        else:
            self.voGroups = [self.group]

        result = findGenericCloudCredentials(vo=self.vo)
        if not result["OK"]:
            return result
        self.cloudDN, self.cloudGroup = result["Value"]
        self.maxVMsToSubmit = self.am_getOption("MaxVMsToSubmit", 1)
        self.runningPod = self.am_getOption("RunningPod", self.vo)

        # Get the site description dictionary
        siteNames = None
        if not self.am_getOption("Site", "Any").lower() == "any":
            siteNames = self.am_getOption("Site", [])
            if not siteNames:
                siteNames = None
        ces = None
        if not self.am_getOption("CEs", "Any").lower() == "any":
            ces = self.am_getOption("CEs", [])
            if not ces:
                ces = None

        result = getVMTypes(vo=self.vo, siteList=siteNames)
        if not result["OK"]:
            return result
        resourceDict = result["Value"]
        result = self.getEndpoints(resourceDict)
        if not result["OK"]:
            return result

        # if not siteNames:
        #  siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' )
        #  if siteName == 'Unknown':
        #    return S_OK( 'No site specified for the SiteDirector' )
        #  else:
        #    siteNames = [siteName]
        # self.siteNames = siteNames

        self.log.always("Sites:", siteNames)
        self.log.always("CEs:", ces)
        self.log.always("CloudDN:", self.cloudDN)
        self.log.always("CloudGroup:", self.cloudGroup)

        self.localhost = socket.getfqdn()
        self.proxy = ""

        if self.firstPass:
            if self.vmTypeDict:
                self.log.always("Agent will serve VM types:")
                for vmType in self.vmTypeDict:
                    self.log.always(
                        "Site: %s, CE: %s, VMType: %s"
                        % (self.vmTypeDict[vmType]["Site"], self.vmTypeDict[vmType]["CEName"], vmType)
                    )
        self.firstPass = False
        return S_OK()

    def __generateVMTypeHash(self, vmTypeDict):
        """Generate a hash of the queue description"""
        myMD5 = hashlib.md5()
        myMD5.update(str(sorted(vmTypeDict.items())).encode())
        hexstring = myMD5.hexdigest()
        return hexstring

    def getEndpoints(self, resourceDict):
        """Get the list of relevant CEs and their descriptions"""

        self.vmTypeDict = {}
        ceFactory = EndpointFactory()

        result = getPilotBootstrapParameters(vo=self.vo, runningPod=self.runningPod)
        if not result["OK"]:
            return result
        opParameters = result["Value"]

        for site in resourceDict:
            for ce in resourceDict[site]:
                ceDict = resourceDict[site][ce]
                ceTags = ceDict.get("Tag", [])
                if isinstance(ceTags, six.string_types):
                    ceTags = fromChar(ceTags)
                ceMaxRAM = ceDict.get("MaxRAM", None)
                qDict = ceDict.pop("VMTypes")
                for vmType in qDict:
                    vmTypeName = "%s_%s" % (ce, vmType)
                    self.vmTypeDict[vmTypeName] = {}
                    self.vmTypeDict[vmTypeName]["ParametersDict"] = qDict[vmType]
                    self.vmTypeDict[vmTypeName]["ParametersDict"]["VMType"] = vmType
                    self.vmTypeDict[vmTypeName]["ParametersDict"]["Site"] = site
                    self.vmTypeDict[vmTypeName]["ParametersDict"]["Setup"] = gConfig.getValue("/DIRAC/Setup", "unknown")
                    self.vmTypeDict[vmTypeName]["ParametersDict"]["CPUTime"] = 99999999

                    vmTypeTags = self.vmTypeDict[vmTypeName]["ParametersDict"].get("Tag")
                    if vmTypeTags and isinstance(vmTypeTags, six.string_types):
                        vmTypeTags = fromChar(vmTypeTags)
                        self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"] = vmTypeTags
                    if ceTags:
                        if vmTypeTags:
                            allTags = list(set(ceTags + vmTypeTags))
                            self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"] = allTags
                        else:
                            self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"] = ceTags

                    maxRAM = self.vmTypeDict[vmTypeName]["ParametersDict"].get("MaxRAM")
                    maxRAM = ceMaxRAM if not maxRAM else maxRAM
                    if maxRAM:
                        self.vmTypeDict[vmTypeName]["ParametersDict"]["MaxRAM"] = maxRAM

                    ceWholeNode = ceDict.get("WholeNode", "true")
                    wholeNode = self.vmTypeDict[vmTypeName]["ParametersDict"].get("WholeNode", ceWholeNode)
                    if wholeNode.lower() in ("yes", "true"):
                        self.vmTypeDict[vmTypeName]["ParametersDict"].setdefault("Tag", [])
                        self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"].append("WholeNode")

                    platform = ""
                    if "Platform" in self.vmTypeDict[vmTypeName]["ParametersDict"]:
                        platform = self.vmTypeDict[vmTypeName]["ParametersDict"]["Platform"]
                    elif "Platform" in ceDict:
                        platform = ceDict["Platform"]
                    if platform and platform not in self.platforms:
                        self.platforms.append(platform)

                    if "Platform" not in self.vmTypeDict[vmTypeName]["ParametersDict"] and platform:
                        result = Resources.getDIRACPlatform(platform)
                        if result["OK"]:
                            self.vmTypeDict[vmTypeName]["ParametersDict"]["Platform"] = result["Value"][0]

                    ceVMTypeDict = dict(ceDict)
                    ceVMTypeDict["CEName"] = ce
                    ceVMTypeDict["VO"] = self.vo
                    ceVMTypeDict["VMType"] = vmType
                    ceVMTypeDict["RunningPod"] = self.runningPod
                    ceVMTypeDict["CSServers"] = gConfig.getValue("/DIRAC/Configuration/Servers", [])
                    ceVMTypeDict.update(self.vmTypeDict[vmTypeName]["ParametersDict"])

                    # Allow a resource-specifc CAPath to be set (as some clouds have their own CAs)
                    # Otherwise fall back to the system-wide default(s)
                    if "CAPath" not in ceVMTypeDict:
                        ceVMTypeDict["CAPath"] = gConfig.getValue(
                            "/DIRAC/Security/CAPath", "/opt/dirac/etc/grid-security/certificates/cas.pem"
                        )

                    # Generate the CE object for the vmType or pick the already existing one
                    # if the vmType definition did not change
                    vmTypeHash = self.__generateVMTypeHash(ceVMTypeDict)
                    if vmTypeName in self.vmTypeCECache and self.vmTypeCECache[vmTypeName]["Hash"] == vmTypeHash:
                        vmTypeCE = self.vmTypeCECache[vmTypeName]["CE"]
                    else:
                        result = ceFactory.getCEObject(parameters=ceVMTypeDict)
                        if not result["OK"]:
                            return result
                        self.vmTypeCECache.setdefault(vmTypeName, {})
                        self.vmTypeCECache[vmTypeName]["Hash"] = vmTypeHash
                        self.vmTypeCECache[vmTypeName]["CE"] = result["Value"]
                        vmTypeCE = self.vmTypeCECache[vmTypeName]["CE"]
                        vmTypeCE.setBootstrapParameters(opParameters)

                    self.vmTypeDict[vmTypeName]["CE"] = vmTypeCE
                    self.vmTypeDict[vmTypeName]["CEName"] = ce
                    self.vmTypeDict[vmTypeName]["CEType"] = ceDict["CEType"]
                    self.vmTypeDict[vmTypeName]["Site"] = site
                    self.vmTypeDict[vmTypeName]["VMType"] = vmType
                    self.vmTypeDict[vmTypeName]["Platform"] = platform
                    self.vmTypeDict[vmTypeName]["MaxInstances"] = ceDict["MaxInstances"]
                    if not self.vmTypeDict[vmTypeName]["CE"].isValid():
                        self.log.error("Failed to instantiate CloudEndpoint for %s" % vmTypeName)
                        continue

                    if site not in self.sites:
                        self.sites.append(site)

        return S_OK()

    def execute(self):
        """Main execution method"""

        if not self.vmTypeDict:
            self.log.warn("No site defined, exiting the cycle")
            return S_OK()

        result = self.createVMs()
        if not result["OK"]:
            self.log.error("Errors in the job submission: ", result["Message"])

        # cyclesDone = self.am_getModuleParam( 'cyclesDone' )
        # if self.updateStatus and cyclesDone % self.cloudStatusUpdateCycleFactor == 0:
        #  result = self.updatePilotStatus()
        #  if not result['OK']:
        #    self.log.error( 'Errors in updating cloud status: ', result['Message'] )

        return S_OK()

    def createVMs(self):
        """Go through defined computing elements and submit jobs if necessary"""

        vmTypeList = list(self.vmTypeDict.keys())

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {"Setup": setup, "CPUTime": 9999999}
        if self.vo:
            tqDict["VO"] = self.vo
        if self.voGroups:
            tqDict["OwnerGroup"] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result["OK"]:
            return result
        tqDict["Platform"] = result["Value"]
        tqDict["Site"] = self.sites
        tags = []
        for vmType in vmTypeList:
            if "Tag" in self.vmTypeDict[vmType]["ParametersDict"]:
                tags += self.vmTypeDict[vmType]["ParametersDict"]["Tag"]
        tqDict["Tag"] = list(set(tags))

        self.log.verbose("Checking overall TQ availability with requirements")
        self.log.verbose(tqDict)

        matcherClient = MatcherClient()
        result = matcherClient.getMatchingTaskQueues(tqDict)
        if not result["OK"]:
            return result
        if not result["Value"]:
            self.log.verbose("No Waiting jobs suitable for the director")
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result["Value"]:
            if "Sites" in result["Value"][tqID]:
                for site in result["Value"][tqID]["Sites"]:
                    if site.lower() != "any":
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result["Value"][tqID]:
                if "Sites" in result["Value"][tqID]:
                    for site in result["Value"][tqID]["Sites"]:
                        if site.lower() != "any":
                            testSites.add(site)
            totalWaitingJobs += result["Value"][tqID]["Jobs"]

        tqIDList = list(result["Value"].keys())

        result = virtualMachineDB.getInstanceCounters("Status", {})
        totalVMs = 0
        if result["OK"]:
            for status in result["Value"]:
                if status in ["New", "Submitted", "Running"]:
                    totalVMs += result["Value"][status]
        self.log.info("Total %d jobs in %d task queues with %d VMs" % (totalWaitingJobs, len(tqIDList), totalVMs))

        # Check if the site is allowed in the mask
        result = self.siteClient.getUsableSites()
        if not result["OK"]:
            return S_ERROR("Can not get the site mask")
        siteMaskList = result.get("Value", [])

        vmTypeList = list(self.vmTypeDict.keys())
        random.shuffle(vmTypeList)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for vmType in vmTypeList:
            ce = self.vmTypeDict[vmType]["CE"]
            ceName = self.vmTypeDict[vmType]["CEName"]
            vmTypeName = self.vmTypeDict[vmType]["VMType"]
            siteName = self.vmTypeDict[vmType]["Site"]
            platform = self.vmTypeDict[vmType]["Platform"]
            vmTypeTags = self.vmTypeDict[vmType]["ParametersDict"].get("Tag", [])
            siteMask = siteName in siteMaskList
            endpoint = "%s::%s" % (siteName, ceName)
            maxInstances = int(self.vmTypeDict[vmType]["MaxInstances"])
            processorTags = []

            # vms support WholeNode naturally
            processorTags.append("WholeNode")

            if not anySite and siteName not in jobSites:
                self.log.verbose("Skipping queue %s at %s: no workload expected" % (vmTypeName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose("Skipping queue %s: site %s not in the mask" % (vmTypeName, siteName))
                continue

            if "CPUTime" in self.vmTypeDict[vmType]["ParametersDict"]:
                vmTypeCPUTime = int(self.vmTypeDict[vmType]["ParametersDict"]["CPUTime"])
            else:
                self.log.warn("CPU time limit is not specified for queue %s, skipping..." % vmType)
                continue

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()

            if not siteMask:
                ceDict["JobType"] = "Test"
            if self.vo:
                ceDict["VO"] = self.vo
            if self.voGroups:
                ceDict["OwnerGroup"] = self.voGroups

            result = Resources.getCompatiblePlatforms(platform)
            if not result["OK"]:
                continue
            ceDict["Platform"] = result["Value"]

            ceDict["Tag"] = list(set(processorTags + vmTypeTags))

            # Get the number of eligible jobs for the target site/queue

            result = matcherClient.getMatchingTaskQueues(ceDict)
            if not result["OK"]:
                self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"])
                return result
            taskQueueDict = result["Value"]
            if not taskQueueDict:
                self.log.verbose("No matching TQs found for %s" % vmType)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            tqIDList = list(taskQueueDict.keys())
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]["Jobs"]

            self.log.verbose(
                "%d job(s) from %d task queue(s) are eligible for %s queue" % (totalTQJobs, len(tqIDList), vmType)
            )

            # Get the number of already instantiated VMs for these task queues
            totalWaitingVMs = 0
            result = virtualMachineDB.getInstanceCounters("Status", {"Endpoint": endpoint})
            if result["OK"]:
                for status in result["Value"]:
                    if status in ["New", "Submitted"]:
                        totalWaitingVMs += result["Value"][status]
            if totalWaitingVMs >= totalTQJobs:
                self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs)

            self.log.verbose("%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, vmType))

            # Get proxy to be used to connect to the cloud endpoint
            authType = ce.parameters.get("Auth")
            if authType and authType.lower() in ["x509", "voms"]:
                self.log.verbose("Getting cloud proxy for %s/%s" % (siteName, ceName))
                result = getProxyFileForCloud(ce)
                if not result["OK"]:
                    continue
                ce.setProxy(result["Value"])

            # Get the number of available slots on the target site/endpoint
            totalSlots = self.getVMInstances(endpoint, maxInstances)
            if totalSlots == 0:
                self.log.debug("%s: No slots available" % vmType)
                continue

            vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs))
            self.log.info(
                "%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d"
                % (vmType, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit)
            )

            # Limit the number of VM instances to create to vmsToSubmit
            vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit)
            if vmsToSubmit == 0:
                continue

            self.log.info("Going to submit %d VMs to %s queue" % (vmsToSubmit, vmType))
            result = ce.createInstances(vmsToSubmit)

            # result = S_OK()
            if not result["OK"]:
                self.log.error("Failed submission to queue %s:\n" % vmType, result["Message"])
                self.failedVMTypes.setdefault(vmType, 0)
                self.failedVMTypes[vmType] += 1
                continue

            # Add VMs to the VirtualMachineDB
            vmDict = result["Value"]
            totalSubmittedPilots += len(vmDict)
            self.log.info("Submitted %d VMs to %s@%s" % (len(vmDict), vmTypeName, ceName))

            pilotList = []
            for uuID in vmDict:
                diracUUID = vmDict[uuID]["InstanceID"]
                endpoint = "%s::%s" % (self.vmTypeDict[vmType]["Site"], ceName)
                result = virtualMachineDB.insertInstance(uuID, vmTypeName, diracUUID, endpoint, self.vo)
                if not result["OK"]:
                    continue
                pRef = "vm://" + ceName + "/" + diracUUID + ":00"
                pilotList.append(pRef)

            stampDict = {}
            tqPriorityList = []
            sumPriority = 0.0
            for tq in taskQueueDict:
                sumPriority += taskQueueDict[tq]["Priority"]
                tqPriorityList.append((tq, sumPriority))
            tqDict = {}
            for pilotID in pilotList:
                rndm = random.random() * sumPriority
                for tq, prio in tqPriorityList:
                    if rndm < prio:
                        tqID = tq
                        break
                if tqID not in tqDict:
                    tqDict[tqID] = []
                tqDict[tqID].append(pilotID)

            for tqID, pilotList in tqDict.items():
                result = pilotAgentsDB.addPilotTQReference(pilotList, tqID, "", "", self.localhost, "Cloud", stampDict)
                if not result["OK"]:
                    self.log.error("Failed to insert pilots into the PilotAgentsDB: %s" % result["Message"])

        self.log.info(
            "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)
        )
        return S_OK()

    def getVMInstances(self, endpoint, maxInstances):

        result = virtualMachineDB.getInstanceCounters("Status", {"Endpoint": endpoint})
        if not result["OK"]:
            return result

        count = 0
        for status in result["Value"]:
            if status in ["New", "Submitted", "Running"]:
                count += int(result["Value"][status])

        return max(0, maxInstances - count)
示例#8
0
  def _resolveCECandidates( self, taskQueueDict ):
    """
      Return a list of CEs for this TaskQueue
    """
    # assume user knows what they're doing and avoid site mask e.g. sam jobs
    if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']:
      self.log.info( 'CEs requested by TaskQueue %s:' % taskQueueDict['TaskQueueID'],
                     ', '.join( taskQueueDict['GridCEs'] ) )
      return taskQueueDict['GridCEs']

    # Get the mask
    siteStatus = SiteStatus()
    ret = siteStatus.getUsableSites( 'ComputingAccess' )
    if not ret['OK']:
      self.log.error( 'Can not retrieve site Mask from DB:', ret['Message'] )
      return []

    usableSites = ret['Value']
    if not usableSites:
      self.log.error( 'Site mask is empty' )
      return []

    self.log.verbose( 'Site Mask: %s' % ', '.join( usableSites ) )

    # remove banned sites from siteMask
    if 'BannedSites' in taskQueueDict:
      for site in taskQueueDict['BannedSites']:
        if site in usableSites:
          usableSites.remove( site )
          self.log.verbose( 'Removing banned site %s from site Mask' % site )

    # remove from the mask if a Site is given
    siteMask = [ site for site in usableSites if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites'] ]

    if not siteMask:
      # pilot can not be submitted
      self.log.info( 'No Valid Site Candidate in Mask for TaskQueue %s' % taskQueueDict['TaskQueueID'] )
      return []

    self.log.info( 'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) )

    # Get CE's associates to the given site Names
    ceMask = []

    resources = Resources( vo = self.virtualOrganization )
    result = resources.getEligibleResources( 'Computing', {'Site':siteMask,
                                                           'SubmissionMode':'gLite',
                                                           'CEType':['LCG','CREAM']} )
    if not result['OK']:
      self.log.error( "Failed to get eligible ce's:", result['Message'] )
      return []
    ces = result['Value']

    for ce in ces:
      ceHost = resources.getComputingElementValue( ce, 'Host', 'unknown' )
      if ceHost != 'unknown':
        ceMask.append( ceHost )

    if not ceMask:
      self.log.info( 'No CE Candidate found for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) )

    self.log.verbose( 'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( ceMask ) )

    return ceMask
示例#9
0
    def _resolveCECandidates(self, taskQueueDict):
        """
      Return a list of CEs for this TaskQueue
    """
        # assume user knows what they're doing and avoid site mask e.g. sam jobs
        if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']:
            self.log.info(
                'CEs requested by TaskQueue %s:' %
                taskQueueDict['TaskQueueID'],
                ', '.join(taskQueueDict['GridCEs']))
            return taskQueueDict['GridCEs']

        # Get the mask
        siteStatus = SiteStatus()
        ret = siteStatus.getUsableSites('ComputingAccess')
        if not ret['OK']:
            self.log.error('Can not retrieve site Mask from DB:',
                           ret['Message'])
            return []

        usableSites = ret['Value']
        if not usableSites:
            self.log.error('Site mask is empty')
            return []

        self.log.verbose('Site Mask: %s' % ', '.join(usableSites))

        # remove banned sites from siteMask
        if 'BannedSites' in taskQueueDict:
            for site in taskQueueDict['BannedSites']:
                if site in usableSites:
                    usableSites.remove(site)
                    self.log.verbose('Removing banned site %s from site Mask' %
                                     site)

        # remove from the mask if a Site is given
        siteMask = [
            site for site in usableSites
            if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites']
        ]

        if not siteMask:
            # pilot can not be submitted
            self.log.info('No Valid Site Candidate in Mask for TaskQueue %s' %
                          taskQueueDict['TaskQueueID'])
            return []

        self.log.info(
            'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'],
            ', '.join(siteMask))

        # Get CE's associates to the given site Names
        ceMask = []

        resources = Resources(vo=self.virtualOrganization)
        result = resources.getEligibleResources(
            'Computing', {
                'Site': siteMask,
                'SubmissionMode': 'gLite',
                'CEType': ['LCG', 'CREAM']
            })
        if not result['OK']:
            self.log.error("Failed to get eligible ce's:", result['Message'])
            return []
        ces = result['Value']

        for ce in ces:
            ceHost = resources.getComputingElementValue(ce, 'Host', 'unknown')
            if ceHost != 'unknown':
                ceMask.append(ceHost)

        if not ceMask:
            self.log.info(
                'No CE Candidate found for TaskQueue %s:' %
                taskQueueDict['TaskQueueID'], ', '.join(siteMask))

        self.log.verbose(
            'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'],
            ', '.join(ceMask))

        return ceMask
示例#10
0
  def optimizeJob( self, jid, jobState ):
    # Reschedule delay
    result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] )
    if not result[ 'OK' ]:
      return result
    attDict = result[ 'Value' ]
    try:
      reschedules = int( attDict[ 'RescheduleCounter' ] )
    except ValueError:
      return S_ERROR( "RescheduleCounter has to be an integer" )
    if reschedules != 0:
      delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] )
      delay = delays[ min( reschedules, len( delays ) - 1 ) ]
      waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) )
      if waited < delay:
        return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay )

    # Get site requirements
    result = self._getSitesRequired( jobState )
    if not result[ 'OK' ]:
      return result
    userSites, userBannedSites = result[ 'Value' ]

    # Get active and banned sites from DIRAC
    siteStatus = SiteStatus()
    result = siteStatus.getUsableSites( 'ComputingAccess' )
    if not result[ 'OK' ]:
      return S_ERROR( "Cannot retrieve active sites from JobDB" )
    usableSites = result[ 'Value' ]
    result = siteStatus.getUnusableSites( 'ComputingAccess' )
    if not result[ 'OK' ]:
      return S_ERROR( "Cannot retrieve banned sites from JobDB" )
    unusableSites = result[ 'Value' ]

    # If the user has selected any site, filter them and hold the job if not able to run
    if userSites:
      result = jobState.getAttribute( "JobType" )
      if not result[ 'OK' ]:
        return S_ERROR( "Could not retrieve job type" )
      jobType = result[ 'Value' ]
      if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ):
        sites = self._applySiteFilter( userSites, usableSites, unusableSites )
        if not sites:
          return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( userSites ) )

    # Get the Input data
    # Third, check if there is input data
    result = jobState.getInputData()
    if not result['OK']:
      self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) )
      return S_ERROR( 'Failed to get input data from JobDB' )

    if not result['Value']:
      # No input data? Generate requirements and next
      return self.__sendToTQ( jobState, userSites, userBannedSites )

    inputData = result[ 'Value' ]

    self.jobLog.verbose( 'Has an input data requirement' )
    idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' )
    result = self.retrieveOptimizerParam( idAgent )
    if not result['OK']:
      self.jobLog.error( "Could not retrieve input data info: %s" % result[ 'Message' ] )
      return S_ERROR( "File Catalog Access Failure" )
    opData = result[ 'Value' ]
    if 'SiteCandidates' not in opData:
      return S_ERROR( "No possible site candidates" )

    # Filter input data sites with user requirement
    siteCandidates = list( opData[ 'SiteCandidates' ] )
    self.jobLog.info( "Site candidates are %s" % siteCandidates )

    siteCandidates = self._applySiteFilter( siteCandidates, userSites, userBannedSites )
    if not siteCandidates:
      return S_ERROR( "Impossible InputData * Site requirements" )

    idSites = {}
    for site in siteCandidates:
      idSites[ site ] = opData[ 'SiteCandidates' ][ site ]

    #Check if sites have correct count of disk+tape replicas
    numData = len( inputData )
    errorSites = set()
    for site in idSites:
      if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]:
        self.jobLog.error( "Site candidate %s does not have all the input data" % site )
        errorSites.add( site )
    for site in errorSites:
      idSites.pop( site )
    if not idSites:
      return S_ERROR( "Site candidates do not have all the input data" )

    #Check if staging is required
    stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites )
    if not siteCandidates:
      return S_ERROR( "No destination sites available" )

    # Is any site active?
    stageSites = self._applySiteFilter( siteCandidates, usableSites, unusableSites )
    if not stageSites:
      return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) )

    # If no staging is required send to TQ
    if not stageRequired:
      # Use siteCandidates and not stageSites because active and banned sites
      # will be taken into account on matching time
      return self.__sendToTQ( jobState, siteCandidates, userBannedSites )

    # Check if the user is allowed to stage
    if self.ex_getOption( "RestrictDataStage", False ):
      if not self.__checkStageAllowed( jobState ):
        return S_ERROR( "Stage not allowed" )

    # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
    stageSite = stageSites[0]
    self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) )
    stageData = idSites[ stageSite ]
    # Set as if everything has already been staged
    stageData[ 'disk' ] += stageData[ 'tape' ]
    stageData[ 'tape' ] = 0
    # Set the site info back to the original dict to save afterwards
    opData[ 'SiteCandidates' ][ stageSite ] = stageData

    result = self.__requestStaging( jobState, stageSite, opData )
    if not result[ 'OK' ]:
      return result
    stageLFNs = result[ 'Value' ]
    self._updateSharedSESites( stageSite, stageLFNs, opData )
    # Save the optimizer data again
    self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData )
    result = self.storeOptimizerParam( idAgent, opData )
    if not result[ 'OK' ]:
      return result

    return self._setJobSite( jobState, stageSites )
示例#11
0
class Matcher(object):
    """Logic for matching"""
    def __init__(self,
                 pilotAgentsDB=None,
                 jobDB=None,
                 tqDB=None,
                 jlDB=None,
                 opsHelper=None,
                 pilotRef=None):
        """c'tor"""
        if pilotAgentsDB:
            self.pilotAgentsDB = pilotAgentsDB
        else:
            self.pilotAgentsDB = PilotAgentsDB()
        if jobDB:
            self.jobDB = jobDB
        else:
            self.jobDB = JobDB()
        if tqDB:
            self.tqDB = tqDB
        else:
            self.tqDB = TaskQueueDB()
        if jlDB:
            self.jlDB = jlDB
        else:
            self.jlDB = JobLoggingDB()

        if opsHelper:
            self.opsHelper = opsHelper
        else:
            self.opsHelper = Operations()

        if pilotRef:
            self.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
            self.pilotAgentsDB.log = gLogger.getSubLogger("[%s]Matcher" %
                                                          pilotRef)
            self.jobDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
            self.tqDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
            self.jlDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
        else:
            self.log = gLogger.getSubLogger("Matcher")

        self.limiter = Limiter(jobDB=self.jobDB,
                               opsHelper=self.opsHelper,
                               pilotRef=pilotRef)

        self.siteClient = SiteStatus()

    def selectJob(self, resourceDescription, credDict):
        """Main job selection function to find the highest priority job matching the resource capacity"""

        startTime = time.time()

        resourceDict = self._getResourceDict(resourceDescription, credDict)

        # Make a nice print of the resource matching parameters
        toPrintDict = dict(resourceDict)
        if "MaxRAM" in resourceDescription:
            toPrintDict["MaxRAM"] = resourceDescription["MaxRAM"]
        if "NumberOfProcessors" in resourceDescription:
            toPrintDict["NumberOfProcessors"] = resourceDescription[
                "NumberOfProcessors"]
        toPrintDict["Tag"] = []
        if "Tag" in resourceDict:
            for tag in resourceDict["Tag"]:
                if not tag.endswith("GB") and not tag.endswith("Processors"):
                    toPrintDict["Tag"].append(tag)
        if not toPrintDict["Tag"]:
            toPrintDict.pop("Tag")
        self.log.info("Resource description for matching",
                      printDict(toPrintDict))

        negativeCond = self.limiter.getNegativeCondForSite(
            resourceDict["Site"], resourceDict.get("GridCE"))
        result = self.tqDB.matchAndGetJob(resourceDict,
                                          negativeCond=negativeCond)

        if not result["OK"]:
            raise RuntimeError(result["Message"])
        result = result["Value"]
        if not result["matchFound"]:
            self.log.info("No match found")
            return {}

        jobID = result["jobId"]
        resAtt = self.jobDB.getJobAttributes(
            jobID, ["OwnerDN", "OwnerGroup", "Status"])
        if not resAtt["OK"]:
            raise RuntimeError("Could not retrieve job attributes")
        if not resAtt["Value"]:
            raise RuntimeError("No attributes returned for job")
        if not resAtt["Value"]["Status"] == "Waiting":
            self.log.error("Job matched by the TQ is not in Waiting state",
                           str(jobID))
            result = self.tqDB.deleteJob(jobID)
            if not result["OK"]:
                raise RuntimeError(result["Message"])
            raise RuntimeError("Job %s is not in Waiting state" % str(jobID))

        self._reportStatus(resourceDict, jobID)

        result = self.jobDB.getJobJDL(jobID)
        if not result["OK"]:
            raise RuntimeError("Failed to get the job JDL")

        resultDict = {}
        resultDict["JDL"] = result["Value"]
        resultDict["JobID"] = jobID

        matchTime = time.time() - startTime
        self.log.verbose("Match time", "[%s]" % str(matchTime))
        gMonitor.addMark("matchTime", matchTime)

        # Get some extra stuff into the response returned
        resOpt = self.jobDB.getJobOptParameters(jobID)
        if resOpt["OK"]:
            for key, value in resOpt["Value"].items():
                resultDict[key] = value
        resAtt = self.jobDB.getJobAttributes(jobID, ["OwnerDN", "OwnerGroup"])
        if not resAtt["OK"]:
            raise RuntimeError("Could not retrieve job attributes")
        if not resAtt["Value"]:
            raise RuntimeError("No attributes returned for job")

        if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
            self.limiter.updateDelayCounters(resourceDict["Site"], jobID)

        pilotInfoReportedFlag = resourceDict.get("PilotInfoReportedFlag",
                                                 False)
        if not pilotInfoReportedFlag:
            self._updatePilotInfo(resourceDict)
        self._updatePilotJobMapping(resourceDict, jobID)

        resultDict["DN"] = resAtt["Value"]["OwnerDN"]
        resultDict["Group"] = resAtt["Value"]["OwnerGroup"]
        resultDict["PilotInfoReportedFlag"] = True

        return resultDict

    def _getResourceDict(self, resourceDescription, credDict):
        """from resourceDescription to resourceDict (just various mods)"""
        resourceDict = self._processResourceDescription(resourceDescription)
        resourceDict = self._checkCredentials(resourceDict, credDict)
        self._checkPilotVersion(resourceDict)
        if not self._checkMask(resourceDict):
            # Banned destinations can only take Test jobs
            resourceDict["JobType"] = "Test"

        self.log.verbose("Resource description")
        for key in resourceDict:
            self.log.debug("%s : %s" % (key.rjust(20), resourceDict[key]))

        return resourceDict

    def _processResourceDescription(self, resourceDescription):
        """Check and form the resource description dictionary

        :param resourceDescription: a ceDict coming from a JobAgent,
                                    for example.
        :return: updated dictionary of resource description parameters
        """

        resourceDict = {}
        for name in singleValueDefFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        for name in multiValueMatchFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        if resourceDescription.get("Tag"):
            tags = resourceDescription["Tag"]
            resourceDict["Tag"] = (tags if isinstance(tags, list) else list(
                {tag.strip("\"' ")
                 for tag in tags.strip("[]").split(",")}))
            if "RequiredTag" in resourceDescription:
                requiredTagsList = (list({
                    tag.strip("\"' ")
                    for tag in resourceDescription["RequiredTag"].strip(
                        "[]").split(",")
                }) if isinstance(resourceDescription["RequiredTag"], str) else
                                    resourceDescription["RequiredTag"])
                resourceDict["RequiredTag"] = requiredTagsList

        if "JobID" in resourceDescription:
            resourceDict["JobID"] = resourceDescription["JobID"]

        # Convert MaxRAM and NumberOfProcessors parameters into a list of tags
        maxRAM = resourceDescription.get("MaxRAM")
        if maxRAM:
            try:
                maxRAM = int(maxRAM / 1000)
            except ValueError:
                maxRAM = None
        nProcessors = resourceDescription.get("NumberOfProcessors")
        if nProcessors:
            try:
                nProcessors = int(nProcessors)
            except ValueError:
                nProcessors = None
        for param, key in [(maxRAM, "GB"), (nProcessors, "Processors")]:
            if param and param <= 1024:
                paramList = list(range(2, param + 1))
                paramTags = ["%d%s" % (par, key) for par in paramList]
                if paramTags:
                    resourceDict.setdefault("Tag", []).extend(paramTags)

        # Add 'MultiProcessor' to the list of tags
        if nProcessors and nProcessors > 1:
            resourceDict.setdefault("Tag", []).append("MultiProcessor")

        # Add 'WholeNode' to the list of tags
        if "WholeNode" in resourceDescription:
            resourceDict.setdefault("Tag", []).append("WholeNode")

        if "Tag" in resourceDict:
            resourceDict["Tag"] = list(set(resourceDict["Tag"]))
        if "RequiredTag" in resourceDict:
            resourceDict["RequiredTag"] = list(set(
                resourceDict["RequiredTag"]))

        for k in (
                "DIRACVersion",
                "ReleaseVersion",
                "ReleaseProject",
                "VirtualOrganization",
                "PilotReference",
                "PilotBenchmark",
                "PilotInfoReportedFlag",
        ):
            if k in resourceDescription:
                resourceDict[k] = resourceDescription[k]

        return resourceDict

    def _reportStatus(self, resourceDict, jobID):
        """Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
        """
        attNames = ["Status", "MinorStatus", "ApplicationStatus", "Site"]
        attValues = ["Matched", "Assigned", "Unknown", resourceDict["Site"]]
        result = self.jobDB.setJobAttributes(jobID, attNames, attValues)
        if not result["OK"]:
            self.log.error(
                "Problem reporting job status",
                "setJobAttributes, jobID = %s: %s" %
                (jobID, result["Message"]))
        else:
            self.log.verbose("Set job attributes for jobID", jobID)

        result = self.jlDB.addLoggingRecord(jobID,
                                            status=JobStatus.MATCHED,
                                            minorStatus="Assigned",
                                            source="Matcher")
        if not result["OK"]:
            self.log.error(
                "Problem reporting job status",
                "addLoggingRecord, jobID = %s: %s" %
                (jobID, result["Message"]))
        else:
            self.log.verbose("Added logging record for jobID", jobID)

    def _checkMask(self, resourceDict):
        """Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
        """
        if "Site" not in resourceDict:
            self.log.error("Missing Site Name in Resource JDL")
            raise RuntimeError("Missing Site Name in Resource JDL")

        # Check if site is allowed
        result = self.siteClient.getUsableSites(resourceDict["Site"])
        if not result["OK"]:
            self.log.error("Internal error",
                           "siteClient.getUsableSites: %s" % result["Message"])
            raise RuntimeError("Internal error")

        if resourceDict["Site"] not in result["Value"]:
            return False

        return True

    def _updatePilotInfo(self, resourceDict):
        """Update pilot information - do not fail if we don't manage to do it"""
        pilotReference = resourceDict.get("PilotReference", "")
        if pilotReference and pilotReference != "Unknown":
            gridCE = resourceDict.get("GridCE", "Unknown")
            site = resourceDict.get("Site", "Unknown")
            benchmark = resourceDict.get("PilotBenchmark", 0.0)
            self.log.verbose(
                "Reporting pilot info",
                "for %s: gridCE=%s, site=%s, benchmark=%f" %
                (pilotReference, gridCE, site, benchmark),
            )

            result = self.pilotAgentsDB.setPilotStatus(
                pilotReference,
                status=PilotStatus.RUNNING,
                gridSite=site,
                destination=gridCE,
                benchmark=benchmark)
            if not result["OK"]:
                self.log.warn(
                    "Problem updating pilot information",
                    "; setPilotStatus. pilotReference: %s; %s" %
                    (pilotReference, result["Message"]),
                )

    def _updatePilotJobMapping(self, resourceDict, jobID):
        """Update pilot to job mapping information"""
        pilotReference = resourceDict.get("PilotReference", "")
        if pilotReference and pilotReference != "Unknown":
            result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID)
            if not result["OK"]:
                self.log.error(
                    "Problem updating pilot information",
                    ";setCurrentJobID. pilotReference: %s; %s" %
                    (pilotReference, result["Message"]),
                )
            result = self.pilotAgentsDB.setJobForPilot(jobID,
                                                       pilotReference,
                                                       updateStatus=False)
            if not result["OK"]:
                self.log.error(
                    "Problem updating pilot information",
                    "; setJobForPilot. pilotReference: %s; %s" %
                    (pilotReference, result["Message"]),
                )

    def _checkCredentials(self, resourceDict, credDict):
        """Check if we can get a job given the passed credentials"""
        if Properties.GENERIC_PILOT in credDict["properties"]:
            # You can only match groups in the same VO
            if credDict["group"] == "hosts":
                # for the host case the VirtualOrganization parameter
                # is mandatory in resourceDict
                vo = resourceDict.get("VirtualOrganization", "")
            else:
                vo = Registry.getVOForGroup(credDict["group"])
            if "OwnerGroup" not in resourceDict:
                result = Registry.getGroupsForVO(vo)
                if result["OK"]:
                    resourceDict["OwnerGroup"] = result["Value"]
                else:
                    raise RuntimeError(result["Message"])
        else:
            # If it's a private pilot, the DN has to be the same
            if Properties.PILOT in credDict["properties"]:
                self.log.notice(
                    "Setting the resource DN to the credentials DN")
                resourceDict["OwnerDN"] = credDict["DN"]
            # If it's a job sharing. The group has to be the same and just check that the DN (if any)
            # belongs to the same group
            elif Properties.JOB_SHARING in credDict["properties"]:
                resourceDict["OwnerGroup"] = credDict["group"]
                self.log.notice(
                    "Setting the resource group to the credentials group")
                if "OwnerDN" in resourceDict and resourceDict[
                        "OwnerDN"] != credDict["DN"]:
                    ownerDN = resourceDict["OwnerDN"]
                    result = Registry.getGroupsForDN(resourceDict["OwnerDN"])
                    if not result["OK"]:
                        raise RuntimeError(result["Message"])
                    if credDict["group"] not in result["Value"]:
                        # DN is not in the same group! bad boy.
                        self.log.warn(
                            "You cannot request jobs from this DN, as it does not belong to your group!",
                            "(%s)" % ownerDN,
                        )
                        resourceDict["OwnerDN"] = credDict["DN"]
            # Nothing special, group and DN have to be the same
            else:
                resourceDict["OwnerDN"] = credDict["DN"]
                resourceDict["OwnerGroup"] = credDict["group"]

        return resourceDict

    def _checkPilotVersion(self, resourceDict):
        """Check the pilot DIRAC version"""
        if self.opsHelper.getValue("Pilot/CheckVersion", True):
            if "ReleaseVersion" not in resourceDict:
                if "DIRACVersion" not in resourceDict:
                    raise PilotVersionError(
                        "Version check requested and not provided by Pilot")
                else:
                    pilotVersion = resourceDict["DIRACVersion"]
            else:
                pilotVersion = resourceDict["ReleaseVersion"]

            validVersions = [
                convertToPy3VersionNumber(newStyleVersion)
                for newStyleVersion in self.opsHelper.getValue(
                    "Pilot/Version", [])
            ]
            if validVersions and convertToPy3VersionNumber(
                    pilotVersion) not in validVersions:
                raise PilotVersionError(
                    "Pilot version does not match the production version: %s not in ( %s )"
                    % (pilotVersion, ",".join(validVersions)))
            # Check project if requested
            validProject = self.opsHelper.getValue("Pilot/Project", "")
            if validProject:
                if "ReleaseProject" not in resourceDict:
                    raise PilotVersionError(
                        "Version check requested but expected project %s not received"
                        % validProject)
                if resourceDict["ReleaseProject"] != validProject:
                    raise PilotVersionError(
                        "Version check requested but expected project %s != received %s"
                        % (validProject, resourceDict["ReleaseProject"]))
示例#12
0
  def checkJob( self, job, classAdJob ):
    """This method controls the checking of the job.
    """
    self.log.verbose( 'Job %s will be processed' % ( job ) )

    # Check if the job was recently rescheduled
    result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'] )
    if not result['OK']:
      self.log.error( result['Message'] )
      return S_ERROR( 'Can not get job attributes from JobDB' )
    jobDict = result['Value']
    reCounter = int( jobDict['RescheduleCounter'] )
    if reCounter != 0 :
      reTime = fromString( jobDict['RescheduleTime'] )
      delta = toEpoch() - toEpoch( reTime )
      delay = self.maxRescheduleDelay
      if reCounter <= len( self.rescheduleDelaysList ):
        delay = self.rescheduleDelaysList[reCounter - 1]
      if delta < delay:
        if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling' ) == -1:
          result = self.jobDB.setJobStatus( job, application = 'On Hold: after rescheduling #%d' % reCounter )
        return S_OK()

    # First, get Site and BannedSites from the Job

    result = self.__getJobSiteRequirement( job, classAdJob )
    userBannedSites = result['BannedSites']
    userSites = result['Sites']

    if userSites:
      userSites = applySiteRequirements( userSites, [], userBannedSites )
      if not userSites:
        msg = 'Impossible Site Requirement'
        return S_ERROR( msg )

    # Second, get the Active and Banned sites from the RSS

    siteStatus = SiteStatus()
    
    usableSites   = siteStatus.getUsableSites( 'ComputingAccess' )
    unusableSites = siteStatus.getUnusableSites( 'ComputingAccess' )
    
    if not ( usableSites['OK'] and unusableSites['OK'] ):
      if not usableSites['OK']:
        self.log.error( usableSites['Message'] )
      if not unusableSites['OK']:
        self.log.error( unusableSites['Message'] )
      return S_ERROR( 'Can not get Active and Banned Sites from JobDB' )

    usableSites   = usableSites['Value']
    unusableSites = unusableSites['Value']

    if userSites:
      sites = applySiteRequirements( userSites, usableSites, unusableSites )
      if not sites:
        # Put on Hold only non-excluded job types
        jobType = classAdJob.getAttributeString( 'JobType' )
        if not jobType in self.excludedOnHoldJobTypes:
          msg = 'On Hold: Requested site is Banned or not Active'
          self.log.info( msg )
          result = self.jobDB.setJobStatus( job, application = msg )
          return S_OK()


    # Third, check if there is input data
    result = self.jobDB.getInputData( job )
    if not result['OK']:
      self.log.warn( 'Failed to get input data from JobDB for %s' % ( job ) )
      self.log.error( result['Message'] )
      return S_ERROR( 'Failed to get input data from JobDB' )

    if not result['Value']:
      return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites )

    hasInputData = False
    inputData = []
    for lfn in result['Value']:
      if lfn:
        inputData.append( lfn )
        hasInputData = True

    if not hasInputData:
      #With no input data requirement, job can proceed directly to task queue
      self.log.verbose( 'Job %s has no input data requirement' % ( job ) )
      return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites )

    self.log.verbose( 'Job %s has an input data requirement ' % ( job ) )

    # Fourth, Check all optimizer information
    result = self.__checkOptimizerInfo( job )
    if not result['OK']:
      return result

    optInfo = result['Value']

    #Compare site candidates with current mask
    optSites = optInfo['SiteCandidates'].keys()
    self.log.info( 'Input Data Site Candidates: %s' % ( ', '.join( optSites ) ) )
    # Check that it is compatible with user requirements
    optSites = applySiteRequirements( optSites, userSites, userBannedSites )
    if not optSites:
      msg = 'Impossible Site + InputData Requirement'
      return S_ERROR( msg )

    sites = applySiteRequirements( optSites, usableSites, unusableSites )
    if not sites:
      msg = 'On Hold: InputData Site is Banned or not Active'
      self.log.info( msg )
      result = self.jobDB.setJobStatus( job, application = msg )
      return S_OK()

    #Set stager request as necessary, optimize for smallest #files on tape if
    #more than one site candidate left at this point
    checkStaging = self.__resolveSitesForStaging( job, sites, inputData, optInfo['SiteCandidates'] )
    if not checkStaging['OK']:
      return checkStaging

    destinationSites = checkStaging['SiteCandidates']
    if not destinationSites:
      return S_ERROR( 'No destination sites available' )

    stagingFlag = checkStaging['Value']
    if stagingFlag:
      #Single site candidate chosen and staging required
      self.log.verbose( 'Job %s requires staging of input data' % ( job ) )
      # set all LFN to disk for the selected site
      stagingSite = destinationSites[0]
      siteDict = optInfo['SiteCandidates'][stagingSite]
      siteDict['disk'] = siteDict['disk'] + siteDict['tape']
      siteDict['tape'] = 0

      optInfo['SiteCandidates'][stagingSite] = siteDict
      self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % ( self.dataAgentName, job ), optInfo )
      result = self.setOptimizerJobInfo( job, self.dataAgentName, optInfo )
      if not result['OK']:
        return result

      # Site is selected for staging, report it
      self.log.verbose( 'Staging site candidate for job %s is %s' % ( job, stagingSite ) )

      result = self.__getStagingSites( stagingSite, destinationSites )
      if not result['OK']:
        stagingSites = [stagingSite]
      else:
        stagingSites = result['Value']

      if len( stagingSites ) == 1:
        self.jobDB.setJobAttribute( job, 'Site', stagingSite )
      else:
        # Get the name of the site group
        result = self.__getSiteGroup( stagingSites )
        if result['OK']:
          groupName = result['Value']
          if groupName:
            self.jobDB.setJobAttribute( job, 'Site', groupName )
          else:
            self.jobDB.setJobAttribute( job, 'Site', 'Multiple' )
        else:
          self.jobDB.setJobAttribute( job, 'Site', 'Multiple' )

      stagerDict = self.__setStagingRequest( job, stagingSite, optInfo )
      if not stagerDict['OK']:
        return stagerDict
      self.__updateOtherSites( job, stagingSite, stagerDict['Value'], optInfo )
      return S_OK()
    else:
      #No staging required, can proceed to task queue agent and then waiting status
      self.log.verbose( 'Job %s does not require staging of input data' % ( job ) )
    #Finally send job to TaskQueueAgent
    return self.__sendJobToTaskQueue( job, classAdJob, destinationSites, userBannedSites )
示例#13
0
class MatcherHandler(RequestHandler):

    __opsCache = {}

    def initialize(self):
        self.__opsHelper = self.__getOpsHelper()
        self.__limiter = Limiter(self.__opsHelper)
        self.__siteStatus = SiteStatus()

    def __getOpsHelper(self, setup=False, vo=False):
        if not setup:
            setup = self.srv_getClientSetup()
        if not vo:
            vo = Registry.getVOForGroup(self.getRemoteCredentials()['group'])
        cKey = (vo, setup)
        if cKey not in MatcherHandler.__opsCache:
            MatcherHandler.__opsCache[cKey] = Operations.Operations(
                vo=vo, setup=setup)
        return MatcherHandler.__opsCache[cKey]

    def __processResourceDescription(self, resourceDescription):
        # Check and form the resource description dictionary
        resourceDict = {}
        if type(resourceDescription) in StringTypes:
            classAdAgent = ClassAd(resourceDescription)
            if not classAdAgent.isOK():
                return S_ERROR('Illegal Resource JDL')
            gLogger.verbose(classAdAgent.asJDL())

            for name in gTaskQueueDB.getSingleValueTQDefFields():
                if classAdAgent.lookupAttribute(name):
                    if name == 'CPUTime':
                        resourceDict[name] = classAdAgent.getAttributeInt(name)
                    else:
                        resourceDict[name] = classAdAgent.getAttributeString(
                            name)

            for name in gTaskQueueDB.getMultiValueMatchFields():
                if classAdAgent.lookupAttribute(name):
                    if name == 'SubmitPool':
                        resourceDict[
                            name] = classAdAgent.getListFromExpression(name)
                    else:
                        resourceDict[name] = classAdAgent.getAttributeString(
                            name)

            # Check if a JobID is requested
            if classAdAgent.lookupAttribute('JobID'):
                resourceDict['JobID'] = classAdAgent.getAttributeInt('JobID')

            for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject',
                      'VirtualOrganization'):
                if classAdAgent.lookupAttribute(k):
                    resourceDict[k] = classAdAgent.getAttributeString(k)

        else:
            for name in gTaskQueueDB.getSingleValueTQDefFields():
                if resourceDescription.has_key(name):
                    resourceDict[name] = resourceDescription[name]

            for name in gTaskQueueDB.getMultiValueMatchFields():
                if resourceDescription.has_key(name):
                    resourceDict[name] = resourceDescription[name]

            if resourceDescription.has_key('JobID'):
                resourceDict['JobID'] = resourceDescription['JobID']

            for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject',
                      'VirtualOrganization', 'PilotReference',
                      'PilotInfoReportedFlag', 'PilotBenchmark',
                      'LHCbPlatform'):
                if k in resourceDescription:
                    resourceDict[k] = resourceDescription[k]

        return resourceDict

    def selectJob(self, resourceDescription):
        """ Main job selection function to find the highest priority job
        matching the resource capacity
    """

        startTime = time.time()
        resourceDict = self.__processResourceDescription(resourceDescription)

        credDict = self.getRemoteCredentials()
        #Check credentials if not generic pilot
        if Properties.GENERIC_PILOT in credDict['properties']:
            #You can only match groups in the same VO
            vo = Registry.getVOForGroup(credDict['group'])
            result = Registry.getGroupsForVO(vo)
            if result['OK']:
                resourceDict['OwnerGroup'] = result['Value']
        else:
            #If it's a private pilot, the DN has to be the same
            if Properties.PILOT in credDict['properties']:
                gLogger.notice("Setting the resource DN to the credentials DN")
                resourceDict['OwnerDN'] = credDict['DN']
            #If it's a job sharing. The group has to be the same and just check that the DN (if any)
            # belongs to the same group
            elif Properties.JOB_SHARING in credDict['properties']:
                resourceDict['OwnerGroup'] = credDict['group']
                gLogger.notice(
                    "Setting the resource group to the credentials group")
                if 'OwnerDN' in resourceDict and resourceDict[
                        'OwnerDN'] != credDict['DN']:
                    ownerDN = resourceDict['OwnerDN']
                    result = Registry.getGroupsForDN(resourceDict['OwnerDN'])
                    if not result['OK'] or credDict['group'] not in result[
                            'Value']:
                        #DN is not in the same group! bad boy.
                        gLogger.notice(
                            "You cannot request jobs from DN %s. It does not belong to your group!"
                            % ownerDN)
                        resourceDict['OwnerDN'] = credDict['DN']
            #Nothing special, group and DN have to be the same
            else:
                resourceDict['OwnerDN'] = credDict['DN']
                resourceDict['OwnerGroup'] = credDict['group']

        # Check the pilot DIRAC version
        if self.__opsHelper.getValue("Pilot/CheckVersion", True):
            if 'ReleaseVersion' not in resourceDict:
                if not 'DIRACVersion' in resourceDict:
                    return S_ERROR(
                        'Version check requested and not provided by Pilot')
                else:
                    pilotVersion = resourceDict['DIRACVersion']
            else:
                pilotVersion = resourceDict['ReleaseVersion']

            validVersions = self.__opsHelper.getValue("Pilot/Version", [])
            if validVersions and pilotVersion not in validVersions:
                return S_ERROR( 'Pilot version does not match the production version %s not in ( %s )' % \
                               ( pilotVersion, ",".join( validVersions ) ) )
            #Check project if requested
            validProject = self.__opsHelper.getValue("Pilot/Project", "")
            if validProject:
                if 'ReleaseProject' not in resourceDict:
                    return S_ERROR(
                        "Version check requested but expected project %s not received"
                        % validProject)
                if resourceDict['ReleaseProject'] != validProject:
                    return S_ERROR(
                        "Version check requested but expected project %s != received %s"
                        % (validProject, resourceDict['ReleaseProject']))

        # Update pilot information
        pilotInfoReported = False
        pilotReference = resourceDict.get('PilotReference', '')
        if pilotReference:
            if "PilotInfoReportedFlag" in resourceDict and not resourceDict[
                    'PilotInfoReportedFlag']:
                gridCE = resourceDict.get('GridCE', 'Unknown')
                site = resourceDict.get('Site', 'Unknown')
                benchmark = benchmark = resourceDict.get('PilotBenchmark', 0.0)
                gLogger.verbose(
                    'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f'
                    % (pilotReference, gridCE, site, benchmark))
                result = gPilotAgentsDB.setPilotStatus(pilotReference,
                                                       status='Running',
                                                       gridSite=site,
                                                       destination=gridCE,
                                                       benchmark=benchmark)
                if result['OK']:
                    pilotInfoReported = True

        #Check the site mask
        if not 'Site' in resourceDict:
            return S_ERROR('Missing Site Name in Resource JDL')

        # Get common site mask and check the agent site
        result = self.__siteStatus.getUsableSites('ComputingAccess')
        if not result['OK']:
            return S_ERROR('Internal error: can not get site mask')
        usableSites = result['Value']

        siteName = resourceDict['Site']
        if siteName not in usableSites:
            if 'GridCE' not in resourceDict:
                return S_ERROR('Site not in mask and GridCE not specified')
            #Even if the site is banned, if it defines a CE, it must be able to check it
            del resourceDict['Site']

        resourceDict['Setup'] = self.serviceInfoDict['clientSetup']

        gLogger.verbose("Resource description:")
        for key in resourceDict:
            gLogger.verbose("%s : %s" % (key.rjust(20), resourceDict[key]))

        negativeCond = self.__limiter.getNegativeCondForSite(siteName)
        result = gTaskQueueDB.matchAndGetJob(resourceDict,
                                             negativeCond=negativeCond)

        if DEBUG:
            print result

        if not result['OK']:
            return result
        result = result['Value']
        if not result['matchFound']:
            return S_ERROR('No match found')

        jobID = result['jobId']
        resAtt = gJobDB.getJobAttributes(jobID,
                                         ['OwnerDN', 'OwnerGroup', 'Status'])
        if not resAtt['OK']:
            return S_ERROR('Could not retrieve job attributes')
        if not resAtt['Value']:
            return S_ERROR('No attributes returned for job')
        if not resAtt['Value']['Status'] == 'Waiting':
            gLogger.error('Job matched by the TQ is not in Waiting state',
                          str(jobID))
            result = gTaskQueueDB.deleteJob(jobID)
            if not result['OK']:
                return result
            return S_ERROR("Job %s is not in Waiting state" % str(jobID))

        attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
        attValues = ['Matched', 'Assigned', 'Unknown', siteName]
        result = gJobDB.setJobAttributes(jobID, attNames, attValues)
        # result = gJobDB.setJobStatus( jobID, status = 'Matched', minor = 'Assigned' )
        result = gJobLoggingDB.addLoggingRecord(jobID,
                                                status='Matched',
                                                minor='Assigned',
                                                source='Matcher')

        result = gJobDB.getJobJDL(jobID)
        if not result['OK']:
            return S_ERROR('Failed to get the job JDL')

        resultDict = {}
        resultDict['JDL'] = result['Value']
        resultDict['JobID'] = jobID

        matchTime = time.time() - startTime
        gLogger.info("Match time: [%s]" % str(matchTime))
        gMonitor.addMark("matchTime", matchTime)

        # Get some extra stuff into the response returned
        resOpt = gJobDB.getJobOptParameters(jobID)
        if resOpt['OK']:
            for key, value in resOpt['Value'].items():
                resultDict[key] = value
        resAtt = gJobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup'])
        if not resAtt['OK']:
            return S_ERROR('Could not retrieve job attributes')
        if not resAtt['Value']:
            return S_ERROR('No attributes returned for job')

        if self.__opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
            self.__limiter.updateDelayCounters(siteName, jobID)

        # Report pilot-job association
        if pilotReference:
            result = gPilotAgentsDB.setCurrentJobID(pilotReference, jobID)
            result = gPilotAgentsDB.setJobForPilot(jobID,
                                                   pilotReference,
                                                   updateStatus=False)

        resultDict['DN'] = resAtt['Value']['OwnerDN']
        resultDict['Group'] = resAtt['Value']['OwnerGroup']
        resultDict['PilotInfoReportedFlag'] = pilotInfoReported
        return S_OK(resultDict)

##############################################################################

    types_requestJob = [[StringType, DictType]]

    def export_requestJob(self, resourceDescription):
        """ Serve a job to the request of an agent which is the highest priority
        one matching the agent's site capacity
    """

        result = self.selectJob(resourceDescription)
        gMonitor.addMark("matchesDone")
        if result['OK']:
            gMonitor.addMark("matchesOK")
        return result

##############################################################################

    types_getActiveTaskQueues = []

    def export_getActiveTaskQueues(self):
        """ Return all task queues
    """
        return gTaskQueueDB.retrieveTaskQueues()

##############################################################################

    types_getMatchingTaskQueues = [DictType]

    def export_getMatchingTaskQueues(self, resourceDict):
        """ Return all task queues
    """
        if 'Site' in resourceDict and type(
                resourceDict['Site']) in StringTypes:
            negativeCond = self.__limiter.getNegativeCondForSite(
                resourceDict['Site'])
        else:
            negativeCond = self.__limiter.getNegativeCond()
        return gTaskQueueDB.retrieveTaskQueuesThatMatch(
            resourceDict, negativeCond=negativeCond)


##############################################################################

    types_matchAndGetTaskQueue = [DictType]

    def export_matchAndGetTaskQueue(self, resourceDict):
        """ Return matching task queues
    """
        return gTaskQueueDB.matchAndGetTaskQueue(resourceDict)
示例#14
0
    def checkJob(self, job, classAdJob):
        """This method controls the checking of the job.
    """
        self.log.verbose('Job %s will be processed' % (job))

        # Check if the job was recently rescheduled
        result = self.jobDB.getJobAttributes(
            job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'])
        if not result['OK']:
            self.log.error(result['Message'])
            return S_ERROR('Can not get job attributes from JobDB')
        jobDict = result['Value']
        reCounter = int(jobDict['RescheduleCounter'])
        if reCounter != 0:
            reTime = fromString(jobDict['RescheduleTime'])
            delta = toEpoch() - toEpoch(reTime)
            delay = self.maxRescheduleDelay
            if reCounter <= len(self.rescheduleDelaysList):
                delay = self.rescheduleDelaysList[reCounter - 1]
            if delta < delay:
                if jobDict['ApplicationStatus'].find(
                        'On Hold: after rescheduling') == -1:
                    result = self.jobDB.setJobStatus(
                        job,
                        application='On Hold: after rescheduling #%d' %
                        reCounter)
                return S_OK()

        # First, get Site and BannedSites from the Job

        result = self.__getJobSiteRequirement(job, classAdJob)
        userBannedSites = result['BannedSites']
        userSites = result['Sites']

        if userSites:
            userSites = applySiteRequirements(userSites, [], userBannedSites)
            if not userSites:
                msg = 'Impossible Site Requirement'
                return S_ERROR(msg)

        # Second, get the Active and Banned sites from the RSS

        siteStatus = SiteStatus()

        usableSites = siteStatus.getUsableSites('ComputingAccess')
        unusableSites = siteStatus.getUnusableSites('ComputingAccess')

        if not (usableSites['OK'] and unusableSites['OK']):
            if not usableSites['OK']:
                self.log.error(usableSites['Message'])
            if not unusableSites['OK']:
                self.log.error(unusableSites['Message'])
            return S_ERROR('Can not get Active and Banned Sites from JobDB')

        usableSites = usableSites['Value']
        unusableSites = unusableSites['Value']

        if userSites:
            sites = applySiteRequirements(userSites, usableSites,
                                          unusableSites)
            if not sites:
                # Put on Hold only non-excluded job types
                jobType = classAdJob.getAttributeString('JobType')
                if not jobType in self.excludedOnHoldJobTypes:
                    msg = 'On Hold: Requested site is Banned or not Active'
                    self.log.info(msg)
                    result = self.jobDB.setJobStatus(job, application=msg)
                    return S_OK()

        # Third, check if there is input data
        result = self.jobDB.getInputData(job)
        if not result['OK']:
            self.log.warn('Failed to get input data from JobDB for %s' % (job))
            self.log.error(result['Message'])
            return S_ERROR('Failed to get input data from JobDB')

        if not result['Value']:
            return self.__sendJobToTaskQueue(job, classAdJob, userSites,
                                             userBannedSites)

        hasInputData = False
        inputData = []
        for lfn in result['Value']:
            if lfn:
                inputData.append(lfn)
                hasInputData = True

        if not hasInputData:
            #With no input data requirement, job can proceed directly to task queue
            self.log.verbose('Job %s has no input data requirement' % (job))
            return self.__sendJobToTaskQueue(job, classAdJob, userSites,
                                             userBannedSites)

        self.log.verbose('Job %s has an input data requirement ' % (job))

        # Fourth, Check all optimizer information
        result = self.__checkOptimizerInfo(job)
        if not result['OK']:
            return result

        optInfo = result['Value']

        #Compare site candidates with current mask
        optSites = optInfo['SiteCandidates'].keys()
        self.log.info('Input Data Site Candidates: %s' % (', '.join(optSites)))
        # Check that it is compatible with user requirements
        optSites = applySiteRequirements(optSites, userSites, userBannedSites)
        if not optSites:
            msg = 'Impossible Site + InputData Requirement'
            return S_ERROR(msg)

        sites = applySiteRequirements(optSites, usableSites, unusableSites)
        if not sites:
            msg = 'On Hold: InputData Site is Banned or not Active'
            self.log.info(msg)
            result = self.jobDB.setJobStatus(job, application=msg)
            return S_OK()

        #Set stager request as necessary, optimize for smallest #files on tape if
        #more than one site candidate left at this point
        checkStaging = self.__resolveSitesForStaging(job, sites, inputData,
                                                     optInfo['SiteCandidates'])
        if not checkStaging['OK']:
            return checkStaging

        destinationSites = checkStaging['SiteCandidates']
        if not destinationSites:
            return S_ERROR('No destination sites available')

        stagingFlag = checkStaging['Value']
        if stagingFlag:
            #Single site candidate chosen and staging required
            self.log.verbose('Job %s requires staging of input data' % (job))
            # set all LFN to disk for the selected site
            stagingSite = destinationSites[0]
            siteDict = optInfo['SiteCandidates'][stagingSite]
            siteDict['disk'] = siteDict['disk'] + siteDict['tape']
            siteDict['tape'] = 0

            optInfo['SiteCandidates'][stagingSite] = siteDict
            self.log.verbose(
                'Updating %s Optimizer Info for Job %s:' %
                (self.dataAgentName, job), optInfo)
            result = self.setOptimizerJobInfo(job, self.dataAgentName, optInfo)
            if not result['OK']:
                return result

            # Site is selected for staging, report it
            self.log.verbose('Staging site candidate for job %s is %s' %
                             (job, stagingSite))

            result = self.__getStagingSites(stagingSite, destinationSites)
            if not result['OK']:
                stagingSites = [stagingSite]
            else:
                stagingSites = result['Value']

            if len(stagingSites) == 1:
                self.jobDB.setJobAttribute(job, 'Site', stagingSite)
            else:
                # Get the name of the site group
                result = self.__getSiteGroup(stagingSites)
                if result['OK']:
                    groupName = result['Value']
                    if groupName:
                        self.jobDB.setJobAttribute(job, 'Site', groupName)
                    else:
                        self.jobDB.setJobAttribute(job, 'Site', 'Multiple')
                else:
                    self.jobDB.setJobAttribute(job, 'Site', 'Multiple')

            stagerDict = self.__setStagingRequest(job, stagingSite, optInfo)
            if not stagerDict['OK']:
                return stagerDict
            self.__updateOtherSites(job, stagingSite, stagerDict['Value'],
                                    optInfo)
            return S_OK()
        else:
            #No staging required, can proceed to task queue agent and then waiting status
            self.log.verbose('Job %s does not require staging of input data' %
                             (job))
        #Finally send job to TaskQueueAgent
        return self.__sendJobToTaskQueue(job, classAdJob, destinationSites,
                                         userBannedSites)