示例#1
0
    def getBannedSites(self, printOutput=False):
        """Retrieve current list of banned sites.

       Example usage:

       >>> print diracAdmin.getBannedSites()
       {'OK': True, 'Value': []}

       :returns: S_OK,S_ERROR

    """
        siteStatus = SiteStatus()

        result = siteStatus.getUnusableSites('ComputingAccess')
        if not result['OK']:
            self.log.warn(result['Message'])
            return result
        bannedSites = result['Value']

        bannedSites.sort()
        if printOutput:
            print '\n'.join(bannedSites)
        return S_OK(bannedSites)
示例#2
0
  def getBannedSites( self, printOutput = False ):
    """Retrieve current list of banned sites.

       Example usage:

       >>> print diracAdmin.getBannedSites()
       {'OK': True, 'Value': []}

       :returns: S_OK,S_ERROR

    """
    siteStatus = SiteStatus()

    result = siteStatus.getUnusableSites( 'ComputingAccess' )
    if not result['OK']:
      self.log.warn( result['Message'] )
      return result
    bannedSites = result['Value']

    bannedSites.sort()
    if printOutput:
      print '\n'.join( bannedSites )
    return S_OK( bannedSites )
示例#3
0
  def checkJob( self, job, classAdJob ):
    """This method controls the checking of the job.
    """
    self.log.verbose( 'Job %s will be processed' % ( job ) )

    # Check if the job was recently rescheduled
    result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'] )
    if not result['OK']:
      self.log.error( result['Message'] )
      return S_ERROR( 'Can not get job attributes from JobDB' )
    jobDict = result['Value']
    reCounter = int( jobDict['RescheduleCounter'] )
    if reCounter != 0 :
      reTime = fromString( jobDict['RescheduleTime'] )
      delta = toEpoch() - toEpoch( reTime )
      delay = self.maxRescheduleDelay
      if reCounter <= len( self.rescheduleDelaysList ):
        delay = self.rescheduleDelaysList[reCounter - 1]
      if delta < delay:
        if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling' ) == -1:
          result = self.jobDB.setJobStatus( job, application = 'On Hold: after rescheduling #%d' % reCounter )
        return S_OK()

    # First, get Site and BannedSites from the Job

    result = self.__getJobSiteRequirement( job, classAdJob )
    userBannedSites = result['BannedSites']
    userSites = result['Sites']

    if userSites:
      userSites = applySiteRequirements( userSites, [], userBannedSites )
      if not userSites:
        msg = 'Impossible Site Requirement'
        return S_ERROR( msg )

    # Second, get the Active and Banned sites from the RSS

    siteStatus = SiteStatus()
    
    usableSites   = siteStatus.getUsableSites( 'ComputingAccess' )
    unusableSites = siteStatus.getUnusableSites( 'ComputingAccess' )
    
    if not ( usableSites['OK'] and unusableSites['OK'] ):
      if not usableSites['OK']:
        self.log.error( usableSites['Message'] )
      if not unusableSites['OK']:
        self.log.error( unusableSites['Message'] )
      return S_ERROR( 'Can not get Active and Banned Sites from JobDB' )

    usableSites   = usableSites['Value']
    unusableSites = unusableSites['Value']

    if userSites:
      sites = applySiteRequirements( userSites, usableSites, unusableSites )
      if not sites:
        # Put on Hold only non-excluded job types
        jobType = classAdJob.getAttributeString( 'JobType' )
        if not jobType in self.excludedOnHoldJobTypes:
          msg = 'On Hold: Requested site is Banned or not Active'
          self.log.info( msg )
          result = self.jobDB.setJobStatus( job, application = msg )
          return S_OK()


    # Third, check if there is input data
    result = self.jobDB.getInputData( job )
    if not result['OK']:
      self.log.warn( 'Failed to get input data from JobDB for %s' % ( job ) )
      self.log.error( result['Message'] )
      return S_ERROR( 'Failed to get input data from JobDB' )

    if not result['Value']:
      return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites )

    hasInputData = False
    inputData = []
    for lfn in result['Value']:
      if lfn:
        inputData.append( lfn )
        hasInputData = True

    if not hasInputData:
      #With no input data requirement, job can proceed directly to task queue
      self.log.verbose( 'Job %s has no input data requirement' % ( job ) )
      return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites )

    self.log.verbose( 'Job %s has an input data requirement ' % ( job ) )

    # Fourth, Check all optimizer information
    result = self.__checkOptimizerInfo( job )
    if not result['OK']:
      return result

    optInfo = result['Value']

    #Compare site candidates with current mask
    optSites = optInfo['SiteCandidates'].keys()
    self.log.info( 'Input Data Site Candidates: %s' % ( ', '.join( optSites ) ) )
    # Check that it is compatible with user requirements
    optSites = applySiteRequirements( optSites, userSites, userBannedSites )
    if not optSites:
      msg = 'Impossible Site + InputData Requirement'
      return S_ERROR( msg )

    sites = applySiteRequirements( optSites, usableSites, unusableSites )
    if not sites:
      msg = 'On Hold: InputData Site is Banned or not Active'
      self.log.info( msg )
      result = self.jobDB.setJobStatus( job, application = msg )
      return S_OK()

    #Set stager request as necessary, optimize for smallest #files on tape if
    #more than one site candidate left at this point
    checkStaging = self.__resolveSitesForStaging( job, sites, inputData, optInfo['SiteCandidates'] )
    if not checkStaging['OK']:
      return checkStaging

    destinationSites = checkStaging['SiteCandidates']
    if not destinationSites:
      return S_ERROR( 'No destination sites available' )

    stagingFlag = checkStaging['Value']
    if stagingFlag:
      #Single site candidate chosen and staging required
      self.log.verbose( 'Job %s requires staging of input data' % ( job ) )
      # set all LFN to disk for the selected site
      stagingSite = destinationSites[0]
      siteDict = optInfo['SiteCandidates'][stagingSite]
      siteDict['disk'] = siteDict['disk'] + siteDict['tape']
      siteDict['tape'] = 0

      optInfo['SiteCandidates'][stagingSite] = siteDict
      self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % ( self.dataAgentName, job ), optInfo )
      result = self.setOptimizerJobInfo( job, self.dataAgentName, optInfo )
      if not result['OK']:
        return result

      # Site is selected for staging, report it
      self.log.verbose( 'Staging site candidate for job %s is %s' % ( job, stagingSite ) )

      result = self.__getStagingSites( stagingSite, destinationSites )
      if not result['OK']:
        stagingSites = [stagingSite]
      else:
        stagingSites = result['Value']

      if len( stagingSites ) == 1:
        self.jobDB.setJobAttribute( job, 'Site', stagingSite )
      else:
        # Get the name of the site group
        result = self.__getSiteGroup( stagingSites )
        if result['OK']:
          groupName = result['Value']
          if groupName:
            self.jobDB.setJobAttribute( job, 'Site', groupName )
          else:
            self.jobDB.setJobAttribute( job, 'Site', 'Multiple' )
        else:
          self.jobDB.setJobAttribute( job, 'Site', 'Multiple' )

      stagerDict = self.__setStagingRequest( job, stagingSite, optInfo )
      if not stagerDict['OK']:
        return stagerDict
      self.__updateOtherSites( job, stagingSite, stagerDict['Value'], optInfo )
      return S_OK()
    else:
      #No staging required, can proceed to task queue agent and then waiting status
      self.log.verbose( 'Job %s does not require staging of input data' % ( job ) )
    #Finally send job to TaskQueueAgent
    return self.__sendJobToTaskQueue( job, classAdJob, destinationSites, userBannedSites )
示例#4
0
  def optimizeJob( self, jid, jobState ):
    # Reschedule delay
    result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] )
    if not result[ 'OK' ]:
      return result
    attDict = result[ 'Value' ]
    try:
      reschedules = int( attDict[ 'RescheduleCounter' ] )
    except ValueError:
      return S_ERROR( "RescheduleCounter has to be an integer" )
    if reschedules != 0:
      delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] )
      delay = delays[ min( reschedules, len( delays ) - 1 ) ]
      waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) )
      if waited < delay:
        return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay )

    # Get site requirements
    result = self._getSitesRequired( jobState )
    if not result[ 'OK' ]:
      return result
    userSites, userBannedSites = result[ 'Value' ]

    # Get active and banned sites from DIRAC
    siteStatus = SiteStatus()
    result = siteStatus.getUsableSites( 'ComputingAccess' )
    if not result[ 'OK' ]:
      return S_ERROR( "Cannot retrieve active sites from JobDB" )
    usableSites = result[ 'Value' ]
    result = siteStatus.getUnusableSites( 'ComputingAccess' )
    if not result[ 'OK' ]:
      return S_ERROR( "Cannot retrieve banned sites from JobDB" )
    unusableSites = result[ 'Value' ]

    # If the user has selected any site, filter them and hold the job if not able to run
    if userSites:
      result = jobState.getAttribute( "JobType" )
      if not result[ 'OK' ]:
        return S_ERROR( "Could not retrieve job type" )
      jobType = result[ 'Value' ]
      if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ):
        sites = self._applySiteFilter( userSites, usableSites, unusableSites )
        if not sites:
          return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( userSites ) )

    # Get the Input data
    # Third, check if there is input data
    result = jobState.getInputData()
    if not result['OK']:
      self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) )
      return S_ERROR( 'Failed to get input data from JobDB' )

    if not result['Value']:
      # No input data? Generate requirements and next
      return self.__sendToTQ( jobState, userSites, userBannedSites )

    inputData = result[ 'Value' ]

    self.jobLog.verbose( 'Has an input data requirement' )
    idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' )
    result = self.retrieveOptimizerParam( idAgent )
    if not result['OK']:
      self.jobLog.error( "Could not retrieve input data info: %s" % result[ 'Message' ] )
      return S_ERROR( "File Catalog Access Failure" )
    opData = result[ 'Value' ]
    if 'SiteCandidates' not in opData:
      return S_ERROR( "No possible site candidates" )

    # Filter input data sites with user requirement
    siteCandidates = list( opData[ 'SiteCandidates' ] )
    self.jobLog.info( "Site candidates are %s" % siteCandidates )

    siteCandidates = self._applySiteFilter( siteCandidates, userSites, userBannedSites )
    if not siteCandidates:
      return S_ERROR( "Impossible InputData * Site requirements" )

    idSites = {}
    for site in siteCandidates:
      idSites[ site ] = opData[ 'SiteCandidates' ][ site ]

    #Check if sites have correct count of disk+tape replicas
    numData = len( inputData )
    errorSites = set()
    for site in idSites:
      if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]:
        self.jobLog.error( "Site candidate %s does not have all the input data" % site )
        errorSites.add( site )
    for site in errorSites:
      idSites.pop( site )
    if not idSites:
      return S_ERROR( "Site candidates do not have all the input data" )

    #Check if staging is required
    stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites )
    if not siteCandidates:
      return S_ERROR( "No destination sites available" )

    # Is any site active?
    stageSites = self._applySiteFilter( siteCandidates, usableSites, unusableSites )
    if not stageSites:
      return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) )

    # If no staging is required send to TQ
    if not stageRequired:
      # Use siteCandidates and not stageSites because active and banned sites
      # will be taken into account on matching time
      return self.__sendToTQ( jobState, siteCandidates, userBannedSites )

    # Check if the user is allowed to stage
    if self.ex_getOption( "RestrictDataStage", False ):
      if not self.__checkStageAllowed( jobState ):
        return S_ERROR( "Stage not allowed" )

    # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
    stageSite = stageSites[0]
    self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) )
    stageData = idSites[ stageSite ]
    # Set as if everything has already been staged
    stageData[ 'disk' ] += stageData[ 'tape' ]
    stageData[ 'tape' ] = 0
    # Set the site info back to the original dict to save afterwards
    opData[ 'SiteCandidates' ][ stageSite ] = stageData

    result = self.__requestStaging( jobState, stageSite, opData )
    if not result[ 'OK' ]:
      return result
    stageLFNs = result[ 'Value' ]
    self._updateSharedSESites( stageSite, stageLFNs, opData )
    # Save the optimizer data again
    self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData )
    result = self.storeOptimizerParam( idAgent, opData )
    if not result[ 'OK' ]:
      return result

    return self._setJobSite( jobState, stageSites )
示例#5
0
    def checkJob(self, job, classAdJob):
        """This method controls the checking of the job.
    """
        self.log.verbose('Job %s will be processed' % (job))

        # Check if the job was recently rescheduled
        result = self.jobDB.getJobAttributes(
            job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'])
        if not result['OK']:
            self.log.error(result['Message'])
            return S_ERROR('Can not get job attributes from JobDB')
        jobDict = result['Value']
        reCounter = int(jobDict['RescheduleCounter'])
        if reCounter != 0:
            reTime = fromString(jobDict['RescheduleTime'])
            delta = toEpoch() - toEpoch(reTime)
            delay = self.maxRescheduleDelay
            if reCounter <= len(self.rescheduleDelaysList):
                delay = self.rescheduleDelaysList[reCounter - 1]
            if delta < delay:
                if jobDict['ApplicationStatus'].find(
                        'On Hold: after rescheduling') == -1:
                    result = self.jobDB.setJobStatus(
                        job,
                        application='On Hold: after rescheduling #%d' %
                        reCounter)
                return S_OK()

        # First, get Site and BannedSites from the Job

        result = self.__getJobSiteRequirement(job, classAdJob)
        userBannedSites = result['BannedSites']
        userSites = result['Sites']

        if userSites:
            userSites = applySiteRequirements(userSites, [], userBannedSites)
            if not userSites:
                msg = 'Impossible Site Requirement'
                return S_ERROR(msg)

        # Second, get the Active and Banned sites from the RSS

        siteStatus = SiteStatus()

        usableSites = siteStatus.getUsableSites('ComputingAccess')
        unusableSites = siteStatus.getUnusableSites('ComputingAccess')

        if not (usableSites['OK'] and unusableSites['OK']):
            if not usableSites['OK']:
                self.log.error(usableSites['Message'])
            if not unusableSites['OK']:
                self.log.error(unusableSites['Message'])
            return S_ERROR('Can not get Active and Banned Sites from JobDB')

        usableSites = usableSites['Value']
        unusableSites = unusableSites['Value']

        if userSites:
            sites = applySiteRequirements(userSites, usableSites,
                                          unusableSites)
            if not sites:
                # Put on Hold only non-excluded job types
                jobType = classAdJob.getAttributeString('JobType')
                if not jobType in self.excludedOnHoldJobTypes:
                    msg = 'On Hold: Requested site is Banned or not Active'
                    self.log.info(msg)
                    result = self.jobDB.setJobStatus(job, application=msg)
                    return S_OK()

        # Third, check if there is input data
        result = self.jobDB.getInputData(job)
        if not result['OK']:
            self.log.warn('Failed to get input data from JobDB for %s' % (job))
            self.log.error(result['Message'])
            return S_ERROR('Failed to get input data from JobDB')

        if not result['Value']:
            return self.__sendJobToTaskQueue(job, classAdJob, userSites,
                                             userBannedSites)

        hasInputData = False
        inputData = []
        for lfn in result['Value']:
            if lfn:
                inputData.append(lfn)
                hasInputData = True

        if not hasInputData:
            #With no input data requirement, job can proceed directly to task queue
            self.log.verbose('Job %s has no input data requirement' % (job))
            return self.__sendJobToTaskQueue(job, classAdJob, userSites,
                                             userBannedSites)

        self.log.verbose('Job %s has an input data requirement ' % (job))

        # Fourth, Check all optimizer information
        result = self.__checkOptimizerInfo(job)
        if not result['OK']:
            return result

        optInfo = result['Value']

        #Compare site candidates with current mask
        optSites = optInfo['SiteCandidates'].keys()
        self.log.info('Input Data Site Candidates: %s' % (', '.join(optSites)))
        # Check that it is compatible with user requirements
        optSites = applySiteRequirements(optSites, userSites, userBannedSites)
        if not optSites:
            msg = 'Impossible Site + InputData Requirement'
            return S_ERROR(msg)

        sites = applySiteRequirements(optSites, usableSites, unusableSites)
        if not sites:
            msg = 'On Hold: InputData Site is Banned or not Active'
            self.log.info(msg)
            result = self.jobDB.setJobStatus(job, application=msg)
            return S_OK()

        #Set stager request as necessary, optimize for smallest #files on tape if
        #more than one site candidate left at this point
        checkStaging = self.__resolveSitesForStaging(job, sites, inputData,
                                                     optInfo['SiteCandidates'])
        if not checkStaging['OK']:
            return checkStaging

        destinationSites = checkStaging['SiteCandidates']
        if not destinationSites:
            return S_ERROR('No destination sites available')

        stagingFlag = checkStaging['Value']
        if stagingFlag:
            #Single site candidate chosen and staging required
            self.log.verbose('Job %s requires staging of input data' % (job))
            # set all LFN to disk for the selected site
            stagingSite = destinationSites[0]
            siteDict = optInfo['SiteCandidates'][stagingSite]
            siteDict['disk'] = siteDict['disk'] + siteDict['tape']
            siteDict['tape'] = 0

            optInfo['SiteCandidates'][stagingSite] = siteDict
            self.log.verbose(
                'Updating %s Optimizer Info for Job %s:' %
                (self.dataAgentName, job), optInfo)
            result = self.setOptimizerJobInfo(job, self.dataAgentName, optInfo)
            if not result['OK']:
                return result

            # Site is selected for staging, report it
            self.log.verbose('Staging site candidate for job %s is %s' %
                             (job, stagingSite))

            result = self.__getStagingSites(stagingSite, destinationSites)
            if not result['OK']:
                stagingSites = [stagingSite]
            else:
                stagingSites = result['Value']

            if len(stagingSites) == 1:
                self.jobDB.setJobAttribute(job, 'Site', stagingSite)
            else:
                # Get the name of the site group
                result = self.__getSiteGroup(stagingSites)
                if result['OK']:
                    groupName = result['Value']
                    if groupName:
                        self.jobDB.setJobAttribute(job, 'Site', groupName)
                    else:
                        self.jobDB.setJobAttribute(job, 'Site', 'Multiple')
                else:
                    self.jobDB.setJobAttribute(job, 'Site', 'Multiple')

            stagerDict = self.__setStagingRequest(job, stagingSite, optInfo)
            if not stagerDict['OK']:
                return stagerDict
            self.__updateOtherSites(job, stagingSite, stagerDict['Value'],
                                    optInfo)
            return S_OK()
        else:
            #No staging required, can proceed to task queue agent and then waiting status
            self.log.verbose('Job %s does not require staging of input data' %
                             (job))
        #Finally send job to TaskQueueAgent
        return self.__sendJobToTaskQueue(job, classAdJob, destinationSites,
                                         userBannedSites)