示例#1
0
    def __getLatestUpdateTime(self, job):
        """ Returns the most recent of HeartBeatTime and LastUpdateTime
"""
        result = self.jobDB.getJobAttributes(job, ["HeartBeatTime", "LastUpdateTime"])
        if not result["OK"]:
            self.log.error("Failed to get job attributes", result["Message"])
        if not result["OK"] or not result["Value"]:
            self.log.error("Could not get attributes for job", "%s" % job)
            return S_ERROR("Could not get attributes for job")

        self.log.verbose(result)
        latestUpdate = 0
        if not result["Value"]["HeartBeatTime"] or result["Value"]["HeartBeatTime"] == "None":
            self.log.verbose("HeartBeatTime is null for job %s" % job)
        else:
            latestUpdate = toEpoch(fromString(result["Value"]["HeartBeatTime"]))

        if not result["Value"]["LastUpdateTime"] or result["Value"]["LastUpdateTime"] == "None":
            self.log.verbose("LastUpdateTime is null for job %s" % job)
        else:
            lastUpdate = toEpoch(fromString(result["Value"]["LastUpdateTime"]))
            if latestUpdate < lastUpdate:
                latestUpdate = lastUpdate

        if not latestUpdate:
            return S_ERROR("LastUpdate and HeartBeat times are null for job %s" % job)
        else:
            self.log.verbose("Latest update time from epoch for job %s is %s" % (job, latestUpdate))
            return S_OK(latestUpdate)
示例#2
0
    def __getLatestUpdateTime(self, job):
        """ Returns the most recent of HeartBeatTime and LastUpdateTime
"""
        result = self.jobDB.getJobAttributes(
            job, ['HeartBeatTime', 'LastUpdateTime'])
        if not result['OK']:
            self.log.error(result['Message'])
        if not result['OK'] or not result['Value']:
            return S_ERROR('Could not get attributes for job %s' % job)

        self.log.verbose(result)
        latestUpdate = 0
        if not result['Value']['HeartBeatTime'] or result['Value'][
                'HeartBeatTime'] == 'None':
            self.log.verbose('HeartBeatTime is null for job %s' % job)
        else:
            latestUpdate = toEpoch(fromString(
                result['Value']['HeartBeatTime']))

        if not result['Value']['LastUpdateTime'] or result['Value'][
                'LastUpdateTime'] == 'None':
            self.log.verbose('LastUpdateTime is null for job %s' % job)
        else:
            lastUpdate = toEpoch(fromString(result['Value']['LastUpdateTime']))
            if latestUpdate < lastUpdate:
                latestUpdate = lastUpdate

        if not latestUpdate:
            return S_ERROR(
                'LastUpdate and HeartBeat times are null for job %s' % job)
        else:
            self.log.verbose('Latest update time from epoch for job %s is %s' %
                             (job, latestUpdate))
            return S_OK(latestUpdate)
示例#3
0
  def __getLatestUpdateTime( self, job ):
    """ Returns the most recent of HeartBeatTime and LastUpdateTime
"""
    result = self.jobDB.getJobAttributes( job, ['HeartBeatTime', 'LastUpdateTime'] )
    if not result['OK']:
      self.log.error( 'Failed to get job attributes', result['Message'] )
    if not result['OK'] or not result['Value']:
      self.log.error( 'Could not get attributes for job', '%s' % job )
      return S_ERROR( 'Could not get attributes for job' )

    self.log.verbose( result )
    latestUpdate = 0
    if not result['Value']['HeartBeatTime'] or result['Value']['HeartBeatTime'] == 'None':
      self.log.verbose( 'HeartBeatTime is null for job %s' % job )
    else:
      latestUpdate = toEpoch( fromString( result['Value']['HeartBeatTime'] ) )

    if not result['Value']['LastUpdateTime'] or result['Value']['LastUpdateTime'] == 'None':
      self.log.verbose( 'LastUpdateTime is null for job %s' % job )
    else:
      lastUpdate = toEpoch( fromString( result['Value']['LastUpdateTime'] ) )
      if latestUpdate < lastUpdate:
        latestUpdate = lastUpdate

    if not latestUpdate:
      return S_ERROR( 'LastUpdate and HeartBeat times are null for job %s' % job )
    else:
      self.log.verbose( 'Latest update time from epoch for job %s is %s' % ( job, latestUpdate ) )
      return S_OK( latestUpdate )
示例#4
0
  def _failStalledJobs(self, jobID):
    """
    Changes the Stalled status to Failed for jobs long in the Stalled status.

    Run inside thread.
    """

    setFailed = False
    # Check if the job pilot is lost
    result = self.__getJobPilotStatus(jobID)
    if not result['OK']:
      self.log.error('Failed to get pilot status',
                     "for job %d: %s" % (jobID, result['Message']))
      return result
    pilotStatus = result['Value']
    if pilotStatus != "Running":
      setFailed = self.minorStalledStatuses[0]
    else:
      # Verify that there was no sign of life for long enough
      result = self.__getLatestUpdateTime(jobID)
      if not result['OK']:
        self.log.error('Failed to get job update time',
                       "for job %d: %s" % (jobID, result['Message']))
        return result
      elapsedTime = toEpoch() - result['Value']
      if elapsedTime > self.failedTime:
        setFailed = self.minorStalledStatuses[1]

    # Set the jobs Failed, send them a kill signal in case they are not really dead
    # and send accounting info
    if setFailed:
      self.__sendKillCommand(jobID)  # always returns None
      return self.__updateJobStatus(jobID, JobStatus.FAILED, minorStatus=setFailed)

    return S_OK()
示例#5
0
    def _failStalledJobs(self, jobID):
        """
        Changes the Stalled status to Failed for jobs long in the Stalled status.

        Run inside thread.
        """

        setFailed = False
        # Check if the job pilot is lost
        result = self._getJobPilotStatus(jobID)
        if not result["OK"]:
            self.log.error("Failed to get pilot status",
                           "for job %d: %s" % (jobID, result["Message"]))
            return result
        pilotStatus = result["Value"]
        if pilotStatus != "Running":
            setFailed = self.minorStalledStatuses[0]
        else:
            # Verify that there was no sign of life for long enough
            result = self._getLatestUpdateTime(jobID)
            if not result["OK"]:
                self.log.error("Failed to get job update time",
                               "for job %d: %s" % (jobID, result["Message"]))
                return result
            elapsedTime = toEpoch() - result["Value"]
            if elapsedTime > self.failedTime:
                setFailed = self.minorStalledStatuses[1]

        # Set the jobs Failed, send them a kill signal in case they are not really dead
        # and send accounting info
        if setFailed:
            self._sendKillCommand(jobID)  # always returns None

            # For some sites we might want to reschedule rather than fail the jobs
            if self.stalledJobsToRescheduleSites:
                result = self.jobDB.getJobAttribute(jobID, "site")
                if not result["OK"]:
                    return result
                site = result["Value"]
                if site in self.stalledJobsToRescheduleSites:
                    return self._updateJobStatus(jobID,
                                                 JobStatus.RESCHEDULED,
                                                 minorStatus=setFailed,
                                                 force=True)

            return self._updateJobStatus(jobID,
                                         JobStatus.FAILED,
                                         minorStatus=setFailed)

        return S_OK()
示例#6
0
    def _getLatestUpdateTime(self, job):
        """Returns the most recent of HeartBeatTime and LastUpdateTime"""
        result = self.jobDB.getJobAttributes(
            job, ["HeartBeatTime", "LastUpdateTime"])
        if not result["OK"] or not result["Value"]:
            self.log.error(
                "Failed to get job attributes",
                "for job %d: %s" %
                (job, result["Message"] if "Message" in result else "empty"),
            )
            return S_ERROR("Could not get attributes for job")

        latestUpdate = 0
        if not result["Value"]["HeartBeatTime"] or result["Value"][
                "HeartBeatTime"] == "None":
            self.log.verbose("HeartBeatTime is null", "for job %s" % job)
        else:
            latestUpdate = toEpoch(fromString(
                result["Value"]["HeartBeatTime"]))

        if not result["Value"]["LastUpdateTime"] or result["Value"][
                "LastUpdateTime"] == "None":
            self.log.verbose("LastUpdateTime is null", "for job %s" % job)
        else:
            latestUpdate = max(
                latestUpdate,
                toEpoch(fromString(result["Value"]["LastUpdateTime"])))

        if not latestUpdate:
            return S_ERROR(
                "LastUpdate and HeartBeat times are null for job %s" % job)
        else:
            self.log.verbose(
                "", "Latest update time from epoch for job %s is %s" %
                (job, latestUpdate))
            return S_OK(latestUpdate)
示例#7
0
  def __checkJobStalled(self, job, stalledTime):
    """ Compares the most recent of LastUpdateTime and HeartBeatTime against
    the stalledTime limit.
    """
    result = self.__getLatestUpdateTime(job)
    if not result['OK']:
      return result

    elapsedTime = toEpoch() - result['Value']
    self.log.debug('(CurrentTime-LastUpdate) = %s secs' % (elapsedTime))
    if elapsedTime > stalledTime:
      self.log.info('Job is identified as stalled',
                    ": jobID %d with last update > %s secs ago" % (job, elapsedTime))
      return S_OK('Stalled')

    return S_ERROR('Job %s is running and will be ignored' % job)
示例#8
0
  def __getStalledJob( self, job, stalledTime ):
    """ Compares the most recent of LastUpdateTime and HeartBeatTime against
the stalledTime limit.
"""
    result = self.__getLatestUpdateTime( job )
    if not result['OK']:
      return result

    currentTime = toEpoch()
    lastUpdate = result['Value']

    elapsedTime = currentTime - lastUpdate
    self.log.verbose( '(CurrentTime-LastUpdate) = %s secs' % ( elapsedTime ) )
    if elapsedTime > stalledTime:
      self.log.info( 'Job %s is identified as stalled with last update > %s secs ago' % ( job, elapsedTime ) )
      return S_OK( 'Stalled' )

    return S_ERROR( 'Job %s is running and will be ignored' % job )
示例#9
0
  def __getStalledJob( self, job, stalledTime ):
    """ Compares the most recent of LastUpdateTime and HeartBeatTime against
the stalledTime limit.
"""
    result = self.__getLatestUpdateTime( job )
    if not result['OK']:
      return result

    currentTime = toEpoch()
    lastUpdate = result['Value']

    elapsedTime = currentTime - lastUpdate
    self.log.verbose( '(CurrentTime-LastUpdate) = %s secs' % ( elapsedTime ) )
    if elapsedTime > stalledTime:
      self.log.info( 'Job %s is identified as stalled with last update > %s secs ago' % ( job, elapsedTime ) )
      return S_OK( 'Stalled' )

    return S_ERROR( 'Job %s is running and will be ignored' % job )
示例#10
0
  def __failStalledJobs( self, failedTime ):
    """ Changes the Stalled status to Failed for jobs long in the Stalled status
"""

    result = self.jobDB.selectJobs( {'Status':'Stalled'} )
    if not result['OK']:
      return result

    failedCounter = 0

    if result['Value']:
      jobs = result['Value']
      self.log.info( '%s Stalled jobs will be checked for failure' % ( len( jobs ) ) )

      for job in jobs:

        # Check if the job pilot is lost
        result = self.__getJobPilotStatus( job )
        if result['OK']:
          pilotStatus = result['Value']
          if pilotStatus != "Running":
            result = self.__updateJobStatus( job, 'Failed',
                                             "Job stalled: pilot not running" )
            failedCounter += 1
            result = self.__sendAccounting( job )
            if not result['OK']:
              self.log.error( 'Failed to send accounting', result['Message'] )
              break
            continue

        result = self.__getLatestUpdateTime( job )
        if not result['OK']:
          return result
        currentTime = toEpoch()
        lastUpdate = result['Value']
        elapsedTime = currentTime - lastUpdate
        if elapsedTime > failedTime:
          self.__updateJobStatus( job, 'Failed', 'Stalling for more than %d sec' % failedTime )
          failedCounter += 1
          result = self.__sendAccounting( job )
          if not result['OK']:
            self.log.error( 'Failed to send accounting', result['Message'] )
            break

    recoverCounter = 0

    for minor in ["Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime]:
      result = self.jobDB.selectJobs( {'Status':'Failed', 'MinorStatus': minor, 'AccountedFlag': 'False' } )
      if not result['OK']:
        return result
      if result['Value']:
        jobs = result['Value']
        self.log.info( '%s Stalled jobs will be Accounted' % ( len( jobs ) ) )
        for job in jobs:
          result = self.__sendAccounting( job )
          if not result['OK']:
            self.log.error( 'Failed to send accounting', result['Message'] )
            continue

          recoverCounter += 1
      if not result['OK']:
        break

    if failedCounter:
      self.log.info( '%d jobs set to Failed' % failedCounter )
    if recoverCounter:
      self.log.info( '%d jobs properly Accounted' % recoverCounter )
    return S_OK( failedCounter )
示例#11
0
  def __failStalledJobs( self, failedTime ):
    """ Changes the Stalled status to Failed for jobs long in the Stalled status
    """

    result = self.jobDB.selectJobs( {'Status':'Stalled'} )
    if not result['OK']:
      return result
    jobs = result['Value']

    failedCounter = 0
    minorStalledStatuses = ( "Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime )

    if jobs:
      self.log.info( '%s Stalled jobs will be checked for failure' % ( len( jobs ) ) )

      for job in jobs:
        setFailed = False
        # Check if the job pilot is lost
        result = self.__getJobPilotStatus( job )
        if not result['OK']:
          self.log.error( 'Failed to get pilot status', result['Message'] )
          continue
        pilotStatus = result['Value']
        if pilotStatus != "Running":
          setFailed = minorStalledStatuses[0]
        else:

          result = self.__getLatestUpdateTime( job )
          if not result['OK']:
            self.log.error( 'Failed to get job update time', result['Message'] )
            continue
          elapsedTime = toEpoch() - result['Value']
          if elapsedTime > failedTime:
            setFailed = minorStalledStatuses[1]

        # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info
        if setFailed:
          self.__sendKillCommand(job)
          self.__updateJobStatus( job, 'Failed', setFailed )
          failedCounter += 1
          result = self.__sendAccounting( job )
          if not result['OK']:
            self.log.error( 'Failed to send accounting', result['Message'] )

    recoverCounter = 0

    for minor in minorStalledStatuses:
      result = self.jobDB.selectJobs( {'Status':'Failed', 'MinorStatus': minor, 'AccountedFlag': 'False' } )
      if not result['OK']:
        return result
      if result['Value']:
        jobs = result['Value']
        self.log.info( '%s Stalled jobs will be Accounted' % ( len( jobs ) ) )
        for job in jobs:
          result = self.__sendAccounting( job )
          if not result['OK']:
            self.log.error( 'Failed to send accounting', result['Message'] )
            continue

          recoverCounter += 1
      if not result['OK']:
        break

    if failedCounter:
      self.log.info( '%d jobs set to Failed' % failedCounter )
    if recoverCounter:
      self.log.info( '%d jobs properly Accounted' % recoverCounter )
    return S_OK( failedCounter )
  def checkJob( self, job, classAdJob ):
    """This method controls the checking of the job.
    """
    self.log.verbose( 'Job %s will be processed' % ( job ) )

    # Check if the job was recently rescheduled
    result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'] )
    if not result['OK']:
      self.log.error( result['Message'] )
      return S_ERROR( 'Can not get job attributes from JobDB' )
    jobDict = result['Value']
    reCounter = int( jobDict['RescheduleCounter'] )
    if reCounter != 0 :
      reTime = fromString( jobDict['RescheduleTime'] )
      delta = toEpoch() - toEpoch( reTime )
      delay = self.maxRescheduleDelay
      if reCounter <= len( self.rescheduleDelaysList ):
        delay = self.rescheduleDelaysList[reCounter - 1]
      if delta < delay:
        if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling' ) == -1:
          result = self.jobDB.setJobStatus( job, application = 'On Hold: after rescheduling #%d' % reCounter )
        return S_OK()

    # First, get Site and BannedSites from the Job

    result = self.__getJobSiteRequirement( job, classAdJob )
    userBannedSites = result['BannedSites']
    userSites = result['Sites']

    if userSites:
      userSites = applySiteRequirements( userSites, [], userBannedSites )
      if not userSites:
        msg = 'Impossible Site Requirement'
        return S_ERROR( msg )

    # Second, get the Active and Banned sites from the WMS

    wmsSites = self.jobDB.getSiteMask( 'Active' )
    wmsBannedSites = self.jobDB.getSiteMask( 'Banned' )
    if not ( wmsSites['OK'] and wmsBannedSites['OK'] ):
      if not wmsSites['OK']:
        self.log.error( wmsSites['Message'] )
      if not wmsBannedSites['OK']:
        self.log.error( wmsBannedSites['Message'] )
      return S_ERROR( 'Can not get Active and Banned Sites from JobDB' )

    wmsSites = wmsSites['Value']
    wmsBannedSites = wmsBannedSites['Value']

    if userSites:
      sites = applySiteRequirements( userSites, wmsSites, wmsBannedSites )
      if not sites:
        # Put on Hold only non-excluded job types
        jobType = classAdJob.getAttributeString( 'JobType' )
        if not jobType in self.excludedOnHoldJobTypes:
          msg = 'On Hold: Requested site is Banned or not Active'
          self.log.info( msg )
          result = self.jobDB.setJobStatus( job, application = msg )
          return S_OK()


    # Third, check if there is input data
    result = self.jobDB.getInputData( job )
    if not result['OK']:
      self.log.warn( 'Failed to get input data from JobDB for %s' % ( job ) )
      self.log.error( result['Message'] )
      return S_ERROR( 'Failed to get input data from JobDB' )

    if not result['Value']:
      return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites )

    hasInputData = False
    inputData = []
    for lfn in result['Value']:
      if lfn:
        inputData.append( lfn )
        hasInputData = True

    if not hasInputData:
      #With no input data requirement, job can proceed directly to task queue
      self.log.verbose( 'Job %s has no input data requirement' % ( job ) )
      return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites )

    self.log.verbose( 'Job %s has an input data requirement ' % ( job ) )

    # Fourth, Check all optimizer information
    result = self.__checkOptimizerInfo( job )
    if not result['OK']:
      return result

    optInfo = result['Value']

    #Compare site candidates with current mask
    optSites = optInfo['SiteCandidates'].keys()
    self.log.info( 'Input Data Site Candidates: %s' % ( ', '.join( optSites ) ) )
    # Check that it is compatible with user requirements
    optSites = applySiteRequirements( optSites, userSites, userBannedSites )
    if not optSites:
      msg = 'Impossible Site + InputData Requirement'
      return S_ERROR( msg )

    sites = applySiteRequirements( optSites, wmsSites, wmsBannedSites )
    if not sites:
      msg = 'On Hold: InputData Site is Banned or not Active'
      self.log.info( msg )
      result = self.jobDB.setJobStatus( job, application = msg )
      return S_OK()

    #Set stager request as necessary, optimize for smallest #files on tape if
    #more than one site candidate left at this point
    checkStaging = self.__resolveSitesForStaging( job, sites, inputData, optInfo['SiteCandidates'] )
    if not checkStaging['OK']:
      return checkStaging

    destinationSites = checkStaging['SiteCandidates']
    if not destinationSites:
      return S_ERROR( 'No destination sites available' )

    stagingFlag = checkStaging['Value']
    if stagingFlag:
      #Single site candidate chosen and staging required
      self.log.verbose( 'Job %s requires staging of input data' % ( job ) )
      # set all LFN to disk for the selected site
      stagingSite = destinationSites[0]
      siteDict = optInfo['SiteCandidates'][stagingSite]
      siteDict['disk'] = siteDict['disk'] + siteDict['tape']
      siteDict['tape'] = 0

      optInfo['SiteCandidates'][stagingSite] = siteDict
      self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % ( self.dataAgentName, job ), optInfo )
      result = self.setOptimizerJobInfo( job, self.dataAgentName, optInfo )
      if not result['OK']:
        return result

      # Site is selected for staging, report it
      self.log.verbose( 'Staging site candidate for job %s is %s' % ( job, stagingSite ) )

      result = self.__getStagingSites(stagingSite,destinationSites)
      if not result['OK']:
        stagingSites = [stagingSite]
      else:
        stagingSites = result['Value']  

      if len( stagingSites ) == 1:
        self.jobDB.setJobAttribute( job, 'Site', stagingSite )
      else:
        # Get the name of the site group
        result = self.__getSiteGroup(stagingSites)
        if result['OK']:
          groupName = result['Value']
          if groupName:
            self.jobDB.setJobAttribute( job, 'Site', groupName )
          else:    
            self.jobDB.setJobAttribute( job, 'Site', 'Multiple' )
        else:
          self.jobDB.setJobAttribute( job, 'Site', 'Multiple' )    

      stagerDict = self.__setStagingRequest( job, stagingSite, optInfo )
      if not stagerDict['OK']:
        return stagerDict
      self.__updateOtherSites( job, stagingSite, stagerDict['Value'], optInfo )
      return S_OK()
    else:
      #No staging required, can proceed to task queue agent and then waiting status
      self.log.verbose( 'Job %s does not require staging of input data' % ( job ) )
    #Finally send job to TaskQueueAgent
    return self.__sendJobToTaskQueue( job, classAdJob, destinationSites, userBannedSites )
示例#13
0
    def _failStalledJobs(self, failedTime):
        """ Changes the Stalled status to Failed for jobs long in the Stalled status
    """
        # Only get jobs that have been Stalled for long enough
        checkTime = dateTime() - failedTime * second
        result = self.jobDB.selectJobs({'Status': JobStatus.STALLED},
                                       older=checkTime)
        if not result['OK']:
            return result
        jobs = result['Value']

        failedCounter = 0
        minorStalledStatuses = ("Job stalled: pilot not running",
                                'Stalling for more than %d sec' % failedTime)

        if jobs:
            self.log.info(
                '%d jobs Stalled before %s will be checked for failure' %
                (len(jobs), str(checkTime)))

            for job in jobs:
                setFailed = False
                # Check if the job pilot is lost
                result = self.__getJobPilotStatus(job)
                if not result['OK']:
                    self.log.error('Failed to get pilot status',
                                   result['Message'])
                    continue
                pilotStatus = result['Value']
                if pilotStatus != "Running":
                    setFailed = minorStalledStatuses[0]
                else:
                    # Verify that there was no sign of life for long enough
                    result = self.__getLatestUpdateTime(job)
                    if not result['OK']:
                        self.log.error('Failed to get job update time',
                                       result['Message'])
                        continue
                    elapsedTime = toEpoch() - result['Value']
                    if elapsedTime > failedTime:
                        setFailed = minorStalledStatuses[1]

                # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info
                if setFailed:
                    self.__sendKillCommand(job)
                    self.__updateJobStatus(job, JobStatus.FAILED, setFailed)
                    failedCounter += 1
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error('Failed to send accounting',
                                       result['Message'])

        recoverCounter = 0

        for minor in minorStalledStatuses:
            result = self.jobDB.selectJobs({
                'Status': JobStatus.FAILED,
                'MinorStatus': minor,
                'AccountedFlag': 'False'
            })
            if not result['OK']:
                return result
            if result['Value']:
                jobs = result['Value']
                self.log.info('%s Stalled jobs will be Accounted' %
                              (len(jobs)))
                for job in jobs:
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error('Failed to send accounting',
                                       result['Message'])
                        continue

                    recoverCounter += 1
            if not result['OK']:
                break

        if failedCounter:
            self.log.info('%d jobs set to Failed' % failedCounter)
        if recoverCounter:
            self.log.info('%d jobs properly Accounted' % recoverCounter)
        return S_OK(failedCounter)
示例#14
0
    def checkJob(self, job, classAdJob):
        """This method controls the checking of the job.
    """
        self.log.verbose('Job %s will be processed' % (job))

        # Check if the job was recently rescheduled
        result = self.jobDB.getJobAttributes(
            job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'])
        if not result['OK']:
            self.log.error(result['Message'])
            return S_ERROR('Can not get job attributes from JobDB')
        jobDict = result['Value']
        reCounter = int(jobDict['RescheduleCounter'])
        if reCounter != 0:
            reTime = fromString(jobDict['RescheduleTime'])
            delta = toEpoch() - toEpoch(reTime)
            delay = self.maxRescheduleDelay
            if reCounter <= len(self.rescheduleDelaysList):
                delay = self.rescheduleDelaysList[reCounter - 1]
            if delta < delay:
                if jobDict['ApplicationStatus'].find(
                        'On Hold: after rescheduling') == -1:
                    result = self.jobDB.setJobStatus(
                        job,
                        application='On Hold: after rescheduling #%d' %
                        reCounter)
                return S_OK()

        # First, get Site and BannedSites from the Job

        result = self.__getJobSiteRequirement(job, classAdJob)
        userBannedSites = result['BannedSites']
        userSites = result['Sites']

        if userSites:
            userSites = applySiteRequirements(userSites, [], userBannedSites)
            if not userSites:
                msg = 'Impossible Site Requirement'
                return S_ERROR(msg)

        # Second, get the Active and Banned sites from the WMS

        wmsSites = self.jobDB.getSiteMask('Active')
        wmsBannedSites = self.jobDB.getSiteMask('Banned')
        if not (wmsSites['OK'] and wmsBannedSites['OK']):
            if not wmsSites['OK']:
                self.log.error(wmsSites['Message'])
            if not wmsBannedSites['OK']:
                self.log.error(wmsBannedSites['Message'])
            return S_ERROR('Can not get Active and Banned Sites from JobDB')

        wmsSites = wmsSites['Value']
        wmsBannedSites = wmsBannedSites['Value']

        if userSites:
            sites = applySiteRequirements(userSites, wmsSites, wmsBannedSites)
            if not sites:
                # Put on Hold only non-excluded job types
                jobType = classAdJob.getAttributeString('JobType')
                if not jobType in self.excludedOnHoldJobTypes:
                    msg = 'On Hold: Requested site is Banned or not Active'
                    self.log.info(msg)
                    result = self.jobDB.setJobStatus(job, application=msg)
                    return S_OK()

        # Third, check if there is input data
        result = self.jobDB.getInputData(job)
        if not result['OK']:
            self.log.warn('Failed to get input data from JobDB for %s' % (job))
            self.log.error(result['Message'])
            return S_ERROR('Failed to get input data from JobDB')

        if not result['Value']:
            return self.__sendJobToTaskQueue(job, classAdJob, userSites,
                                             userBannedSites)

        hasInputData = False
        inputData = []
        for lfn in result['Value']:
            if lfn:
                inputData.append(lfn)
                hasInputData = True

        if not hasInputData:
            #With no input data requirement, job can proceed directly to task queue
            self.log.verbose('Job %s has no input data requirement' % (job))
            return self.__sendJobToTaskQueue(job, classAdJob, userSites,
                                             userBannedSites)

        self.log.verbose('Job %s has an input data requirement ' % (job))

        # Fourth, Check all optimizer information
        result = self.__checkOptimizerInfo(job)
        if not result['OK']:
            return result

        optInfo = result['Value']

        #Compare site candidates with current mask
        optSites = optInfo['SiteCandidates'].keys()
        self.log.info('Input Data Site Candidates: %s' % (', '.join(optSites)))
        # Check that it is compatible with user requirements
        optSites = applySiteRequirements(optSites, userSites, userBannedSites)
        if not optSites:
            msg = 'Impossible Site + InputData Requirement'
            return S_ERROR(msg)

        sites = applySiteRequirements(optSites, wmsSites, wmsBannedSites)
        if not sites:
            msg = 'On Hold: InputData Site is Banned or not Active'
            self.log.info(msg)
            result = self.jobDB.setJobStatus(job, application=msg)
            return S_OK()

        #Set stager request as necessary, optimize for smallest #files on tape if
        #more than one site candidate left at this point
        checkStaging = self.__resolveSitesForStaging(job, sites, inputData,
                                                     optInfo['SiteCandidates'])
        if not checkStaging['OK']:
            return checkStaging

        destinationSites = checkStaging['SiteCandidates']
        if not destinationSites:
            return S_ERROR('No destination sites available')

        stagingFlag = checkStaging['Value']
        if stagingFlag:
            #Single site candidate chosen and staging required
            self.log.verbose('Job %s requires staging of input data' % (job))
            # set all LFN to disk for the selected site
            stagingSite = destinationSites[0]
            siteDict = optInfo['SiteCandidates'][stagingSite]
            siteDict['disk'] = siteDict['disk'] + siteDict['tape']
            siteDict['tape'] = 0

            optInfo['SiteCandidates'][stagingSite] = siteDict
            self.log.verbose(
                'Updating %s Optimizer Info for Job %s:' %
                (self.dataAgentName, job), optInfo)
            result = self.setOptimizerJobInfo(job, self.dataAgentName, optInfo)
            if not result['OK']:
                return result

            # Site is selected for staging, report it
            self.log.verbose('Staging site candidate for job %s is %s' %
                             (job, stagingSite))

            result = self.__getStagingSites(stagingSite, destinationSites)
            if not result['OK']:
                stagingSites = [stagingSite]
            else:
                stagingSites = result['Value']

            if len(stagingSites) == 1:
                self.jobDB.setJobAttribute(job, 'Site', stagingSite)
            else:
                # Get the name of the site group
                result = self.__getSiteGroup(stagingSites)
                if result['OK']:
                    groupName = result['Value']
                    if groupName:
                        self.jobDB.setJobAttribute(job, 'Site', groupName)
                    else:
                        self.jobDB.setJobAttribute(job, 'Site', 'Multiple')
                else:
                    self.jobDB.setJobAttribute(job, 'Site', 'Multiple')

            stagerDict = self.__setStagingRequest(job, stagingSite, optInfo)
            if not stagerDict['OK']:
                return stagerDict
            self.__updateOtherSites(job, stagingSite, stagerDict['Value'],
                                    optInfo)
            return S_OK()
        else:
            #No staging required, can proceed to task queue agent and then waiting status
            self.log.verbose('Job %s does not require staging of input data' %
                             (job))
        #Finally send job to TaskQueueAgent
        return self.__sendJobToTaskQueue(job, classAdJob, destinationSites,
                                         userBannedSites)
示例#15
0
    def checkJob(self, job, classAdJob):
        """This method controls the checking of the job.
    """
        self.log.verbose("Job %s will be processed" % (job))

        # Check if the job was recently rescheduled
        result = self.jobDB.getJobAttributes(job, ["RescheduleCounter", "RescheduleTime", "ApplicationStatus"])
        if not result["OK"]:
            self.log.error(result["Message"])
            return S_ERROR("Can not get job attributes from JobDB")
        jobDict = result["Value"]
        reCounter = int(jobDict["RescheduleCounter"])
        if reCounter != 0:
            reTime = fromString(jobDict["RescheduleTime"])
            delta = toEpoch() - toEpoch(reTime)
            delay = self.maxRescheduleDelay
            if reCounter <= len(self.rescheduleDelaysList):
                delay = self.rescheduleDelaysList[reCounter - 1]
            if delta < delay:
                if jobDict["ApplicationStatus"].find("On Hold: after rescheduling") == -1:
                    result = self.jobDB.setJobStatus(job, application="On Hold: after rescheduling #%d" % reCounter)
                return S_OK()

        # First, get Site and BannedSites from the Job

        result = self.__getJobSiteRequirement(job, classAdJob)
        userBannedSites = result["BannedSites"]
        userSites = result["Sites"]

        if userSites:
            userSites = applySiteRequirements(userSites, [], userBannedSites)
            if not userSites:
                msg = "Impossible Site Requirement"
                return S_ERROR(msg)

        # Second, get the Active and Banned sites from the WMS

        wmsSites = self.jobDB.getSiteMask("Active")
        wmsBannedSites = self.jobDB.getSiteMask("Banned")
        if not (wmsSites["OK"] and wmsBannedSites["OK"]):
            if not wmsSites["OK"]:
                self.log.error(wmsSites["Message"])
            if not wmsBannedSites["OK"]:
                self.log.error(wmsBannedSites["Message"])
            return S_ERROR("Can not get Active and Banned Sites from JobDB")

        wmsSites = wmsSites["Value"]
        wmsBannedSites = wmsBannedSites["Value"]

        if userSites:
            sites = applySiteRequirements(userSites, wmsSites, wmsBannedSites)
            if not sites:
                # Put on Hold only non-excluded job types
                jobType = classAdJob.getAttributeString("JobType")
                if not jobType in self.excludedOnHoldJobTypes:
                    msg = "On Hold: Requested site is Banned or not Active"
                    self.log.info(msg)
                    result = self.jobDB.setJobStatus(job, application=msg)
                    return S_OK()

        # Third, check if there is input data
        result = self.jobDB.getInputData(job)
        if not result["OK"]:
            self.log.warn("Failed to get input data from JobDB for %s" % (job))
            self.log.error(result["Message"])
            return S_ERROR("Failed to get input data from JobDB")

        if not result["Value"]:
            return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites)

        hasInputData = False
        inputData = []
        for lfn in result["Value"]:
            if lfn:
                inputData.append(lfn)
                hasInputData = True

        if not hasInputData:
            # With no input data requirement, job can proceed directly to task queue
            self.log.verbose("Job %s has no input data requirement" % (job))
            return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites)

        self.log.verbose("Job %s has an input data requirement " % (job))

        # Fourth, Check all optimizer information
        result = self.__checkOptimizerInfo(job)
        if not result["OK"]:
            return result

        optInfo = result["Value"]

        # Compare site candidates with current mask
        optSites = optInfo["SiteCandidates"].keys()
        self.log.info("Input Data Site Candidates: %s" % (", ".join(optSites)))
        # Check that it is compatible with user requirements
        optSites = applySiteRequirements(optSites, userSites, userBannedSites)
        if not optSites:
            msg = "Impossible Site + InputData Requirement"
            return S_ERROR(msg)

        sites = applySiteRequirements(optSites, wmsSites, wmsBannedSites)
        if not sites:
            msg = "On Hold: InputData Site is Banned or not Active"
            self.log.info(msg)
            result = self.jobDB.setJobStatus(job, application=msg)
            return S_OK()

        # Set stager request as necessary, optimize for smallest #files on tape if
        # more than one site candidate left at this point
        checkStaging = self.__resolveSitesForStaging(job, sites, inputData, optInfo["SiteCandidates"])
        if not checkStaging["OK"]:
            return checkStaging

        destinationSites = checkStaging["SiteCandidates"]
        if not destinationSites:
            return S_ERROR("No destination sites available")

        stagingFlag = checkStaging["Value"]
        if stagingFlag:
            # Single site candidate chosen and staging required
            self.log.verbose("Job %s requires staging of input data" % (job))
            # set all LFN to disk for the selected site
            stagingSite = destinationSites[0]
            siteDict = optInfo["SiteCandidates"][stagingSite]
            siteDict["disk"] = siteDict["disk"] + siteDict["tape"]
            siteDict["tape"] = 0

            optInfo["SiteCandidates"][stagingSite] = siteDict
            result = self.setOptimizerJobInfo(job, self.dataAgentName, optInfo)
            if not result["OK"]:
                return result

            # Site is selected for staging, report it
            self.log.verbose("Staging site candidate for job %s is %s" % (job, stagingSite))
            if len(destinationSites) == 1:
                self.jobDB.setJobAttribute(job, "Site", stagingSite)
            else:
                self.jobDB.setJobAttribute(job, "Site", "Multiple")

            stagerDict = self.__setStagingRequest(job, stagingSite, optInfo)
            if not stagerDict["OK"]:
                return stagerDict
            self.__updateOtherSites(job, stagingSite, stagerDict["Value"], optInfo)
            return S_OK()
        else:
            # No staging required, can proceed to task queue agent and then waiting status
            self.log.verbose("Job %s does not require staging of input data" % (job))
        # Finally send job to TaskQueueAgent
        return self.__sendJobToTaskQueue(job, classAdJob, destinationSites, userBannedSites)
示例#16
0
  def optimizeJob( self, jid, jobState ):
    """ 1. Banned sites are removed from the destination list.
        2. Get input files
        3. Production jobs are sent directly to TQ
        4. Check if staging is necessary
    """
    # Reschedule delay
    result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] )
    if not result[ 'OK' ]:
      return result
    attDict = result[ 'Value' ]
    try:
      reschedules = int( attDict[ 'RescheduleCounter' ] )
    except ( ValueError, KeyError ):
      return S_ERROR( "RescheduleCounter has to be an integer" )
    if reschedules != 0:
      delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] )
      delay = delays[ min( reschedules, len( delays ) - 1 ) ]
      waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) )
      if waited < delay:
        return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay )

    # Get site requirements
    result = self.__getSitesRequired( jobState )
    if not result[ 'OK' ]:
      return result
    userSites, userBannedSites = result[ 'Value' ]

    # Get job type
    result = jobState.getAttribute( "JobType" )
    if not result[ 'OK' ]:
      return S_ERROR( "Could not retrieve job type" )
    jobType = result[ 'Value' ]

    # Get banned sites from DIRAC
    result = self.__jobDB.getSiteMask( 'Banned' )
    if not result[ 'OK' ]:
      return S_ERROR( "Cannot retrieve banned sites from JobDB" )
    wmsBannedSites = result[ 'Value' ]

    # If the user has selected any site, filter them and hold the job if not able to run
    if userSites:
      if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ):
        result = self.__jobDB.getUserSitesTuple( userSites )
        if not result[ 'OK' ]:
          return S_ERROR( "Problem checking userSites for tuple of active/banned/invalid sites" )

        userSites, bannedSites, invalidSites = result['Value']
        if invalidSites:
          self.jobLog.debug( "Invalid site(s) requested: %s" % ','.join( invalidSites ) )
          if not self.ex_getOption( 'AllowInvalidSites', True ):
            return self.__holdJob( jobState, "Requested site(s) %s are invalid" % ",".join( invalidSites ) )
        if bannedSites:
          self.jobLog.debug( "Banned site(s) %s ignored" % ",".join( bannedSites ) )
          if not userSites:
            return self.__holdJob( jobState, "Requested site(s) %s are inactive" % ",".join( bannedSites ) )

        if not userSites:
          return self.__holdJob( jobState, "No requested site(s) are active/valid" )
        userSites = list(userSites)

    # Check if there is input data
    result = jobState.getInputData()
    if not result['OK']:
      self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) )
      return S_ERROR( "Failed to get input data from JobDB" )

    if not result['Value']:
      # No input data? Just send to TQ
      return self.__sendToTQ( jobState, userSites, userBannedSites )

    self.jobLog.verbose( "Has an input data requirement" )
    inputData = result[ 'Value' ]

    # Production jobs are sent to TQ, but first we have to verify if staging is necessary
    if jobType in Operations().getValue( 'Transformations/DataProcessing', [] ):
      self.jobLog.info( "Production job: sending to TQ, but first checking if staging is requested" )

      userName = jobState.getAttribute( 'Owner' )
      if not userName[ 'OK' ]:
        return userName
      userName = userName['Value']

      userGroup = jobState.getAttribute( 'OwnerGroup' )
      if not userGroup[ 'OK' ]:
        return userGroup
      userGroup = userGroup['Value']

      res = getFilesToStage( inputData, proxyUserName = userName, proxyUserGroup = userGroup ) #pylint: disable=unexpected-keyword-arg

      if not res['OK']:
        return self.__holdJob( jobState, res['Message'] )
      stageLFNs = res['Value']['offlineLFNs']
      if stageLFNs:
        res = self.__checkStageAllowed( jobState )
        if not res['OK']:
          return res
        if not res['Value']:
          return S_ERROR( "Stage not allowed" )
        self.__requestStaging( jobState, stageLFNs )
        return S_OK()
      else:
        return self.__sendToTQ( jobState, userSites, userBannedSites )

    # From now on we know it's a user job with input data

    idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' )
    result = self.retrieveOptimizerParam( idAgent )
    if not result['OK']:
      self.jobLog.error( "Could not retrieve input data info", result[ 'Message' ] )
      return S_ERROR( "Could not retrieve input data info" )
    opData = result[ 'Value' ]

    if 'SiteCandidates' not in opData:
      return S_ERROR( "No possible site candidates" )

    # Filter input data sites with user requirement
    siteCandidates = list( opData[ 'SiteCandidates' ] )
    self.jobLog.info( "Site candidates are %s" % siteCandidates )

    if userSites:
      siteCandidates = list( set( siteCandidates ) & set( userSites ) )

    siteCandidates = self._applySiteFilter( siteCandidates, banned = userBannedSites )
    if not siteCandidates:
      return S_ERROR( "Impossible InputData * Site requirements" )

    idSites = {}
    for site in siteCandidates:
      idSites[ site ] = opData[ 'SiteCandidates' ][ site ]

    # Check if sites have correct count of disk+tape replicas
    numData = len( inputData )
    errorSites = set()
    for site in idSites:
      if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]:
        self.jobLog.error( "Site candidate %s does not have all the input data" % site )
        errorSites.add( site )
    for site in errorSites:
      idSites.pop( site )
    if not idSites:
      return S_ERROR( "Site candidates do not have all the input data" )

    # Check if staging is required
    stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites )
    if not siteCandidates:
      return S_ERROR( "No destination sites available" )

    # Is any site active?
    stageSites = self._applySiteFilter( siteCandidates, banned = wmsBannedSites )
    if not stageSites:
      return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) )

    # If no staging is required send to TQ
    if not stageRequired:
      # Use siteCandidates and not stageSites because active and banned sites
      # will be taken into account on matching time
      return self.__sendToTQ( jobState, siteCandidates, userBannedSites )

    # Check if the user is allowed to stage
    if self.ex_getOption( "RestrictDataStage", False ):
      res = self.__checkStageAllowed( jobState )
      if not res['OK']:
        return res
      if not res['Value']:
        return S_ERROR( "Stage not allowed" )

    # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
    stageSite = stageSites[0]
    self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) )
    stageData = idSites[ stageSite ]
    # Set as if everything has already been staged
    stageData[ 'disk' ] += stageData[ 'tape' ]
    stageData[ 'tape' ] = 0
    # Set the site info back to the original dict to save afterwards
    opData[ 'SiteCandidates' ][ stageSite ] = stageData

    stageRequest = self.__preRequestStaging( jobState, stageSite, opData )
    if not stageRequest['OK']:
      return stageRequest
    stageLFNs = stageRequest['Value']
    result = self.__requestStaging( jobState, stageLFNs )
    if not result[ 'OK' ]:
      return result
    stageLFNs = result[ 'Value' ]
    self.__updateSharedSESites( jobState, stageSite, stageLFNs, opData )
    # Save the optimizer data again
    self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData )
    result = self.storeOptimizerParam( idAgent, opData )
    if not result[ 'OK' ]:
      return result

    return self.__setJobSite( jobState, stageSites )
示例#17
0
    def optimizeJob(self, jid, jobState):
        # Reschedule delay
        result = jobState.getAttributes(
            ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'])
        if not result['OK']:
            return result
        attDict = result['Value']
        try:
            reschedules = int(attDict['RescheduleCounter'])
        except ValueError:
            return S_ERROR("RescheduleCounter has to be an integer")
        if reschedules != 0:
            delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600])
            delay = delays[min(reschedules, len(delays) - 1)]
            waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime']))
            if waited < delay:
                return self.__holdJob(
                    jobState, 'On Hold: after rescheduling %s' % reschedules,
                    delay)

        # Get site requirements
        result = self._getSitesRequired(jobState)
        if not result['OK']:
            return result
        userSites, userBannedSites = result['Value']

        # Get active and banned sites from DIRAC
        result = self.__jobDB.getSiteMask('Active')
        if not result['OK']:
            return S_ERROR("Cannot retrieve active sites from JobDB")
        wmsActiveSites = result['Value']
        result = self.__jobDB.getSiteMask('Banned')
        if not result['OK']:
            return S_ERROR("Cannot retrieve banned sites from JobDB")
        wmsBannedSites = result['Value']

        # If the user has selected any site, filter them and hold the job if not able to run
        if userSites:
            result = jobState.getAttribute("JobType")
            if not result['OK']:
                return S_ERROR("Could not retrieve job type")
            jobType = result['Value']
            if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []):
                sites = self._applySiteFilter(userSites, wmsActiveSites,
                                              wmsBannedSites)
                if not sites:
                    return self.__holdJob(
                        jobState, "Sites %s are inactive or banned" %
                        ", ".join(userSites))

        # Get the Input data
        # Third, check if there is input data
        result = jobState.getInputData()
        if not result['OK']:
            self.jobLog.error("Cannot get input data %s" % (result['Message']))
            return S_ERROR('Failed to get input data from JobDB')

        if not result['Value']:
            # No input data? Generate requirements and next
            return self.__sendToTQ(jobState, userSites, userBannedSites)

        inputData = result['Value']

        self.jobLog.verbose('Has an input data requirement')
        idAgent = self.ex_getOption('InputDataAgent', 'InputData')
        result = self.retrieveOptimizerParam(idAgent)
        if not result['OK']:
            self.jobLog.error("Could not retrieve input data info: %s" %
                              result['Message'])
            return S_ERROR("File Catalog Access Failure")
        opData = result['Value']
        if 'SiteCandidates' not in opData:
            return S_ERROR("No possible site candidates")

        # Filter input data sites with user requirement
        siteCandidates = list(opData['SiteCandidates'])
        self.jobLog.info("Site candidates are %s" % siteCandidates)

        siteCandidates = self._applySiteFilter(siteCandidates, userSites,
                                               userBannedSites)
        if not siteCandidates:
            return S_ERROR("Impossible InputData * Site requirements")

        idSites = {}
        for site in siteCandidates:
            idSites[site] = opData['SiteCandidates'][site]

        #Check if sites have correct count of disk+tape replicas
        numData = len(inputData)
        errorSites = set()
        for site in idSites:
            if numData != idSites[site]['disk'] + idSites[site]['tape']:
                self.jobLog.error(
                    "Site candidate %s does not have all the input data" %
                    site)
                errorSites.add(site)
        for site in errorSites:
            idSites.pop(site)
        if not idSites:
            return S_ERROR("Site candidates do not have all the input data")

        #Check if staging is required
        stageRequired, siteCandidates = self.__resolveStaging(
            jobState, inputData, idSites)
        if not siteCandidates:
            return S_ERROR("No destination sites available")

        # Is any site active?
        stageSites = self._applySiteFilter(siteCandidates, wmsActiveSites,
                                           wmsBannedSites)
        if not stageSites:
            return self.__holdJob(
                jobState,
                "Sites %s are inactive or banned" % ", ".join(siteCandidates))

        # If no staging is required send to TQ
        if not stageRequired:
            # Use siteCandidates and not stageSites because active and banned sites
            # will be taken into account on matching time
            return self.__sendToTQ(jobState, siteCandidates, userBannedSites)

        # Check if the user is allowed to stage
        if self.ex_getOption("RestrictDataStage", False):
            if not self.__checkStageAllowed(jobState):
                return S_ERROR("Stage not allowed")

        # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
        stageSite = stageSites[0]
        self.jobLog.verbose(" Staging site will be %s" % (stageSite))
        stageData = idSites[stageSite]
        # Set as if everything has already been staged
        stageData['disk'] += stageData['tape']
        stageData['tape'] = 0
        # Set the site info back to the original dict to save afterwards
        opData['SiteCandidates'][stageSite] = stageData

        result = self.__requestStaging(jobState, stageSite, opData)
        if not result['OK']:
            return result
        stageLFNs = result['Value']
        self._updateSharedSESites(stageSite, stageLFNs, opData)
        # Save the optimizer data again
        self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData)
        result = self.storeOptimizerParam(idAgent, opData)
        if not result['OK']:
            return result

        return self._setJobSite(jobState, stageSites)
示例#18
0
    def __failStalledJobs(self, failedTime):
        """ Changes the Stalled status to Failed for jobs long in the Stalled status
"""

        result = self.jobDB.selectJobs({'Status': 'Stalled'})
        if not result['OK']:
            return result

        failedCounter = 0

        if result['Value']:
            jobs = result['Value']
            self.log.info('%s Stalled jobs will be checked for failure' %
                          (len(jobs)))

            for job in jobs:

                # Check if the job pilot is lost
                result = self.__getJobPilotStatus(job)
                if result['OK']:
                    pilotStatus = result['Value']
                    if pilotStatus != "Running":
                        result = self.__updateJobStatus(
                            job, 'Failed', "Job stalled: pilot not running")
                        failedCounter += 1
                        result = self.__sendAccounting(job)
                        if not result['OK']:
                            self.log.error(result['Message'])
                            break
                        continue

                result = self.__getLatestUpdateTime(job)
                if not result['OK']:
                    return result
                currentTime = toEpoch()
                lastUpdate = result['Value']
                elapsedTime = currentTime - lastUpdate
                if elapsedTime > failedTime:
                    self.__updateJobStatus(
                        job, 'Failed',
                        'Stalling for more than %d sec' % failedTime)
                    failedCounter += 1
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error(result['Message'])
                        break

        recoverCounter = 0

        for minor in [
                "Job stalled: pilot not running",
                'Stalling for more than %d sec' % failedTime
        ]:
            result = self.jobDB.selectJobs({
                'Status': 'Failed',
                'MinorStatus': minor,
                'AccountedFlag': 'False'
            })
            if not result['OK']:
                return result
            if result['Value']:
                jobs = result['Value']
                self.log.info('%s Stalled jobs will be Accounted' %
                              (len(jobs)))
                for job in jobs:
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error(result['Message'])
                        continue

                    recoverCounter += 1
            if not result['OK']:
                break

        if failedCounter:
            self.log.info('%d jobs set to Failed' % failedCounter)
        if recoverCounter:
            self.log.info('%d jobs properly Accounted' % recoverCounter)
        return S_OK(failedCounter)
示例#19
0
    def __failStalledJobs(self, failedTime):
        """ Changes the Stalled status to Failed for jobs long in the Stalled status
"""

        result = self.jobDB.selectJobs({'Status': 'Stalled'})
        if not result['OK']:
            return result
        jobs = result['Value']

        failedCounter = 0
        minorStalledStatuses = ("Job stalled: pilot not running",
                                'Stalling for more than %d sec' % failedTime)

        if jobs:
            self.log.info('%s Stalled jobs will be checked for failure' %
                          (len(jobs)))

            for job in jobs:
                setFailed = False
                # Check if the job pilot is lost
                result = self.__getJobPilotStatus(job)
                if not result['OK']:
                    self.log.error('Failed to get pilot status',
                                   result['Message'])
                    continue
                pilotStatus = result['Value']
                if pilotStatus != "Running":
                    setFailed = minorStalledStatuses[0]
                else:

                    result = self.__getLatestUpdateTime(job)
                    if not result['OK']:
                        self.log.error('Failed to get job update time',
                                       result['Message'])
                        continue
                    elapsedTime = toEpoch() - result['Value']
                    if elapsedTime > failedTime:
                        setFailed = minorStalledStatuses[1]

                # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info
                if setFailed:
                    # Send a kill signal to the job such that it cannot continue running
                    WMSClient().killJob(job)
                    self.__updateJobStatus(job, 'Failed', setFailed)
                    failedCounter += 1
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error('Failed to send accounting',
                                       result['Message'])

        recoverCounter = 0

        for minor in minorStalledStatuses:
            result = self.jobDB.selectJobs({
                'Status': 'Failed',
                'MinorStatus': minor,
                'AccountedFlag': 'False'
            })
            if not result['OK']:
                return result
            if result['Value']:
                jobs = result['Value']
                self.log.info('%s Stalled jobs will be Accounted' %
                              (len(jobs)))
                for job in jobs:
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error('Failed to send accounting',
                                       result['Message'])
                        continue

                    recoverCounter += 1
            if not result['OK']:
                break

        if failedCounter:
            self.log.info('%d jobs set to Failed' % failedCounter)
        if recoverCounter:
            self.log.info('%d jobs properly Accounted' % recoverCounter)
        return S_OK(failedCounter)
示例#20
0
  def optimizeJob( self, jid, jobState ):
    #Reschedule delay
    result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] )
    if not result[ 'OK' ]:
      return result
    attDict = result[ 'Value' ]
    try:
      reschedules = int( attDict[ 'RescheduleCounter' ] )
    except ValueError:
      return S_ERROR( "RescheduleCounter has to be an integer" )
    if reschedules != 0:
      delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] )
      delay = delays[ min( reschedules, len( delays ) - 1 ) ]
      waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) )
      if waited < delay:
        return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay )

    #Get site requirements
    result = self.__getSitesRequired( jobState )
    if not result[ 'OK' ]:
      return result
    userSites, userBannedSites = result[ 'Value' ]

    #Get active and banned sites from DIRAC
    result = self.__jobDB.getSiteMask( 'Active' )
    if not result[ 'OK' ]:
      return S_ERROR( "Cannot retrieve active sites from JobDB" )
    wmsActiveSites = result[ 'Value' ]
    result = self.__jobDB.getSiteMask( 'Banned' )
    if not result[ 'OK' ]:
      return S_ERROR( "Cannot retrieve banned sites from JobDB" )
    wmsBannedSites = result[ 'Value' ]

    #If the user has selected any site, filter them and hold the job if not able to run
    if userSites:
      result = jobState.getAttribute( "JobType" )
      if not result[ 'OK' ]:
        return S_ERROR( "Could not retrieve job type" )
      jobType = result[ 'Value' ]
      if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ):
        sites = self.__applySiteFilter( userSites, wmsActiveSites, wmsBannedSites )
        if not sites:
          if len( userSites ) > 1:
            return self.__holdJob( jobState, "Requested sites %s are inactive" % ",".join( userSites ) )
          else:
            return self.__holdJob( jobState, "Requested site %s is inactive" % userSites[0] )

    #Get the Input data
    # Third, check if there is input data
    result = jobState.getInputData()
    if not result['OK']:
      self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) )
      return S_ERROR( 'Failed to get input data from JobDB' )

    if not result['Value']:
      #No input data? Generate requirements and next
      return self.__sendToTQ( jobState, userSites, userBannedSites )

    inputData = result[ 'Value' ]

    self.jobLog.verbose( 'Has an input data requirement' )
    idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' )
    result = self.retrieveOptimizerParam( idAgent )
    if not result['OK']:
      self.jobLog.error( "Could not retrieve input data info: %s" % result[ 'Message' ] )
      return S_ERROR( "File Catalog Access Failure" )
    opData = result[ 'Value' ]
    if 'SiteCandidates' not in opData:
      return S_ERROR( "No possible site candidates" )

    #Filter input data sites with user requirement
    siteCandidates = list( opData[ 'SiteCandidates' ] )
    self.jobLog.info( "Site candidates are %s" % siteCandidates )

    siteCandidates = self.__applySiteFilter( siteCandidates, userSites, userBannedSites )
    if not siteCandidates:
      return S_ERROR( "Impossible InputData * Site requirements" )

    idSites = {}
    for site in siteCandidates:
      idSites[ site ] = opData[ 'SiteCandidates' ][ site ]

    #Check if sites have correct count of disk+tape replicas
    numData = len( inputData )
    errorSites = set()
    for site in idSites:
      if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]:
        self.jobLog.error( "Site candidate %s does not have all the input data" % site )
        errorSites.add( site )
    for site in errorSites:
      idSites.pop( site )
    if not idSites:
      return S_ERROR( "Site candidates do not have all the input data" )

    #Check if staging is required
    stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites )
    if not siteCandidates:
      return S_ERROR( "No destination sites available" )

    #Is any site active?
    stageSites = self.__applySiteFilter( siteCandidates, wmsActiveSites, wmsBannedSites )
    if not stageSites:
      return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) )

    #If no staging is required send to TQ
    if not stageRequired:
      #Use siteCandidates and not stageSites because active and banned sites
      #will be taken into account on matching time
      return self.__sendToTQ( jobState, siteCandidates, userBannedSites )

    #Check if the user is allowed to stage
    if self.ex_getOption( "RestrictDataStage", False ):
      if not self.__checkStageAllowed( jobState ):
        return S_ERROR( "Stage not allowed" )

    #Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
    stageSite = stageSites[0]
    self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) )
    stageData = idSites[ stageSite ]
    #Set as if everything has already been staged
    stageData[ 'disk' ] += stageData[ 'tape' ]
    stageData[ 'tape' ] = 0
    #Set the site info back to the original dict to save afterwards
    opData[ 'SiteCandidates' ][ stageSite ] = stageData

    result = self.__requestStaging( jobState, stageSite, opData )
    if not result[ 'OK' ]:
      return result
    stageLFNs = result[ 'Value' ]
    self.__updateSharedSESites( jobState, stageSite, stageLFNs, opData )
    #Save the optimizer data again
    self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData )
    result = self.storeOptimizerParam( idAgent, opData )
    if not result[ 'OK' ]:
      return result

    return self.__setJobSite( jobState, stageSites )
示例#21
0
  def optimizeJob(self, jid, jobState):
    """ 1. Banned sites are removed from the destination list.
        2. Get input files
        3. Production jobs are sent directly to TQ
        4. Check if staging is necessary
    """
    # Reschedule delay
    result = jobState.getAttributes(['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'])
    if not result['OK']:
      return result
    attDict = result['Value']
    try:
      reschedules = int(attDict['RescheduleCounter'])
    except (ValueError, KeyError):
      return S_ERROR("RescheduleCounter has to be an integer")
    if reschedules != 0:
      delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600])
      delay = delays[min(reschedules, len(delays) - 1)]
      waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime']))
      if waited < delay:
        return self.__holdJob(jobState, 'On Hold: after rescheduling %s' % reschedules, delay)

    # Get the job manifest for the later checks
    result = jobState.getManifest()
    if not result['OK']:
      return S_ERROR("Could not retrieve job manifest: %s" % result['Message'])
    jobManifest = result['Value']

    # Get site requirements
    result = self.__getSitesRequired(jobManifest)
    if not result['OK']:
      return result
    userSites, userBannedSites = result['Value']

    # Get job type
    result = jobState.getAttribute("JobType")
    if not result['OK']:
      return S_ERROR("Could not retrieve job type")
    jobType = result['Value']

    # Get banned sites from DIRAC
    result = self.siteClient.getSites('Banned')
    if not result['OK']:
      return S_ERROR("Cannot retrieve banned sites from JobDB")
    wmsBannedSites = result['Value']

    # If the user has selected any site, filter them and hold the job if not able to run
    if userSites:
      if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []):

        result = self.siteClient.getUsableSites(userSites)
        if not result['OK']:
          return S_ERROR("Problem checking userSites for tuple of active/banned/invalid sites")
        usableSites = set(result['Value'])
        bannedSites = []
        invalidSites = []
        for site in userSites:
          if site in wmsBannedSites:
            bannedSites.append(site)
          elif site not in usableSites:
            invalidSites.append(site)

        if invalidSites:
          self.jobLog.debug("Invalid site(s) requested: %s" % ','.join(invalidSites))
          if not self.ex_getOption('AllowInvalidSites', True):
            return self.__holdJob(jobState, "Requested site(s) %s are invalid" % ",".join(invalidSites))
        if bannedSites:
          self.jobLog.debug("Banned site(s) %s ignored" % ",".join(bannedSites))
          if not usableSites:
            return self.__holdJob(jobState, "Requested site(s) %s are inactive" % ",".join(bannedSites))

        if not usableSites:
          return self.__holdJob(jobState, "No requested site(s) are active/valid")
        userSites = list(usableSites)

    checkPlatform = self.ex_getOption('CheckPlatform', False)
    jobPlatform = jobManifest.getOption("Platform", None)
    # First check that the platform is valid (in OSCompatibility list)
    if checkPlatform and jobPlatform:
      result = gConfig.getOptionsDict('/Resources/Computing/OSCompatibility')
      if not result['OK']:
        return S_ERROR("Unable to get OSCompatibility list")
      allPlatforms = result['Value']
      if jobPlatform not in allPlatforms:
        self.jobLog.error("Platform %s is not supported" % jobPlatform)
        return S_ERROR("Platform %s is not supported" % jobPlatform)

    # Filter the userSites by the platform selection (if there is one)
    if checkPlatform and userSites:
      if jobPlatform:
        result = self.__filterByPlatform(jobPlatform, userSites)
        if not result['OK']:
          self.jobLog.error("Failed to filter job sites by platform: %s" % result['Message'])
          return S_ERROR("Failed to filter job sites by platform")
        userSites = result['Value']
        if not userSites:
          # No sites left after filtering -> Invalid platform/sites combination
          self.jobLog.error("No selected sites match platform '%s'" % jobPlatform)
          return S_ERROR("No selected sites match platform '%s'" % jobPlatform)

    # Check if there is input data
    result = jobState.getInputData()
    if not result['OK']:
      self.jobLog.error("Cannot get input data %s" % (result['Message']))
      return S_ERROR("Failed to get input data from JobDB")

    if not result['Value']:
      # No input data? Just send to TQ
      return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites)

    self.jobLog.verbose("Has an input data requirement")
    inputData = result['Value']

    # ===================================================================================
    # Production jobs are sent to TQ, but first we have to verify if staging is necessary
    # ===================================================================================
    if jobType in Operations().getValue('Transformations/DataProcessing', []):
      self.jobLog.info("Production job: sending to TQ, but first checking if staging is requested")

      res = getFilesToStage(inputData,
                            jobState=jobState,
                            checkOnlyTapeSEs=self.ex_getOption('CheckOnlyTapeSEs', True),
                            jobLog=self.jobLog)

      if not res['OK']:
        return self.__holdJob(jobState, res['Message'])
      if res['Value']['absentLFNs']:
        # Some files do not exist at all... set the job Failed
        # Reverse errors
        reasons = {}
        for lfn, reason in res['Value']['absentLFNs'].iteritems():
          reasons.setdefault(reason, []).append(lfn)
        for reason, lfns in reasons.iteritems():
          # Some files are missing in the FC or in SEs, fail the job
          self.jobLog.error(reason, ','.join(lfns))
        error = ','.join(reasons)
        return S_ERROR(error)

      if res['Value']['failedLFNs']:
        return self.__holdJob(jobState, "Couldn't get storage metadata of some files")
      stageLFNs = res['Value']['offlineLFNs']
      if stageLFNs:
        res = self.__checkStageAllowed(jobState)
        if not res['OK']:
          return res
        if not res['Value']:
          return S_ERROR("Stage not allowed")
        self.__requestStaging(jobState, stageLFNs)
        return S_OK()
      else:
        # No staging required
        onlineSites = res['Value']['onlineSites']
        if onlineSites:
          # Set the online site(s) first
          userSites = set(userSites)
          onlineSites &= userSites
          userSites = list(onlineSites) + list(userSites - onlineSites)
        return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites, onlineSites=onlineSites)

    # ===================================================
    # From now on we know it's a user job with input data
    # ===================================================

    idAgent = self.ex_getOption('InputDataAgent', 'InputData')
    result = self.retrieveOptimizerParam(idAgent)
    if not result['OK']:
      self.jobLog.error("Could not retrieve input data info", result['Message'])
      return S_ERROR("Could not retrieve input data info")
    opData = result['Value']

    if 'SiteCandidates' not in opData:
      return S_ERROR("No possible site candidates")

    # Filter input data sites with user requirement
    siteCandidates = list(opData['SiteCandidates'])
    self.jobLog.info("Site candidates are %s" % siteCandidates)

    if userSites:
      siteCandidates = list(set(siteCandidates) & set(userSites))

    siteCandidates = self._applySiteFilter(siteCandidates, banned=userBannedSites)
    if not siteCandidates:
      return S_ERROR("Impossible InputData * Site requirements")

    idSites = {}
    for site in siteCandidates:
      idSites[site] = opData['SiteCandidates'][site]

    # Check if sites have correct count of disk+tape replicas
    numData = len(inputData)
    errorSites = set()
    for site in idSites:
      if numData != idSites[site]['disk'] + idSites[site]['tape']:
        self.jobLog.error("Site candidate %s does not have all the input data" % site)
        errorSites.add(site)
    for site in errorSites:
      idSites.pop(site)
    if not idSites:
      return S_ERROR("Site candidates do not have all the input data")

    # Check if staging is required
    stageRequired, siteCandidates = self.__resolveStaging(inputData, idSites)
    if not siteCandidates:
      return S_ERROR("No destination sites available")

    # Is any site active?
    stageSites = self._applySiteFilter(siteCandidates, banned=wmsBannedSites)
    if not stageSites:
      return self.__holdJob(jobState, "Sites %s are inactive or banned" % ", ".join(siteCandidates))

    # If no staging is required send to TQ
    if not stageRequired:
      # Use siteCandidates and not stageSites because active and banned sites
      # will be taken into account on matching time
      return self.__sendToTQ(jobState, jobManifest, siteCandidates, userBannedSites)

    # Check if the user is allowed to stage
    if self.ex_getOption("RestrictDataStage", False):
      res = self.__checkStageAllowed(jobState)
      if not res['OK']:
        return res
      if not res['Value']:
        return S_ERROR("Stage not allowed")

    # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
    stageSite = stageSites[0]
    self.jobLog.verbose(" Staging site will be %s" % (stageSite))
    stageData = idSites[stageSite]
    # Set as if everything has already been staged
    stageData['disk'] += stageData['tape']
    stageData['tape'] = 0
    # Set the site info back to the original dict to save afterwards
    opData['SiteCandidates'][stageSite] = stageData

    stageRequest = self.__preRequestStaging(jobManifest, stageSite, opData)
    if not stageRequest['OK']:
      return stageRequest
    stageLFNs = stageRequest['Value']
    result = self.__requestStaging(jobState, stageLFNs)
    if not result['OK']:
      return result
    stageLFNs = result['Value']
    self.__updateSharedSESites(jobManifest, stageSite, stageLFNs, opData)
    # Save the optimizer data again
    self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData)
    result = self.storeOptimizerParam(idAgent, opData)
    if not result['OK']:
      return result

    return self.__setJobSite(jobState, stageSites)
示例#22
0
    def optimizeJob(self, jid, jobState):
        """ 1. Banned sites are removed from the destination list.
        2. Get input files
        3. Production jobs are sent directly to TQ
        4. Check if staging is necessary
    """
        # Reschedule delay
        result = jobState.getAttributes(
            ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'])
        if not result['OK']:
            return result
        attDict = result['Value']
        try:
            reschedules = int(attDict['RescheduleCounter'])
        except (ValueError, KeyError):
            return S_ERROR("RescheduleCounter has to be an integer")
        if reschedules != 0:
            delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600])
            delay = delays[min(reschedules, len(delays) - 1)]
            waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime']))
            if waited < delay:
                return self.__holdJob(
                    jobState, 'On Hold: after rescheduling %s' % reschedules,
                    delay)

        # Get the job manifest for the later checks
        result = jobState.getManifest()
        if not result['OK']:
            return S_ERROR("Could not retrieve job manifest: %s" %
                           result['Message'])
        jobManifest = result['Value']

        # Get site requirements
        result = self.__getSitesRequired(jobManifest)
        if not result['OK']:
            return result
        userSites, userBannedSites = result['Value']

        # Get job type
        result = jobState.getAttribute("JobType")
        if not result['OK']:
            return S_ERROR("Could not retrieve job type")
        jobType = result['Value']

        # Get banned sites from DIRAC
        result = self.siteClient.getSites('Banned')
        if not result['OK']:
            return S_ERROR("Cannot retrieve banned sites from JobDB")
        wmsBannedSites = result['Value']

        # If the user has selected any site, filter them and hold the job if not able to run
        if userSites:
            if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []):

                result = self.siteClient.getUsableSites(userSites)
                if not result['OK']:
                    return S_ERROR(
                        "Problem checking userSites for tuple of active/banned/invalid sites"
                    )
                usableSites = set(result['Value'])
                bannedSites = []
                invalidSites = []
                for site in userSites:
                    if site in wmsBannedSites:
                        bannedSites.append(site)
                    elif site not in usableSites:
                        invalidSites.append(site)

                if invalidSites:
                    self.jobLog.debug("Invalid site(s) requested: %s" %
                                      ','.join(invalidSites))
                    if not self.ex_getOption('AllowInvalidSites', True):
                        return self.__holdJob(
                            jobState, "Requested site(s) %s are invalid" %
                            ",".join(invalidSites))
                if bannedSites:
                    self.jobLog.debug("Banned site(s) %s ignored" %
                                      ",".join(bannedSites))
                    if not usableSites:
                        return self.__holdJob(
                            jobState, "Requested site(s) %s are inactive" %
                            ",".join(bannedSites))

                if not usableSites:
                    return self.__holdJob(
                        jobState, "No requested site(s) are active/valid")
                userSites = list(usableSites)

        checkPlatform = self.ex_getOption('CheckPlatform', False)
        jobPlatform = jobManifest.getOption("Platform", None)
        # First check that the platform is valid (in OSCompatibility list)
        if checkPlatform and jobPlatform:
            result = gConfig.getOptionsDict(
                '/Resources/Computing/OSCompatibility')
            if not result['OK']:
                return S_ERROR("Unable to get OSCompatibility list")
            allPlatforms = result['Value']
            if jobPlatform not in allPlatforms:
                self.jobLog.error("Platform not supported", jobPlatform)
                return S_ERROR("Platform %s is not supported" % jobPlatform)

        # Filter the userSites by the platform selection (if there is one)
        if checkPlatform and userSites:
            if jobPlatform:
                result = self.__filterByPlatform(jobPlatform, userSites)
                if not result['OK']:
                    self.jobLog.error("Failed to filter job sites by platform",
                                      result['Message'])
                    return S_ERROR("Failed to filter job sites by platform")
                userSites = result['Value']
                if not userSites:
                    # No sites left after filtering -> Invalid platform/sites combination
                    self.jobLog.error("No selected sites match platform",
                                      jobPlatform)
                    return S_ERROR("No selected sites match platform '%s'" %
                                   jobPlatform)

        # Check if there is input data
        result = jobState.getInputData()
        if not result['OK']:
            self.jobLog.error("Cannot get input data", result['Message'])
            return S_ERROR("Failed to get input data from JobDB")

        if not result['Value']:
            # No input data? Just send to TQ
            return self.__sendToTQ(jobState, jobManifest, userSites,
                                   userBannedSites)

        self.jobLog.verbose("Has an input data requirement")
        inputData = result['Value']

        # ===================================================================================
        # Production jobs are sent to TQ, but first we have to verify if staging is necessary
        # ===================================================================================
        if jobType in Operations().getValue('Transformations/DataProcessing',
                                            []):
            self.jobLog.info(
                "Production job: sending to TQ, but first checking if staging is requested"
            )

            res = getFilesToStage(inputData,
                                  jobState=jobState,
                                  checkOnlyTapeSEs=self.ex_getOption(
                                      'CheckOnlyTapeSEs', True),
                                  jobLog=self.jobLog)

            if not res['OK']:
                return self.__holdJob(jobState, res['Message'])
            if res['Value']['absentLFNs']:
                # Some files do not exist at all... set the job Failed
                # Reverse errors
                reasons = {}
                for lfn, reason in res['Value']['absentLFNs'].iteritems():
                    reasons.setdefault(reason, []).append(lfn)
                for reason, lfns in reasons.iteritems():
                    # Some files are missing in the FC or in SEs, fail the job
                    self.jobLog.error(reason, ','.join(lfns))
                error = ','.join(reasons)
                return S_ERROR(error)

            if res['Value']['failedLFNs']:
                return self.__holdJob(
                    jobState, "Couldn't get storage metadata of some files")
            stageLFNs = res['Value']['offlineLFNs']
            if stageLFNs:
                res = self.__checkStageAllowed(jobState)
                if not res['OK']:
                    return res
                if not res['Value']:
                    return S_ERROR("Stage not allowed")
                self.__requestStaging(jobState, stageLFNs)
                return S_OK()
            else:
                # No staging required
                onlineSites = res['Value']['onlineSites']
                if onlineSites:
                    # Set the online site(s) first
                    userSites = set(userSites)
                    onlineSites &= userSites
                    userSites = list(onlineSites) + list(userSites -
                                                         onlineSites)
                return self.__sendToTQ(jobState,
                                       jobManifest,
                                       userSites,
                                       userBannedSites,
                                       onlineSites=onlineSites)

        # ===================================================
        # From now on we know it's a user job with input data
        # ===================================================

        idAgent = self.ex_getOption('InputDataAgent', 'InputData')
        result = self.retrieveOptimizerParam(idAgent)
        if not result['OK']:
            self.jobLog.error("Could not retrieve input data info",
                              result['Message'])
            return S_ERROR("Could not retrieve input data info")
        opData = result['Value']

        if 'SiteCandidates' not in opData:
            return S_ERROR("No possible site candidates")

        # Filter input data sites with user requirement
        siteCandidates = list(opData['SiteCandidates'])
        self.jobLog.info("Site candidates are %s" % siteCandidates)

        if userSites:
            siteCandidates = list(set(siteCandidates) & set(userSites))

        siteCandidates = self._applySiteFilter(siteCandidates,
                                               banned=userBannedSites)
        if not siteCandidates:
            return S_ERROR("Impossible InputData * Site requirements")

        idSites = {}
        for site in siteCandidates:
            idSites[site] = opData['SiteCandidates'][site]

        # Check if sites have correct count of disk+tape replicas
        numData = len(inputData)
        errorSites = set()
        for site in idSites:
            if numData != idSites[site]['disk'] + idSites[site]['tape']:
                self.jobLog.error(
                    "Site candidate %s does not have all the input data" %
                    site)
                errorSites.add(site)
        for site in errorSites:
            idSites.pop(site)
        if not idSites:
            return S_ERROR("Site candidates do not have all the input data")

        # Check if staging is required
        stageRequired, siteCandidates = self.__resolveStaging(
            inputData, idSites)
        if not siteCandidates:
            return S_ERROR("No destination sites available")

        # Is any site active?
        stageSites = self._applySiteFilter(siteCandidates,
                                           banned=wmsBannedSites)
        if not stageSites:
            return self.__holdJob(
                jobState,
                "Sites %s are inactive or banned" % ", ".join(siteCandidates))

        # If no staging is required send to TQ
        if not stageRequired:
            # Use siteCandidates and not stageSites because active and banned sites
            # will be taken into account on matching time
            return self.__sendToTQ(jobState, jobManifest, siteCandidates,
                                   userBannedSites)

        # Check if the user is allowed to stage
        if self.ex_getOption("RestrictDataStage", False):
            res = self.__checkStageAllowed(jobState)
            if not res['OK']:
                return res
            if not res['Value']:
                return S_ERROR("Stage not allowed")

        # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
        stageSite = stageSites[0]
        self.jobLog.verbose(" Staging site will be %s" % (stageSite))
        stageData = idSites[stageSite]
        # Set as if everything has already been staged
        stageData['disk'] += stageData['tape']
        stageData['tape'] = 0
        # Set the site info back to the original dict to save afterwards
        opData['SiteCandidates'][stageSite] = stageData

        stageRequest = self.__preRequestStaging(jobManifest, stageSite, opData)
        if not stageRequest['OK']:
            return stageRequest
        stageLFNs = stageRequest['Value']
        result = self.__requestStaging(jobState, stageLFNs)
        if not result['OK']:
            return result
        stageLFNs = result['Value']
        self.__updateSharedSESites(jobManifest, stageSite, stageLFNs, opData)
        # Save the optimizer data again
        self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData)
        result = self.storeOptimizerParam(idAgent, opData)
        if not result['OK']:
            return result

        return self.__setJobSite(jobState, stageSites)
示例#23
0
    def __failStalledJobs(self, failedTime):
        """ Changes the Stalled status to Failed for jobs long in the Stalled status
"""

        result = self.jobDB.selectJobs({"Status": "Stalled"})
        if not result["OK"]:
            return result
        jobs = result["Value"]

        failedCounter = 0
        minorStalledStatuses = ("Job stalled: pilot not running", "Stalling for more than %d sec" % failedTime)

        if jobs:
            self.log.info("%s Stalled jobs will be checked for failure" % (len(jobs)))

            for job in jobs:
                setFailed = False
                # Check if the job pilot is lost
                result = self.__getJobPilotStatus(job)
                if not result["OK"]:
                    self.log.error("Failed to get pilot status", result["Message"])
                    continue
                pilotStatus = result["Value"]
                if pilotStatus != "Running":
                    setFailed = minorStalledStatuses[0]
                else:

                    result = self.__getLatestUpdateTime(job)
                    if not result["OK"]:
                        self.log.error("Failed to get job update time", result["Message"])
                        continue
                    elapsedTime = toEpoch() - result["Value"]
                    if elapsedTime > failedTime:
                        setFailed = minorStalledStatuses[1]

                # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info
                if setFailed:
                    # Send a kill signal to the job such that it cannot continue running
                    WMSClient().killJob(job)
                    self.__updateJobStatus(job, "Failed", setFailed)
                    failedCounter += 1
                    result = self.__sendAccounting(job)
                    if not result["OK"]:
                        self.log.error("Failed to send accounting", result["Message"])

        recoverCounter = 0

        for minor in minorStalledStatuses:
            result = self.jobDB.selectJobs({"Status": "Failed", "MinorStatus": minor, "AccountedFlag": "False"})
            if not result["OK"]:
                return result
            if result["Value"]:
                jobs = result["Value"]
                self.log.info("%s Stalled jobs will be Accounted" % (len(jobs)))
                for job in jobs:
                    result = self.__sendAccounting(job)
                    if not result["OK"]:
                        self.log.error("Failed to send accounting", result["Message"])
                        continue

                    recoverCounter += 1
            if not result["OK"]:
                break

        if failedCounter:
            self.log.info("%d jobs set to Failed" % failedCounter)
        if recoverCounter:
            self.log.info("%d jobs properly Accounted" % recoverCounter)
        return S_OK(failedCounter)