def __getLatestUpdateTime(self, job): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes(job, ["HeartBeatTime", "LastUpdateTime"]) if not result["OK"]: self.log.error("Failed to get job attributes", result["Message"]) if not result["OK"] or not result["Value"]: self.log.error("Could not get attributes for job", "%s" % job) return S_ERROR("Could not get attributes for job") self.log.verbose(result) latestUpdate = 0 if not result["Value"]["HeartBeatTime"] or result["Value"]["HeartBeatTime"] == "None": self.log.verbose("HeartBeatTime is null for job %s" % job) else: latestUpdate = toEpoch(fromString(result["Value"]["HeartBeatTime"])) if not result["Value"]["LastUpdateTime"] or result["Value"]["LastUpdateTime"] == "None": self.log.verbose("LastUpdateTime is null for job %s" % job) else: lastUpdate = toEpoch(fromString(result["Value"]["LastUpdateTime"])) if latestUpdate < lastUpdate: latestUpdate = lastUpdate if not latestUpdate: return S_ERROR("LastUpdate and HeartBeat times are null for job %s" % job) else: self.log.verbose("Latest update time from epoch for job %s is %s" % (job, latestUpdate)) return S_OK(latestUpdate)
def __getLatestUpdateTime(self, job): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes( job, ['HeartBeatTime', 'LastUpdateTime']) if not result['OK']: self.log.error(result['Message']) if not result['OK'] or not result['Value']: return S_ERROR('Could not get attributes for job %s' % job) self.log.verbose(result) latestUpdate = 0 if not result['Value']['HeartBeatTime'] or result['Value'][ 'HeartBeatTime'] == 'None': self.log.verbose('HeartBeatTime is null for job %s' % job) else: latestUpdate = toEpoch(fromString( result['Value']['HeartBeatTime'])) if not result['Value']['LastUpdateTime'] or result['Value'][ 'LastUpdateTime'] == 'None': self.log.verbose('LastUpdateTime is null for job %s' % job) else: lastUpdate = toEpoch(fromString(result['Value']['LastUpdateTime'])) if latestUpdate < lastUpdate: latestUpdate = lastUpdate if not latestUpdate: return S_ERROR( 'LastUpdate and HeartBeat times are null for job %s' % job) else: self.log.verbose('Latest update time from epoch for job %s is %s' % (job, latestUpdate)) return S_OK(latestUpdate)
def __getLatestUpdateTime( self, job ): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes( job, ['HeartBeatTime', 'LastUpdateTime'] ) if not result['OK']: self.log.error( 'Failed to get job attributes', result['Message'] ) if not result['OK'] or not result['Value']: self.log.error( 'Could not get attributes for job', '%s' % job ) return S_ERROR( 'Could not get attributes for job' ) self.log.verbose( result ) latestUpdate = 0 if not result['Value']['HeartBeatTime'] or result['Value']['HeartBeatTime'] == 'None': self.log.verbose( 'HeartBeatTime is null for job %s' % job ) else: latestUpdate = toEpoch( fromString( result['Value']['HeartBeatTime'] ) ) if not result['Value']['LastUpdateTime'] or result['Value']['LastUpdateTime'] == 'None': self.log.verbose( 'LastUpdateTime is null for job %s' % job ) else: lastUpdate = toEpoch( fromString( result['Value']['LastUpdateTime'] ) ) if latestUpdate < lastUpdate: latestUpdate = lastUpdate if not latestUpdate: return S_ERROR( 'LastUpdate and HeartBeat times are null for job %s' % job ) else: self.log.verbose( 'Latest update time from epoch for job %s is %s' % ( job, latestUpdate ) ) return S_OK( latestUpdate )
def _failStalledJobs(self, jobID): """ Changes the Stalled status to Failed for jobs long in the Stalled status. Run inside thread. """ setFailed = False # Check if the job pilot is lost result = self.__getJobPilotStatus(jobID) if not result['OK']: self.log.error('Failed to get pilot status', "for job %d: %s" % (jobID, result['Message'])) return result pilotStatus = result['Value'] if pilotStatus != "Running": setFailed = self.minorStalledStatuses[0] else: # Verify that there was no sign of life for long enough result = self.__getLatestUpdateTime(jobID) if not result['OK']: self.log.error('Failed to get job update time', "for job %d: %s" % (jobID, result['Message'])) return result elapsedTime = toEpoch() - result['Value'] if elapsedTime > self.failedTime: setFailed = self.minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead # and send accounting info if setFailed: self.__sendKillCommand(jobID) # always returns None return self.__updateJobStatus(jobID, JobStatus.FAILED, minorStatus=setFailed) return S_OK()
def _failStalledJobs(self, jobID): """ Changes the Stalled status to Failed for jobs long in the Stalled status. Run inside thread. """ setFailed = False # Check if the job pilot is lost result = self._getJobPilotStatus(jobID) if not result["OK"]: self.log.error("Failed to get pilot status", "for job %d: %s" % (jobID, result["Message"])) return result pilotStatus = result["Value"] if pilotStatus != "Running": setFailed = self.minorStalledStatuses[0] else: # Verify that there was no sign of life for long enough result = self._getLatestUpdateTime(jobID) if not result["OK"]: self.log.error("Failed to get job update time", "for job %d: %s" % (jobID, result["Message"])) return result elapsedTime = toEpoch() - result["Value"] if elapsedTime > self.failedTime: setFailed = self.minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead # and send accounting info if setFailed: self._sendKillCommand(jobID) # always returns None # For some sites we might want to reschedule rather than fail the jobs if self.stalledJobsToRescheduleSites: result = self.jobDB.getJobAttribute(jobID, "site") if not result["OK"]: return result site = result["Value"] if site in self.stalledJobsToRescheduleSites: return self._updateJobStatus(jobID, JobStatus.RESCHEDULED, minorStatus=setFailed, force=True) return self._updateJobStatus(jobID, JobStatus.FAILED, minorStatus=setFailed) return S_OK()
def _getLatestUpdateTime(self, job): """Returns the most recent of HeartBeatTime and LastUpdateTime""" result = self.jobDB.getJobAttributes( job, ["HeartBeatTime", "LastUpdateTime"]) if not result["OK"] or not result["Value"]: self.log.error( "Failed to get job attributes", "for job %d: %s" % (job, result["Message"] if "Message" in result else "empty"), ) return S_ERROR("Could not get attributes for job") latestUpdate = 0 if not result["Value"]["HeartBeatTime"] or result["Value"][ "HeartBeatTime"] == "None": self.log.verbose("HeartBeatTime is null", "for job %s" % job) else: latestUpdate = toEpoch(fromString( result["Value"]["HeartBeatTime"])) if not result["Value"]["LastUpdateTime"] or result["Value"][ "LastUpdateTime"] == "None": self.log.verbose("LastUpdateTime is null", "for job %s" % job) else: latestUpdate = max( latestUpdate, toEpoch(fromString(result["Value"]["LastUpdateTime"]))) if not latestUpdate: return S_ERROR( "LastUpdate and HeartBeat times are null for job %s" % job) else: self.log.verbose( "", "Latest update time from epoch for job %s is %s" % (job, latestUpdate)) return S_OK(latestUpdate)
def __checkJobStalled(self, job, stalledTime): """ Compares the most recent of LastUpdateTime and HeartBeatTime against the stalledTime limit. """ result = self.__getLatestUpdateTime(job) if not result['OK']: return result elapsedTime = toEpoch() - result['Value'] self.log.debug('(CurrentTime-LastUpdate) = %s secs' % (elapsedTime)) if elapsedTime > stalledTime: self.log.info('Job is identified as stalled', ": jobID %d with last update > %s secs ago" % (job, elapsedTime)) return S_OK('Stalled') return S_ERROR('Job %s is running and will be ignored' % job)
def __getStalledJob( self, job, stalledTime ): """ Compares the most recent of LastUpdateTime and HeartBeatTime against the stalledTime limit. """ result = self.__getLatestUpdateTime( job ) if not result['OK']: return result currentTime = toEpoch() lastUpdate = result['Value'] elapsedTime = currentTime - lastUpdate self.log.verbose( '(CurrentTime-LastUpdate) = %s secs' % ( elapsedTime ) ) if elapsedTime > stalledTime: self.log.info( 'Job %s is identified as stalled with last update > %s secs ago' % ( job, elapsedTime ) ) return S_OK( 'Stalled' ) return S_ERROR( 'Job %s is running and will be ignored' % job )
def __failStalledJobs( self, failedTime ): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ result = self.jobDB.selectJobs( {'Status':'Stalled'} ) if not result['OK']: return result failedCounter = 0 if result['Value']: jobs = result['Value'] self.log.info( '%s Stalled jobs will be checked for failure' % ( len( jobs ) ) ) for job in jobs: # Check if the job pilot is lost result = self.__getJobPilotStatus( job ) if result['OK']: pilotStatus = result['Value'] if pilotStatus != "Running": result = self.__updateJobStatus( job, 'Failed', "Job stalled: pilot not running" ) failedCounter += 1 result = self.__sendAccounting( job ) if not result['OK']: self.log.error( 'Failed to send accounting', result['Message'] ) break continue result = self.__getLatestUpdateTime( job ) if not result['OK']: return result currentTime = toEpoch() lastUpdate = result['Value'] elapsedTime = currentTime - lastUpdate if elapsedTime > failedTime: self.__updateJobStatus( job, 'Failed', 'Stalling for more than %d sec' % failedTime ) failedCounter += 1 result = self.__sendAccounting( job ) if not result['OK']: self.log.error( 'Failed to send accounting', result['Message'] ) break recoverCounter = 0 for minor in ["Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime]: result = self.jobDB.selectJobs( {'Status':'Failed', 'MinorStatus': minor, 'AccountedFlag': 'False' } ) if not result['OK']: return result if result['Value']: jobs = result['Value'] self.log.info( '%s Stalled jobs will be Accounted' % ( len( jobs ) ) ) for job in jobs: result = self.__sendAccounting( job ) if not result['OK']: self.log.error( 'Failed to send accounting', result['Message'] ) continue recoverCounter += 1 if not result['OK']: break if failedCounter: self.log.info( '%d jobs set to Failed' % failedCounter ) if recoverCounter: self.log.info( '%d jobs properly Accounted' % recoverCounter ) return S_OK( failedCounter )
def __failStalledJobs( self, failedTime ): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ result = self.jobDB.selectJobs( {'Status':'Stalled'} ) if not result['OK']: return result jobs = result['Value'] failedCounter = 0 minorStalledStatuses = ( "Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime ) if jobs: self.log.info( '%s Stalled jobs will be checked for failure' % ( len( jobs ) ) ) for job in jobs: setFailed = False # Check if the job pilot is lost result = self.__getJobPilotStatus( job ) if not result['OK']: self.log.error( 'Failed to get pilot status', result['Message'] ) continue pilotStatus = result['Value'] if pilotStatus != "Running": setFailed = minorStalledStatuses[0] else: result = self.__getLatestUpdateTime( job ) if not result['OK']: self.log.error( 'Failed to get job update time', result['Message'] ) continue elapsedTime = toEpoch() - result['Value'] if elapsedTime > failedTime: setFailed = minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info if setFailed: self.__sendKillCommand(job) self.__updateJobStatus( job, 'Failed', setFailed ) failedCounter += 1 result = self.__sendAccounting( job ) if not result['OK']: self.log.error( 'Failed to send accounting', result['Message'] ) recoverCounter = 0 for minor in minorStalledStatuses: result = self.jobDB.selectJobs( {'Status':'Failed', 'MinorStatus': minor, 'AccountedFlag': 'False' } ) if not result['OK']: return result if result['Value']: jobs = result['Value'] self.log.info( '%s Stalled jobs will be Accounted' % ( len( jobs ) ) ) for job in jobs: result = self.__sendAccounting( job ) if not result['OK']: self.log.error( 'Failed to send accounting', result['Message'] ) continue recoverCounter += 1 if not result['OK']: break if failedCounter: self.log.info( '%d jobs set to Failed' % failedCounter ) if recoverCounter: self.log.info( '%d jobs properly Accounted' % recoverCounter ) return S_OK( failedCounter )
def checkJob( self, job, classAdJob ): """This method controls the checking of the job. """ self.log.verbose( 'Job %s will be processed' % ( job ) ) # Check if the job was recently rescheduled result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'] ) if not result['OK']: self.log.error( result['Message'] ) return S_ERROR( 'Can not get job attributes from JobDB' ) jobDict = result['Value'] reCounter = int( jobDict['RescheduleCounter'] ) if reCounter != 0 : reTime = fromString( jobDict['RescheduleTime'] ) delta = toEpoch() - toEpoch( reTime ) delay = self.maxRescheduleDelay if reCounter <= len( self.rescheduleDelaysList ): delay = self.rescheduleDelaysList[reCounter - 1] if delta < delay: if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling' ) == -1: result = self.jobDB.setJobStatus( job, application = 'On Hold: after rescheduling #%d' % reCounter ) return S_OK() # First, get Site and BannedSites from the Job result = self.__getJobSiteRequirement( job, classAdJob ) userBannedSites = result['BannedSites'] userSites = result['Sites'] if userSites: userSites = applySiteRequirements( userSites, [], userBannedSites ) if not userSites: msg = 'Impossible Site Requirement' return S_ERROR( msg ) # Second, get the Active and Banned sites from the WMS wmsSites = self.jobDB.getSiteMask( 'Active' ) wmsBannedSites = self.jobDB.getSiteMask( 'Banned' ) if not ( wmsSites['OK'] and wmsBannedSites['OK'] ): if not wmsSites['OK']: self.log.error( wmsSites['Message'] ) if not wmsBannedSites['OK']: self.log.error( wmsBannedSites['Message'] ) return S_ERROR( 'Can not get Active and Banned Sites from JobDB' ) wmsSites = wmsSites['Value'] wmsBannedSites = wmsBannedSites['Value'] if userSites: sites = applySiteRequirements( userSites, wmsSites, wmsBannedSites ) if not sites: # Put on Hold only non-excluded job types jobType = classAdJob.getAttributeString( 'JobType' ) if not jobType in self.excludedOnHoldJobTypes: msg = 'On Hold: Requested site is Banned or not Active' self.log.info( msg ) result = self.jobDB.setJobStatus( job, application = msg ) return S_OK() # Third, check if there is input data result = self.jobDB.getInputData( job ) if not result['OK']: self.log.warn( 'Failed to get input data from JobDB for %s' % ( job ) ) self.log.error( result['Message'] ) return S_ERROR( 'Failed to get input data from JobDB' ) if not result['Value']: return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites ) hasInputData = False inputData = [] for lfn in result['Value']: if lfn: inputData.append( lfn ) hasInputData = True if not hasInputData: #With no input data requirement, job can proceed directly to task queue self.log.verbose( 'Job %s has no input data requirement' % ( job ) ) return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites ) self.log.verbose( 'Job %s has an input data requirement ' % ( job ) ) # Fourth, Check all optimizer information result = self.__checkOptimizerInfo( job ) if not result['OK']: return result optInfo = result['Value'] #Compare site candidates with current mask optSites = optInfo['SiteCandidates'].keys() self.log.info( 'Input Data Site Candidates: %s' % ( ', '.join( optSites ) ) ) # Check that it is compatible with user requirements optSites = applySiteRequirements( optSites, userSites, userBannedSites ) if not optSites: msg = 'Impossible Site + InputData Requirement' return S_ERROR( msg ) sites = applySiteRequirements( optSites, wmsSites, wmsBannedSites ) if not sites: msg = 'On Hold: InputData Site is Banned or not Active' self.log.info( msg ) result = self.jobDB.setJobStatus( job, application = msg ) return S_OK() #Set stager request as necessary, optimize for smallest #files on tape if #more than one site candidate left at this point checkStaging = self.__resolveSitesForStaging( job, sites, inputData, optInfo['SiteCandidates'] ) if not checkStaging['OK']: return checkStaging destinationSites = checkStaging['SiteCandidates'] if not destinationSites: return S_ERROR( 'No destination sites available' ) stagingFlag = checkStaging['Value'] if stagingFlag: #Single site candidate chosen and staging required self.log.verbose( 'Job %s requires staging of input data' % ( job ) ) # set all LFN to disk for the selected site stagingSite = destinationSites[0] siteDict = optInfo['SiteCandidates'][stagingSite] siteDict['disk'] = siteDict['disk'] + siteDict['tape'] siteDict['tape'] = 0 optInfo['SiteCandidates'][stagingSite] = siteDict self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % ( self.dataAgentName, job ), optInfo ) result = self.setOptimizerJobInfo( job, self.dataAgentName, optInfo ) if not result['OK']: return result # Site is selected for staging, report it self.log.verbose( 'Staging site candidate for job %s is %s' % ( job, stagingSite ) ) result = self.__getStagingSites(stagingSite,destinationSites) if not result['OK']: stagingSites = [stagingSite] else: stagingSites = result['Value'] if len( stagingSites ) == 1: self.jobDB.setJobAttribute( job, 'Site', stagingSite ) else: # Get the name of the site group result = self.__getSiteGroup(stagingSites) if result['OK']: groupName = result['Value'] if groupName: self.jobDB.setJobAttribute( job, 'Site', groupName ) else: self.jobDB.setJobAttribute( job, 'Site', 'Multiple' ) else: self.jobDB.setJobAttribute( job, 'Site', 'Multiple' ) stagerDict = self.__setStagingRequest( job, stagingSite, optInfo ) if not stagerDict['OK']: return stagerDict self.__updateOtherSites( job, stagingSite, stagerDict['Value'], optInfo ) return S_OK() else: #No staging required, can proceed to task queue agent and then waiting status self.log.verbose( 'Job %s does not require staging of input data' % ( job ) ) #Finally send job to TaskQueueAgent return self.__sendJobToTaskQueue( job, classAdJob, destinationSites, userBannedSites )
def _failStalledJobs(self, failedTime): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ # Only get jobs that have been Stalled for long enough checkTime = dateTime() - failedTime * second result = self.jobDB.selectJobs({'Status': JobStatus.STALLED}, older=checkTime) if not result['OK']: return result jobs = result['Value'] failedCounter = 0 minorStalledStatuses = ("Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime) if jobs: self.log.info( '%d jobs Stalled before %s will be checked for failure' % (len(jobs), str(checkTime))) for job in jobs: setFailed = False # Check if the job pilot is lost result = self.__getJobPilotStatus(job) if not result['OK']: self.log.error('Failed to get pilot status', result['Message']) continue pilotStatus = result['Value'] if pilotStatus != "Running": setFailed = minorStalledStatuses[0] else: # Verify that there was no sign of life for long enough result = self.__getLatestUpdateTime(job) if not result['OK']: self.log.error('Failed to get job update time', result['Message']) continue elapsedTime = toEpoch() - result['Value'] if elapsedTime > failedTime: setFailed = minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info if setFailed: self.__sendKillCommand(job) self.__updateJobStatus(job, JobStatus.FAILED, setFailed) failedCounter += 1 result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) recoverCounter = 0 for minor in minorStalledStatuses: result = self.jobDB.selectJobs({ 'Status': JobStatus.FAILED, 'MinorStatus': minor, 'AccountedFlag': 'False' }) if not result['OK']: return result if result['Value']: jobs = result['Value'] self.log.info('%s Stalled jobs will be Accounted' % (len(jobs))) for job in jobs: result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) continue recoverCounter += 1 if not result['OK']: break if failedCounter: self.log.info('%d jobs set to Failed' % failedCounter) if recoverCounter: self.log.info('%d jobs properly Accounted' % recoverCounter) return S_OK(failedCounter)
def checkJob(self, job, classAdJob): """This method controls the checking of the job. """ self.log.verbose('Job %s will be processed' % (job)) # Check if the job was recently rescheduled result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus']) if not result['OK']: self.log.error(result['Message']) return S_ERROR('Can not get job attributes from JobDB') jobDict = result['Value'] reCounter = int(jobDict['RescheduleCounter']) if reCounter != 0: reTime = fromString(jobDict['RescheduleTime']) delta = toEpoch() - toEpoch(reTime) delay = self.maxRescheduleDelay if reCounter <= len(self.rescheduleDelaysList): delay = self.rescheduleDelaysList[reCounter - 1] if delta < delay: if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling') == -1: result = self.jobDB.setJobStatus( job, application='On Hold: after rescheduling #%d' % reCounter) return S_OK() # First, get Site and BannedSites from the Job result = self.__getJobSiteRequirement(job, classAdJob) userBannedSites = result['BannedSites'] userSites = result['Sites'] if userSites: userSites = applySiteRequirements(userSites, [], userBannedSites) if not userSites: msg = 'Impossible Site Requirement' return S_ERROR(msg) # Second, get the Active and Banned sites from the WMS wmsSites = self.jobDB.getSiteMask('Active') wmsBannedSites = self.jobDB.getSiteMask('Banned') if not (wmsSites['OK'] and wmsBannedSites['OK']): if not wmsSites['OK']: self.log.error(wmsSites['Message']) if not wmsBannedSites['OK']: self.log.error(wmsBannedSites['Message']) return S_ERROR('Can not get Active and Banned Sites from JobDB') wmsSites = wmsSites['Value'] wmsBannedSites = wmsBannedSites['Value'] if userSites: sites = applySiteRequirements(userSites, wmsSites, wmsBannedSites) if not sites: # Put on Hold only non-excluded job types jobType = classAdJob.getAttributeString('JobType') if not jobType in self.excludedOnHoldJobTypes: msg = 'On Hold: Requested site is Banned or not Active' self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() # Third, check if there is input data result = self.jobDB.getInputData(job) if not result['OK']: self.log.warn('Failed to get input data from JobDB for %s' % (job)) self.log.error(result['Message']) return S_ERROR('Failed to get input data from JobDB') if not result['Value']: return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) hasInputData = False inputData = [] for lfn in result['Value']: if lfn: inputData.append(lfn) hasInputData = True if not hasInputData: #With no input data requirement, job can proceed directly to task queue self.log.verbose('Job %s has no input data requirement' % (job)) return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) self.log.verbose('Job %s has an input data requirement ' % (job)) # Fourth, Check all optimizer information result = self.__checkOptimizerInfo(job) if not result['OK']: return result optInfo = result['Value'] #Compare site candidates with current mask optSites = optInfo['SiteCandidates'].keys() self.log.info('Input Data Site Candidates: %s' % (', '.join(optSites))) # Check that it is compatible with user requirements optSites = applySiteRequirements(optSites, userSites, userBannedSites) if not optSites: msg = 'Impossible Site + InputData Requirement' return S_ERROR(msg) sites = applySiteRequirements(optSites, wmsSites, wmsBannedSites) if not sites: msg = 'On Hold: InputData Site is Banned or not Active' self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() #Set stager request as necessary, optimize for smallest #files on tape if #more than one site candidate left at this point checkStaging = self.__resolveSitesForStaging(job, sites, inputData, optInfo['SiteCandidates']) if not checkStaging['OK']: return checkStaging destinationSites = checkStaging['SiteCandidates'] if not destinationSites: return S_ERROR('No destination sites available') stagingFlag = checkStaging['Value'] if stagingFlag: #Single site candidate chosen and staging required self.log.verbose('Job %s requires staging of input data' % (job)) # set all LFN to disk for the selected site stagingSite = destinationSites[0] siteDict = optInfo['SiteCandidates'][stagingSite] siteDict['disk'] = siteDict['disk'] + siteDict['tape'] siteDict['tape'] = 0 optInfo['SiteCandidates'][stagingSite] = siteDict self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % (self.dataAgentName, job), optInfo) result = self.setOptimizerJobInfo(job, self.dataAgentName, optInfo) if not result['OK']: return result # Site is selected for staging, report it self.log.verbose('Staging site candidate for job %s is %s' % (job, stagingSite)) result = self.__getStagingSites(stagingSite, destinationSites) if not result['OK']: stagingSites = [stagingSite] else: stagingSites = result['Value'] if len(stagingSites) == 1: self.jobDB.setJobAttribute(job, 'Site', stagingSite) else: # Get the name of the site group result = self.__getSiteGroup(stagingSites) if result['OK']: groupName = result['Value'] if groupName: self.jobDB.setJobAttribute(job, 'Site', groupName) else: self.jobDB.setJobAttribute(job, 'Site', 'Multiple') else: self.jobDB.setJobAttribute(job, 'Site', 'Multiple') stagerDict = self.__setStagingRequest(job, stagingSite, optInfo) if not stagerDict['OK']: return stagerDict self.__updateOtherSites(job, stagingSite, stagerDict['Value'], optInfo) return S_OK() else: #No staging required, can proceed to task queue agent and then waiting status self.log.verbose('Job %s does not require staging of input data' % (job)) #Finally send job to TaskQueueAgent return self.__sendJobToTaskQueue(job, classAdJob, destinationSites, userBannedSites)
def checkJob(self, job, classAdJob): """This method controls the checking of the job. """ self.log.verbose("Job %s will be processed" % (job)) # Check if the job was recently rescheduled result = self.jobDB.getJobAttributes(job, ["RescheduleCounter", "RescheduleTime", "ApplicationStatus"]) if not result["OK"]: self.log.error(result["Message"]) return S_ERROR("Can not get job attributes from JobDB") jobDict = result["Value"] reCounter = int(jobDict["RescheduleCounter"]) if reCounter != 0: reTime = fromString(jobDict["RescheduleTime"]) delta = toEpoch() - toEpoch(reTime) delay = self.maxRescheduleDelay if reCounter <= len(self.rescheduleDelaysList): delay = self.rescheduleDelaysList[reCounter - 1] if delta < delay: if jobDict["ApplicationStatus"].find("On Hold: after rescheduling") == -1: result = self.jobDB.setJobStatus(job, application="On Hold: after rescheduling #%d" % reCounter) return S_OK() # First, get Site and BannedSites from the Job result = self.__getJobSiteRequirement(job, classAdJob) userBannedSites = result["BannedSites"] userSites = result["Sites"] if userSites: userSites = applySiteRequirements(userSites, [], userBannedSites) if not userSites: msg = "Impossible Site Requirement" return S_ERROR(msg) # Second, get the Active and Banned sites from the WMS wmsSites = self.jobDB.getSiteMask("Active") wmsBannedSites = self.jobDB.getSiteMask("Banned") if not (wmsSites["OK"] and wmsBannedSites["OK"]): if not wmsSites["OK"]: self.log.error(wmsSites["Message"]) if not wmsBannedSites["OK"]: self.log.error(wmsBannedSites["Message"]) return S_ERROR("Can not get Active and Banned Sites from JobDB") wmsSites = wmsSites["Value"] wmsBannedSites = wmsBannedSites["Value"] if userSites: sites = applySiteRequirements(userSites, wmsSites, wmsBannedSites) if not sites: # Put on Hold only non-excluded job types jobType = classAdJob.getAttributeString("JobType") if not jobType in self.excludedOnHoldJobTypes: msg = "On Hold: Requested site is Banned or not Active" self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() # Third, check if there is input data result = self.jobDB.getInputData(job) if not result["OK"]: self.log.warn("Failed to get input data from JobDB for %s" % (job)) self.log.error(result["Message"]) return S_ERROR("Failed to get input data from JobDB") if not result["Value"]: return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) hasInputData = False inputData = [] for lfn in result["Value"]: if lfn: inputData.append(lfn) hasInputData = True if not hasInputData: # With no input data requirement, job can proceed directly to task queue self.log.verbose("Job %s has no input data requirement" % (job)) return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) self.log.verbose("Job %s has an input data requirement " % (job)) # Fourth, Check all optimizer information result = self.__checkOptimizerInfo(job) if not result["OK"]: return result optInfo = result["Value"] # Compare site candidates with current mask optSites = optInfo["SiteCandidates"].keys() self.log.info("Input Data Site Candidates: %s" % (", ".join(optSites))) # Check that it is compatible with user requirements optSites = applySiteRequirements(optSites, userSites, userBannedSites) if not optSites: msg = "Impossible Site + InputData Requirement" return S_ERROR(msg) sites = applySiteRequirements(optSites, wmsSites, wmsBannedSites) if not sites: msg = "On Hold: InputData Site is Banned or not Active" self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() # Set stager request as necessary, optimize for smallest #files on tape if # more than one site candidate left at this point checkStaging = self.__resolveSitesForStaging(job, sites, inputData, optInfo["SiteCandidates"]) if not checkStaging["OK"]: return checkStaging destinationSites = checkStaging["SiteCandidates"] if not destinationSites: return S_ERROR("No destination sites available") stagingFlag = checkStaging["Value"] if stagingFlag: # Single site candidate chosen and staging required self.log.verbose("Job %s requires staging of input data" % (job)) # set all LFN to disk for the selected site stagingSite = destinationSites[0] siteDict = optInfo["SiteCandidates"][stagingSite] siteDict["disk"] = siteDict["disk"] + siteDict["tape"] siteDict["tape"] = 0 optInfo["SiteCandidates"][stagingSite] = siteDict result = self.setOptimizerJobInfo(job, self.dataAgentName, optInfo) if not result["OK"]: return result # Site is selected for staging, report it self.log.verbose("Staging site candidate for job %s is %s" % (job, stagingSite)) if len(destinationSites) == 1: self.jobDB.setJobAttribute(job, "Site", stagingSite) else: self.jobDB.setJobAttribute(job, "Site", "Multiple") stagerDict = self.__setStagingRequest(job, stagingSite, optInfo) if not stagerDict["OK"]: return stagerDict self.__updateOtherSites(job, stagingSite, stagerDict["Value"], optInfo) return S_OK() else: # No staging required, can proceed to task queue agent and then waiting status self.log.verbose("Job %s does not require staging of input data" % (job)) # Finally send job to TaskQueueAgent return self.__sendJobToTaskQueue(job, classAdJob, destinationSites, userBannedSites)
def optimizeJob( self, jid, jobState ): """ 1. Banned sites are removed from the destination list. 2. Get input files 3. Production jobs are sent directly to TQ 4. Check if staging is necessary """ # Reschedule delay result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] ) if not result[ 'OK' ]: return result attDict = result[ 'Value' ] try: reschedules = int( attDict[ 'RescheduleCounter' ] ) except ( ValueError, KeyError ): return S_ERROR( "RescheduleCounter has to be an integer" ) if reschedules != 0: delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] ) delay = delays[ min( reschedules, len( delays ) - 1 ) ] waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) ) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay ) # Get site requirements result = self.__getSitesRequired( jobState ) if not result[ 'OK' ]: return result userSites, userBannedSites = result[ 'Value' ] # Get job type result = jobState.getAttribute( "JobType" ) if not result[ 'OK' ]: return S_ERROR( "Could not retrieve job type" ) jobType = result[ 'Value' ] # Get banned sites from DIRAC result = self.__jobDB.getSiteMask( 'Banned' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve banned sites from JobDB" ) wmsBannedSites = result[ 'Value' ] # If the user has selected any site, filter them and hold the job if not able to run if userSites: if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ): result = self.__jobDB.getUserSitesTuple( userSites ) if not result[ 'OK' ]: return S_ERROR( "Problem checking userSites for tuple of active/banned/invalid sites" ) userSites, bannedSites, invalidSites = result['Value'] if invalidSites: self.jobLog.debug( "Invalid site(s) requested: %s" % ','.join( invalidSites ) ) if not self.ex_getOption( 'AllowInvalidSites', True ): return self.__holdJob( jobState, "Requested site(s) %s are invalid" % ",".join( invalidSites ) ) if bannedSites: self.jobLog.debug( "Banned site(s) %s ignored" % ",".join( bannedSites ) ) if not userSites: return self.__holdJob( jobState, "Requested site(s) %s are inactive" % ",".join( bannedSites ) ) if not userSites: return self.__holdJob( jobState, "No requested site(s) are active/valid" ) userSites = list(userSites) # Check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) ) return S_ERROR( "Failed to get input data from JobDB" ) if not result['Value']: # No input data? Just send to TQ return self.__sendToTQ( jobState, userSites, userBannedSites ) self.jobLog.verbose( "Has an input data requirement" ) inputData = result[ 'Value' ] # Production jobs are sent to TQ, but first we have to verify if staging is necessary if jobType in Operations().getValue( 'Transformations/DataProcessing', [] ): self.jobLog.info( "Production job: sending to TQ, but first checking if staging is requested" ) userName = jobState.getAttribute( 'Owner' ) if not userName[ 'OK' ]: return userName userName = userName['Value'] userGroup = jobState.getAttribute( 'OwnerGroup' ) if not userGroup[ 'OK' ]: return userGroup userGroup = userGroup['Value'] res = getFilesToStage( inputData, proxyUserName = userName, proxyUserGroup = userGroup ) #pylint: disable=unexpected-keyword-arg if not res['OK']: return self.__holdJob( jobState, res['Message'] ) stageLFNs = res['Value']['offlineLFNs'] if stageLFNs: res = self.__checkStageAllowed( jobState ) if not res['OK']: return res if not res['Value']: return S_ERROR( "Stage not allowed" ) self.__requestStaging( jobState, stageLFNs ) return S_OK() else: return self.__sendToTQ( jobState, userSites, userBannedSites ) # From now on we know it's a user job with input data idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' ) result = self.retrieveOptimizerParam( idAgent ) if not result['OK']: self.jobLog.error( "Could not retrieve input data info", result[ 'Message' ] ) return S_ERROR( "Could not retrieve input data info" ) opData = result[ 'Value' ] if 'SiteCandidates' not in opData: return S_ERROR( "No possible site candidates" ) # Filter input data sites with user requirement siteCandidates = list( opData[ 'SiteCandidates' ] ) self.jobLog.info( "Site candidates are %s" % siteCandidates ) if userSites: siteCandidates = list( set( siteCandidates ) & set( userSites ) ) siteCandidates = self._applySiteFilter( siteCandidates, banned = userBannedSites ) if not siteCandidates: return S_ERROR( "Impossible InputData * Site requirements" ) idSites = {} for site in siteCandidates: idSites[ site ] = opData[ 'SiteCandidates' ][ site ] # Check if sites have correct count of disk+tape replicas numData = len( inputData ) errorSites = set() for site in idSites: if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]: self.jobLog.error( "Site candidate %s does not have all the input data" % site ) errorSites.add( site ) for site in errorSites: idSites.pop( site ) if not idSites: return S_ERROR( "Site candidates do not have all the input data" ) # Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites ) if not siteCandidates: return S_ERROR( "No destination sites available" ) # Is any site active? stageSites = self._applySiteFilter( siteCandidates, banned = wmsBannedSites ) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) ) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ( jobState, siteCandidates, userBannedSites ) # Check if the user is allowed to stage if self.ex_getOption( "RestrictDataStage", False ): res = self.__checkStageAllowed( jobState ) if not res['OK']: return res if not res['Value']: return S_ERROR( "Stage not allowed" ) # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) ) stageData = idSites[ stageSite ] # Set as if everything has already been staged stageData[ 'disk' ] += stageData[ 'tape' ] stageData[ 'tape' ] = 0 # Set the site info back to the original dict to save afterwards opData[ 'SiteCandidates' ][ stageSite ] = stageData stageRequest = self.__preRequestStaging( jobState, stageSite, opData ) if not stageRequest['OK']: return stageRequest stageLFNs = stageRequest['Value'] result = self.__requestStaging( jobState, stageLFNs ) if not result[ 'OK' ]: return result stageLFNs = result[ 'Value' ] self.__updateSharedSESites( jobState, stageSite, stageLFNs, opData ) # Save the optimizer data again self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData ) result = self.storeOptimizerParam( idAgent, opData ) if not result[ 'OK' ]: return result return self.__setJobSite( jobState, stageSites )
def optimizeJob(self, jid, jobState): # Reschedule delay result = jobState.getAttributes( ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus']) if not result['OK']: return result attDict = result['Value'] try: reschedules = int(attDict['RescheduleCounter']) except ValueError: return S_ERROR("RescheduleCounter has to be an integer") if reschedules != 0: delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600]) delay = delays[min(reschedules, len(delays) - 1)] waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime'])) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay) # Get site requirements result = self._getSitesRequired(jobState) if not result['OK']: return result userSites, userBannedSites = result['Value'] # Get active and banned sites from DIRAC result = self.__jobDB.getSiteMask('Active') if not result['OK']: return S_ERROR("Cannot retrieve active sites from JobDB") wmsActiveSites = result['Value'] result = self.__jobDB.getSiteMask('Banned') if not result['OK']: return S_ERROR("Cannot retrieve banned sites from JobDB") wmsBannedSites = result['Value'] # If the user has selected any site, filter them and hold the job if not able to run if userSites: result = jobState.getAttribute("JobType") if not result['OK']: return S_ERROR("Could not retrieve job type") jobType = result['Value'] if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []): sites = self._applySiteFilter(userSites, wmsActiveSites, wmsBannedSites) if not sites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join(userSites)) # Get the Input data # Third, check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error("Cannot get input data %s" % (result['Message'])) return S_ERROR('Failed to get input data from JobDB') if not result['Value']: # No input data? Generate requirements and next return self.__sendToTQ(jobState, userSites, userBannedSites) inputData = result['Value'] self.jobLog.verbose('Has an input data requirement') idAgent = self.ex_getOption('InputDataAgent', 'InputData') result = self.retrieveOptimizerParam(idAgent) if not result['OK']: self.jobLog.error("Could not retrieve input data info: %s" % result['Message']) return S_ERROR("File Catalog Access Failure") opData = result['Value'] if 'SiteCandidates' not in opData: return S_ERROR("No possible site candidates") # Filter input data sites with user requirement siteCandidates = list(opData['SiteCandidates']) self.jobLog.info("Site candidates are %s" % siteCandidates) siteCandidates = self._applySiteFilter(siteCandidates, userSites, userBannedSites) if not siteCandidates: return S_ERROR("Impossible InputData * Site requirements") idSites = {} for site in siteCandidates: idSites[site] = opData['SiteCandidates'][site] #Check if sites have correct count of disk+tape replicas numData = len(inputData) errorSites = set() for site in idSites: if numData != idSites[site]['disk'] + idSites[site]['tape']: self.jobLog.error( "Site candidate %s does not have all the input data" % site) errorSites.add(site) for site in errorSites: idSites.pop(site) if not idSites: return S_ERROR("Site candidates do not have all the input data") #Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites) if not siteCandidates: return S_ERROR("No destination sites available") # Is any site active? stageSites = self._applySiteFilter(siteCandidates, wmsActiveSites, wmsBannedSites) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join(siteCandidates)) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ(jobState, siteCandidates, userBannedSites) # Check if the user is allowed to stage if self.ex_getOption("RestrictDataStage", False): if not self.__checkStageAllowed(jobState): return S_ERROR("Stage not allowed") # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose(" Staging site will be %s" % (stageSite)) stageData = idSites[stageSite] # Set as if everything has already been staged stageData['disk'] += stageData['tape'] stageData['tape'] = 0 # Set the site info back to the original dict to save afterwards opData['SiteCandidates'][stageSite] = stageData result = self.__requestStaging(jobState, stageSite, opData) if not result['OK']: return result stageLFNs = result['Value'] self._updateSharedSESites(stageSite, stageLFNs, opData) # Save the optimizer data again self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData) result = self.storeOptimizerParam(idAgent, opData) if not result['OK']: return result return self._setJobSite(jobState, stageSites)
def __failStalledJobs(self, failedTime): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ result = self.jobDB.selectJobs({'Status': 'Stalled'}) if not result['OK']: return result failedCounter = 0 if result['Value']: jobs = result['Value'] self.log.info('%s Stalled jobs will be checked for failure' % (len(jobs))) for job in jobs: # Check if the job pilot is lost result = self.__getJobPilotStatus(job) if result['OK']: pilotStatus = result['Value'] if pilotStatus != "Running": result = self.__updateJobStatus( job, 'Failed', "Job stalled: pilot not running") failedCounter += 1 result = self.__sendAccounting(job) if not result['OK']: self.log.error(result['Message']) break continue result = self.__getLatestUpdateTime(job) if not result['OK']: return result currentTime = toEpoch() lastUpdate = result['Value'] elapsedTime = currentTime - lastUpdate if elapsedTime > failedTime: self.__updateJobStatus( job, 'Failed', 'Stalling for more than %d sec' % failedTime) failedCounter += 1 result = self.__sendAccounting(job) if not result['OK']: self.log.error(result['Message']) break recoverCounter = 0 for minor in [ "Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime ]: result = self.jobDB.selectJobs({ 'Status': 'Failed', 'MinorStatus': minor, 'AccountedFlag': 'False' }) if not result['OK']: return result if result['Value']: jobs = result['Value'] self.log.info('%s Stalled jobs will be Accounted' % (len(jobs))) for job in jobs: result = self.__sendAccounting(job) if not result['OK']: self.log.error(result['Message']) continue recoverCounter += 1 if not result['OK']: break if failedCounter: self.log.info('%d jobs set to Failed' % failedCounter) if recoverCounter: self.log.info('%d jobs properly Accounted' % recoverCounter) return S_OK(failedCounter)
def __failStalledJobs(self, failedTime): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ result = self.jobDB.selectJobs({'Status': 'Stalled'}) if not result['OK']: return result jobs = result['Value'] failedCounter = 0 minorStalledStatuses = ("Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime) if jobs: self.log.info('%s Stalled jobs will be checked for failure' % (len(jobs))) for job in jobs: setFailed = False # Check if the job pilot is lost result = self.__getJobPilotStatus(job) if not result['OK']: self.log.error('Failed to get pilot status', result['Message']) continue pilotStatus = result['Value'] if pilotStatus != "Running": setFailed = minorStalledStatuses[0] else: result = self.__getLatestUpdateTime(job) if not result['OK']: self.log.error('Failed to get job update time', result['Message']) continue elapsedTime = toEpoch() - result['Value'] if elapsedTime > failedTime: setFailed = minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info if setFailed: # Send a kill signal to the job such that it cannot continue running WMSClient().killJob(job) self.__updateJobStatus(job, 'Failed', setFailed) failedCounter += 1 result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) recoverCounter = 0 for minor in minorStalledStatuses: result = self.jobDB.selectJobs({ 'Status': 'Failed', 'MinorStatus': minor, 'AccountedFlag': 'False' }) if not result['OK']: return result if result['Value']: jobs = result['Value'] self.log.info('%s Stalled jobs will be Accounted' % (len(jobs))) for job in jobs: result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) continue recoverCounter += 1 if not result['OK']: break if failedCounter: self.log.info('%d jobs set to Failed' % failedCounter) if recoverCounter: self.log.info('%d jobs properly Accounted' % recoverCounter) return S_OK(failedCounter)
def optimizeJob( self, jid, jobState ): #Reschedule delay result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] ) if not result[ 'OK' ]: return result attDict = result[ 'Value' ] try: reschedules = int( attDict[ 'RescheduleCounter' ] ) except ValueError: return S_ERROR( "RescheduleCounter has to be an integer" ) if reschedules != 0: delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] ) delay = delays[ min( reschedules, len( delays ) - 1 ) ] waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) ) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay ) #Get site requirements result = self.__getSitesRequired( jobState ) if not result[ 'OK' ]: return result userSites, userBannedSites = result[ 'Value' ] #Get active and banned sites from DIRAC result = self.__jobDB.getSiteMask( 'Active' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve active sites from JobDB" ) wmsActiveSites = result[ 'Value' ] result = self.__jobDB.getSiteMask( 'Banned' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve banned sites from JobDB" ) wmsBannedSites = result[ 'Value' ] #If the user has selected any site, filter them and hold the job if not able to run if userSites: result = jobState.getAttribute( "JobType" ) if not result[ 'OK' ]: return S_ERROR( "Could not retrieve job type" ) jobType = result[ 'Value' ] if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ): sites = self.__applySiteFilter( userSites, wmsActiveSites, wmsBannedSites ) if not sites: if len( userSites ) > 1: return self.__holdJob( jobState, "Requested sites %s are inactive" % ",".join( userSites ) ) else: return self.__holdJob( jobState, "Requested site %s is inactive" % userSites[0] ) #Get the Input data # Third, check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) ) return S_ERROR( 'Failed to get input data from JobDB' ) if not result['Value']: #No input data? Generate requirements and next return self.__sendToTQ( jobState, userSites, userBannedSites ) inputData = result[ 'Value' ] self.jobLog.verbose( 'Has an input data requirement' ) idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' ) result = self.retrieveOptimizerParam( idAgent ) if not result['OK']: self.jobLog.error( "Could not retrieve input data info: %s" % result[ 'Message' ] ) return S_ERROR( "File Catalog Access Failure" ) opData = result[ 'Value' ] if 'SiteCandidates' not in opData: return S_ERROR( "No possible site candidates" ) #Filter input data sites with user requirement siteCandidates = list( opData[ 'SiteCandidates' ] ) self.jobLog.info( "Site candidates are %s" % siteCandidates ) siteCandidates = self.__applySiteFilter( siteCandidates, userSites, userBannedSites ) if not siteCandidates: return S_ERROR( "Impossible InputData * Site requirements" ) idSites = {} for site in siteCandidates: idSites[ site ] = opData[ 'SiteCandidates' ][ site ] #Check if sites have correct count of disk+tape replicas numData = len( inputData ) errorSites = set() for site in idSites: if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]: self.jobLog.error( "Site candidate %s does not have all the input data" % site ) errorSites.add( site ) for site in errorSites: idSites.pop( site ) if not idSites: return S_ERROR( "Site candidates do not have all the input data" ) #Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites ) if not siteCandidates: return S_ERROR( "No destination sites available" ) #Is any site active? stageSites = self.__applySiteFilter( siteCandidates, wmsActiveSites, wmsBannedSites ) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) ) #If no staging is required send to TQ if not stageRequired: #Use siteCandidates and not stageSites because active and banned sites #will be taken into account on matching time return self.__sendToTQ( jobState, siteCandidates, userBannedSites ) #Check if the user is allowed to stage if self.ex_getOption( "RestrictDataStage", False ): if not self.__checkStageAllowed( jobState ): return S_ERROR( "Stage not allowed" ) #Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) ) stageData = idSites[ stageSite ] #Set as if everything has already been staged stageData[ 'disk' ] += stageData[ 'tape' ] stageData[ 'tape' ] = 0 #Set the site info back to the original dict to save afterwards opData[ 'SiteCandidates' ][ stageSite ] = stageData result = self.__requestStaging( jobState, stageSite, opData ) if not result[ 'OK' ]: return result stageLFNs = result[ 'Value' ] self.__updateSharedSESites( jobState, stageSite, stageLFNs, opData ) #Save the optimizer data again self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData ) result = self.storeOptimizerParam( idAgent, opData ) if not result[ 'OK' ]: return result return self.__setJobSite( jobState, stageSites )
def optimizeJob(self, jid, jobState): """ 1. Banned sites are removed from the destination list. 2. Get input files 3. Production jobs are sent directly to TQ 4. Check if staging is necessary """ # Reschedule delay result = jobState.getAttributes(['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus']) if not result['OK']: return result attDict = result['Value'] try: reschedules = int(attDict['RescheduleCounter']) except (ValueError, KeyError): return S_ERROR("RescheduleCounter has to be an integer") if reschedules != 0: delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600]) delay = delays[min(reschedules, len(delays) - 1)] waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime'])) if waited < delay: return self.__holdJob(jobState, 'On Hold: after rescheduling %s' % reschedules, delay) # Get the job manifest for the later checks result = jobState.getManifest() if not result['OK']: return S_ERROR("Could not retrieve job manifest: %s" % result['Message']) jobManifest = result['Value'] # Get site requirements result = self.__getSitesRequired(jobManifest) if not result['OK']: return result userSites, userBannedSites = result['Value'] # Get job type result = jobState.getAttribute("JobType") if not result['OK']: return S_ERROR("Could not retrieve job type") jobType = result['Value'] # Get banned sites from DIRAC result = self.siteClient.getSites('Banned') if not result['OK']: return S_ERROR("Cannot retrieve banned sites from JobDB") wmsBannedSites = result['Value'] # If the user has selected any site, filter them and hold the job if not able to run if userSites: if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []): result = self.siteClient.getUsableSites(userSites) if not result['OK']: return S_ERROR("Problem checking userSites for tuple of active/banned/invalid sites") usableSites = set(result['Value']) bannedSites = [] invalidSites = [] for site in userSites: if site in wmsBannedSites: bannedSites.append(site) elif site not in usableSites: invalidSites.append(site) if invalidSites: self.jobLog.debug("Invalid site(s) requested: %s" % ','.join(invalidSites)) if not self.ex_getOption('AllowInvalidSites', True): return self.__holdJob(jobState, "Requested site(s) %s are invalid" % ",".join(invalidSites)) if bannedSites: self.jobLog.debug("Banned site(s) %s ignored" % ",".join(bannedSites)) if not usableSites: return self.__holdJob(jobState, "Requested site(s) %s are inactive" % ",".join(bannedSites)) if not usableSites: return self.__holdJob(jobState, "No requested site(s) are active/valid") userSites = list(usableSites) checkPlatform = self.ex_getOption('CheckPlatform', False) jobPlatform = jobManifest.getOption("Platform", None) # First check that the platform is valid (in OSCompatibility list) if checkPlatform and jobPlatform: result = gConfig.getOptionsDict('/Resources/Computing/OSCompatibility') if not result['OK']: return S_ERROR("Unable to get OSCompatibility list") allPlatforms = result['Value'] if jobPlatform not in allPlatforms: self.jobLog.error("Platform %s is not supported" % jobPlatform) return S_ERROR("Platform %s is not supported" % jobPlatform) # Filter the userSites by the platform selection (if there is one) if checkPlatform and userSites: if jobPlatform: result = self.__filterByPlatform(jobPlatform, userSites) if not result['OK']: self.jobLog.error("Failed to filter job sites by platform: %s" % result['Message']) return S_ERROR("Failed to filter job sites by platform") userSites = result['Value'] if not userSites: # No sites left after filtering -> Invalid platform/sites combination self.jobLog.error("No selected sites match platform '%s'" % jobPlatform) return S_ERROR("No selected sites match platform '%s'" % jobPlatform) # Check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error("Cannot get input data %s" % (result['Message'])) return S_ERROR("Failed to get input data from JobDB") if not result['Value']: # No input data? Just send to TQ return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites) self.jobLog.verbose("Has an input data requirement") inputData = result['Value'] # =================================================================================== # Production jobs are sent to TQ, but first we have to verify if staging is necessary # =================================================================================== if jobType in Operations().getValue('Transformations/DataProcessing', []): self.jobLog.info("Production job: sending to TQ, but first checking if staging is requested") res = getFilesToStage(inputData, jobState=jobState, checkOnlyTapeSEs=self.ex_getOption('CheckOnlyTapeSEs', True), jobLog=self.jobLog) if not res['OK']: return self.__holdJob(jobState, res['Message']) if res['Value']['absentLFNs']: # Some files do not exist at all... set the job Failed # Reverse errors reasons = {} for lfn, reason in res['Value']['absentLFNs'].iteritems(): reasons.setdefault(reason, []).append(lfn) for reason, lfns in reasons.iteritems(): # Some files are missing in the FC or in SEs, fail the job self.jobLog.error(reason, ','.join(lfns)) error = ','.join(reasons) return S_ERROR(error) if res['Value']['failedLFNs']: return self.__holdJob(jobState, "Couldn't get storage metadata of some files") stageLFNs = res['Value']['offlineLFNs'] if stageLFNs: res = self.__checkStageAllowed(jobState) if not res['OK']: return res if not res['Value']: return S_ERROR("Stage not allowed") self.__requestStaging(jobState, stageLFNs) return S_OK() else: # No staging required onlineSites = res['Value']['onlineSites'] if onlineSites: # Set the online site(s) first userSites = set(userSites) onlineSites &= userSites userSites = list(onlineSites) + list(userSites - onlineSites) return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites, onlineSites=onlineSites) # =================================================== # From now on we know it's a user job with input data # =================================================== idAgent = self.ex_getOption('InputDataAgent', 'InputData') result = self.retrieveOptimizerParam(idAgent) if not result['OK']: self.jobLog.error("Could not retrieve input data info", result['Message']) return S_ERROR("Could not retrieve input data info") opData = result['Value'] if 'SiteCandidates' not in opData: return S_ERROR("No possible site candidates") # Filter input data sites with user requirement siteCandidates = list(opData['SiteCandidates']) self.jobLog.info("Site candidates are %s" % siteCandidates) if userSites: siteCandidates = list(set(siteCandidates) & set(userSites)) siteCandidates = self._applySiteFilter(siteCandidates, banned=userBannedSites) if not siteCandidates: return S_ERROR("Impossible InputData * Site requirements") idSites = {} for site in siteCandidates: idSites[site] = opData['SiteCandidates'][site] # Check if sites have correct count of disk+tape replicas numData = len(inputData) errorSites = set() for site in idSites: if numData != idSites[site]['disk'] + idSites[site]['tape']: self.jobLog.error("Site candidate %s does not have all the input data" % site) errorSites.add(site) for site in errorSites: idSites.pop(site) if not idSites: return S_ERROR("Site candidates do not have all the input data") # Check if staging is required stageRequired, siteCandidates = self.__resolveStaging(inputData, idSites) if not siteCandidates: return S_ERROR("No destination sites available") # Is any site active? stageSites = self._applySiteFilter(siteCandidates, banned=wmsBannedSites) if not stageSites: return self.__holdJob(jobState, "Sites %s are inactive or banned" % ", ".join(siteCandidates)) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ(jobState, jobManifest, siteCandidates, userBannedSites) # Check if the user is allowed to stage if self.ex_getOption("RestrictDataStage", False): res = self.__checkStageAllowed(jobState) if not res['OK']: return res if not res['Value']: return S_ERROR("Stage not allowed") # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose(" Staging site will be %s" % (stageSite)) stageData = idSites[stageSite] # Set as if everything has already been staged stageData['disk'] += stageData['tape'] stageData['tape'] = 0 # Set the site info back to the original dict to save afterwards opData['SiteCandidates'][stageSite] = stageData stageRequest = self.__preRequestStaging(jobManifest, stageSite, opData) if not stageRequest['OK']: return stageRequest stageLFNs = stageRequest['Value'] result = self.__requestStaging(jobState, stageLFNs) if not result['OK']: return result stageLFNs = result['Value'] self.__updateSharedSESites(jobManifest, stageSite, stageLFNs, opData) # Save the optimizer data again self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData) result = self.storeOptimizerParam(idAgent, opData) if not result['OK']: return result return self.__setJobSite(jobState, stageSites)
def optimizeJob(self, jid, jobState): """ 1. Banned sites are removed from the destination list. 2. Get input files 3. Production jobs are sent directly to TQ 4. Check if staging is necessary """ # Reschedule delay result = jobState.getAttributes( ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus']) if not result['OK']: return result attDict = result['Value'] try: reschedules = int(attDict['RescheduleCounter']) except (ValueError, KeyError): return S_ERROR("RescheduleCounter has to be an integer") if reschedules != 0: delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600]) delay = delays[min(reschedules, len(delays) - 1)] waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime'])) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay) # Get the job manifest for the later checks result = jobState.getManifest() if not result['OK']: return S_ERROR("Could not retrieve job manifest: %s" % result['Message']) jobManifest = result['Value'] # Get site requirements result = self.__getSitesRequired(jobManifest) if not result['OK']: return result userSites, userBannedSites = result['Value'] # Get job type result = jobState.getAttribute("JobType") if not result['OK']: return S_ERROR("Could not retrieve job type") jobType = result['Value'] # Get banned sites from DIRAC result = self.siteClient.getSites('Banned') if not result['OK']: return S_ERROR("Cannot retrieve banned sites from JobDB") wmsBannedSites = result['Value'] # If the user has selected any site, filter them and hold the job if not able to run if userSites: if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []): result = self.siteClient.getUsableSites(userSites) if not result['OK']: return S_ERROR( "Problem checking userSites for tuple of active/banned/invalid sites" ) usableSites = set(result['Value']) bannedSites = [] invalidSites = [] for site in userSites: if site in wmsBannedSites: bannedSites.append(site) elif site not in usableSites: invalidSites.append(site) if invalidSites: self.jobLog.debug("Invalid site(s) requested: %s" % ','.join(invalidSites)) if not self.ex_getOption('AllowInvalidSites', True): return self.__holdJob( jobState, "Requested site(s) %s are invalid" % ",".join(invalidSites)) if bannedSites: self.jobLog.debug("Banned site(s) %s ignored" % ",".join(bannedSites)) if not usableSites: return self.__holdJob( jobState, "Requested site(s) %s are inactive" % ",".join(bannedSites)) if not usableSites: return self.__holdJob( jobState, "No requested site(s) are active/valid") userSites = list(usableSites) checkPlatform = self.ex_getOption('CheckPlatform', False) jobPlatform = jobManifest.getOption("Platform", None) # First check that the platform is valid (in OSCompatibility list) if checkPlatform and jobPlatform: result = gConfig.getOptionsDict( '/Resources/Computing/OSCompatibility') if not result['OK']: return S_ERROR("Unable to get OSCompatibility list") allPlatforms = result['Value'] if jobPlatform not in allPlatforms: self.jobLog.error("Platform not supported", jobPlatform) return S_ERROR("Platform %s is not supported" % jobPlatform) # Filter the userSites by the platform selection (if there is one) if checkPlatform and userSites: if jobPlatform: result = self.__filterByPlatform(jobPlatform, userSites) if not result['OK']: self.jobLog.error("Failed to filter job sites by platform", result['Message']) return S_ERROR("Failed to filter job sites by platform") userSites = result['Value'] if not userSites: # No sites left after filtering -> Invalid platform/sites combination self.jobLog.error("No selected sites match platform", jobPlatform) return S_ERROR("No selected sites match platform '%s'" % jobPlatform) # Check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error("Cannot get input data", result['Message']) return S_ERROR("Failed to get input data from JobDB") if not result['Value']: # No input data? Just send to TQ return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites) self.jobLog.verbose("Has an input data requirement") inputData = result['Value'] # =================================================================================== # Production jobs are sent to TQ, but first we have to verify if staging is necessary # =================================================================================== if jobType in Operations().getValue('Transformations/DataProcessing', []): self.jobLog.info( "Production job: sending to TQ, but first checking if staging is requested" ) res = getFilesToStage(inputData, jobState=jobState, checkOnlyTapeSEs=self.ex_getOption( 'CheckOnlyTapeSEs', True), jobLog=self.jobLog) if not res['OK']: return self.__holdJob(jobState, res['Message']) if res['Value']['absentLFNs']: # Some files do not exist at all... set the job Failed # Reverse errors reasons = {} for lfn, reason in res['Value']['absentLFNs'].iteritems(): reasons.setdefault(reason, []).append(lfn) for reason, lfns in reasons.iteritems(): # Some files are missing in the FC or in SEs, fail the job self.jobLog.error(reason, ','.join(lfns)) error = ','.join(reasons) return S_ERROR(error) if res['Value']['failedLFNs']: return self.__holdJob( jobState, "Couldn't get storage metadata of some files") stageLFNs = res['Value']['offlineLFNs'] if stageLFNs: res = self.__checkStageAllowed(jobState) if not res['OK']: return res if not res['Value']: return S_ERROR("Stage not allowed") self.__requestStaging(jobState, stageLFNs) return S_OK() else: # No staging required onlineSites = res['Value']['onlineSites'] if onlineSites: # Set the online site(s) first userSites = set(userSites) onlineSites &= userSites userSites = list(onlineSites) + list(userSites - onlineSites) return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites, onlineSites=onlineSites) # =================================================== # From now on we know it's a user job with input data # =================================================== idAgent = self.ex_getOption('InputDataAgent', 'InputData') result = self.retrieveOptimizerParam(idAgent) if not result['OK']: self.jobLog.error("Could not retrieve input data info", result['Message']) return S_ERROR("Could not retrieve input data info") opData = result['Value'] if 'SiteCandidates' not in opData: return S_ERROR("No possible site candidates") # Filter input data sites with user requirement siteCandidates = list(opData['SiteCandidates']) self.jobLog.info("Site candidates are %s" % siteCandidates) if userSites: siteCandidates = list(set(siteCandidates) & set(userSites)) siteCandidates = self._applySiteFilter(siteCandidates, banned=userBannedSites) if not siteCandidates: return S_ERROR("Impossible InputData * Site requirements") idSites = {} for site in siteCandidates: idSites[site] = opData['SiteCandidates'][site] # Check if sites have correct count of disk+tape replicas numData = len(inputData) errorSites = set() for site in idSites: if numData != idSites[site]['disk'] + idSites[site]['tape']: self.jobLog.error( "Site candidate %s does not have all the input data" % site) errorSites.add(site) for site in errorSites: idSites.pop(site) if not idSites: return S_ERROR("Site candidates do not have all the input data") # Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( inputData, idSites) if not siteCandidates: return S_ERROR("No destination sites available") # Is any site active? stageSites = self._applySiteFilter(siteCandidates, banned=wmsBannedSites) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join(siteCandidates)) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ(jobState, jobManifest, siteCandidates, userBannedSites) # Check if the user is allowed to stage if self.ex_getOption("RestrictDataStage", False): res = self.__checkStageAllowed(jobState) if not res['OK']: return res if not res['Value']: return S_ERROR("Stage not allowed") # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose(" Staging site will be %s" % (stageSite)) stageData = idSites[stageSite] # Set as if everything has already been staged stageData['disk'] += stageData['tape'] stageData['tape'] = 0 # Set the site info back to the original dict to save afterwards opData['SiteCandidates'][stageSite] = stageData stageRequest = self.__preRequestStaging(jobManifest, stageSite, opData) if not stageRequest['OK']: return stageRequest stageLFNs = stageRequest['Value'] result = self.__requestStaging(jobState, stageLFNs) if not result['OK']: return result stageLFNs = result['Value'] self.__updateSharedSESites(jobManifest, stageSite, stageLFNs, opData) # Save the optimizer data again self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData) result = self.storeOptimizerParam(idAgent, opData) if not result['OK']: return result return self.__setJobSite(jobState, stageSites)
def __failStalledJobs(self, failedTime): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ result = self.jobDB.selectJobs({"Status": "Stalled"}) if not result["OK"]: return result jobs = result["Value"] failedCounter = 0 minorStalledStatuses = ("Job stalled: pilot not running", "Stalling for more than %d sec" % failedTime) if jobs: self.log.info("%s Stalled jobs will be checked for failure" % (len(jobs))) for job in jobs: setFailed = False # Check if the job pilot is lost result = self.__getJobPilotStatus(job) if not result["OK"]: self.log.error("Failed to get pilot status", result["Message"]) continue pilotStatus = result["Value"] if pilotStatus != "Running": setFailed = minorStalledStatuses[0] else: result = self.__getLatestUpdateTime(job) if not result["OK"]: self.log.error("Failed to get job update time", result["Message"]) continue elapsedTime = toEpoch() - result["Value"] if elapsedTime > failedTime: setFailed = minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info if setFailed: # Send a kill signal to the job such that it cannot continue running WMSClient().killJob(job) self.__updateJobStatus(job, "Failed", setFailed) failedCounter += 1 result = self.__sendAccounting(job) if not result["OK"]: self.log.error("Failed to send accounting", result["Message"]) recoverCounter = 0 for minor in minorStalledStatuses: result = self.jobDB.selectJobs({"Status": "Failed", "MinorStatus": minor, "AccountedFlag": "False"}) if not result["OK"]: return result if result["Value"]: jobs = result["Value"] self.log.info("%s Stalled jobs will be Accounted" % (len(jobs))) for job in jobs: result = self.__sendAccounting(job) if not result["OK"]: self.log.error("Failed to send accounting", result["Message"]) continue recoverCounter += 1 if not result["OK"]: break if failedCounter: self.log.info("%d jobs set to Failed" % failedCounter) if recoverCounter: self.log.info("%d jobs properly Accounted" % recoverCounter) return S_OK(failedCounter)