def getBannedSites(self, printOutput=False): """Retrieve current list of banned sites. Example usage: >>> print diracAdmin.getBannedSites() {'OK': True, 'Value': []} :returns: S_OK,S_ERROR """ siteStatus = SiteStatus() result = siteStatus.getUnusableSites('ComputingAccess') if not result['OK']: self.log.warn(result['Message']) return result bannedSites = result['Value'] bannedSites.sort() if printOutput: print '\n'.join(bannedSites) return S_OK(bannedSites)
def getBannedSites( self, printOutput = False ): """Retrieve current list of banned sites. Example usage: >>> print diracAdmin.getBannedSites() {'OK': True, 'Value': []} :returns: S_OK,S_ERROR """ siteStatus = SiteStatus() result = siteStatus.getUnusableSites( 'ComputingAccess' ) if not result['OK']: self.log.warn( result['Message'] ) return result bannedSites = result['Value'] bannedSites.sort() if printOutput: print '\n'.join( bannedSites ) return S_OK( bannedSites )
def checkJob( self, job, classAdJob ): """This method controls the checking of the job. """ self.log.verbose( 'Job %s will be processed' % ( job ) ) # Check if the job was recently rescheduled result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'] ) if not result['OK']: self.log.error( result['Message'] ) return S_ERROR( 'Can not get job attributes from JobDB' ) jobDict = result['Value'] reCounter = int( jobDict['RescheduleCounter'] ) if reCounter != 0 : reTime = fromString( jobDict['RescheduleTime'] ) delta = toEpoch() - toEpoch( reTime ) delay = self.maxRescheduleDelay if reCounter <= len( self.rescheduleDelaysList ): delay = self.rescheduleDelaysList[reCounter - 1] if delta < delay: if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling' ) == -1: result = self.jobDB.setJobStatus( job, application = 'On Hold: after rescheduling #%d' % reCounter ) return S_OK() # First, get Site and BannedSites from the Job result = self.__getJobSiteRequirement( job, classAdJob ) userBannedSites = result['BannedSites'] userSites = result['Sites'] if userSites: userSites = applySiteRequirements( userSites, [], userBannedSites ) if not userSites: msg = 'Impossible Site Requirement' return S_ERROR( msg ) # Second, get the Active and Banned sites from the RSS siteStatus = SiteStatus() usableSites = siteStatus.getUsableSites( 'ComputingAccess' ) unusableSites = siteStatus.getUnusableSites( 'ComputingAccess' ) if not ( usableSites['OK'] and unusableSites['OK'] ): if not usableSites['OK']: self.log.error( usableSites['Message'] ) if not unusableSites['OK']: self.log.error( unusableSites['Message'] ) return S_ERROR( 'Can not get Active and Banned Sites from JobDB' ) usableSites = usableSites['Value'] unusableSites = unusableSites['Value'] if userSites: sites = applySiteRequirements( userSites, usableSites, unusableSites ) if not sites: # Put on Hold only non-excluded job types jobType = classAdJob.getAttributeString( 'JobType' ) if not jobType in self.excludedOnHoldJobTypes: msg = 'On Hold: Requested site is Banned or not Active' self.log.info( msg ) result = self.jobDB.setJobStatus( job, application = msg ) return S_OK() # Third, check if there is input data result = self.jobDB.getInputData( job ) if not result['OK']: self.log.warn( 'Failed to get input data from JobDB for %s' % ( job ) ) self.log.error( result['Message'] ) return S_ERROR( 'Failed to get input data from JobDB' ) if not result['Value']: return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites ) hasInputData = False inputData = [] for lfn in result['Value']: if lfn: inputData.append( lfn ) hasInputData = True if not hasInputData: #With no input data requirement, job can proceed directly to task queue self.log.verbose( 'Job %s has no input data requirement' % ( job ) ) return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites ) self.log.verbose( 'Job %s has an input data requirement ' % ( job ) ) # Fourth, Check all optimizer information result = self.__checkOptimizerInfo( job ) if not result['OK']: return result optInfo = result['Value'] #Compare site candidates with current mask optSites = optInfo['SiteCandidates'].keys() self.log.info( 'Input Data Site Candidates: %s' % ( ', '.join( optSites ) ) ) # Check that it is compatible with user requirements optSites = applySiteRequirements( optSites, userSites, userBannedSites ) if not optSites: msg = 'Impossible Site + InputData Requirement' return S_ERROR( msg ) sites = applySiteRequirements( optSites, usableSites, unusableSites ) if not sites: msg = 'On Hold: InputData Site is Banned or not Active' self.log.info( msg ) result = self.jobDB.setJobStatus( job, application = msg ) return S_OK() #Set stager request as necessary, optimize for smallest #files on tape if #more than one site candidate left at this point checkStaging = self.__resolveSitesForStaging( job, sites, inputData, optInfo['SiteCandidates'] ) if not checkStaging['OK']: return checkStaging destinationSites = checkStaging['SiteCandidates'] if not destinationSites: return S_ERROR( 'No destination sites available' ) stagingFlag = checkStaging['Value'] if stagingFlag: #Single site candidate chosen and staging required self.log.verbose( 'Job %s requires staging of input data' % ( job ) ) # set all LFN to disk for the selected site stagingSite = destinationSites[0] siteDict = optInfo['SiteCandidates'][stagingSite] siteDict['disk'] = siteDict['disk'] + siteDict['tape'] siteDict['tape'] = 0 optInfo['SiteCandidates'][stagingSite] = siteDict self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % ( self.dataAgentName, job ), optInfo ) result = self.setOptimizerJobInfo( job, self.dataAgentName, optInfo ) if not result['OK']: return result # Site is selected for staging, report it self.log.verbose( 'Staging site candidate for job %s is %s' % ( job, stagingSite ) ) result = self.__getStagingSites( stagingSite, destinationSites ) if not result['OK']: stagingSites = [stagingSite] else: stagingSites = result['Value'] if len( stagingSites ) == 1: self.jobDB.setJobAttribute( job, 'Site', stagingSite ) else: # Get the name of the site group result = self.__getSiteGroup( stagingSites ) if result['OK']: groupName = result['Value'] if groupName: self.jobDB.setJobAttribute( job, 'Site', groupName ) else: self.jobDB.setJobAttribute( job, 'Site', 'Multiple' ) else: self.jobDB.setJobAttribute( job, 'Site', 'Multiple' ) stagerDict = self.__setStagingRequest( job, stagingSite, optInfo ) if not stagerDict['OK']: return stagerDict self.__updateOtherSites( job, stagingSite, stagerDict['Value'], optInfo ) return S_OK() else: #No staging required, can proceed to task queue agent and then waiting status self.log.verbose( 'Job %s does not require staging of input data' % ( job ) ) #Finally send job to TaskQueueAgent return self.__sendJobToTaskQueue( job, classAdJob, destinationSites, userBannedSites )
def optimizeJob( self, jid, jobState ): # Reschedule delay result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] ) if not result[ 'OK' ]: return result attDict = result[ 'Value' ] try: reschedules = int( attDict[ 'RescheduleCounter' ] ) except ValueError: return S_ERROR( "RescheduleCounter has to be an integer" ) if reschedules != 0: delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] ) delay = delays[ min( reschedules, len( delays ) - 1 ) ] waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) ) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay ) # Get site requirements result = self._getSitesRequired( jobState ) if not result[ 'OK' ]: return result userSites, userBannedSites = result[ 'Value' ] # Get active and banned sites from DIRAC siteStatus = SiteStatus() result = siteStatus.getUsableSites( 'ComputingAccess' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve active sites from JobDB" ) usableSites = result[ 'Value' ] result = siteStatus.getUnusableSites( 'ComputingAccess' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve banned sites from JobDB" ) unusableSites = result[ 'Value' ] # If the user has selected any site, filter them and hold the job if not able to run if userSites: result = jobState.getAttribute( "JobType" ) if not result[ 'OK' ]: return S_ERROR( "Could not retrieve job type" ) jobType = result[ 'Value' ] if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ): sites = self._applySiteFilter( userSites, usableSites, unusableSites ) if not sites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( userSites ) ) # Get the Input data # Third, check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) ) return S_ERROR( 'Failed to get input data from JobDB' ) if not result['Value']: # No input data? Generate requirements and next return self.__sendToTQ( jobState, userSites, userBannedSites ) inputData = result[ 'Value' ] self.jobLog.verbose( 'Has an input data requirement' ) idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' ) result = self.retrieveOptimizerParam( idAgent ) if not result['OK']: self.jobLog.error( "Could not retrieve input data info: %s" % result[ 'Message' ] ) return S_ERROR( "File Catalog Access Failure" ) opData = result[ 'Value' ] if 'SiteCandidates' not in opData: return S_ERROR( "No possible site candidates" ) # Filter input data sites with user requirement siteCandidates = list( opData[ 'SiteCandidates' ] ) self.jobLog.info( "Site candidates are %s" % siteCandidates ) siteCandidates = self._applySiteFilter( siteCandidates, userSites, userBannedSites ) if not siteCandidates: return S_ERROR( "Impossible InputData * Site requirements" ) idSites = {} for site in siteCandidates: idSites[ site ] = opData[ 'SiteCandidates' ][ site ] #Check if sites have correct count of disk+tape replicas numData = len( inputData ) errorSites = set() for site in idSites: if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]: self.jobLog.error( "Site candidate %s does not have all the input data" % site ) errorSites.add( site ) for site in errorSites: idSites.pop( site ) if not idSites: return S_ERROR( "Site candidates do not have all the input data" ) #Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites ) if not siteCandidates: return S_ERROR( "No destination sites available" ) # Is any site active? stageSites = self._applySiteFilter( siteCandidates, usableSites, unusableSites ) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) ) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ( jobState, siteCandidates, userBannedSites ) # Check if the user is allowed to stage if self.ex_getOption( "RestrictDataStage", False ): if not self.__checkStageAllowed( jobState ): return S_ERROR( "Stage not allowed" ) # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) ) stageData = idSites[ stageSite ] # Set as if everything has already been staged stageData[ 'disk' ] += stageData[ 'tape' ] stageData[ 'tape' ] = 0 # Set the site info back to the original dict to save afterwards opData[ 'SiteCandidates' ][ stageSite ] = stageData result = self.__requestStaging( jobState, stageSite, opData ) if not result[ 'OK' ]: return result stageLFNs = result[ 'Value' ] self._updateSharedSESites( stageSite, stageLFNs, opData ) # Save the optimizer data again self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData ) result = self.storeOptimizerParam( idAgent, opData ) if not result[ 'OK' ]: return result return self._setJobSite( jobState, stageSites )
def checkJob(self, job, classAdJob): """This method controls the checking of the job. """ self.log.verbose('Job %s will be processed' % (job)) # Check if the job was recently rescheduled result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus']) if not result['OK']: self.log.error(result['Message']) return S_ERROR('Can not get job attributes from JobDB') jobDict = result['Value'] reCounter = int(jobDict['RescheduleCounter']) if reCounter != 0: reTime = fromString(jobDict['RescheduleTime']) delta = toEpoch() - toEpoch(reTime) delay = self.maxRescheduleDelay if reCounter <= len(self.rescheduleDelaysList): delay = self.rescheduleDelaysList[reCounter - 1] if delta < delay: if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling') == -1: result = self.jobDB.setJobStatus( job, application='On Hold: after rescheduling #%d' % reCounter) return S_OK() # First, get Site and BannedSites from the Job result = self.__getJobSiteRequirement(job, classAdJob) userBannedSites = result['BannedSites'] userSites = result['Sites'] if userSites: userSites = applySiteRequirements(userSites, [], userBannedSites) if not userSites: msg = 'Impossible Site Requirement' return S_ERROR(msg) # Second, get the Active and Banned sites from the RSS siteStatus = SiteStatus() usableSites = siteStatus.getUsableSites('ComputingAccess') unusableSites = siteStatus.getUnusableSites('ComputingAccess') if not (usableSites['OK'] and unusableSites['OK']): if not usableSites['OK']: self.log.error(usableSites['Message']) if not unusableSites['OK']: self.log.error(unusableSites['Message']) return S_ERROR('Can not get Active and Banned Sites from JobDB') usableSites = usableSites['Value'] unusableSites = unusableSites['Value'] if userSites: sites = applySiteRequirements(userSites, usableSites, unusableSites) if not sites: # Put on Hold only non-excluded job types jobType = classAdJob.getAttributeString('JobType') if not jobType in self.excludedOnHoldJobTypes: msg = 'On Hold: Requested site is Banned or not Active' self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() # Third, check if there is input data result = self.jobDB.getInputData(job) if not result['OK']: self.log.warn('Failed to get input data from JobDB for %s' % (job)) self.log.error(result['Message']) return S_ERROR('Failed to get input data from JobDB') if not result['Value']: return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) hasInputData = False inputData = [] for lfn in result['Value']: if lfn: inputData.append(lfn) hasInputData = True if not hasInputData: #With no input data requirement, job can proceed directly to task queue self.log.verbose('Job %s has no input data requirement' % (job)) return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) self.log.verbose('Job %s has an input data requirement ' % (job)) # Fourth, Check all optimizer information result = self.__checkOptimizerInfo(job) if not result['OK']: return result optInfo = result['Value'] #Compare site candidates with current mask optSites = optInfo['SiteCandidates'].keys() self.log.info('Input Data Site Candidates: %s' % (', '.join(optSites))) # Check that it is compatible with user requirements optSites = applySiteRequirements(optSites, userSites, userBannedSites) if not optSites: msg = 'Impossible Site + InputData Requirement' return S_ERROR(msg) sites = applySiteRequirements(optSites, usableSites, unusableSites) if not sites: msg = 'On Hold: InputData Site is Banned or not Active' self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() #Set stager request as necessary, optimize for smallest #files on tape if #more than one site candidate left at this point checkStaging = self.__resolveSitesForStaging(job, sites, inputData, optInfo['SiteCandidates']) if not checkStaging['OK']: return checkStaging destinationSites = checkStaging['SiteCandidates'] if not destinationSites: return S_ERROR('No destination sites available') stagingFlag = checkStaging['Value'] if stagingFlag: #Single site candidate chosen and staging required self.log.verbose('Job %s requires staging of input data' % (job)) # set all LFN to disk for the selected site stagingSite = destinationSites[0] siteDict = optInfo['SiteCandidates'][stagingSite] siteDict['disk'] = siteDict['disk'] + siteDict['tape'] siteDict['tape'] = 0 optInfo['SiteCandidates'][stagingSite] = siteDict self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % (self.dataAgentName, job), optInfo) result = self.setOptimizerJobInfo(job, self.dataAgentName, optInfo) if not result['OK']: return result # Site is selected for staging, report it self.log.verbose('Staging site candidate for job %s is %s' % (job, stagingSite)) result = self.__getStagingSites(stagingSite, destinationSites) if not result['OK']: stagingSites = [stagingSite] else: stagingSites = result['Value'] if len(stagingSites) == 1: self.jobDB.setJobAttribute(job, 'Site', stagingSite) else: # Get the name of the site group result = self.__getSiteGroup(stagingSites) if result['OK']: groupName = result['Value'] if groupName: self.jobDB.setJobAttribute(job, 'Site', groupName) else: self.jobDB.setJobAttribute(job, 'Site', 'Multiple') else: self.jobDB.setJobAttribute(job, 'Site', 'Multiple') stagerDict = self.__setStagingRequest(job, stagingSite, optInfo) if not stagerDict['OK']: return stagerDict self.__updateOtherSites(job, stagingSite, stagerDict['Value'], optInfo) return S_OK() else: #No staging required, can proceed to task queue agent and then waiting status self.log.verbose('Job %s does not require staging of input data' % (job)) #Finally send job to TaskQueueAgent return self.__sendJobToTaskQueue(job, classAdJob, destinationSites, userBannedSites)