def getAuthAPI(): """Get Auth REST API url :return: str """ return gConfig.getValue("/Systems/Framework/%s/URLs/AuthAPI" % getSystemInstance("Framework"))
def initialize(self): """Sets default parameters and creates CE instance""" super(PushJobAgent, self).initialize() result = self._initializeComputingElement("Pool") if not result["OK"]: return result # on-the fly imports ol = ObjectLoader() res = ol.loadModule("ConfigurationSystem.Client.Helpers.Resources") if not res["OK"]: sys.exit(res["Message"]) self.resourcesModule = res["Value"] self.opsHelper = Operations() # Disable Watchdog: we don't need it as pre/post processing occurs locally setup = gConfig.getValue("/DIRAC/Setup", "") if not setup: return S_ERROR("Cannot get the DIRAC Setup value") wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR("Cannot get the WorkloadManagement system instance") section = "/Systems/WorkloadManagement/%s/JobWrapper" % wms_instance self._updateConfiguration("CheckWallClockFlag", 0, path=section) self._updateConfiguration("CheckDiskSpaceFlag", 0, path=section) self._updateConfiguration("CheckLoadAvgFlag", 0, path=section) self._updateConfiguration("CheckCPUConsumedFlag", 0, path=section) self._updateConfiguration("CheckCPULimitFlag", 0, path=section) self._updateConfiguration("CheckMemoryLimitFlag", 0, path=section) self._updateConfiguration("CheckTimeLeftFlag", 0, path=section) return S_OK()
def execute(self): """ The main agent execution method """ self.log.verbose('Waking up Stalled Job Agent') wms_instance = getSystemInstance('WorkloadManagement') if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance') wrapperSection = cfgPath('Systems', 'WorkloadManagement', wms_instance, 'JobWrapper') stalledTime = self.am_getOption('StalledTimeHours', 2) failedTime = self.am_getOption('FailedTimeHours', 6) self.stalledJobsToleranceTime = self.am_getOption( 'StalledJobsToleranceTime', 0) self.matchedTime = self.am_getOption('MatchedTime', self.matchedTime) self.rescheduledTime = self.am_getOption('RescheduledTime', self.rescheduledTime) self.completedTime = self.am_getOption('CompletedTime', self.completedTime) self.log.verbose('StalledTime = %s cycles' % (stalledTime)) self.log.verbose('FailedTime = %s cycles' % (failedTime)) watchdogCycle = gConfig.getValue( cfgPath(wrapperSection, 'CheckingTime'), 30 * 60) watchdogCycle = max( watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, 'MinCheckingTime'), 20 * 60)) # Add half cycle to avoid race conditions stalledTime = watchdogCycle * (stalledTime + 0.5) failedTime = watchdogCycle * (failedTime + 0.5) result = self.__markStalledJobs(stalledTime) if not result['OK']: self.log.error('Failed to detect stalled jobs', result['Message']) # Note, jobs will be revived automatically during the heartbeat signal phase and # subsequent status changes will result in jobs not being selected by the # stalled job agent. result = self.__failStalledJobs(failedTime) if not result['OK']: self.log.error('Failed to process stalled jobs', result['Message']) result = self.__failCompletedJobs() if not result['OK']: self.log.error('Failed to process completed jobs', result['Message']) result = self.__kickStuckJobs() if not result['OK']: self.log.error('Failed to kick stuck jobs', result['Message']) return S_OK('Stalled Job Agent cycle complete')
def isDownloadProxyAllowed(): """Get allowProxyDownload flag :return: S_OK(bool)/S_ERROR() """ cs_path = "/Systems/Framework/%s/APIs/Auth" % getSystemInstance( "Framework") return gConfig.getValue(cs_path + "/allowProxyDownload", True)
def initialize(self): """Sets default parameters""" self.jobDB = JobDB() self.logDB = JobLoggingDB() # getting parameters if not self.am_getOption("Enable", True): self.log.info("Stalled Job Agent running in disabled mode") wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR( "Can not get the WorkloadManagement system instance") self.stalledJobsTolerantSites = self.am_getOption( "StalledJobsTolerantSites", []) self.stalledJobsToleranceTime = self.am_getOption( "StalledJobsToleranceTime", 0) self.stalledJobsToRescheduleSites = self.am_getOption( "StalledJobsToRescheduleSites", []) self.submittingTime = self.am_getOption("SubmittingTime", self.submittingTime) self.matchedTime = self.am_getOption("MatchedTime", self.matchedTime) self.rescheduledTime = self.am_getOption("RescheduledTime", self.rescheduledTime) wrapperSection = cfgPath("Systems", "WorkloadManagement", wms_instance, "JobWrapper") failedTime = self.am_getOption("FailedTimeHours", 6) watchdogCycle = gConfig.getValue( cfgPath(wrapperSection, "CheckingTime"), 30 * 60) watchdogCycle = max( watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, "MinCheckingTime"), 20 * 60)) stalledTime = self.am_getOption("StalledTimeHours", 2) self.log.verbose("", "StalledTime = %s cycles" % (stalledTime)) self.stalledTime = int(watchdogCycle * (stalledTime + 0.5)) self.log.verbose("", "FailedTime = %s cycles" % (failedTime)) # Add half cycle to avoid race conditions self.failedTime = int(watchdogCycle * (failedTime + 0.5)) self.minorStalledStatuses = ( JobMinorStatus.STALLED_PILOT_NOT_RUNNING, "Stalling for more than %d sec" % self.failedTime, ) # setting up the threading maxNumberOfThreads = self.am_getOption("MaxNumberOfThreads", 15) self.log.verbose("Multithreaded with %d threads" % maxNumberOfThreads) self.threadPoolExecutor = concurrent.futures.ThreadPoolExecutor( max_workers=maxNumberOfThreads) return S_OK()
def initialize( self, loops = 0 ): """ Watchdog initialization. """ if self.initialized: self.log.info( 'Watchdog already initialized' ) return S_OK() else: self.initialized = True setup = gConfig.getValue( '/DIRAC/Setup', '' ) if not setup: return S_ERROR( 'Can not get the DIRAC Setup value' ) wms_instance = getSystemInstance( "WorkloadManagement" ) if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance' ) self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance self.maxcount = loops self.log.verbose( 'Watchdog initialization' ) self.log.info( 'Attempting to Initialize Watchdog for: %s' % ( self.systemFlag ) ) #Test control flags self.testWallClock = gConfig.getValue( self.section + '/CheckWallClockFlag', 1 ) self.testDiskSpace = gConfig.getValue( self.section + '/CheckDiskSpaceFlag', 1 ) self.testLoadAvg = gConfig.getValue( self.section + '/CheckLoadAvgFlag', 1 ) self.testCPUConsumed = gConfig.getValue( self.section + '/CheckCPUConsumedFlag', 1 ) self.testCPULimit = gConfig.getValue( self.section + '/CheckCPULimitFlag', 0 ) self.testMemoryLimit = gConfig.getValue( self.section + '/CheckMemoryLimitFlag', 0 ) self.testTimeLeft = gConfig.getValue( self.section + '/CheckTimeLeftFlag', 1 ) #Other parameters self.pollingTime = gConfig.getValue( self.section + '/PollingTime', 10 ) # 10 seconds self.checkingTime = gConfig.getValue( self.section + '/CheckingTime', 30 * 60 ) #30 minute period self.minCheckingTime = gConfig.getValue( self.section + '/MinCheckingTime', 20 * 60 ) # 20 mins self.maxWallClockTime = gConfig.getValue( self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60 ) # e.g. 4 days self.jobPeekFlag = gConfig.getValue( self.section + '/JobPeekFlag', 1 ) # on / off self.minDiskSpace = gConfig.getValue( self.section + '/MinDiskSpace', 10 ) #MB self.loadAvgLimit = gConfig.getValue( self.section + '/LoadAverageLimit', 1000 ) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue( self.section + '/CPUSampleTime', 30 * 60 ) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue( self.section + '/JobCPULimitMargin', 20 ) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue( self.section + '/MinCPUWallClockRatio', 5 ) #ratio %age self.nullCPULimit = gConfig.getValue( self.section + '/NullCPUCountLimit', 5 ) #After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.nullCPUCount = 0 if self.checkingTime < self.minCheckingTime: self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % ( self.checkingTime, self.minCheckingTime ) ) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.grossTimeLeftLimit = 10 * self.checkingTime self.fineTimeLeftLimit = gConfig.getValue( self.section + '/TimeLeftLimit', 150 * self.pollingTime ) self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False return S_OK()
def initialize(self, loops=0): """ Watchdog initialization. """ if self.initialized: self.log.info('Watchdog already initialized') return S_OK() else: self.initialized = True setup = gConfig.getValue('/DIRAC/Setup', '') if not setup: return S_ERROR('Can not get the DIRAC Setup value') wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR('Can not get the WorkloadManagement system instance') self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance self.maxcount = loops self.log.verbose('Watchdog initialization') self.log.info('Attempting to Initialize Watchdog for: %s' % (self.systemFlag)) # Test control flags self.testWallClock = gConfig.getValue(self.section + '/CheckWallClockFlag', 1) self.testDiskSpace = gConfig.getValue(self.section + '/CheckDiskSpaceFlag', 1) self.testLoadAvg = gConfig.getValue(self.section + '/CheckLoadAvgFlag', 1) self.testCPUConsumed = gConfig.getValue(self.section + '/CheckCPUConsumedFlag', 1) self.testCPULimit = gConfig.getValue(self.section + '/CheckCPULimitFlag', 0) self.testMemoryLimit = gConfig.getValue(self.section + '/CheckMemoryLimitFlag', 0) self.testTimeLeft = gConfig.getValue(self.section + '/CheckTimeLeftFlag', 1) # Other parameters self.pollingTime = gConfig.getValue(self.section + '/PollingTime', 10) # 10 seconds self.checkingTime = gConfig.getValue(self.section + '/CheckingTime', 30 * 60) # 30 minute period self.minCheckingTime = gConfig.getValue(self.section + '/MinCheckingTime', 20 * 60) # 20 mins self.maxWallClockTime = gConfig.getValue(self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60) # e.g. 4 days self.jobPeekFlag = gConfig.getValue(self.section + '/JobPeekFlag', 1) # on / off self.minDiskSpace = gConfig.getValue(self.section + '/MinDiskSpace', 10) # MB self.loadAvgLimit = gConfig.getValue(self.section + '/LoadAverageLimit', 1000) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue(self.section + '/CPUSampleTime', 30 * 60) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue(self.section + '/JobCPULimitMargin', 20) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue(self.section + '/MinCPUWallClockRatio', 5) # ratio %age # After 5 sample times return null CPU consumption kill job self.nullCPULimit = gConfig.getValue(self.section + '/NullCPUCountLimit', 5) if self.checkingTime < self.minCheckingTime: self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % (self.checkingTime, self.minCheckingTime)) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.fineTimeLeftLimit = gConfig.getValue(self.section + '/TimeLeftLimit', 150 * self.pollingTime) self.scaleFactor = gConfig.getValue('/LocalSite/CPUScalingFactor', 1.0) return S_OK()
def execute( self ): """ The main agent execution method """ self.log.verbose( 'Waking up Stalled Job Agent' ) wms_instance = getSystemInstance( 'WorkloadManagement' ) if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance' ) wrapperSection = cfgPath( 'Systems', 'WorkloadManagement', wms_instance, 'JobWrapper' ) stalledTime = self.am_getOption( 'StalledTimeHours', 2 ) failedTime = self.am_getOption( 'FailedTimeHours', 6 ) self.stalledJobsToleranceTime = self.am_getOption( 'StalledJobsToleranceTime', 0 ) self.submittingTime = self.am_getOption('SubmittingTime', self.submittingTime) self.matchedTime = self.am_getOption( 'MatchedTime', self.matchedTime ) self.rescheduledTime = self.am_getOption( 'RescheduledTime', self.rescheduledTime ) self.completedTime = self.am_getOption( 'CompletedTime', self.completedTime ) self.log.verbose( 'StalledTime = %s cycles' % ( stalledTime ) ) self.log.verbose( 'FailedTime = %s cycles' % ( failedTime ) ) watchdogCycle = gConfig.getValue( cfgPath( wrapperSection , 'CheckingTime' ), 30 * 60 ) watchdogCycle = max( watchdogCycle, gConfig.getValue( cfgPath( wrapperSection , 'MinCheckingTime' ), 20 * 60 ) ) # Add half cycle to avoid race conditions stalledTime = watchdogCycle * ( stalledTime + 0.5 ) failedTime = watchdogCycle * ( failedTime + 0.5 ) result = self.__markStalledJobs( stalledTime ) if not result['OK']: self.log.error( 'Failed to detect stalled jobs', result['Message'] ) # Note, jobs will be revived automatically during the heartbeat signal phase and # subsequent status changes will result in jobs not being selected by the # stalled job agent. result = self.__failStalledJobs( failedTime ) if not result['OK']: self.log.error( 'Failed to process stalled jobs', result['Message'] ) result = self.__failCompletedJobs() if not result['OK']: self.log.error( 'Failed to process completed jobs', result['Message'] ) result = self.__failSubmittingJobs() if not result['OK']: self.log.error('Failed to process jobs being submitted', result['Message']) result = self.__kickStuckJobs() if not result['OK']: self.log.error( 'Failed to kick stuck jobs', result['Message'] ) return S_OK( 'Stalled Job Agent cycle complete' )
def execute(self): """ The main agent execution method """ self.log.verbose("Waking up Stalled Job Agent") wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR("Can not get the WorkloadManagement system instance") wrapperSection = cfgPath("Systems", "WorkloadManagement", wms_instance, "JobWrapper") stalledTime = self.am_getOption("StalledTimeHours", 2) failedTime = self.am_getOption("FailedTimeHours", 6) self.matchedTime = self.am_getOption("MatchedTime", self.matchedTime) self.rescheduledTime = self.am_getOption("RescheduledTime", self.rescheduledTime) self.completedTime = self.am_getOption("CompletedTime", self.completedTime) self.log.verbose("StalledTime = %s cycles" % (stalledTime)) self.log.verbose("FailedTime = %s cycles" % (failedTime)) watchdogCycle = gConfig.getValue(cfgPath(wrapperSection, "CheckingTime"), 30 * 60) watchdogCycle = max(watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, "MinCheckingTime"), 20 * 60)) # Add half cycle to avoid race conditions stalledTime = watchdogCycle * (stalledTime + 0.5) failedTime = watchdogCycle * (failedTime + 0.5) result = self.__markStalledJobs(stalledTime) if not result["OK"]: self.log.error("Failed to detect stalled jobs", result["Message"]) # Note, jobs will be revived automatically during the heartbeat signal phase and # subsequent status changes will result in jobs not being selected by the # stalled job agent. result = self.__failStalledJobs(failedTime) if not result["OK"]: self.log.error("Failed to process stalled jobs", result["Message"]) result = self.__failCompletedJobs() if not result["OK"]: self.log.error("Failed to process completed jobs", result["Message"]) result = self.__kickStuckJobs() if not result["OK"]: self.log.error("Failed to kick stuck jobs", result["Message"]) return S_OK("Stalled Job Agent cycle complete")
def execute(self): """ The main agent execution method """ self.log.debug('Waking up Stalled Job Agent') # getting parameters wms_instance = getSystemInstance('WorkloadManagement') if not wms_instance: return S_ERROR('Can not get the WorkloadManagement system instance') wrapperSection = cfgPath('Systems', 'WorkloadManagement', wms_instance, 'JobWrapper') stalledTime = self.am_getOption('StalledTimeHours', 2) failedTime = self.am_getOption('FailedTimeHours', 6) self.stalledJobsTolerantSites = self.am_getOption('StalledJobsTolerantSites', []) self.stalledJobsToleranceTime = self.am_getOption('StalledJobsToleranceTime', 0) self.submittingTime = self.am_getOption('SubmittingTime', self.submittingTime) self.matchedTime = self.am_getOption('MatchedTime', self.matchedTime) self.rescheduledTime = self.am_getOption('RescheduledTime', self.rescheduledTime) self.log.verbose('', 'StalledTime = %s cycles' % (stalledTime)) self.log.verbose('', 'FailedTime = %s cycles' % (failedTime)) watchdogCycle = gConfig.getValue(cfgPath(wrapperSection, 'CheckingTime'), 30 * 60) watchdogCycle = max(watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, 'MinCheckingTime'), 20 * 60)) # Add half cycle to avoid race conditions self.stalledTime = int(watchdogCycle * (stalledTime + 0.5)) self.failedTime = int(watchdogCycle * (failedTime + 0.5)) self.minorStalledStatuses = ( JobMinorStatus.STALLED_PILOT_NOT_RUNNING, 'Stalling for more than %d sec' % self.failedTime) # Now we are getting what's going to be checked # 1) Queueing the jobs that might be marked Stalled # This is the minimum time we wait for declaring a job Stalled, therefore it is safe checkTime = dateTime() - self.stalledTime * second checkedStatuses = [JobStatus.RUNNING, JobStatus.COMPLETING] # Only get jobs whose HeartBeat is older than the stalledTime result = self.jobDB.selectJobs({'Status': checkedStatuses}, older=checkTime, timeStamp='HeartBeatTime') if not result['OK']: self.log.error("Issue selecting %s jobs" % ' & '.join(checkedStatuses), result['Message']) if result['Value']: jobs = sorted(result['Value']) self.log.info('%s jobs will be checked for being stalled' % ' & '.join(checkedStatuses), '(n=%d, heartbeat before %s)' % (len(jobs), str(checkTime))) for job in jobs: self.jobsQueue.put('%s:_markStalledJobs' % job) # 2) Queueing the Stalled jobs that might be marked Failed result = self.jobDB.selectJobs({'Status': JobStatus.STALLED}) if not result['OK']: self.log.error("Issue selecting Stalled jobs", result['Message']) if result['Value']: jobs = sorted(result['Value']) self.log.info('Jobs Stalled will be checked for failure', '(n=%d)' % len(jobs)) for job in jobs: self.jobsQueue.put('%s:_failStalledJobs' % job) # 3) Send accounting for minor in self.minorStalledStatuses: result = self.jobDB.selectJobs({'Status': JobStatus.FAILED, 'MinorStatus': minor, 'AccountedFlag': 'False'}) if not result['OK']: self.log.error("Issue selecting jobs for accounting", result['Message']) if result['Value']: jobs = result['Value'] self.log.info('Stalled jobs will be Accounted', '(n=%d)' % (len(jobs))) for job in jobs: self.jobsQueue.put('%s:__sendAccounting' % job) # From here on we don't use the threads # 4) Fail submitting jobs result = self._failSubmittingJobs() if not result['OK']: self.log.error('Failed to process jobs being submitted', result['Message']) # 5) Kick stuck jobs result = self._kickStuckJobs() if not result['OK']: self.log.error('Failed to kick stuck jobs', result['Message']) return S_OK()