示例#1
0
def getAuthAPI():
    """Get Auth REST API url

    :return: str
    """
    return gConfig.getValue("/Systems/Framework/%s/URLs/AuthAPI" %
                            getSystemInstance("Framework"))
示例#2
0
    def initialize(self):
        """Sets default parameters and creates CE instance"""
        super(PushJobAgent, self).initialize()

        result = self._initializeComputingElement("Pool")
        if not result["OK"]:
            return result

        # on-the fly imports
        ol = ObjectLoader()
        res = ol.loadModule("ConfigurationSystem.Client.Helpers.Resources")
        if not res["OK"]:
            sys.exit(res["Message"])
        self.resourcesModule = res["Value"]
        self.opsHelper = Operations()

        # Disable Watchdog: we don't need it as pre/post processing occurs locally
        setup = gConfig.getValue("/DIRAC/Setup", "")
        if not setup:
            return S_ERROR("Cannot get the DIRAC Setup value")
        wms_instance = getSystemInstance("WorkloadManagement")
        if not wms_instance:
            return S_ERROR("Cannot get the WorkloadManagement system instance")
        section = "/Systems/WorkloadManagement/%s/JobWrapper" % wms_instance
        self._updateConfiguration("CheckWallClockFlag", 0, path=section)
        self._updateConfiguration("CheckDiskSpaceFlag", 0, path=section)
        self._updateConfiguration("CheckLoadAvgFlag", 0, path=section)
        self._updateConfiguration("CheckCPUConsumedFlag", 0, path=section)
        self._updateConfiguration("CheckCPULimitFlag", 0, path=section)
        self._updateConfiguration("CheckMemoryLimitFlag", 0, path=section)
        self._updateConfiguration("CheckTimeLeftFlag", 0, path=section)

        return S_OK()
示例#3
0
    def execute(self):
        """ The main agent execution method
"""
        self.log.verbose('Waking up Stalled Job Agent')

        wms_instance = getSystemInstance('WorkloadManagement')
        if not wms_instance:
            return S_ERROR(
                'Can not get the WorkloadManagement system instance')
        wrapperSection = cfgPath('Systems', 'WorkloadManagement', wms_instance,
                                 'JobWrapper')

        stalledTime = self.am_getOption('StalledTimeHours', 2)
        failedTime = self.am_getOption('FailedTimeHours', 6)
        self.stalledJobsToleranceTime = self.am_getOption(
            'StalledJobsToleranceTime', 0)

        self.matchedTime = self.am_getOption('MatchedTime', self.matchedTime)
        self.rescheduledTime = self.am_getOption('RescheduledTime',
                                                 self.rescheduledTime)
        self.completedTime = self.am_getOption('CompletedTime',
                                               self.completedTime)

        self.log.verbose('StalledTime = %s cycles' % (stalledTime))
        self.log.verbose('FailedTime = %s cycles' % (failedTime))

        watchdogCycle = gConfig.getValue(
            cfgPath(wrapperSection, 'CheckingTime'), 30 * 60)
        watchdogCycle = max(
            watchdogCycle,
            gConfig.getValue(cfgPath(wrapperSection, 'MinCheckingTime'),
                             20 * 60))

        # Add half cycle to avoid race conditions
        stalledTime = watchdogCycle * (stalledTime + 0.5)
        failedTime = watchdogCycle * (failedTime + 0.5)

        result = self.__markStalledJobs(stalledTime)
        if not result['OK']:
            self.log.error('Failed to detect stalled jobs', result['Message'])

        # Note, jobs will be revived automatically during the heartbeat signal phase and
        # subsequent status changes will result in jobs not being selected by the
        # stalled job agent.

        result = self.__failStalledJobs(failedTime)
        if not result['OK']:
            self.log.error('Failed to process stalled jobs', result['Message'])

        result = self.__failCompletedJobs()
        if not result['OK']:
            self.log.error('Failed to process completed jobs',
                           result['Message'])

        result = self.__kickStuckJobs()
        if not result['OK']:
            self.log.error('Failed to kick stuck jobs', result['Message'])

        return S_OK('Stalled Job Agent cycle complete')
示例#4
0
def isDownloadProxyAllowed():
    """Get allowProxyDownload flag

    :return: S_OK(bool)/S_ERROR()
    """
    cs_path = "/Systems/Framework/%s/APIs/Auth" % getSystemInstance(
        "Framework")
    return gConfig.getValue(cs_path + "/allowProxyDownload", True)
示例#5
0
    def initialize(self):
        """Sets default parameters"""
        self.jobDB = JobDB()
        self.logDB = JobLoggingDB()

        # getting parameters

        if not self.am_getOption("Enable", True):
            self.log.info("Stalled Job Agent running in disabled mode")

        wms_instance = getSystemInstance("WorkloadManagement")
        if not wms_instance:
            return S_ERROR(
                "Can not get the WorkloadManagement system instance")
        self.stalledJobsTolerantSites = self.am_getOption(
            "StalledJobsTolerantSites", [])
        self.stalledJobsToleranceTime = self.am_getOption(
            "StalledJobsToleranceTime", 0)

        self.stalledJobsToRescheduleSites = self.am_getOption(
            "StalledJobsToRescheduleSites", [])

        self.submittingTime = self.am_getOption("SubmittingTime",
                                                self.submittingTime)
        self.matchedTime = self.am_getOption("MatchedTime", self.matchedTime)
        self.rescheduledTime = self.am_getOption("RescheduledTime",
                                                 self.rescheduledTime)

        wrapperSection = cfgPath("Systems", "WorkloadManagement", wms_instance,
                                 "JobWrapper")

        failedTime = self.am_getOption("FailedTimeHours", 6)
        watchdogCycle = gConfig.getValue(
            cfgPath(wrapperSection, "CheckingTime"), 30 * 60)
        watchdogCycle = max(
            watchdogCycle,
            gConfig.getValue(cfgPath(wrapperSection, "MinCheckingTime"),
                             20 * 60))
        stalledTime = self.am_getOption("StalledTimeHours", 2)
        self.log.verbose("", "StalledTime = %s cycles" % (stalledTime))
        self.stalledTime = int(watchdogCycle * (stalledTime + 0.5))
        self.log.verbose("", "FailedTime = %s cycles" % (failedTime))

        # Add half cycle to avoid race conditions
        self.failedTime = int(watchdogCycle * (failedTime + 0.5))

        self.minorStalledStatuses = (
            JobMinorStatus.STALLED_PILOT_NOT_RUNNING,
            "Stalling for more than %d sec" % self.failedTime,
        )

        # setting up the threading
        maxNumberOfThreads = self.am_getOption("MaxNumberOfThreads", 15)
        self.log.verbose("Multithreaded with %d threads" % maxNumberOfThreads)
        self.threadPoolExecutor = concurrent.futures.ThreadPoolExecutor(
            max_workers=maxNumberOfThreads)

        return S_OK()
示例#6
0
  def initialize( self, loops = 0 ):
    """ Watchdog initialization.
    """
    if self.initialized:
      self.log.info( 'Watchdog already initialized' )
      return S_OK()
    else:
      self.initialized = True

    setup = gConfig.getValue( '/DIRAC/Setup', '' )
    if not setup:
      return S_ERROR( 'Can not get the DIRAC Setup value' )
    wms_instance = getSystemInstance( "WorkloadManagement" )
    if not wms_instance:
      return S_ERROR( 'Can not get the WorkloadManagement system instance' )
    self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance

    self.maxcount = loops
    self.log.verbose( 'Watchdog initialization' )
    self.log.info( 'Attempting to Initialize Watchdog for: %s' % ( self.systemFlag ) )
    #Test control flags
    self.testWallClock = gConfig.getValue( self.section + '/CheckWallClockFlag', 1 )
    self.testDiskSpace = gConfig.getValue( self.section + '/CheckDiskSpaceFlag', 1 )
    self.testLoadAvg = gConfig.getValue( self.section + '/CheckLoadAvgFlag', 1 )
    self.testCPUConsumed = gConfig.getValue( self.section + '/CheckCPUConsumedFlag', 1 )
    self.testCPULimit = gConfig.getValue( self.section + '/CheckCPULimitFlag', 0 )
    self.testMemoryLimit = gConfig.getValue( self.section + '/CheckMemoryLimitFlag', 0 )
    self.testTimeLeft = gConfig.getValue( self.section + '/CheckTimeLeftFlag', 1 )
    #Other parameters
    self.pollingTime = gConfig.getValue( self.section + '/PollingTime', 10 ) # 10 seconds
    self.checkingTime = gConfig.getValue( self.section + '/CheckingTime', 30 * 60 ) #30 minute period
    self.minCheckingTime = gConfig.getValue( self.section + '/MinCheckingTime', 20 * 60 ) # 20 mins
    self.maxWallClockTime = gConfig.getValue( self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60 ) # e.g. 4 days
    self.jobPeekFlag = gConfig.getValue( self.section + '/JobPeekFlag', 1 ) # on / off
    self.minDiskSpace = gConfig.getValue( self.section + '/MinDiskSpace', 10 ) #MB
    self.loadAvgLimit = gConfig.getValue( self.section + '/LoadAverageLimit', 1000 ) # > 1000 and jobs killed
    self.sampleCPUTime = gConfig.getValue( self.section + '/CPUSampleTime', 30 * 60 ) # e.g. up to 20mins sample
    self.jobCPUMargin = gConfig.getValue( self.section + '/JobCPULimitMargin', 20 ) # %age buffer before killing job
    self.minCPUWallClockRatio = gConfig.getValue( self.section + '/MinCPUWallClockRatio', 5 ) #ratio %age
    self.nullCPULimit = gConfig.getValue( self.section + '/NullCPUCountLimit', 5 ) #After 5 sample times return null CPU consumption kill job
    self.checkCount = 0
    self.nullCPUCount = 0
    if self.checkingTime < self.minCheckingTime:
      self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % ( self.checkingTime, self.minCheckingTime ) )
      self.checkingTime = self.minCheckingTime

    # The time left is returned in seconds @ 250 SI00 = 1 HS06,
    # the self.checkingTime and self.pollingTime are in seconds,
    # thus they need to be multiplied by a large enough factor
    self.grossTimeLeftLimit = 10 * self.checkingTime
    self.fineTimeLeftLimit = gConfig.getValue( self.section + '/TimeLeftLimit', 150 * self.pollingTime )

    self.timeLeftUtil = TimeLeft()
    self.timeLeft = 0
    self.littleTimeLeft = False
    return S_OK()
示例#7
0
文件: Watchdog.py 项目: cserf/DIRAC
  def initialize(self, loops=0):
    """ Watchdog initialization.
    """
    if self.initialized:
      self.log.info('Watchdog already initialized')
      return S_OK()
    else:
      self.initialized = True

    setup = gConfig.getValue('/DIRAC/Setup', '')
    if not setup:
      return S_ERROR('Can not get the DIRAC Setup value')
    wms_instance = getSystemInstance("WorkloadManagement")
    if not wms_instance:
      return S_ERROR('Can not get the WorkloadManagement system instance')
    self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance

    self.maxcount = loops
    self.log.verbose('Watchdog initialization')
    self.log.info('Attempting to Initialize Watchdog for: %s' % (self.systemFlag))
    # Test control flags
    self.testWallClock = gConfig.getValue(self.section + '/CheckWallClockFlag', 1)
    self.testDiskSpace = gConfig.getValue(self.section + '/CheckDiskSpaceFlag', 1)
    self.testLoadAvg = gConfig.getValue(self.section + '/CheckLoadAvgFlag', 1)
    self.testCPUConsumed = gConfig.getValue(self.section + '/CheckCPUConsumedFlag', 1)
    self.testCPULimit = gConfig.getValue(self.section + '/CheckCPULimitFlag', 0)
    self.testMemoryLimit = gConfig.getValue(self.section + '/CheckMemoryLimitFlag', 0)
    self.testTimeLeft = gConfig.getValue(self.section + '/CheckTimeLeftFlag', 1)
    # Other parameters
    self.pollingTime = gConfig.getValue(self.section + '/PollingTime', 10)  # 10 seconds
    self.checkingTime = gConfig.getValue(self.section + '/CheckingTime', 30 * 60)  # 30 minute period
    self.minCheckingTime = gConfig.getValue(self.section + '/MinCheckingTime', 20 * 60)  # 20 mins
    self.maxWallClockTime = gConfig.getValue(self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60)  # e.g. 4 days
    self.jobPeekFlag = gConfig.getValue(self.section + '/JobPeekFlag', 1)  # on / off
    self.minDiskSpace = gConfig.getValue(self.section + '/MinDiskSpace', 10)  # MB
    self.loadAvgLimit = gConfig.getValue(self.section + '/LoadAverageLimit', 1000)  # > 1000 and jobs killed
    self.sampleCPUTime = gConfig.getValue(self.section + '/CPUSampleTime', 30 * 60)  # e.g. up to 20mins sample
    self.jobCPUMargin = gConfig.getValue(self.section + '/JobCPULimitMargin', 20)  # %age buffer before killing job
    self.minCPUWallClockRatio = gConfig.getValue(self.section + '/MinCPUWallClockRatio', 5)  # ratio %age
    # After 5 sample times return null CPU consumption kill job
    self.nullCPULimit = gConfig.getValue(self.section + '/NullCPUCountLimit', 5)
    if self.checkingTime < self.minCheckingTime:
      self.log.info(
          'Requested CheckingTime of %s setting to %s seconds (minimum)' %
          (self.checkingTime, self.minCheckingTime))
      self.checkingTime = self.minCheckingTime

    # The time left is returned in seconds @ 250 SI00 = 1 HS06,
    # the self.checkingTime and self.pollingTime are in seconds,
    # thus they need to be multiplied by a large enough factor
    self.fineTimeLeftLimit = gConfig.getValue(self.section + '/TimeLeftLimit', 150 * self.pollingTime)
    self.scaleFactor = gConfig.getValue('/LocalSite/CPUScalingFactor', 1.0)

    return S_OK()
示例#8
0
  def execute( self ):
    """ The main agent execution method
"""
    self.log.verbose( 'Waking up Stalled Job Agent' )

    wms_instance = getSystemInstance( 'WorkloadManagement' )
    if not wms_instance:
      return S_ERROR( 'Can not get the WorkloadManagement system instance' )
    wrapperSection = cfgPath( 'Systems', 'WorkloadManagement', wms_instance, 'JobWrapper' )

    stalledTime = self.am_getOption( 'StalledTimeHours', 2 )
    failedTime = self.am_getOption( 'FailedTimeHours', 6 )
    self.stalledJobsToleranceTime = self.am_getOption( 'StalledJobsToleranceTime', 0 )

    self.submittingTime = self.am_getOption('SubmittingTime', self.submittingTime)
    self.matchedTime = self.am_getOption( 'MatchedTime', self.matchedTime )
    self.rescheduledTime = self.am_getOption( 'RescheduledTime', self.rescheduledTime )
    self.completedTime = self.am_getOption( 'CompletedTime', self.completedTime )

    self.log.verbose( 'StalledTime = %s cycles' % ( stalledTime ) )
    self.log.verbose( 'FailedTime = %s cycles' % ( failedTime ) )

    watchdogCycle = gConfig.getValue( cfgPath( wrapperSection , 'CheckingTime' ), 30 * 60 )
    watchdogCycle = max( watchdogCycle, gConfig.getValue( cfgPath( wrapperSection , 'MinCheckingTime' ), 20 * 60 ) )

    # Add half cycle to avoid race conditions
    stalledTime = watchdogCycle * ( stalledTime + 0.5 )
    failedTime = watchdogCycle * ( failedTime + 0.5 )

    result = self.__markStalledJobs( stalledTime )
    if not result['OK']:
      self.log.error( 'Failed to detect stalled jobs', result['Message'] )

    # Note, jobs will be revived automatically during the heartbeat signal phase and
    # subsequent status changes will result in jobs not being selected by the
    # stalled job agent.

    result = self.__failStalledJobs( failedTime )
    if not result['OK']:
      self.log.error( 'Failed to process stalled jobs', result['Message'] )

    result = self.__failCompletedJobs()
    if not result['OK']:
      self.log.error( 'Failed to process completed jobs', result['Message'] )

    result = self.__failSubmittingJobs()
    if not result['OK']:
      self.log.error('Failed to process jobs being submitted', result['Message'])

    result = self.__kickStuckJobs()
    if not result['OK']:
      self.log.error( 'Failed to kick stuck jobs', result['Message'] )

    return S_OK( 'Stalled Job Agent cycle complete' )
示例#9
0
    def execute(self):
        """ The main agent execution method
"""
        self.log.verbose("Waking up Stalled Job Agent")

        wms_instance = getSystemInstance("WorkloadManagement")
        if not wms_instance:
            return S_ERROR("Can not get the WorkloadManagement system instance")
        wrapperSection = cfgPath("Systems", "WorkloadManagement", wms_instance, "JobWrapper")

        stalledTime = self.am_getOption("StalledTimeHours", 2)
        failedTime = self.am_getOption("FailedTimeHours", 6)

        self.matchedTime = self.am_getOption("MatchedTime", self.matchedTime)
        self.rescheduledTime = self.am_getOption("RescheduledTime", self.rescheduledTime)
        self.completedTime = self.am_getOption("CompletedTime", self.completedTime)

        self.log.verbose("StalledTime = %s cycles" % (stalledTime))
        self.log.verbose("FailedTime = %s cycles" % (failedTime))

        watchdogCycle = gConfig.getValue(cfgPath(wrapperSection, "CheckingTime"), 30 * 60)
        watchdogCycle = max(watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, "MinCheckingTime"), 20 * 60))

        # Add half cycle to avoid race conditions
        stalledTime = watchdogCycle * (stalledTime + 0.5)
        failedTime = watchdogCycle * (failedTime + 0.5)

        result = self.__markStalledJobs(stalledTime)
        if not result["OK"]:
            self.log.error("Failed to detect stalled jobs", result["Message"])

        # Note, jobs will be revived automatically during the heartbeat signal phase and
        # subsequent status changes will result in jobs not being selected by the
        # stalled job agent.

        result = self.__failStalledJobs(failedTime)
        if not result["OK"]:
            self.log.error("Failed to process stalled jobs", result["Message"])

        result = self.__failCompletedJobs()
        if not result["OK"]:
            self.log.error("Failed to process completed jobs", result["Message"])

        result = self.__kickStuckJobs()
        if not result["OK"]:
            self.log.error("Failed to kick stuck jobs", result["Message"])

        return S_OK("Stalled Job Agent cycle complete")
示例#10
0
  def execute(self):
    """ The main agent execution method
    """
    self.log.debug('Waking up Stalled Job Agent')

    # getting parameters
    wms_instance = getSystemInstance('WorkloadManagement')
    if not wms_instance:
      return S_ERROR('Can not get the WorkloadManagement system instance')
    wrapperSection = cfgPath('Systems', 'WorkloadManagement', wms_instance, 'JobWrapper')

    stalledTime = self.am_getOption('StalledTimeHours', 2)
    failedTime = self.am_getOption('FailedTimeHours', 6)
    self.stalledJobsTolerantSites = self.am_getOption('StalledJobsTolerantSites', [])
    self.stalledJobsToleranceTime = self.am_getOption('StalledJobsToleranceTime', 0)

    self.submittingTime = self.am_getOption('SubmittingTime', self.submittingTime)
    self.matchedTime = self.am_getOption('MatchedTime', self.matchedTime)
    self.rescheduledTime = self.am_getOption('RescheduledTime', self.rescheduledTime)

    self.log.verbose('', 'StalledTime = %s cycles' % (stalledTime))
    self.log.verbose('', 'FailedTime = %s cycles' % (failedTime))

    watchdogCycle = gConfig.getValue(cfgPath(wrapperSection, 'CheckingTime'), 30 * 60)
    watchdogCycle = max(watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, 'MinCheckingTime'), 20 * 60))

    # Add half cycle to avoid race conditions
    self.stalledTime = int(watchdogCycle * (stalledTime + 0.5))
    self.failedTime = int(watchdogCycle * (failedTime + 0.5))

    self.minorStalledStatuses = (
        JobMinorStatus.STALLED_PILOT_NOT_RUNNING,
        'Stalling for more than %d sec' % self.failedTime)

    # Now we are getting what's going to be checked

    # 1) Queueing the jobs that might be marked Stalled
    # This is the minimum time we wait for declaring a job Stalled, therefore it is safe
    checkTime = dateTime() - self.stalledTime * second
    checkedStatuses = [JobStatus.RUNNING, JobStatus.COMPLETING]
    # Only get jobs whose HeartBeat is older than the stalledTime
    result = self.jobDB.selectJobs({'Status': checkedStatuses},
                                   older=checkTime, timeStamp='HeartBeatTime')
    if not result['OK']:
      self.log.error("Issue selecting %s jobs" % ' & '.join(checkedStatuses), result['Message'])
    if result['Value']:
      jobs = sorted(result['Value'])
      self.log.info('%s jobs will be checked for being stalled' % ' & '.join(checkedStatuses),
                    '(n=%d, heartbeat before %s)' % (len(jobs), str(checkTime)))
      for job in jobs:
        self.jobsQueue.put('%s:_markStalledJobs' % job)

    # 2) Queueing the Stalled jobs that might be marked Failed
    result = self.jobDB.selectJobs({'Status': JobStatus.STALLED})
    if not result['OK']:
      self.log.error("Issue selecting Stalled jobs", result['Message'])
    if result['Value']:
      jobs = sorted(result['Value'])
      self.log.info('Jobs Stalled will be checked for failure', '(n=%d)' % len(jobs))
      for job in jobs:
        self.jobsQueue.put('%s:_failStalledJobs' % job)

    # 3) Send accounting
    for minor in self.minorStalledStatuses:
      result = self.jobDB.selectJobs({'Status': JobStatus.FAILED, 'MinorStatus': minor, 'AccountedFlag': 'False'})
      if not result['OK']:
        self.log.error("Issue selecting jobs for accounting", result['Message'])
      if result['Value']:
        jobs = result['Value']
        self.log.info('Stalled jobs will be Accounted', '(n=%d)' % (len(jobs)))
        for job in jobs:
          self.jobsQueue.put('%s:__sendAccounting' % job)

    # From here on we don't use the threads

    # 4) Fail submitting jobs
    result = self._failSubmittingJobs()
    if not result['OK']:
      self.log.error('Failed to process jobs being submitted', result['Message'])

    # 5) Kick stuck jobs
    result = self._kickStuckJobs()
    if not result['OK']:
      self.log.error('Failed to kick stuck jobs', result['Message'])

    return S_OK()