Пример #1
0
  def test_getTimeLeft( self ):
#     for batch, retValue in [( 'LSF', LSF_ReturnValue ), ( 'SGE', SGE_ReturnValue )]:

    for batch, retValue in [( 'LSF', LSF_ReturnValue )]:
      self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" )
      rcMock = MagicMock()
      rcMock.return_value = S_OK( retValue )
      self.tl.runCommand = rcMock

      timeMock = MagicMock()

      tl = TimeLeft()
#      res = tl.getTimeLeft()
#      self.assertEqual( res['OK'], True )

      batchSystemName = '%sTimeLeft' % batch
      batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' %
                                batchSystemName, globals(), locals(), [batchSystemName] )
      batchStr = 'batchPlugin.%s()' % ( batchSystemName )
      tl.batchPlugin = eval( batchStr )

      tl.scaleFactor = 10.0
      tl.normFactor = 10.0
      tl.batchPlugin.bin = '/usr/bin'
      tl.batchPlugin.hostNorm = 10.0
      tl.batchPlugin.cpuLimit = 1000
      tl.batchPlugin.wallClockLimit = 1000

      with patch( "DIRAC.Core.Utilities.TimeLeft.LSFTimeLeft.runCommand", new=rcMock ):
        with patch( "DIRAC.Core.Utilities.TimeLeft.LSFTimeLeft.time", new=timeMock ):
          res = tl.getTimeLeft()
          self.assertEqual( res['OK'], True, res.get('Message', '') )

    for batch, retValue in [( 'SGE', SGE_ReturnValue )]:
      self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" )
      rcMock = MagicMock()
      rcMock.return_value = S_OK( retValue )
      self.tl.runCommand = rcMock

      tl = TimeLeft()
#       res = tl.getTimeLeft()
#       self.assertFalse( res['OK'] )

      batchSystemName = '%sTimeLeft' % batch
      batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' %
                                batchSystemName, globals(), locals(), [batchSystemName] )
      batchStr = 'batchPlugin.%s()' % ( batchSystemName )
      tl.batchPlugin = eval( batchStr )

      tl.scaleFactor = 10.0
      tl.normFactor = 10.0
      tl.batchPlugin.bin = '/usr/bin'
      tl.batchPlugin.hostNorm = 10.0
      tl.batchPlugin.cpuLimit = 1000
      tl.batchPlugin.wallClockLimit = 1000

      res = tl.getTimeLeft()
      self.assert_( res['OK'] )
      self.assertEqual( res['Value'], 9400.0 )
Пример #2
0
    def test_getTimeLeft(self):
        #     for batch, retValue in [( 'LSF', LSF_ReturnValue ), ( 'SGE', SGE_ReturnValue )]:

        for batch, retValue in [('LSF', LSF_ReturnValue)]:
            self.tl = importlib.import_module(
                "DIRAC.Core.Utilities.TimeLeft.TimeLeft")
            rcMock = MagicMock()
            rcMock.return_value = S_OK(retValue)
            self.tl.runCommand = rcMock

            tl = TimeLeft()
            #      res = tl.getTimeLeft()
            #      self.assertEqual( res['OK'], True )

            batchSystemName = '%sTimeLeft' % batch
            batchPlugin = __import__(
                'DIRAC.Core.Utilities.TimeLeft.%s' % batchSystemName,
                globals(), locals(), [batchSystemName])
            batchStr = 'batchPlugin.%s()' % (batchSystemName)
            tl.batchPlugin = eval(batchStr)

            tl.scaleFactor = 10.0
            tl.normFactor = 10.0
            tl.batchPlugin.bin = '/usr/bin'
            tl.batchPlugin.hostNorm = 10.0
            tl.batchPlugin.cpuLimit = 1000
            tl.batchPlugin.wallClockLimit = 1000

            res = tl.getTimeLeft()
            self.assertEqual(res['OK'], True)

        for batch, retValue in [('SGE', SGE_ReturnValue)]:
            self.tl = importlib.import_module(
                "DIRAC.Core.Utilities.TimeLeft.TimeLeft")
            rcMock = MagicMock()
            rcMock.return_value = S_OK(retValue)
            self.tl.runCommand = rcMock

            tl = TimeLeft()
            #       res = tl.getTimeLeft()
            #       self.assertFalse( res['OK'] )

            batchSystemName = '%sTimeLeft' % batch
            batchPlugin = __import__(
                'DIRAC.Core.Utilities.TimeLeft.%s' % batchSystemName,
                globals(), locals(), [batchSystemName])
            batchStr = 'batchPlugin.%s()' % (batchSystemName)
            tl.batchPlugin = eval(batchStr)

            tl.scaleFactor = 10.0
            tl.normFactor = 10.0
            tl.batchPlugin.bin = '/usr/bin'
            tl.batchPlugin.hostNorm = 10.0
            tl.batchPlugin.cpuLimit = 1000
            tl.batchPlugin.wallClockLimit = 1000

            res = tl.getTimeLeft()
            self.assert_(res['OK'])
            self.assertEqual(res['Value'], 9400.0)
Пример #3
0
class Watchdog(object):

  #############################################################################
  def __init__(self, pid, exeThread, spObject, jobCPUTime,
               memoryLimit=0, processors=1, systemFlag='linux', jobArgs={}):
    """ Constructor, takes system flag as argument.
    """
    self.stopSigStartSeconds = int(jobArgs.get('StopSigStartSeconds', 1800))  # 30 minutes
    self.stopSigFinishSeconds = int(jobArgs.get('StopSigFinishSeconds', 1800))  # 30 minutes
    self.stopSigNumber = int(jobArgs.get('StopSigNumber', 2))  # SIGINT
    self.stopSigRegex = jobArgs.get('StopSigRegex', None)
    self.stopSigSent = False

    self.log = gLogger.getSubLogger("Watchdog")
    self.systemFlag = systemFlag
    self.exeThread = exeThread
    self.wrapperPID = pid
    self.appPID = self.exeThread.getCurrentPID()
    self.spObject = spObject
    self.jobCPUTime = jobCPUTime
    self.memoryLimit = memoryLimit
    self.calibration = 0
    self.initialValues = {}
    self.parameters = {}
    self.peekFailCount = 0
    self.peekRetry = 5
    self.profiler = Profiler(pid)
    self.checkError = ''
    self.currentStats = {}
    self.initialized = False
    self.count = 0

    # defaults
    self.testWallClock = 1
    self.testDiskSpace = 1
    self.testLoadAvg = 1
    self.maxWallClockTime = 3 * 24 * 60 * 60
    self.testCPUConsumed = 1
    self.testCPULimit = 0
    self.testMemoryLimit = 0
    self.testTimeLeft = 1
    self.pollingTime = 10  # 10 seconds
    self.checkingTime = 30 * 60  # 30 minute period
    self.minCheckingTime = 20 * 60  # 20 mins
    self.wallClockCheckSeconds = 5 * 60  # 5 minutes
    self.maxWallClockTime = 3 * 24 * 60 * 60  # e.g. 4 days
    self.jobPeekFlag = 1  # on / off
    self.minDiskSpace = 10  # MB
    self.loadAvgLimit = 1000  # > 1000 and jobs killed
    self.sampleCPUTime = 30 * 60  # e.g. up to 20mins sample
    self.jobCPUMargin = 20  # %age buffer before killing job
    self.minCPUWallClockRatio = 5  # ratio %age
    self.nullCPULimit = 5  # After 5 sample times return null CPU consumption kill job
    self.checkCount = 0
    self.wallClockCheckCount = 0
    self.nullCPUCount = 0

    self.grossTimeLeftLimit = 10 * self.checkingTime
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = 0
    self.littleTimeLeft = False
    self.scaleFactor = 1.0
    self.processors = processors

  #############################################################################
  def initialize(self, loops=0):
    """ Watchdog initialization.
    """
    if self.initialized:
      self.log.info('Watchdog already initialized')
      return S_OK()
    else:
      self.initialized = True

    setup = gConfig.getValue('/DIRAC/Setup', '')
    if not setup:
      return S_ERROR('Can not get the DIRAC Setup value')
    wms_instance = getSystemInstance("WorkloadManagement")
    if not wms_instance:
      return S_ERROR('Can not get the WorkloadManagement system instance')
    self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance

    self.maxcount = loops
    self.log.verbose('Watchdog initialization')
    self.log.info('Attempting to Initialize Watchdog for: %s' % (self.systemFlag))
    # Test control flags
    self.testWallClock = gConfig.getValue(self.section + '/CheckWallClockFlag', 1)
    self.testDiskSpace = gConfig.getValue(self.section + '/CheckDiskSpaceFlag', 1)
    self.testLoadAvg = gConfig.getValue(self.section + '/CheckLoadAvgFlag', 1)
    self.testCPUConsumed = gConfig.getValue(self.section + '/CheckCPUConsumedFlag', 1)
    self.testCPULimit = gConfig.getValue(self.section + '/CheckCPULimitFlag', 0)
    self.testMemoryLimit = gConfig.getValue(self.section + '/CheckMemoryLimitFlag', 0)
    self.testTimeLeft = gConfig.getValue(self.section + '/CheckTimeLeftFlag', 1)
    # Other parameters
    self.pollingTime = gConfig.getValue(self.section + '/PollingTime', 10)  # 10 seconds
    self.checkingTime = gConfig.getValue(self.section + '/CheckingTime', 30 * 60)  # 30 minute period
    self.minCheckingTime = gConfig.getValue(self.section + '/MinCheckingTime', 20 * 60)  # 20 mins
    self.maxWallClockTime = gConfig.getValue(self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60)  # e.g. 4 days
    self.jobPeekFlag = gConfig.getValue(self.section + '/JobPeekFlag', 1)  # on / off
    self.minDiskSpace = gConfig.getValue(self.section + '/MinDiskSpace', 10)  # MB
    self.loadAvgLimit = gConfig.getValue(self.section + '/LoadAverageLimit', 1000)  # > 1000 and jobs killed
    self.sampleCPUTime = gConfig.getValue(self.section + '/CPUSampleTime', 30 * 60)  # e.g. up to 20mins sample
    self.jobCPUMargin = gConfig.getValue(self.section + '/JobCPULimitMargin', 20)  # %age buffer before killing job
    self.minCPUWallClockRatio = gConfig.getValue(self.section + '/MinCPUWallClockRatio', 5)  # ratio %age
    # After 5 sample times return null CPU consumption kill job
    self.nullCPULimit = gConfig.getValue(self.section + '/NullCPUCountLimit', 5)
    if self.checkingTime < self.minCheckingTime:
      self.log.info(
          'Requested CheckingTime of %s setting to %s seconds (minimum)' %
          (self.checkingTime, self.minCheckingTime))
      self.checkingTime = self.minCheckingTime

    # The time left is returned in seconds @ 250 SI00 = 1 HS06,
    # the self.checkingTime and self.pollingTime are in seconds,
    # thus they need to be multiplied by a large enough factor
    self.fineTimeLeftLimit = gConfig.getValue(self.section + '/TimeLeftLimit', 150 * self.pollingTime)
    self.scaleFactor = gConfig.getValue('/LocalSite/CPUScalingFactor', 1.0)

    return S_OK()

  def run(self):
    """ The main watchdog execution method
    """

    result = self.initialize()
    if not result['OK']:
      self.log.always('Can not start watchdog for the following reason')
      self.log.always(result['Message'])
      return result

    try:
      while True:
        self.log.debug('Starting watchdog loop # %d' % self.count)
        start_cycle_time = time.time()
        result = self.execute()
        exec_cycle_time = time.time() - start_cycle_time
        if not result['OK']:
          self.log.error("Watchdog error during execution", result['Message'])
          break
        elif result['Value'] == "Ended":
          break
        self.count += 1
        if exec_cycle_time < self.pollingTime:
          time.sleep(self.pollingTime - exec_cycle_time)
      return S_OK()
    except Exception:
      self.log.exception()
      return S_ERROR('Exception')

  #############################################################################
  def execute(self):
    """ The main agent execution method of the Watchdog.
    """

    if not self.exeThread.isAlive():
      self.__getUsageSummary()
      self.log.info('Process to monitor has completed, Watchdog will exit.')
      return S_OK("Ended")

    # WallClock checks every self.wallClockCheckSeconds, but only if StopSigRegex is defined in JDL
    if not self.stopSigSent and self.stopSigRegex is not None and (
            time.time() - self.initialValues['StartTime']) > self.wallClockCheckSeconds * self.wallClockCheckCount:
      self.wallClockCheckCount += 1
      self._performWallClockChecks()

    if self.littleTimeLeft:
      # if we have gone over enough iterations query again
      if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1:
        self.checkError = 'Job has reached the CPU limit of the queue'
        self.log.error(self.checkError, self.timeLeft)
        self.__killRunningThread()
        return S_OK()
      else:
        self.littleTimeLeftCount -= 1

    # Note: need to poll regularly to see if the thread is alive
    #      but only perform checks with a certain frequency
    if (time.time() - self.initialValues['StartTime']) > self.checkingTime * self.checkCount:
      self.checkCount += 1
      result = self._performChecks()
      if not result['OK']:
        self.log.warn('Problem during recent checks')
        self.log.warn(result['Message'])
      return S_OK()
    else:
      # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount))
      return S_OK()

  #############################################################################
  def _performWallClockChecks(self):
    """Watchdog performs the wall clock checks based on MJF. Signals are sent
       to processes if we need to stop, but function always returns S_OK()
    """
    mjf = MJF.MJF()

    try:
      wallClockSecondsLeft = mjf.getWallClockSecondsLeft()
    except Exception as e:
      # Just stop if we can't get the wall clock seconds left
      return S_OK()

    jobstartSeconds = mjf.getIntJobFeature('jobstart_secs')
    if jobstartSeconds is None:
      # Just stop if we don't know when the job started
      return S_OK()

    if (int(time.time()) > jobstartSeconds + self.stopSigStartSeconds) and \
       (wallClockSecondsLeft < self.stopSigFinishSeconds + self.wallClockCheckSeconds):
      # Need to send the signal! Assume it works to avoid sending the signal more than once
      self.log.info('Sending signal %d to JobWrapper children' % self.stopSigNumber)
      self.stopSigSent = True

      try:
        for childPid in getChildrenPIDs(self.wrapperPID):
          try:
            cmdline = open('/proc/%d/cmdline' % childPid, 'r').read().replace('\0', ' ').strip()
          except IOError:
            # Process gone away? Not running on Linux? Skip anyway
            continue

          if re.search(self.stopSigRegex, cmdline) is not None:
            self.log.info(
                'Sending signal %d to process ID %d, cmdline = "%s"' %
                (self.stopSigNumber, childPid, cmdline))
            os.kill(childPid, self.stopSigNumber)

      except Exception as e:
        self.log.error('Failed to send signals to JobWrapper children! (%s)' % str(e))

    return S_OK()

  #############################################################################
  def _performChecks(self):
    """The Watchdog checks are performed at a different period to the checking of the
       application thread and correspond to the checkingTime.
    """
    self.log.verbose('------------------------------------')
    self.log.verbose('Checking loop starts for Watchdog')
    heartBeatDict = {}

    msg = ''

    loadAvg = float(os.getloadavg()[0])
    msg += 'LoadAvg: %d ' % loadAvg
    heartBeatDict['LoadAverage'] = loadAvg
    if 'LoadAverage' not in self.parameters:
      self.parameters['LoadAverage'] = []
    self.parameters['LoadAverage'].append(loadAvg)

    memoryUsed = self.getMemoryUsed()
    msg += 'MemUsed: %.1f kb ' % (memoryUsed)
    heartBeatDict['MemoryUsed'] = memoryUsed
    if 'MemoryUsed' not in self.parameters:
      self.parameters['MemoryUsed'] = []
    self.parameters['MemoryUsed'].append(memoryUsed)

    result = self.profiler.getAllProcessData(withChildren=True)
    if result['OK']:
      vsize = result['Value']['stats']['vSizeUsage'] * 1024.
      rss = result['Value']['stats']['memoryUsage'] * 1024.
      heartBeatDict['Vsize'] = vsize
      heartBeatDict['RSS'] = rss
      self.parameters.setdefault('Vsize', [])
      self.parameters['Vsize'].append(vsize)
      self.parameters.setdefault('RSS', [])
      self.parameters['RSS'].append(rss)
      msg += "Job Vsize: %.1f kb " % vsize
      msg += "Job RSS: %.1f kb " % rss
    result = self.getDiskSpace()
    if not result['OK']:
      self.log.warn("Could not establish DiskSpace", result['Message'])
    else:
      msg += 'DiskSpace: %.1f MB ' % (result['Value'])
    if 'DiskSpace' not in self.parameters:
      self.parameters['DiskSpace'] = []
    if result['OK']:
      self.parameters['DiskSpace'].append(result['Value'])
      heartBeatDict['AvailableDiskSpace'] = result['Value']

    cpu = self.__getCPU()
    if not cpu['OK']:
      msg += 'CPU: ERROR '
      hmsCPU = 0
    else:
      cpu = cpu['Value']
      msg += 'CPU: %s (h:m:s) ' % (cpu)
      if 'CPUConsumed' not in self.parameters:
        self.parameters['CPUConsumed'] = []
      self.parameters['CPUConsumed'].append(cpu)
      hmsCPU = cpu
      rawCPU = self.__convertCPUTime(hmsCPU)
      if rawCPU['OK']:
        heartBeatDict['CPUConsumed'] = rawCPU['Value']

    result = self.__getWallClockTime()
    if not result['OK']:
      self.log.warn("Failed determining wall clock time", result['Message'])
    else:
      msg += 'WallClock: %.2f s ' % (result['Value'])
      self.parameters.setdefault('WallClockTime', list()).append(result['Value'])
      heartBeatDict['WallClockTime'] = result['Value']
    self.log.info(msg)

    result = self._checkProgress()
    if not result['OK']:
      self.checkError = result['Message']
      self.log.warn(self.checkError)

      if self.jobPeekFlag:
        result = self.__peek()
        if result['OK']:
          outputList = result['Value']
          size = len(outputList)
          self.log.info('Last %s lines of available application output:' % (size))
          self.log.info('================START================')
          for line in outputList:
            self.log.info(line)

          self.log.info('=================END=================')

      self.__killRunningThread()
      return S_OK()

    recentStdOut = 'None'
    if self.jobPeekFlag:
      result = self.__peek()
      if result['OK']:
        outputList = result['Value']
        size = len(outputList)
        recentStdOut = 'Last %s lines of application output from Watchdog on %s [UTC]:' % (size, Time.dateTime())
        border = '=' * len(recentStdOut)
        cpuTotal = 'Last reported CPU consumed for job is %s (h:m:s)' % (hmsCPU)
        if self.timeLeft:
          cpuTotal += ', Batch Queue Time Left %s (s @ HS06)' % self.timeLeft
        recentStdOut = '\n%s\n%s\n%s\n%s\n' % (border, recentStdOut, cpuTotal, border)
        self.log.info(recentStdOut)
        for line in outputList:
          self.log.info(line)
          recentStdOut += line + '\n'
      else:
        recentStdOut = 'Watchdog is initializing and will attempt to obtain standard output from application thread'
        self.log.info(recentStdOut)
        self.peekFailCount += 1
        if self.peekFailCount > self.peekRetry:
          self.jobPeekFlag = 0
          self.log.warn('Turning off job peeking for remainder of execution')

    if 'JOBID' not in os.environ:
      self.log.info('Running without JOBID so parameters will not be reported')
      return S_OK()

    jobID = os.environ['JOBID']
    staticParamDict = {'StandardOutput': recentStdOut}
    self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict)
    return S_OK('Watchdog checking cycle complete')

  #############################################################################
  def __getCPU(self):
    """Uses os.times() to get CPU time and returns HH:MM:SS after conversion.
    """
    try:
      result = self.profiler.getAllProcessData(withChildren=True)
      if not result['OK']:
        self.log.warn('Problem while checking consumed CPU')
        return result
      cpuTime = result['Value']
      if cpuTime:
        cpuTimeTotal = cpuTime['stats']['cpuUsageSystem'] + cpuTime['stats']['cpuUsageUser']
        self.log.verbose("Raw CPU time consumed (s) = %s" % (cpuTimeTotal))
        return self.__getCPUHMS(cpuTimeTotal)
      else:
        self.log.error("CPU time consumed found to be 0")
        return S_ERROR()

    except Exception as e:
      self.log.warn('Could not determine CPU time consumed with exception')
      self.log.exception(e)
      return S_ERROR("Could not determine CPU time consumed with exception")

  #############################################################################
  def __getCPUHMS(self, cpuTime):
    mins, secs = divmod(cpuTime, 60)
    hours, mins = divmod(mins, 60)
    humanTime = '%02d:%02d:%02d' % (hours, mins, secs)
    self.log.verbose('Human readable CPU time is: %s' % humanTime)
    return S_OK(humanTime)

  #############################################################################
  def __interpretControlSignal(self, signalDict):
    """This method is called whenever a signal is sent via the result of
       sending a sign of life.
    """
    self.log.info('Received control signal')
    if isinstance(signalDict, dict):
      if 'Kill' in signalDict:
        self.log.info('Received Kill signal, stopping job via control signal')
        self.checkError = 'Received Kill signal'
        self.__killRunningThread()
      else:
        self.log.info('The following control signal was sent but not understood by the watchdog:')
        self.log.info(signalDict)
    else:
      self.log.info('Expected dictionary for control signal, received:\n%s' % (signalDict))

    return S_OK()

  #############################################################################
  def _checkProgress(self):
    """This method calls specific tests to determine whether the job execution
       is proceeding normally.  CS flags can easily be added to add or remove
       tests via central configuration.
    """
    report = ''

    if self.testWallClock:
      result = self.__checkWallClockTime()
      if not result['OK']:
        self.log.warn(result['Message'])
        return result
      report += 'WallClock: OK, '
    else:
      report += 'WallClock: NA,'

    if self.testDiskSpace:
      result = self.__checkDiskSpace()
      if not result['OK']:
        self.log.warn(result['Message'])
        return result
      report += 'DiskSpace: OK, '
    else:
      report += 'DiskSpace: NA,'

    if self.testLoadAvg:
      result = self.__checkLoadAverage()
      if not result['OK']:
        self.log.warn("Check of load average failed, but won't fail because of that: %s" % result['Message'])
        report += 'LoadAverage: ERROR, '
        return S_OK()
      report += 'LoadAverage: OK, '
    else:
      report += 'LoadAverage: NA,'

    if self.testCPUConsumed:
      result = self.__checkCPUConsumed()
      if not result['OK']:
        return result
      report += 'CPUConsumed: OK, '
    else:
      report += 'CPUConsumed: NA, '

    if self.testCPULimit:
      result = self.__checkCPULimit()
      if not result['OK']:
        self.log.warn(result['Message'])
        return result
      report += 'CPULimit OK, '
    else:
      report += 'CPULimit: NA, '

    if self.testTimeLeft:
      self.__timeLeft()
      if self.timeLeft:
        report += 'TimeLeft: OK'
    else:
      report += 'TimeLeft: NA'

    if self.testMemoryLimit:
      result = self.__checkMemoryLimit()
      if not result['OK']:
        self.log.warn(result['Message'])
        return result
      report += 'MemoryLimit OK, '
    else:
      report += 'MemoryLimit: NA, '

    self.log.info(report)
    return S_OK('All enabled checks passed')

  #############################################################################
  def __checkCPUConsumed(self):
    """ Checks whether the CPU consumed by application process is reasonable. This
        method will report stalled jobs to be killed.
    """
    self.log.info("Checking CPU Consumed")

    if 'WallClockTime' not in self.parameters:
      return S_ERROR('Missing WallClockTime info')
    if 'CPUConsumed' not in self.parameters:
      return S_ERROR('Missing CPUConsumed info')

    wallClockTime = self.parameters['WallClockTime'][-1]
    if wallClockTime < self.sampleCPUTime:
      self.log.info("Stopping check, wallclock time (%s) is still smaller than sample time (%s)" % (
          wallClockTime,
          self.sampleCPUTime))
      return S_OK()

    intervals = max(1, int(self.sampleCPUTime / self.checkingTime))
    if len(self.parameters['CPUConsumed']) < intervals + 1:
      self.log.info("Not enough snapshots to calculate, there are %s and we need %s" %
                    (len(self.parameters['CPUConsumed']), intervals + 1))
      return S_OK()

    wallClockTime = self.parameters['WallClockTime'][-1] - self.parameters['WallClockTime'][-1 - intervals]
    try:
      cpuTime = self.__convertCPUTime(self.parameters['CPUConsumed'][-1])['Value']
      # For some reason, some times the CPU consumed estimation returns 0
      # if cpuTime == 0:
      #   return S_OK()
      cpuTime -= self.__convertCPUTime(self.parameters['CPUConsumed'][-1 - intervals])['Value']
      if cpuTime < 0:
        self.log.warn('Consumed CPU time negative, something wrong may have happened, ignore')
        return S_OK()
      if wallClockTime <= 0:
        self.log.warn('Wallclock time should not be negative or zero, Ignore')
        return S_OK()

      ratio = (cpuTime / wallClockTime) * 100.

      self.log.info("CPU/Wallclock ratio is %.2f%%" % ratio)
      # in case of error cpuTime might be 0, exclude this
      if ratio < self.minCPUWallClockRatio:
        if os.path.exists('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK'):
          self.log.info('N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload')
          return S_OK()
        self.log.info("Job is stalled!")
        return S_ERROR('Watchdog identified this job as stalled')
    except Exception as e:
      self.log.error("Cannot convert CPU consumed from string to int", str(e))

    return S_OK()

  #############################################################################

  def __convertCPUTime(self, cputime):
    """ Method to convert the CPU time as returned from the Watchdog
        instances to the equivalent DIRAC normalized CPU time to be compared
        to the Job CPU requirement.
    """
    cpuValue = 0
    cpuHMS = cputime.split(':')
    # for i in xrange( len( cpuHMS ) ):
    #   cpuHMS[i] = cpuHMS[i].replace( '00', '0' )

    try:
      hours = float(cpuHMS[0]) * 60 * 60
      mins = float(cpuHMS[1]) * 60
      secs = float(cpuHMS[2])
      cpuValue = float(hours + mins + secs)
    except Exception as x:
      self.log.warn(str(x))
      return S_ERROR('Could not calculate CPU time')

    # Normalization to be implemented
    normalizedCPUValue = cpuValue

    result = S_OK()
    result['Value'] = normalizedCPUValue
    self.log.debug('CPU value %s converted to %s' % (cputime, normalizedCPUValue))
    return result

  #############################################################################

  def __checkCPULimit(self):
    """ Checks that the job has consumed more than the job CPU requirement
        (plus a configurable margin) and kills them as necessary.
    """
    consumedCPU = 0
    if 'CPUConsumed' in self.parameters:
      consumedCPU = self.parameters['CPUConsumed'][-1]

    consumedCPUDict = self.__convertCPUTime(consumedCPU)
    if consumedCPUDict['OK']:
      currentCPU = consumedCPUDict['Value']
    else:
      return S_OK('Not possible to determine current CPU consumed')

    if consumedCPU:
      limit = self.jobCPUTime + self.jobCPUTime * (self.jobCPUMargin / 100)
      cpuConsumed = float(currentCPU)
      if cpuConsumed > limit:
        self.log.info(
            'Job has consumed more than the specified CPU limit with an additional %s%% margin' %
            (self.jobCPUMargin))
        return S_ERROR('Job has exceeded maximum CPU time limit')
      else:
        return S_OK('Job within CPU limit')
    elif not currentCPU:
      self.log.verbose('Both initial and current CPU consumed are null')
      return S_OK('CPU consumed is not measurable yet')
    else:
      return S_OK('Not possible to determine CPU consumed')

  def __checkMemoryLimit(self):
    """ Checks that the job memory consumption is within a limit
    """
    if 'Vsize' in self.parameters:
      vsize = self.parameters['Vsize'][-1]

    if vsize and self.memoryLimit:
      if vsize > self.memoryLimit:
        vsize = vsize
        # Just a warning for the moment
        self.log.warn("Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % (vsize, self.memoryLimit))

    return S_OK()

  #############################################################################
  def __checkDiskSpace(self):
    """Checks whether the CS defined minimum disk space is available.
    """
    if 'DiskSpace' in self.parameters:
      availSpace = self.parameters['DiskSpace'][-1]
      if availSpace >= 0 and availSpace < self.minDiskSpace:
        self.log.info('Not enough local disk space for job to continue, defined in CS as %s MB' % (self.minDiskSpace))
        return S_ERROR('Job has insufficient disk space to continue')
      else:
        return S_OK('Job has enough disk space available')
    else:
      return S_ERROR('Available disk space could not be established')

  #############################################################################
  def __checkWallClockTime(self):
    """Checks whether the job has been running for the CS defined maximum
       wall clock time.
    """
    if 'StartTime' in self.initialValues:
      startTime = self.initialValues['StartTime']
      if time.time() - startTime > self.maxWallClockTime:
        self.log.info('Job has exceeded maximum wall clock time of %s seconds' % (self.maxWallClockTime))
        return S_ERROR('Job has exceeded maximum wall clock time')
      else:
        return S_OK('Job within maximum wall clock time')
    else:
      return S_ERROR('Job start time could not be established')

  #############################################################################
  def __checkLoadAverage(self):
    """Checks whether the CS defined maximum load average is exceeded.
    """
    if 'LoadAverage' in self.parameters:
      loadAvg = self.parameters['LoadAverage'][-1]
      if loadAvg > float(self.loadAvgLimit):
        self.log.info('Maximum load average exceeded, defined in CS as %s ' % (self.loadAvgLimit))
        return S_ERROR('Job exceeded maximum load average')
      else:
        return S_OK('Job running with normal load average')
    else:
      return S_ERROR('Job load average not established')

  #############################################################################
  def __peek(self):
    """ Uses ExecutionThread.getOutput() method to obtain standard output
        from running thread via subprocess callback function.
    """
    result = self.exeThread.getOutput()
    if not result['OK']:
      self.log.warn('Could not obtain output from running application thread')
      self.log.warn(result['Message'])

    return result

  #############################################################################
  def calibrate(self):
    """ The calibrate method obtains the initial values for system memory and load
        and calculates the margin for error for the rest of the Watchdog cycle.
    """
    self.__getWallClockTime()
    self.parameters['WallClockTime'] = []

    cpuConsumed = self.__getCPU()
    if not cpuConsumed['OK']:
      self.log.warn("Could not establish CPU consumed, setting to 0.0")
      cpuConsumed = 0.0
    else:
      cpuConsumed = cpuConsumed['Value']

    self.initialValues['CPUConsumed'] = cpuConsumed
    self.parameters['CPUConsumed'] = []

    self.initialValues['LoadAverage'] = float(os.getloadavg()[0])
    self.parameters['LoadAverage'] = []

    memUsed = self.getMemoryUsed()

    self.initialValues['MemoryUsed'] = memUsed
    self.parameters['MemoryUsed'] = []

    result = self.profiler.getAllProcessData(withChildren=True)
    self.log.verbose('Job Memory: %s' % (result['Value']))
    if not result['OK']:
      self.log.warn('Could not get job memory usage')

    self.initialValues['Vsize'] = result['Value']['stats']['vSizeUsage'] * 1024.
    self.initialValues['RSS'] = result['Value']['stats']['memoryUsage'] * 1024.
    self.parameters['Vsize'] = []
    self.parameters['RSS'] = []

    result = self.getDiskSpace()
    self.log.verbose('DiskSpace: %s' % (result))
    if not result['OK']:
      self.log.warn("Could not establish DiskSpace")

    self.initialValues['DiskSpace'] = result['Value']
    self.parameters['DiskSpace'] = []

    result = self.getNodeInformation()
    self.log.verbose('NodeInfo: %s' % (result))
    if not result['OK']:
      self.log.warn("Could not establish static system information")

    if 'LSB_JOBID' in os.environ:
      result['LocalJobID'] = os.environ['LSB_JOBID']
    if 'PBS_JOBID' in os.environ:
      result['LocalJobID'] = os.environ['PBS_JOBID']
    if 'QSUB_REQNAME' in os.environ:
      result['LocalJobID'] = os.environ['QSUB_REQNAME']
    if 'JOB_ID' in os.environ:
      result['LocalJobID'] = os.environ['JOB_ID']

    self.__reportParameters(result, 'NodeInformation', True)
    self.__reportParameters(self.initialValues, 'InitialValues')
    return S_OK()

  def __timeLeft(self):
    """
      return Normalized CPU time left in the batch system
      0 if not available
      update self.timeLeft and self.littleTimeLeft
    """
    # Get CPU time left in the batch system
    result = self.timeLeftUtil.getTimeLeft(0.0)
    if not result['OK']:
      # Could not get CPU time left, we might need to wait for the first loop
      # or the Utility is not working properly for this batch system
      # or we are in a batch system
      timeLeft = 0
    else:
      timeLeft = result['Value']

    self.timeLeft = timeLeft
    if not self.littleTimeLeft:
      if timeLeft and timeLeft < self.grossTimeLeftLimit:
        self.log.info('TimeLeft bellow %s, now checking with higher frequency' % timeLeft)
        self.littleTimeLeft = True
        # TODO: better configurable way of doing this to be coded
        self.littleTimeLeftCount = 15
    else:
      if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit:
        timeLeft = -1

    return timeLeft

  #############################################################################
  def __getUsageSummary(self):
    """ Returns average load, memory etc. over execution of job thread
    """
    summary = {}
    # CPUConsumed
    if 'CPUConsumed' in self.parameters:
      cpuList = self.parameters['CPUConsumed']
      if cpuList:
        hmsCPU = cpuList[-1]
        rawCPU = self.__convertCPUTime(hmsCPU)
        if rawCPU['OK']:
          summary['LastUpdateCPU(s)'] = rawCPU['Value']
      else:
        summary['LastUpdateCPU(s)'] = 'Could not be estimated'
    # DiskSpace
    if 'DiskSpace' in self.parameters:
      space = self.parameters['DiskSpace']
      if space:
        value = abs(float(space[-1]) - float(self.initialValues['DiskSpace']))
        if value < 0:
          value = 0
        summary['DiskSpace(MB)'] = value
      else:
        summary['DiskSpace(MB)'] = 'Could not be estimated'
    # MemoryUsed
    if 'MemoryUsed' in self.parameters:
      memory = self.parameters['MemoryUsed']
      if memory:
        summary['MemoryUsed(kb)'] = abs(float(memory[-1]) - float(self.initialValues['MemoryUsed']))
      else:
        summary['MemoryUsed(kb)'] = 'Could not be estimated'
    # LoadAverage
    if 'LoadAverage' in self.parameters:
      laList = self.parameters['LoadAverage']
      if laList:
        summary['LoadAverage'] = float(sum(laList)) / float(len(laList))
      else:
        summary['LoadAverage'] = 'Could not be estimated'

    result = self.__getWallClockTime()
    if not result['OK']:
      self.log.warn("Failed determining wall clock time", result['Message'])
      summary['WallClockTime(s)'] = 0
      summary['ScaledCPUTime(s)'] = 0
    else:
      wallClock = result['Value']
      summary['WallClockTime(s)'] = wallClock
      summary['ScaledCPUTime(s)'] = wallClock * self.scaleFactor * self.processors

    self.__reportParameters(summary, 'UsageSummary', True)
    self.currentStats = summary

  #############################################################################
  def __reportParameters(self, params, title=None, report=False):
    """Will report parameters for job.
    """
    try:
      parameters = []
      self.log.info('==========================================================')
      if title:
        self.log.info('Watchdog will report %s' % (title))
      else:
        self.log.info('Watchdog will report parameters')
      self.log.info('==========================================================')
      vals = params
      if 'Value' in params:
        if vals['Value']:
          vals = params['Value']
      for k, v in vals.items():
        if v:
          self.log.info(str(k) + ' = ' + str(v))
          parameters.append((k, v))
      if report:
        self.__setJobParamList(parameters)

      self.log.info('==========================================================')
    except Exception as x:
      self.log.warn('Problem while reporting parameters')
      self.log.warn(str(x))

  #############################################################################
  def __getWallClockTime(self):
    """ Establishes the Wall Clock time spent since the Watchdog initialization"""
    result = S_OK()
    if 'StartTime' in self.initialValues:
      currentTime = time.time()
      wallClock = currentTime - self.initialValues['StartTime']
      result['Value'] = wallClock
    else:
      self.initialValues['StartTime'] = time.time()
      result['Value'] = 0.0

    return result

  #############################################################################
  def __killRunningThread(self):
    """ Will kill the running thread process and any child processes."""
    self.log.info('Sending kill signal to application PID %s' % (self.spObject.getChildPID()))
    result = self.spObject.killChild()
    self.applicationKilled = True
    self.log.info('Subprocess.killChild() returned:%s ' % (result))
    return S_OK('Thread killed')

  #############################################################################
  def __sendSignOfLife(self, jobID, heartBeatDict, staticParamDict):
    """ Sends sign of life 'heartbeat' signal and triggers control signal
        interpretation.
    """
    result = JobStateUpdateClient().sendHeartBeat(jobID, heartBeatDict, staticParamDict)
    if not result['OK']:
      self.log.warn('Problem sending sign of life')
      self.log.warn(result)

    if result['OK'] and result['Value']:
      self.__interpretControlSignal(result['Value'])

    return result

  #############################################################################
  def __setJobParamList(self, value):
    """Wraps around setJobParameters of state update client
    """
    # job wrapper template sets the jobID variable
    if 'JOBID' not in os.environ:
      self.log.info('Running without JOBID so parameters will not be reported')
      return S_OK()
    jobID = os.environ['JOBID']
    jobParam = JobStateUpdateClient().setJobParameters(int(jobID), value)
    self.log.verbose('setJobParameters(%s,%s)' % (jobID, value))
    if not jobParam['OK']:
      self.log.warn(jobParam['Message'])

    return jobParam

  #############################################################################
  def getNodeInformation(self):
    """ Attempts to retrieve all static system information, should be overridden in a subclass"""
    methodName = 'getNodeInformation'
    self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass')
    return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass')

  #############################################################################
  def getMemoryUsed(self):
    """Obtains the memory used.
    """
    mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \
        resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
    return float(mem)

  #############################################################################
  def getDiskSpace(self):
    """ Attempts to get the available disk space, should be overridden in a subclass"""
    methodName = 'getDiskSpace'
    self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass')
    return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass')
Пример #4
0
class Watchdog( object ):

  #############################################################################
  def __init__( self, pid, exeThread, spObject, jobCPUtime, memoryLimit = 0, systemFlag = 'linux2.4' ):
    """ Constructor, takes system flag as argument.
    """
    self.log = gLogger.getSubLogger( "Watchdog" )
    self.systemFlag = systemFlag
    self.exeThread = exeThread
    self.wrapperPID = pid
    self.appPID = self.exeThread.getCurrentPID()
    self.spObject = spObject
    self.jobCPUtime = jobCPUtime
    self.memoryLimit = memoryLimit
    self.calibration = 0
    self.initialValues = {}
    self.parameters = {}
    self.peekFailCount = 0
    self.peekRetry = 5
    self.processMonitor = ProcessMonitor()
    self.checkError = ''
    self.currentStats = {}
    self.initialized = False
    self.count = 0

    #defaults
    self.testWallClock = 1
    self.testDiskSpace = 1
    self.testLoadAvg = 1
    self.maxWallClockTime = 3 * 24 * 60 * 60
    self.testCPUConsumed = 1
    self.testCPULimit = 0
    self.testMemoryLimit = 0
    self.testTimeLeft = 1
    self.pollingTime = 10  # 10 seconds
    self.checkingTime = 30 * 60  # 30 minute period
    self.minCheckingTime = 20 * 60  # 20 mins
    self.maxWallClockTime = 3 * 24 * 60 * 60  # e.g. 4 days
    self.jobPeekFlag = 1  # on / off
    self.minDiskSpace = 10  # MB
    self.loadAvgLimit = 1000  # > 1000 and jobs killed
    self.sampleCPUTime = 30 * 60  # e.g. up to 20mins sample
    self.jobCPUMargin = 20  # %age buffer before killing job
    self.minCPUWallClockRatio = 5  # ratio %age
    self.nullCPULimit = 5  # After 5 sample times return null CPU consumption kill job
    self.checkCount = 0
    self.nullCPUCount = 0

    self.grossTimeLeftLimit = 10 * self.checkingTime
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = 0
    self.littleTimeLeft = False


  #############################################################################
  def initialize( self, loops = 0 ):
    """ Watchdog initialization.
    """
    if self.initialized:
      self.log.info( 'Watchdog already initialized' )
      return S_OK()
    else:
      self.initialized = True

    setup = gConfig.getValue( '/DIRAC/Setup', '' )
    if not setup:
      return S_ERROR( 'Can not get the DIRAC Setup value' )
    wms_instance = getSystemInstance( "WorkloadManagement" )
    if not wms_instance:
      return S_ERROR( 'Can not get the WorkloadManagement system instance' )
    self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance

    self.maxcount = loops
    self.log.verbose( 'Watchdog initialization' )
    self.log.info( 'Attempting to Initialize Watchdog for: %s' % ( self.systemFlag ) )
    # Test control flags
    self.testWallClock = gConfig.getValue( self.section + '/CheckWallClockFlag', 1 )
    self.testDiskSpace = gConfig.getValue( self.section + '/CheckDiskSpaceFlag', 1 )
    self.testLoadAvg = gConfig.getValue( self.section + '/CheckLoadAvgFlag', 1 )
    self.testCPUConsumed = gConfig.getValue( self.section + '/CheckCPUConsumedFlag', 1 )
    self.testCPULimit = gConfig.getValue( self.section + '/CheckCPULimitFlag', 0 )
    self.testMemoryLimit = gConfig.getValue( self.section + '/CheckMemoryLimitFlag', 0 )
    self.testTimeLeft = gConfig.getValue( self.section + '/CheckTimeLeftFlag', 1 )
    # Other parameters
    self.pollingTime = gConfig.getValue( self.section + '/PollingTime', 10 )  # 10 seconds
    self.checkingTime = gConfig.getValue( self.section + '/CheckingTime', 30 * 60 )  # 30 minute period
    self.minCheckingTime = gConfig.getValue( self.section + '/MinCheckingTime', 20 * 60 )  # 20 mins
    self.maxWallClockTime = gConfig.getValue( self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60 )  # e.g. 4 days
    self.jobPeekFlag = gConfig.getValue( self.section + '/JobPeekFlag', 1 )  # on / off
    self.minDiskSpace = gConfig.getValue( self.section + '/MinDiskSpace', 10 )  # MB
    self.loadAvgLimit = gConfig.getValue( self.section + '/LoadAverageLimit', 1000 )  # > 1000 and jobs killed
    self.sampleCPUTime = gConfig.getValue( self.section + '/CPUSampleTime', 30 * 60 )  # e.g. up to 20mins sample
    self.jobCPUMargin = gConfig.getValue( self.section + '/JobCPULimitMargin', 20 )  # %age buffer before killing job
    self.minCPUWallClockRatio = gConfig.getValue( self.section + '/MinCPUWallClockRatio', 5 )  # ratio %age
    self.nullCPULimit = gConfig.getValue( self.section + '/NullCPUCountLimit', 5 )  # After 5 sample times return null CPU consumption kill job
    if self.checkingTime < self.minCheckingTime:
      self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % ( self.checkingTime, self.minCheckingTime ) )
      self.checkingTime = self.minCheckingTime

    # The time left is returned in seconds @ 250 SI00 = 1 HS06,
    # the self.checkingTime and self.pollingTime are in seconds,
    # thus they need to be multiplied by a large enough factor
    self.fineTimeLeftLimit = gConfig.getValue( self.section + '/TimeLeftLimit', 150 * self.pollingTime )

    return S_OK()

  def run( self ):
    """ The main watchdog execution method
    """

    result = self.initialize()
    if not result['OK']:
      self.log.always( 'Can not start watchdog for the following reason' )
      self.log.always( result['Message'] )
      return result

    try:
      while True:
        self.log.debug( 'Starting watchdog loop # %d' % self.count )
        start_cycle_time = time.time()
        result = self.execute()
        exec_cycle_time = time.time() - start_cycle_time
        if not result[ 'OK' ]:
          self.log.error( "Watchdog error during execution", result[ 'Message' ] )
          break
        elif result['Value'] == "Ended":
          break
        self.count += 1
        if exec_cycle_time < self.pollingTime:
          time.sleep( self.pollingTime - exec_cycle_time )
      return S_OK()
    except Exception:
      self.log.exception()
      return S_ERROR( 'Exception' )

  #############################################################################
  def execute( self ):
    """ The main agent execution method of the Watchdog.
    """
    if not self.exeThread.isAlive():
      self.__getUsageSummary()
      self.log.info( 'Process to monitor has completed, Watchdog will exit.' )
      return S_OK( "Ended" )

    if self.littleTimeLeft:
      # if we have gone over enough iterations query again
      if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1:
        self.checkError = 'Job has reached the CPU limit of the queue'
        self.log.error( self.checkError, self.timeLeft )
        self.__killRunningThread()
        return S_OK()
      else:
        self.littleTimeLeftCount -= 1


    # Note: need to poll regularly to see if the thread is alive
    #      but only perform checks with a certain frequency
    if ( time.time() - self.initialValues['StartTime'] ) > self.checkingTime * self.checkCount:
      self.checkCount += 1
      result = self._performChecks()
      if not result['OK']:
        self.log.warn( 'Problem during recent checks' )
        self.log.warn( result['Message'] )
      return S_OK()
    else:
      # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount))
      return S_OK()


  #############################################################################
  def _performChecks( self ):
    """The Watchdog checks are performed at a different period to the checking of the
       application thread and correspond to the checkingTime.
    """
    self.log.verbose( '------------------------------------' )
    self.log.verbose( 'Checking loop starts for Watchdog' )
    heartBeatDict = {}

    msg = ''

    loadAvg = self.getLoadAverage()
    if not loadAvg['OK']:
      msg += 'LoadAvg: ERROR'
    else:
      loadAvg = loadAvg['Value']
      msg += 'LoadAvg: %d ' % loadAvg
      heartBeatDict['LoadAverage'] = loadAvg
      if not self.parameters.has_key( 'LoadAverage' ):
        self.parameters['LoadAverage'] = []
      self.parameters['LoadAverage'].append( loadAvg )

    memoryUsed = self.getMemoryUsed()
    if not memoryUsed['OK']:
      msg += 'MemUsed: ERROR '
    else:
      memoryUsed = memoryUsed['Value']
      msg += 'MemUsed: %.1f kb ' % ( memoryUsed )
      heartBeatDict['MemoryUsed'] = memoryUsed
      if not self.parameters.has_key( 'MemoryUsed' ):
        self.parameters['MemoryUsed'] = []
      self.parameters['MemoryUsed'].append( memoryUsed )

    result = self.processMonitor.getMemoryConsumed( self.wrapperPID )
    if result['OK']:
      vsize = result['Value']['Vsize']/1024.
      rss = result['Value']['RSS']/1024.
      heartBeatDict['Vsize'] = vsize
      heartBeatDict['RSS'] = rss
      self.parameters.setdefault( 'Vsize', [] )
      self.parameters['Vsize'].append( vsize )
      self.parameters.setdefault( 'RSS', [] )
      self.parameters['RSS'].append( rss )
      msg += "Job Vsize: %.1f kb " % vsize
      msg += "Job RSS: %.1f kb " % rss
    result = self.getDiskSpace()
    if not result['OK']:
      self.log.warn( "Could not establish DiskSpace", result['Message'] )
    else:
      msg += 'DiskSpace: %.1f MB ' % ( result['Value'] )
    if not self.parameters.has_key( 'DiskSpace' ):
      self.parameters['DiskSpace'] = []
    if result['OK']:
      self.parameters['DiskSpace'].append( result['Value'] )
      heartBeatDict['AvailableDiskSpace'] = result['Value']
    
    cpu = self.__getCPU()
    if not cpu['OK']:
      msg += 'CPU: ERROR '
      hmsCPU = 0
    else:
      cpu = cpu['Value']
      msg += 'CPU: %s (h:m:s) ' % ( cpu )
      if not self.parameters.has_key( 'CPUConsumed' ):
        self.parameters['CPUConsumed'] = []
      self.parameters['CPUConsumed'].append( cpu )
      hmsCPU = cpu
      rawCPU = self.__convertCPUTime( hmsCPU )
      if rawCPU['OK']:
        heartBeatDict['CPUConsumed'] = rawCPU['Value']
    
    result = self.__getWallClockTime()
    if not result['OK']:
      self.log.warn( "Failed determining wall clock time", result['Message'] )
    else:
      msg += 'WallClock: %.2f s ' % ( result['Value'] )
      self.parameters.setdefault( 'WallClockTime', list() ).append( result['Value'] )
      heartBeatDict['WallClockTime'] = result['Value']
    self.log.info( msg )

    result = self._checkProgress()
    if not result['OK']:
      self.checkError = result['Message']
      self.log.warn( self.checkError )

      if self.jobPeekFlag:
        result = self.__peek()
        if result['OK']:
          outputList = result['Value']
          size = len( outputList )
          self.log.info( 'Last %s lines of available application output:' % ( size ) )
          self.log.info( '================START================' )
          for line in outputList:
            self.log.info( line )

          self.log.info( '=================END=================' )

      self.__killRunningThread()
      return S_OK()

    recentStdOut = 'None'
    if self.jobPeekFlag:
      result = self.__peek()
      if result['OK']:
        outputList = result['Value']
        size = len( outputList )
        recentStdOut = 'Last %s lines of application output from Watchdog on %s [UTC]:' % ( size, Time.dateTime() )
        border = '=' * len( recentStdOut )
        cpuTotal = 'Last reported CPU consumed for job is %s (h:m:s)' % ( hmsCPU )
        if self.timeLeft:
          cpuTotal += ', Batch Queue Time Left %s (s @ HS06)' % self.timeLeft
        recentStdOut = '\n%s\n%s\n%s\n%s\n' % ( border, recentStdOut, cpuTotal, border )
        self.log.info( recentStdOut )
        for line in outputList:
          self.log.info( line )
          recentStdOut += line + '\n'
      else:
        recentStdOut = 'Watchdog is initializing and will attempt to obtain standard output from application thread'
        self.log.info( recentStdOut )
        self.peekFailCount += 1
        if self.peekFailCount > self.peekRetry:
          self.jobPeekFlag = 0
          self.log.warn( 'Turning off job peeking for remainder of execution' )

    if not os.environ.has_key( 'JOBID' ):
      self.log.info( 'Running without JOBID so parameters will not be reported' )
      return S_OK()

    jobID = os.environ['JOBID']
    staticParamDict = {'StandardOutput':recentStdOut}
    self.__sendSignOfLife( int( jobID ), heartBeatDict, staticParamDict )
    return S_OK( 'Watchdog checking cycle complete' )

  #############################################################################
  def __getCPU( self ):
    """Uses os.times() to get CPU time and returns HH:MM:SS after conversion.
    """
    try:
      cpuTime = self.processMonitor.getCPUConsumed( self.wrapperPID )
      if not cpuTime['OK']:
        self.log.warn( 'Problem while checking consumed CPU' )
        return cpuTime
      cpuTime = cpuTime['Value']
      if cpuTime:
        self.log.verbose( "Raw CPU time consumed (s) = %s" % ( cpuTime ) )
        return self.__getCPUHMS( cpuTime )
      else:
        self.log.error( "CPU time consumed found to be 0" )
        return S_ERROR()

    except Exception as e:
      self.log.warn( 'Could not determine CPU time consumed with exception' )
      self.log.exception( e )
      return S_ERROR( "Could not determine CPU time consumed with exception" )



  #############################################################################
  def __getCPUHMS( self, cpuTime ):
    mins, secs = divmod( cpuTime, 60 )
    hours, mins = divmod( mins, 60 )
    humanTime = '%02d:%02d:%02d' % ( hours, mins, secs )
    self.log.verbose( 'Human readable CPU time is: %s' % humanTime )
    return S_OK( humanTime )

  #############################################################################
  def __interpretControlSignal( self, signalDict ):
    """This method is called whenever a signal is sent via the result of
       sending a sign of life.
    """
    self.log.info( 'Received control signal' )
    if type( signalDict ) == type( {} ):
      if signalDict.has_key( 'Kill' ):
        self.log.info( 'Received Kill signal, stopping job via control signal' )
        self.checkError = 'Received Kill signal'
        self.__killRunningThread()
      else:
        self.log.info( 'The following control signal was sent but not understood by the watchdog:' )
        self.log.info( signalDict )
    else:
      self.log.info( 'Expected dictionary for control signal, received:\n%s' % ( signalDict ) )

    return S_OK()

  #############################################################################
  def _checkProgress( self ):
    """This method calls specific tests to determine whether the job execution
       is proceeding normally.  CS flags can easily be added to add or remove
       tests via central configuration.
    """
    report = ''

    if self.testWallClock:
      result = self.__checkWallClockTime()
      if not result['OK']:
        self.log.warn( result['Message'] )
        return result
      report += 'WallClock: OK, '
    else:
      report += 'WallClock: NA,'

    if self.testDiskSpace:
      result = self.__checkDiskSpace()
      if not result['OK']:
        self.log.warn( result['Message'] )
        return result
      report += 'DiskSpace: OK, '
    else:
      report += 'DiskSpace: NA,'

    if self.testLoadAvg:
      result = self.__checkLoadAverage()
      if not result['OK']:
        self.log.warn( "Check of load average failed, but won't fail because of that: %s" % result['Message'] )
        report += 'LoadAverage: ERROR, '
        return S_OK()
      report += 'LoadAverage: OK, '
    else:
      report += 'LoadAverage: NA,'

    if self.testCPUConsumed:
      result = self.__checkCPUConsumed()
      if not result['OK']:
        return result
      report += 'CPUConsumed: OK, '
    else:
      report += 'CPUConsumed: NA, '

    if self.testCPULimit:
      result = self.__checkCPULimit()
      if not result['OK']:
        self.log.warn( result['Message'] )
        return result
      report += 'CPULimit OK, '
    else:
      report += 'CPULimit: NA, '
        
    if self.testTimeLeft:
      self.__timeLeft()
      if self.timeLeft:
        report += 'TimeLeft: OK'
    else:
      report += 'TimeLeft: NA'

    if self.testMemoryLimit:
      result = self.__checkMemoryLimit()
      if not result['OK']:
        self.log.warn( result['Message'] )
        return result
      report += 'MemoryLimit OK, '
    else:
      report += 'MemoryLimit: NA, '

    self.log.info( report )
    return S_OK( 'All enabled checks passed' )

  #############################################################################
  def __checkCPUConsumed( self ):
    """ Checks whether the CPU consumed by application process is reasonable. This
        method will report stalled jobs to be killed.
    """
    self.log.info( "Checking CPU Consumed" )

    if 'WallClockTime' not in self.parameters:
      return S_ERROR( 'Missing WallClockTime info' )
    if 'CPUConsumed' not in self.parameters:
      return S_ERROR( 'Missing CPUConsumed info' )

    wallClockTime = self.parameters['WallClockTime'][-1]
    if wallClockTime < self.sampleCPUTime:
      self.log.info( "Stopping check, wallclock time (%s) is still smaller than sample time (%s)" % ( wallClockTime,
                                                                                            self.sampleCPUTime ) )
      return S_OK()

    intervals = max( 1, int( self.sampleCPUTime / self.checkingTime ) )
    if len( self.parameters['CPUConsumed'] ) < intervals + 1:
      self.log.info( "Not enough snapshots to calculate, there are %s and we need %s" % ( len( self.parameters['CPUConsumed'] ),
                                                                                          intervals + 1 ) )
      return S_OK()

    wallClockTime = self.parameters['WallClockTime'][-1] - self.parameters['WallClockTime'][-1 - intervals ]
    try:
      cpuTime = self.__convertCPUTime( self.parameters['CPUConsumed'][-1] )['Value']
      # For some reason, some times the CPU consumed estimation returns 0
      # if cpuTime == 0:
      #   return S_OK()
      cpuTime -= self.__convertCPUTime( self.parameters['CPUConsumed'][-1 - intervals ] )['Value']
      if cpuTime < 0:
        self.log.warn( 'Consumed CPU time negative, something wrong may have happened, ignore' )
        return S_OK()
      if wallClockTime <= 0:
        self.log.warn( 'Wallclock time should not be negative or zero, Ignore' )
        return S_OK()

      ratio = ( cpuTime / wallClockTime ) * 100.

      self.log.info( "CPU/Wallclock ratio is %.2f%%" % ratio )
      # in case of error cpuTime might be 0, exclude this
      if ratio < self.minCPUWallClockRatio:
        if os.path.exists( 'DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK' ):
          self.log.info( 'N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload' )
          return S_OK()
        self.log.info( "Job is stalled!" )
        return S_ERROR( 'Watchdog identified this job as stalled' )
    except Exception as e:
      self.log.error( "Cannot convert CPU consumed from string to int", str( e ) )

    return S_OK()

  #############################################################################

  def __convertCPUTime( self, cputime ):
    """ Method to convert the CPU time as returned from the Watchdog
        instances to the equivalent DIRAC normalized CPU time to be compared
        to the Job CPU requirement.
    """
    cpuValue = 0
    cpuHMS = cputime.split( ':' )
    # for i in xrange( len( cpuHMS ) ):
    #   cpuHMS[i] = cpuHMS[i].replace( '00', '0' )

    try:
      hours = float( cpuHMS[0] ) * 60 * 60
      mins = float( cpuHMS[1] ) * 60
      secs = float( cpuHMS[2] )
      cpuValue = float( hours + mins + secs )
    except Exception as x:
      self.log.warn( str( x ) )
      return S_ERROR( 'Could not calculate CPU time' )

    # Normalization to be implemented
    normalizedCPUValue = cpuValue

    result = S_OK()
    result['Value'] = normalizedCPUValue
    self.log.debug( 'CPU value %s converted to %s' % ( cputime, normalizedCPUValue ) )
    return result

  #############################################################################

  def __checkCPULimit( self ):
    """ Checks that the job has consumed more than the job CPU requirement
        (plus a configurable margin) and kills them as necessary.
    """
    consumedCPU = 0
    if self.parameters.has_key( 'CPUConsumed' ):
      consumedCPU = self.parameters['CPUConsumed'][-1]

    consumedCPUDict = self.__convertCPUTime( consumedCPU )
    if consumedCPUDict['OK']:
      currentCPU = consumedCPUDict['Value']
    else:
      return S_OK( 'Not possible to determine current CPU consumed' )

    if consumedCPU:
      limit = self.jobCPUtime + self.jobCPUtime * ( self.jobCPUMargin / 100 )
      cpuConsumed = float( currentCPU )
      if cpuConsumed > limit:
        self.log.info( 'Job has consumed more than the specified CPU limit with an additional %s%% margin' % ( self.jobCPUMargin ) )
        return S_ERROR( 'Job has exceeded maximum CPU time limit' )
      else:
        return S_OK( 'Job within CPU limit' )
    elif not currentCPU:
      self.log.verbose( 'Both initial and current CPU consumed are null' )
      return S_OK( 'CPU consumed is not measurable yet' )
    else:
      return S_OK( 'Not possible to determine CPU consumed' )

  def __checkMemoryLimit( self ):
    """ Checks that the job memory consumption is within a limit
    """
    if self.parameters.has_key( 'Vsize' ):
      vsize = self.parameters['Vsize'][-1]
      
    if vsize and self.memoryLimit:
      if vsize > self.memoryLimit:
        vsize = vsize
        # Just a warning for the moment
        self.log.warn( "Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % ( vsize, self.memoryLimit ) )  
    
    return S_OK()

  #############################################################################
  def __checkDiskSpace( self ):
    """Checks whether the CS defined minimum disk space is available.
    """
    if self.parameters.has_key( 'DiskSpace' ):
      availSpace = self.parameters['DiskSpace'][-1]
      if availSpace >= 0 and availSpace < self.minDiskSpace:
        self.log.info( 'Not enough local disk space for job to continue, defined in CS as %s MB' % ( self.minDiskSpace ) )
        return S_ERROR( 'Job has insufficient disk space to continue' )
      else:
        return S_OK( 'Job has enough disk space available' )
    else:
      return S_ERROR( 'Available disk space could not be established' )

  #############################################################################
  def __checkWallClockTime( self ):
    """Checks whether the job has been running for the CS defined maximum
       wall clock time.
    """
    if self.initialValues.has_key( 'StartTime' ):
      startTime = self.initialValues['StartTime']
      if time.time() - startTime > self.maxWallClockTime:
        self.log.info( 'Job has exceeded maximum wall clock time of %s seconds' % ( self.maxWallClockTime ) )
        return S_ERROR( 'Job has exceeded maximum wall clock time' )
      else:
        return S_OK( 'Job within maximum wall clock time' )
    else:
      return S_ERROR( 'Job start time could not be established' )

  #############################################################################
  def __checkLoadAverage( self ):
    """Checks whether the CS defined maximum load average is exceeded.
    """
    if self.parameters.has_key( 'LoadAverage' ):
      loadAvg = self.parameters['LoadAverage'][-1]
      if loadAvg > float( self.loadAvgLimit ):
        self.log.info( 'Maximum load average exceeded, defined in CS as %s ' % ( self.loadAvgLimit ) )
        return S_ERROR( 'Job exceeded maximum load average' )
      else:
        return S_OK( 'Job running with normal load average' )
    else:
      return S_ERROR( 'Job load average not established' )

  #############################################################################
  def __peek( self ):
    """ Uses ExecutionThread.getOutput() method to obtain standard output
        from running thread via subprocess callback function.
    """
    result = self.exeThread.getOutput()
    if not result['OK']:
      self.log.warn( 'Could not obtain output from running application thread' )
      self.log.warn( result['Message'] )

    return result

  #############################################################################
  def calibrate( self ):
    """ The calibrate method obtains the initial values for system memory and load
        and calculates the margin for error for the rest of the Watchdog cycle.
    """
    self.__getWallClockTime()
    self.parameters['WallClockTime'] = []

    cpuConsumed = self.__getCPU()
    if not cpuConsumed['OK']:
      self.log.warn( "Could not establish CPU consumed, setting to 0.0" )
      cpuConsumed = 0.0
    else:
      cpuConsumed = cpuConsumed['Value']

    self.initialValues['CPUConsumed'] = cpuConsumed
    self.parameters['CPUConsumed'] = []

    loadAvg = self.getLoadAverage()
    if not loadAvg['OK']:
      self.log.warn( "Could not establish LoadAverage, setting to 0" )
      loadAvg = 0
    else:
      loadAvg = loadAvg['Value']

    self.initialValues['LoadAverage'] = loadAvg
    self.parameters['LoadAverage'] = []

    memUsed = self.getMemoryUsed()
    if not memUsed['OK']:
      self.log.warn( "Could not establish MemoryUsed, setting to 0" )
      memUsed = 0
    else:
      memUsed = memUsed['Value']

    self.initialValues['MemoryUsed'] = memUsed
    self.parameters['MemoryUsed'] = []
    
    result = self.processMonitor.getMemoryConsumed( self.wrapperPID )
    self.log.verbose( 'Job Memory: %s' % ( result['Value'] ) )
    if not result['OK']:
      self.log.warn( 'Could not get job memory usage' )

    self.initialValues['Vsize'] = result['Value']['Vsize']/1024.
    self.initialValues['RSS'] = result['Value']['RSS']/1024.
    self.parameters['Vsize'] = []
    self.parameters['RSS'] = []

    result = self.getDiskSpace()
    self.log.verbose( 'DiskSpace: %s' % ( result ) )
    if not result['OK']:
      self.log.warn( "Could not establish DiskSpace" )

    self.initialValues['DiskSpace'] = result['Value']
    self.parameters['DiskSpace'] = []

    result = self.getNodeInformation()
    self.log.verbose( 'NodeInfo: %s' % ( result ) )
    if not result['OK']:
      self.log.warn( "Could not establish static system information" )

    if os.environ.has_key( 'LSB_JOBID' ):
      result['LocalJobID'] = os.environ['LSB_JOBID']
    if os.environ.has_key( 'PBS_JOBID' ):
      result['LocalJobID'] = os.environ['PBS_JOBID']
    if os.environ.has_key( 'QSUB_REQNAME' ):
      result['LocalJobID'] = os.environ['QSUB_REQNAME']
    if os.environ.has_key( 'JOB_ID' ):
      result['LocalJobID'] = os.environ['JOB_ID']

    self.__reportParameters( result, 'NodeInformation', True )
    self.__reportParameters( self.initialValues, 'InitialValues' )
    return S_OK()

  def __timeLeft( self ):
    """
      return Normalized CPU time left in the batch system
      0 if not available
      update self.timeLeft and self.littleTimeLeft
    """
    # Get CPU time left in the batch system
    result = self.timeLeftUtil.getTimeLeft( 0.0 )
    if not result['OK']:
      # Could not get CPU time left, we might need to wait for the first loop
      # or the Utility is not working properly for this batch system
      # or we are in a batch system
      timeLeft = 0
    else:
      timeLeft = result['Value']

    self.timeLeft = timeLeft
    if not self.littleTimeLeft:
      if timeLeft and timeLeft < self.grossTimeLeftLimit:
        self.log.info( 'TimeLeft bellow %s, now checking with higher frequency' % timeLeft )
        self.littleTimeLeft = True
        # TODO: better configurable way of doing this to be coded
        self.littleTimeLeftCount = 15
    else:
      if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit:
        timeLeft = -1

    return timeLeft

  #############################################################################
  def __getUsageSummary( self ):
    """ Returns average load, memory etc. over execution of job thread
    """
    summary = {}
    # CPUConsumed
    if self.parameters.has_key( 'CPUConsumed' ):
      cpuList = self.parameters['CPUConsumed']
      if cpuList:
        hmsCPU = cpuList[-1]
        rawCPU = self.__convertCPUTime( hmsCPU )
        if rawCPU['OK']:
          summary['LastUpdateCPU(s)'] = rawCPU['Value']
      else:
        summary['LastUpdateCPU(s)'] = 'Could not be estimated'
    # DiskSpace
    if self.parameters.has_key( 'DiskSpace' ):
      space = self.parameters['DiskSpace']
      if space:
        value = abs( float( space[-1] ) - float( self.initialValues['DiskSpace'] ) )
        if value < 0:
          value = 0
        summary['DiskSpace(MB)'] = value
      else:
        summary['DiskSpace(MB)'] = 'Could not be estimated'
    # MemoryUsed
    if self.parameters.has_key( 'MemoryUsed' ):
      memory = self.parameters['MemoryUsed']
      if memory:
        summary['MemoryUsed(kb)'] = abs( float( memory[-1] ) - float( self.initialValues['MemoryUsed'] ) )
      else:
        summary['MemoryUsed(kb)'] = 'Could not be estimated'
    # LoadAverage
    if self.parameters.has_key( 'LoadAverage' ):
      laList = self.parameters['LoadAverage']
      if laList:
        summary['LoadAverage'] = float( sum( laList ) ) / float( len( laList ) )
      else:
        summary['LoadAverage'] = 'Could not be estimated'

    result = self.__getWallClockTime()
    if not result['OK']:
      self.log.warn( "Failed determining wall clock time", result['Message'] )
      summary['WallClockTime(s)'] = 0
    else:
      wallClock = result['Value']
      summary['WallClockTime(s)'] = wallClock

    self.__reportParameters( summary, 'UsageSummary', True )
    self.currentStats = summary


  #############################################################################
  def __reportParameters( self, params, title = None, report = False ):
    """Will report parameters for job.
    """
    try:
      parameters = []
      self.log.info( '==========================================================' )
      if title:
        self.log.info( 'Watchdog will report %s' % ( title ) )
      else:
        self.log.info( 'Watchdog will report parameters' )
      self.log.info( '==========================================================' )
      vals = params
      if params.has_key( 'Value' ):
        if vals['Value']:
          vals = params['Value']
      for k, v in vals.items():
        if v:
          self.log.info( str( k ) + ' = ' + str( v ) )
          parameters.append( ( k, v ) )
      if report:
        self.__setJobParamList( parameters )

      self.log.info( '==========================================================' )
    except Exception as x:
      self.log.warn( 'Problem while reporting parameters' )
      self.log.warn( str( x ) )

  #############################################################################
  def __getWallClockTime( self ):
    """ Establishes the Wall Clock time spent since the Watchdog initialization"""
    result = S_OK()
    if self.initialValues.has_key( 'StartTime' ):
      currentTime = time.time()
      wallClock = currentTime - self.initialValues['StartTime']
      result['Value'] = wallClock
    else:
      self.initialValues['StartTime'] = time.time()
      result['Value'] = 0.0

    return result

  #############################################################################
  def __killRunningThread( self ):
    """ Will kill the running thread process and any child processes."""
    self.log.info( 'Sending kill signal to application PID %s' % ( self.spObject.getChildPID() ) )
    result = self.spObject.killChild()
    self.applicationKilled = True
    self.log.info( 'Subprocess.killChild() returned:%s ' % ( result ) )
    return S_OK( 'Thread killed' )

  #############################################################################
  def __sendSignOfLife( self, jobID, heartBeatDict, staticParamDict ):
    """ Sends sign of life 'heartbeat' signal and triggers control signal
        interpretation.
    """
    jobReport = RPCClient( 'WorkloadManagement/JobStateUpdate', timeout = 120 )
    result = jobReport.sendHeartBeat( jobID, heartBeatDict, staticParamDict )
    if not result['OK']:
      self.log.warn( 'Problem sending sign of life' )
      self.log.warn( result )

    if result['OK'] and result['Value']:
      self.__interpretControlSignal( result['Value'] )

    return result

  #############################################################################
  def __setJobParamList( self, value ):
    """Wraps around setJobParameters of state update client
    """
    # job wrapper template sets the jobID variable
    if not os.environ.has_key( 'JOBID' ):
      self.log.info( 'Running without JOBID so parameters will not be reported' )
      return S_OK()
    jobID = os.environ['JOBID']
    jobReport = RPCClient( 'WorkloadManagement/JobStateUpdate', timeout = 120 )
    jobParam = jobReport.setJobParameters( int( jobID ), value )
    self.log.verbose( 'setJobParameters(%s,%s)' % ( jobID, value ) )
    if not jobParam['OK']:
      self.log.warn( jobParam['Message'] )

    return jobParam

  #############################################################################
  def getNodeInformation( self ):
    """ Attempts to retrieve all static system information, should be overridden in a subclass"""
    methodName = 'getNodeInformation'
    self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )
    return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )

  #############################################################################
  def getLoadAverage( self ):
    """ Attempts to get the load average, should be overridden in a subclass"""
    methodName = 'getLoadAverage'
    self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )
    return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )

  #############################################################################
  def getMemoryUsed( self ):
    """ Attempts to get the memory used, should be overridden in a subclass"""
    methodName = 'getMemoryUsed'
    self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )
    return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )

  #############################################################################
  def getDiskSpace( self ):
    """ Attempts to get the available disk space, should be overridden in a subclass"""
    methodName = 'getDiskSpace'
    self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )
    return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )
Пример #5
0
class JobAgent( AgentModule ):
  """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """

  #############################################################################
  def initialize( self, loops = 0 ):
    """Sets default parameters and creates CE instance
    """
    # Disable monitoring
    self.am_setOption( 'MonitoringEnabled', False )
    # self.log.setLevel('debug') #temporary for debugging
    self.am_setOption( 'MaxCycles', loops )

    ceType = self.am_getOption( 'CEType', 'InProcess' )
    localCE = gConfig.getValue( '/LocalSite/LocalCE', '' )
    if localCE:
      self.log.info( 'Defining CE from local configuration = %s' % localCE )
      ceType = localCE

    ceFactory = ComputingElementFactory()
    self.ceName = ceType
    ceInstance = ceFactory.getCE( ceType )
    if not ceInstance['OK']:
      self.log.warn( ceInstance['Message'] )
      return ceInstance

    self.initTimes = os.times()

    self.computingElement = ceInstance['Value']
    #Localsite options
    self.siteName = gConfig.getValue( '/LocalSite/Site', 'Unknown' )
    self.pilotReference = gConfig.getValue( '/LocalSite/PilotReference', 'Unknown' )
    self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5 )
    #Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 )
    self.jobSubmissionDelay = self.am_getOption( 'SubmissionDelay', 10 )
    self.fillingMode = self.am_getOption( 'FillingModeFlag', False )
    self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True )
    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10 )
    self.jobCount = 0
    self.matchFailedCount = 0
    self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', '' )
    #Timeleft
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0 )
    self.timeLeftError = ''
    self.scaledCPUTime = 0.0
    self.pilotInfoReportedFlag = False
    return S_OK()

  #############################################################################
  def execute( self ):
    """The JobAgent execution method.
    """
    if self.jobCount:
      #Only call timeLeft utility after a job has been picked up
      self.log.info( 'Attempting to check CPU time left for filling mode' )
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn( self.timeLeftError )
          return self.__finish( self.timeLeftError )
        self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) )
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft )
        if not result['OK']:
          return self.__finish( result['Message'] )
        
        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join( '.', self.extraOptions )
        else:
          localConfigFile = os.path.join( rootPath, "etc", "dirac.cfg" )
        localCfg.loadFromFile( localConfigFile )
        if not localCfg.isSection('/LocalSite'):
          localCfg.createNewSection('/LocalSite')
        localCfg.setOption( '/LocalSite/CPUTimeLeft', self.timeLeft )
        localCfg.writeToFile( localConfigFile )
        
      else:
        return self.__finish( 'Filling Mode is Disabled' )

    self.log.verbose( 'Job Agent execution loop' )
    available = self.computingElement.available()
    if not available['OK'] or not available['Value']:
      self.log.info( 'Resource is not available' )
      self.log.info( available['Message'] )
      return self.__finish( 'CE Not Available' )

    self.log.info( available['Message'] )

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']

    # Add pilot information
    gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' )
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if not 'PilotReference' in ceDict:
      ceDict['PilotReference'] = str( self.pilotReference )
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict( '/AgentJobRequirements' )
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update( requirementsDict )

    self.log.verbose( ceDict )
    start = time.time()
    jobRequest = self.__requestJob( ceDict )
    matchTime = time.time() - start
    self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) )

    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches )

    if not jobRequest['OK']:
      if re.search( 'No match found', jobRequest['Message'] ):
        self.log.notice( 'Job request OK: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "seconds timeout" ) != -1:
        self.log.error( jobRequest['Message'] )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 :
        self.log.error( jobRequest['Message'] )
        return S_ERROR( jobRequest['Message'] )
      else:
        self.log.notice( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    jobID = matcherInfo['JobID']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False )
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if not matcherInfo.has_key( param ):
        self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      elif not matcherInfo[param]:
        self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      else:
        self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) )

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo.keys():
      if not key in matcherParams:
        value = matcherInfo[key]
        optimizerParams[key] = value

    parameters = self.__getJDLParameters( jobJDL )
    if not parameters['OK']:
      self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' )
      self.log.warn( parameters['Message'] )
      return self.__finish( 'JDL Problem' )

    params = parameters['Value']
    if not params.has_key( 'JobID' ):
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report( jobID, 'Failed', msg )
      self.log.warn( msg )
      return self.__finish( 'JDL Problem' )
    else:
      jobID = params['JobID']

    if not params.has_key( 'JobType' ):
      self.log.warn( 'Job has no JobType defined in JDL parameters' )
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if not params.has_key( 'CPUTime' ):
      self.log.warn( 'Job has no CPU requirement defined in JDL parameters' )

    if self.extraOptions:
      params['Arguments'] = params['Arguments'] + ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) )
    self.log.info( 'Received JobID=%s, JobType=%s' % ( jobID, jobType ) )
    self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) )
    self.jobCount += 1
    try:
      jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName )
      jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False )

      if os.environ.has_key( 'BOINC_JOB_ID' ):
        # Report BOINC environment 
        for p in ['BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName']:
          jobReport.setJobParameter( p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown' ), sendFlag = False )

      jobReport.setJobStatus( 'Matched', 'Job Received by Agent' )
      result = self.__setupProxy( ownerDN, jobGroup )
      if not result[ 'OK' ]:
        return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure )
      if 'Value' in result and result[ 'Value' ]:
        proxyChain = result[ 'Value' ]

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest( jobID, jobJDL )

      software = self.__checkInstallSoftware( jobID, params, ceDict )
      if not software['OK']:
        self.log.error( 'Failed to install software for job %s' % ( jobID ) )
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure )

      self.log.debug( 'Before %sCE submitJob()' % ( self.ceName ) )
      submission = self.__submitJob( jobID, params, ceDict, optimizerParams, proxyChain )
      if not submission['OK']:
        self.__report( jobID, 'Failed', submission['Message'] )
        return self.__finish( submission['Message'] )
      elif 'PayloadFailed' in submission:
        # Do not keep running and do not overwrite the Payload error
        return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'],
                              self.stopOnApplicationFailure )

      self.log.debug( 'After %sCE submitJob()' % ( self.ceName ) )
    except Exception:
      self.log.exception()
      return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure )

    currentTimes = list( os.times() )
    for i in range( len( currentTimes ) ):
      currentTimes[i] -= self.initTimes[i]

    utime, stime, cutime, cstime, _elapsed = currentTimes
    cpuTime = utime + stime + cutime + cstime

    result = self.timeLeftUtil.getTimeLeft( cpuTime )
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        if self.cpuFactor:
          # if the batch system is not defined used the CPUNormalizationFactor 
          # defined locally
          self.timeLeft = self.__getCPUTimeLeft()
    scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

    self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) )
    self.scaledCPUTime = scaledCPUTime

    return S_OK( 'Job Agent cycle complete' )

  #############################################################################
  def __saveJobJDLRequest( self, jobID, jobJDL ):
    """Save job JDL local to JobAgent.
    """
    classAdJob = ClassAd( jobJDL )
    classAdJob.insertAttributeString( 'LocalCE', self.ceName )
    jdlFileName = jobID + '.jdl'
    jdlFile = open( jdlFileName, 'w' )
    jdl = classAdJob.asJDL()
    jdlFile.write( jdl )
    jdlFile.close()

  #############################################################################
  def __getCPUTimeLeft( self ):
    """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
    utime, stime, cutime, _cstime, _elapsed = os.times()
    cpuTime = utime + stime + cutime
    self.log.info( 'Current raw CPU time consumed is %s' % cpuTime )
    timeleft = self.timeLeft - cpuTime * self.cpuFactor
    return timeleft

  #############################################################################
  def __changeProxy( self, oldProxy, newProxy ):
    """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
    self.log.verbose( 'Log proxy change (to be instrumented)' )
    return S_OK()

  #############################################################################
  def __setupProxy( self, ownerDN, ownerGroup ):
    """
    Retrieve a proxy for the execution of the job
    """
    if gConfig.getValue( '/DIRAC/Security/UseServerCertificate' , False ):
      proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
      if not proxyResult['OK']:
        self.log.error( 'Invalid Proxy', proxyResult['Message'] )
        return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
      return S_OK( proxyResult['Value'] )
    else:
      ret = getProxyInfo( disableVOMS = True )
      if not ret['OK']:
        self.log.error( 'Invalid Proxy', ret['Message'] )
        return S_ERROR( 'Invalid Proxy' )

      proxyChain = ret['Value']['chain']
      if not 'groupProperties' in ret['Value']:
        print ret['Value']
        print proxyChain.dumpAllToString()
        self.log.error( 'Invalid Proxy', 'Group has no properties defined' )
        return S_ERROR( 'Proxy has no group properties defined' )

      groupProps = ret['Value']['groupProperties']
      if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
        proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
        if not proxyResult['OK']:
          self.log.error( 'Invalid Proxy', proxyResult['Message'] )
          return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
        proxyChain = proxyResult['Value']

    return S_OK( proxyChain )

  #############################################################################
  def __requestProxyFromProxyManager( self, ownerDN, ownerGroup ):
    """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

    self.log.info( "Requesting proxy for %s@%s" % ( ownerDN, ownerGroup ) )
    token = gConfig.getValue( "/Security/ProxyToken", "" )
    if not token:
      self.log.info( "No token defined. Trying to download proxy without token" )
      token = False
    retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup,
                                                          self.defaultProxyLength, token )
    if not retVal[ 'OK' ]:
      self.log.error( 'Could not retrieve proxy' )
      self.log.warn( retVal )
      os.system( 'dirac-proxy-info' )
      sys.stdout.flush()
      return S_ERROR( 'Error retrieving proxy' )

    chain = retVal[ 'Value' ]
    return S_OK( chain )

  #############################################################################
  def __checkInstallSoftware( self, jobID, jobParams, resourceParams ):
    """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
    if not jobParams.has_key( 'SoftwareDistModule' ):
      msg = 'Job has no software installation requirement'
      self.log.verbose( msg )
      return S_OK( msg )

    self.__report( jobID, 'Matched', 'Installing Software' )
    softwareDist = jobParams['SoftwareDistModule']
    self.log.verbose( 'Found VO Software Distribution module: %s' % ( softwareDist ) )
    argumentsDict = {'Job':jobParams, 'CE':resourceParams}
    moduleFactory = ModuleFactory()
    moduleInstance = moduleFactory.getModule( softwareDist, argumentsDict )
    if not moduleInstance['OK']:
      return moduleInstance

    module = moduleInstance['Value']
    return module.execute()

  #############################################################################
  def __submitJob( self, jobID, jobParams, resourceParams, optimizerParams, proxyChain ):
    """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
    logLevel = self.am_getOption( 'DefaultLogLevel', 'INFO' )
    defaultWrapperLocation = self.am_getOption( 'JobWrapperTemplate',
                                                'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py' )
    result = createJobWrapper( jobID, jobParams, resourceParams, optimizerParams,
                               extraOptions = self.extraOptions, defaultWrapperLocation = defaultWrapperLocation,
                               log = self.log, logLevel = logLevel )
    if not result['OK']:
      return result

    wrapperFile = result['Value']
    self.__report( jobID, 'Matched', 'Submitted To CE' )

    self.log.info( 'Submitting %s to %sCE' % ( os.path.basename( wrapperFile ), self.ceName ) )

    #Pass proxy to the CE
    proxy = proxyChain.dumpAllToString()
    if not proxy['OK']:
      self.log.error( proxy )
      return S_ERROR( 'Payload Proxy Not Found' )

    payloadProxy = proxy['Value']
    # FIXME: how can we set the batchID before we submit, this makes no sense
    batchID = 'dc%s' % ( jobID )
    submission = self.computingElement.submitJob( wrapperFile, payloadProxy )

    ret = S_OK( 'Job submitted' )

    if submission['OK']:
      batchID = submission['Value']
      self.log.info( 'Job %s submitted as %s' % ( jobID, batchID ) )
      self.log.verbose( 'Set JobParameter: Local batch ID %s' % ( batchID ) )
      self.__setJobParam( jobID, 'LocalBatchID', str( batchID ) )
      if 'PayloadFailed' in submission:
        ret['PayloadFailed'] = submission['PayloadFailed']
        return ret
      time.sleep( self.jobSubmissionDelay )
    else:
      self.log.error( 'Job submission failed', jobID )
      self.__setJobParam( jobID, 'ErrorMessage', '%s CE Submission Error' % ( self.ceName ) )
      if 'ReschedulePayload' in submission:
        rescheduleFailedJob( jobID, submission['Message'], self.__report )
      else:
        if 'Value' in submission:
          self.log.error( 'Error in DIRAC JobWrapper:', 'exit code = %s' % ( str( submission['Value'] ) ) )
        # make sure the Job is declared Failed
        self.__report( jobID, 'Failed', submission['Message'] )
      return S_ERROR( '%s CE Submission Error: %s' % ( self.ceName, submission['Message'] ) )

    return ret

  #############################################################################
  def __requestJob( self, ceDict ):
    """Request a single job from the matcher service.
    """
    try:
      matcher = RPCClient( 'WorkloadManagement/Matcher', timeout = 600 )
      result = matcher.requestJob( ceDict )
      return result
    except Exception, x:
      self.log.exception( lException = x )
      return S_ERROR( 'Job request to matcher service failed with exception' )
Пример #6
0
class JobAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  #############################################################################
  def initialize( self, loops = 0 ):
    """Sets default parameters and creates CE instance
    """
    #Disable monitoring
    self.am_setOption( 'MonitoringEnabled', False )
   # self.log.setLevel('debug') #temporary for debugging
    self.am_setOption( 'MaxCycles', loops )

    ceUniqueID = self.am_getOption( 'CEUniqueID', 'InProcess' )
    localCE = gConfig.getValue( '/LocalSite/LocalCE', '' )
    if localCE:
      self.log.info( 'Defining CE from local configuration = %s' % localCE )
      ceUniqueID = localCE

    ceFactory = ComputingElementFactory()
    self.ceName = ceUniqueID
    ceInstance = ceFactory.getCE( ceUniqueID )
    if not ceInstance['OK']:
      self.log.warn( ceInstance['Message'] )
      return ceInstance

    self.computingElement = ceInstance['Value']
    self.diracRoot = os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname( __file__ ) ) ) )
    #Localsite options
    self.siteRoot = gConfig.getValue( '/LocalSite/Root', os.getcwd() )
    self.siteName = gConfig.getValue( '/LocalSite/Site', 'Unknown' )
    self.pilotReference = gConfig.getValue( '/LocalSite/PilotReference', 'Unknown' )
    self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5 )
    #Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 )
    defaultWrapperLocation = 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py'
    self.jobWrapperTemplate = os.path.join( self.diracRoot,
                                            self.am_getOption( 'JobWrapperTemplate',
                                                                defaultWrapperLocation ) )
    self.jobSubmissionDelay = self.am_getOption( 'SubmissionDelay', 10 )
    self.defaultLogLevel = self.am_getOption( 'DefaultLogLevel', 'info' )
    self.fillingMode = self.am_getOption( 'FillingModeFlag', False )
    self.jobCount = 0
    #Timeleft
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0 )
    self.gridCEQueue = gConfig.getValue( '/Resources/Computing/CEDefaults/GridCEQueue', '' )
    self.timeLeftError = ''
    self.scaledCPUTime = 0.0
    return S_OK()

  #############################################################################
  def execute( self ):
    """The JobAgent execution method.
    """
    if self.jobCount:
      #Only call timeLeft utility after a job has been picked up
      self.log.info( 'Attempting to check CPU time left for filling mode' )
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn( self.timeLeftError )
          return self.__finish( self.timeLeftError )
        self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) )
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft )
        if not result['OK']:
          return self.__finish( result['Message'] )
        ceJDL = self.computingElement.getJDL()
        resourceJDL = ceJDL['Value']
      else:
        return self.__finish( 'Filling Mode is Disabled' )

    self.log.verbose( 'Job Agent execution loop' )
    available = self.computingElement.available()
    if not available['OK'] or not available['Value']:
      self.log.info( 'Resource is not available' )
      self.log.info( available['Message'] )
      return self.__finish( 'CE Not Available' )

    self.log.info( available['Message'] )

    ceJDL = self.computingElement.getJDL()
    resourceJDL = ceJDL['Value']
    self.log.verbose( resourceJDL )
    start = time.time()
    jobRequest = self.__requestJob( resourceJDL )
    matchTime = time.time() - start
    self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) )

    if not jobRequest['OK']:
      if re.search( 'No work available', jobRequest['Message'] ):
        self.log.info( 'Job request OK: %s' % ( jobRequest['Message'] ) )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "seconds timeout" ) != -1:
        self.log.error( jobRequest['Message'] )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 :
        self.log.error( jobRequest['Message'] )
        return S_ERROR( jobRequest['Message'] )
      else:
        self.log.info( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) )
        return S_OK( jobRequest['Message'] )

    matcherInfo = jobRequest['Value']
    jobID = matcherInfo['JobID']
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if not matcherInfo.has_key( param ):
        self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      elif not matcherInfo[param]:
        self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      else:
        self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) )

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo.keys():
      if not key in matcherParams:
        value = matcherInfo[key]
        optimizerParams[key] = value

    parameters = self.__getJDLParameters( jobJDL )
    if not parameters['OK']:
      self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' )
      self.log.warn( parameters['Message'] )
      return self.__finish( 'JDL Problem' )

    params = parameters['Value']
    if not params.has_key( 'JobID' ):
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report( jobID, 'Failed', msg )
      self.log.warn( msg )
      return self.__finish( 'JDL Problem' )
    else:
      jobID = params['JobID']

    if not params.has_key( 'JobType' ):
      self.log.warn( 'Job has no JobType defined in JDL parameters' )
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if not params.has_key( 'SystemConfig' ):
      self.log.warn( 'Job has no system configuration defined in JDL parameters' )
      systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' )
      self.log.info( 'Setting system config to /LocalSite/Architecture = %s since it was not specified' % systemConfig )
      if not systemConfig:
        self.log.warn( '/LocalSite/Architecture is not defined' )
      params['SystemConfig'] = systemConfig
    else:
      systemConfig = params['SystemConfig']
      if systemConfig.lower() == 'any':
        systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' )
        self.log.info( 'Setting SystemConfig = /LocalSite/Architecture =',
                       '"%s" since it was set to "ANY" in the job description' % systemConfig )
        if not systemConfig:
          self.log.warn( '/LocalSite/Architecture is not defined' )
        params['SystemConfig'] = systemConfig

    if not params.has_key( 'MaxCPUTime' ):
      self.log.warn( 'Job has no CPU requirement defined in JDL parameters' )

    self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) )
    self.log.info( 'Received JobID=%s, JobType=%s, SystemConfig=%s' % ( jobID, jobType, systemConfig ) )
    self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) )
    self.jobCount += 1
    try:
      self.__setJobParam( jobID, 'MatcherServiceTime', str( matchTime ) )
      if self.gridCEQueue:
        self.__setJobParam( jobID, 'GridCEQueue', self.gridCEQueue )
      self.__report( jobID, 'Matched', 'Job Received by Agent' )
      self.__setJobSite( jobID, self.siteName )
      self.__reportPilotInfo( jobID )
      result = self.__setupProxy( ownerDN, jobGroup )
      if not result[ 'OK' ]:
        return self.__rescheduleFailedJob( jobID, result[ 'Message' ] )
      if 'Value' in result and result[ 'Value' ]:
        proxyChain = result[ 'Value' ]

      # Is this necessary at all?
      saveJDL = self.__saveJobJDLRequest( jobID, jobJDL )
      #self.__report(jobID,'Matched','Job Prepared to Submit')

      resourceParameters = self.__getJDLParameters( resourceJDL )
      if not resourceParameters['OK']:
        return resourceParameters
      resourceParams = resourceParameters['Value']

      software = self.__checkInstallSoftware( jobID, params, resourceParams )
      if not software['OK']:
        self.log.error( 'Failed to install software for job %s' % ( jobID ) )
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob( jobID, errorMsg )

      self.log.verbose( 'Before %sCE submitJob()' % ( self.ceName ) )
      submission = self.__submitJob( jobID, params, resourceParams, optimizerParams, jobJDL, proxyChain )
      if not submission['OK']:
        self.__report( jobID, 'Failed', submission['Message'] )
        return self.__finish( submission['Message'] )
      elif 'PayloadFailed' in submission:
        # Do not keep running and do not overwrite the Payload error
        return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'] )

      self.log.verbose( 'After %sCE submitJob()' % ( self.ceName ) )
    except Exception:
      self.log.exception()
      return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception' )

    result = self.timeLeftUtil.getTimeLeft( 0.0 )
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        if self.cpuFactor:
          # if the batch system is not defined used the CPUNormalizationFactor 
          # defined locally
          self.timeLeft = self.__getCPUTimeLeft()
    scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

    self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) )
    self.scaledCPUTime = scaledCPUTime

    return S_OK( 'Job Agent cycle complete' )

  #############################################################################
  def __getCPUTimeLeft( self ):
    """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
    utime, stime, cutime, cstime, elapsed = os.times()
    cpuTime = utime + stime + cutime
    self.log.info( 'Current raw CPU time consumed is %s' % cpuTime )
    timeleft = self.timeLeft - cpuTime * self.cpuFactor
    return timeleft

  #############################################################################
  def __changeProxy( self, oldProxy, newProxy ):
    """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
    self.log.verbose( 'Log proxy change (to be instrumented)' )
    return S_OK()

  #############################################################################
  def __setupProxy( self, ownerDN, ownerGroup ):
    """
    Retrieve a proxy for the execution of the job
    """
    if gConfig.getValue( '/DIRAC/Security/UseServerCertificate' , False ):
      proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
      if not proxyResult['OK']:
        self.log.error( 'Invalid Proxy', proxyResult['Message'] )
        return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
      return S_OK( proxyResult['Value'] )
    else:
      ret = getProxyInfo( disableVOMS = True )
      if not ret['OK']:
        self.log.error( 'Invalid Proxy', ret['Message'] )
        return S_ERROR( 'Invalid Proxy' )

      proxyChain = ret['Value']['chain']
      if not 'groupProperties' in ret['Value']:
        print ret['Value']
        print proxyChain.dumpAllToString()
        self.log.error( 'Invalid Proxy', 'Group has no properties defined' )
        return S_ERROR( 'Proxy has no group properties defined' )

      if Properties.GENERIC_PILOT in ret['Value']['groupProperties']:
        proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
        if not proxyResult['OK']:
          self.log.error( 'Invalid Proxy', proxyResult['Message'] )
          return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
        proxyChain = proxyResult['Value']

    return S_OK( proxyChain )

  #############################################################################
  def __requestProxyFromProxyManager( self, ownerDN, ownerGroup ):
    """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

    self.log.info( "Requesting proxy for %s@%s" % ( ownerDN, ownerGroup ) )
    token = gConfig.getValue( "/Security/ProxyToken", "" )
    if not token:
      self.log.info( "No token defined. Trying to download proxy without token" )
      token = False
    retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup,
                                                          self.defaultProxyLength, token )
    if not retVal[ 'OK' ]:
      self.log.error( 'Could not retrieve proxy' )
      self.log.warn( retVal )
      os.system( 'dirac-proxy-info' )
      sys.stdout.flush()
      return S_ERROR( 'Error retrieving proxy' )

    chain = retVal[ 'Value' ]
    return S_OK( chain )

  #############################################################################
  def __checkInstallSoftware( self, jobID, jobParams, resourceParams ):
    """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
    if not jobParams.has_key( 'SoftwareDistModule' ):
      msg = 'Job has no software installation requirement'
      self.log.verbose( msg )
      return S_OK( msg )

    self.__report( jobID, 'Matched', 'Installing Software' )
    softwareDist = jobParams['SoftwareDistModule']
    #HACK: Delete when svn repo is in production!
    softwareDist = softwareDist.replace( "DIRAC.LHCbSystem.", "LHCbDIRAC.Core." )
    #END OF HACK
    self.log.verbose( 'Found VO Software Distribution module: %s' % ( softwareDist ) )
    argumentsDict = {'Job':jobParams, 'CE':resourceParams}
    moduleFactory = ModuleFactory()
    moduleInstance = moduleFactory.getModule( softwareDist, argumentsDict )
    if not moduleInstance['OK']:
      return moduleInstance

    module = moduleInstance['Value']
    result = module.execute()
    return result

  #############################################################################
  def __submitJob( self, jobID, jobParams, resourceParams, optimizerParams, jobJDL, proxyChain ):
    """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
    result = self.__createJobWrapper( jobID, jobParams, resourceParams, optimizerParams )

    if not result['OK']:
      return result

    wrapperFile = result['Value']
    self.__report( jobID, 'Matched', 'Submitted To CE' )

    wrapperName = os.path.basename( wrapperFile )
    self.log.info( 'Submitting %s to %sCE' % ( wrapperName, self.ceName ) )

    #Pass proxy to the CE
    proxy = proxyChain.dumpAllToString()
    if not proxy['OK']:
      self.log.error( proxy )
      return S_ERROR( 'Payload Proxy Not Found' )

    payloadProxy = proxy['Value']
    # FIXME: how can we set the batchID before we submit, this makes no sense
    batchID = 'dc%s' % ( jobID )
    submission = self.computingElement.submitJob( wrapperFile, payloadProxy )

    ret = S_OK( 'Job submitted' )

    if submission['OK']:
      batchID = submission['Value']
      self.log.info( 'Job %s submitted as %s' % ( jobID, batchID ) )
      self.log.verbose( 'Set JobParameter: Local batch ID %s' % ( batchID ) )
      self.__setJobParam( jobID, 'LocalBatchID', str( batchID ) )
      if 'PayloadFailed' in submission:
        ret['PayloadFailed'] = submission['PayloadFailed']
        return ret
      time.sleep( self.jobSubmissionDelay )
    else:
      self.log.error( 'Job submission failed', jobID )
      self.__setJobParam( jobID, 'ErrorMessage', '%s CE Submission Error' % ( self.ceName ) )
      if 'ReschedulePayload' in submission:
        rescheduleFailedJob( jobID, submission['Message'], self.__report )
      return S_ERROR( '%s CE Submission Error: %s' % ( self.ceName, submission['Message'] ) )

    return ret

  #############################################################################
  def __createJobWrapper( self, jobID, jobParams, resourceParams, optimizerParams ):
    """This method creates a job wrapper filled with the CE and Job parameters
       to executed the job.
    """
    arguments = {'Job':jobParams,
                 'CE':resourceParams,
                 'Optimizer':optimizerParams}
    self.log.verbose( 'Job arguments are: \n %s' % ( arguments ) )

    workingDir = gConfig.getValue( '/LocalSite/WorkingDirectory', self.siteRoot )
    if not os.path.exists( '%s/job/Wrapper' % ( workingDir ) ):
      try:
        os.makedirs( '%s/job/Wrapper' % ( workingDir ) )
      except Exception:
        self.log.exception()
        return S_ERROR( 'Could not create directory for wrapper script' )

    jobWrapperFile = '%s/job/Wrapper/Wrapper_%s' % ( workingDir, jobID )
    if os.path.exists( jobWrapperFile ):
      self.log.verbose( 'Removing existing Job Wrapper for %s' % ( jobID ) )
      os.remove( jobWrapperFile )
    fd = open( self.jobWrapperTemplate, 'r' )
    wrapperTemplate = fd.read()
    fd.close()

    dateStr = time.strftime( "%Y-%m-%d", time.localtime( time.time() ) )
    timeStr = time.strftime( "%H:%M", time.localtime( time.time() ) )
    date_time = '%s %s' % ( dateStr, timeStr )
    signature = __RCSID__
    dPython = sys.executable

    systemConfig = ''
    if jobParams.has_key( 'SystemConfig' ):
      systemConfig = jobParams['SystemConfig']
      self.log.verbose( 'Job system configuration requirement is %s' % ( systemConfig ) )
      if resourceParams.has_key( 'Root' ):
        jobPython = '%s/%s/bin/python' % ( resourceParams['Root'], systemConfig )
        if os.path.exists( jobPython ):
          self.log.verbose( 'Found local python for job:\n%s' % ( jobPython ) )
          dPython = jobPython
        else:
          if systemConfig == 'ANY':
            self.log.verbose( 'Using standard available python %s for job' % ( dPython ) )
          else:
            self.log.warn( 'Job requested python \n%s\n but this is not available locally' % ( jobPython ) )
      else:
        self.log.warn( 'No LocalSite/Root defined' )
    else:
      self.log.warn( 'Job has no system configuration requirement' )

    if not systemConfig or systemConfig.lower() == 'any':
      systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' )
      if not systemConfig:
        self.log.warn( 'Could not establish SystemConfig' )

    logLevel = self.defaultLogLevel
    if jobParams.has_key( 'LogLevel' ):
      logLevel = jobParams['LogLevel']
      self.log.info( 'Found Job LogLevel JDL parameter with value: %s' % ( logLevel ) )
    else:
      self.log.info( 'Applying default LogLevel JDL parameter with value: %s' % ( logLevel ) )

    realPythonPath = os.path.realpath( dPython )
    self.log.debug( 'Real python path after resolving links is:' )
    self.log.debug( realPythonPath )
    dPython = realPythonPath

    siteRootPython = self.siteRoot
    self.log.debug( 'DIRACPython is:\n%s' % dPython )
    self.log.debug( 'SiteRootPythonDir is:\n%s' % siteRootPython )
    wrapperTemplate = wrapperTemplate.replace( "@SIGNATURE@", str( signature ) )
    wrapperTemplate = wrapperTemplate.replace( "@JOBID@", str( jobID ) )
    wrapperTemplate = wrapperTemplate.replace( "@DATESTRING@", str( date_time ) )
    wrapperTemplate = wrapperTemplate.replace( "@JOBARGS@", str( arguments ) )
    wrapperTemplate = wrapperTemplate.replace( "@SITEPYTHON@", str( siteRootPython ) )
    wrapper = open ( jobWrapperFile, "w" )
    wrapper.write( wrapperTemplate )
    wrapper.close ()
    jobExeFile = '%s/job/Wrapper/Job%s' % ( workingDir, jobID )
    jobFileContents = \
"""#!/bin/sh
%s %s -o LogLevel=%s
""" % ( dPython, jobWrapperFile, logLevel )
    jobFile = open( jobExeFile, 'w' )
    jobFile.write( jobFileContents )
    jobFile.close()
    return S_OK( jobExeFile )

  #############################################################################
  def __saveJobJDLRequest( self, jobID, jobJDL ):
    """Save job JDL local to JobAgent.
    """
    classAdJob = ClassAd( jobJDL )
    classAdJob.insertAttributeString( 'LocalCE', self.ceName )
    jdlFileName = jobID + '.jdl'
    jdlFile = open( jdlFileName, 'w' )
    jdl = classAdJob.asJDL()
    jdlFile.write( jdl )
    jdlFile.close()
    return S_OK( jdlFileName )

  #############################################################################
  def __requestJob( self, resourceJDL ):
    """Request a single job from the matcher service.
    """
    try:
      matcher = RPCClient( 'WorkloadManagement/Matcher', timeout = 600 )
      result = matcher.requestJob( resourceJDL )
      return result
    except Exception, x:
      self.log.exception( lException = x )
      return S_ERROR( 'Job request to matcher service failed with exception' )
Пример #7
0
class JobAgent(AgentModule):
    """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring
        self.am_setOption('MonitoringEnabled', False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption('MaxCycles', loops)

        ceType = self.am_getOption('CEType', 'InProcess')
        localCE = gConfig.getValue('/LocalSite/LocalCE', '')
        if localCE:
            self.log.info('Defining CE from local configuration = %s' %
                          localCE)
            ceType = localCE

        # Create backend Computing Element
        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance['OK']:
            self.log.warn(ceInstance['Message'])
            return ceInstance
        self.computingElement = ceInstance['Value']

        result = self.computingElement.getDescription()
        if not result['OK']:
            self.log.warn("Can not get the CE description")
            return result
        ceDict = result['Value']
        self.timeLeft = ceDict.get('CPUTime', 0.0)
        self.timeLeft = gConfig.getValue(
            '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft)

        self.initTimes = os.times()
        # Localsite options
        self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown')
        self.pilotReference = gConfig.getValue('/LocalSite/PilotReference',
                                               'Unknown')
        self.defaultProxyLength = gConfig.getValue(
            '/Registry/DefaultProxyLifeTime', 86400 * 5)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor',
                                          0.0)
        self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10)
        self.fillingMode = self.am_getOption('FillingModeFlag', False)
        self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', 1000)
        self.stopOnApplicationFailure = self.am_getOption(
            'StopOnApplicationFailure', True)
        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        self.extraOptions = gConfig.getValue(
            '/AgentJobRequirements/ExtraOptions', '')
        # Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeftError = ''
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict('/AgentJobRequirements')
        if result['OK']:
            requirementsDict = result['Value']
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs: %s' %
                                (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        if self.extraOptions:
            params['Arguments'] += ' ' + self.extraOptions
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for p in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform',
                          'BoincHostName'):
                    jobReport.setJobParameter(p,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % p,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            proxyChain = result.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % submission[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self.__getCPUTimeLeft()

        scaledCPUTime = self.timeLeftUtil.getScaledCPU()
        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')

    #############################################################################
    def __saveJobJDLRequest(self, jobID, jobJDL):
        """Save job JDL local to JobAgent.
    """
        classAdJob = ClassAd(jobJDL)
        classAdJob.insertAttributeString('LocalCE', self.ceName)
        jdlFileName = jobID + '.jdl'
        jdlFile = open(jdlFileName, 'w')
        jdl = classAdJob.asJDL()
        jdlFile.write(jdl)
        jdlFile.close()

    #############################################################################
    def __getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        cpuTime = sum(os.times()[:-1])
        self.log.info('Current raw CPU time consumed is %s' % cpuTime)
        timeleft = self.timeLeft
        if self.cpuFactor:
            timeleft -= cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def __changeProxy(self, oldProxy, newProxy):
        """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
        self.log.verbose('Log proxy change (to be instrumented)')
        return S_OK()

    #############################################################################
    def __setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False):
            proxyResult = self.__requestProxyFromProxyManager(
                ownerDN, ownerGroup)
            if not proxyResult['OK']:
                self.log.error('Failed to setup proxy', proxyResult['Message'])
                return S_ERROR('Failed to setup proxy: %s' %
                               proxyResult['Message'])
            return S_OK(proxyResult['Value'])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret['OK']:
                self.log.error('Invalid Proxy', ret['Message'])
                return S_ERROR('Invalid Proxy')

            proxyChain = ret['Value']['chain']
            if not 'groupProperties' in ret['Value']:
                print ret['Value']
                print proxyChain.dumpAllToString()
                self.log.error('Invalid Proxy',
                               'Group has no properties defined')
                return S_ERROR('Proxy has no group properties defined')

            groupProps = ret['Value']['groupProperties']
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self.__requestProxyFromProxyManager(
                    ownerDN, ownerGroup)
                if not proxyResult['OK']:
                    self.log.error('Invalid Proxy', proxyResult['Message'])
                    return S_ERROR('Failed to setup proxy: %s' %
                                   proxyResult['Message'])
                proxyChain = proxyResult['Value']

        return S_OK(proxyChain)

    #############################################################################
    def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info(
                "No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(
            ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal['OK']:
            self.log.error('Could not retrieve payload proxy',
                           retVal['Message'])
            self.log.warn(retVal)
            os.system('dirac-proxy-info')
            sys.stdout.flush()
            return S_ERROR('Error retrieving proxy')

        chain = retVal['Value']
        return S_OK(chain)

    #############################################################################
    def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if 'SoftwareDistModule' not in jobParams:
            msg = 'Job has no software installation requirement'
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, 'Matched', 'Installing Software')
        softwareDist = jobParams['SoftwareDistModule']
        self.log.verbose('Found VO Software Distribution module: %s' %
                         (softwareDist))
        argumentsDict = {'Job': jobParams, 'CE': resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance['OK']:
            return moduleInstance

        module = moduleInstance['Value']
        return module.execute()

    #############################################################################
    def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams,
                    proxyChain):
        """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
        logLevel = self.am_getOption('DefaultLogLevel', 'INFO')
        defaultWrapperLocation = self.am_getOption(
            'JobWrapperTemplate',
            'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py')
        result = createJobWrapper(
            jobID,
            jobParams,
            resourceParams,
            optimizerParams,
            extraOptions=self.extraOptions,
            defaultWrapperLocation=defaultWrapperLocation,
            log=self.log,
            logLevel=logLevel)
        if not result['OK']:
            return result

        wrapperFile = result['Value']
        self.__report(jobID, 'Matched', 'Submitted To CE')

        self.log.info('Submitting %s to %sCE' %
                      (os.path.basename(wrapperFile), self.ceName))

        # Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy['OK']:
            self.log.error('Invalid proxy', proxy)
            return S_ERROR('Payload Proxy Not Found')

        payloadProxy = proxy['Value']
        # FIXME: how can we set the batchID before we submit, this makes no sense
        batchID = 'dc%s' % (jobID)
        submission = self.computingElement.submitJob(wrapperFile, payloadProxy)

        ret = S_OK('Job submitted')

        if submission['OK']:
            batchID = submission['Value']
            self.log.info('Job %s submitted as %s' % (jobID, batchID))
            self.log.verbose('Set JobParameter: Local batch ID %s' % (batchID))
            self.__setJobParam(jobID, 'LocalBatchID', str(batchID))
            if 'PayloadFailed' in submission:
                ret['PayloadFailed'] = submission['PayloadFailed']
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error('Job submission failed', jobID)
            self.__setJobParam(jobID, 'ErrorMessage',
                               '%s CE Submission Error' % (self.ceName))
            if 'ReschedulePayload' in submission:
                rescheduleFailedJob(jobID, submission['Message'])
                return S_OK(
                )  # Without this job is marked as failed at line 265 above
            else:
                if 'Value' in submission:
                    self.log.error(
                        'Error in DIRAC JobWrapper:',
                        'exit code = %s' % (str(submission['Value'])))
                # make sure the Job is declared Failed
                self.__report(jobID, 'Failed', submission['Message'])
            return S_ERROR('%s CE Submission Error: %s' %
                           (self.ceName, submission['Message']))

        return ret

    #############################################################################
    def __requestJob(self, ceDict):
        """Request a single job from the matcher service.
    """
        try:
            matcher = RPCClient('WorkloadManagement/Matcher', timeout=600)
            result = matcher.requestJob(ceDict)
            return result
        except Exception as x:
            self.log.exception(lException=x)
            return S_ERROR(
                "Job request to matcher service failed with exception")

    #############################################################################
    def __getJDLParameters(self, jdl):
        """Returns a dictionary of JDL parameters.
    """
        try:
            parameters = {}
            #      print jdl
            if not re.search('\[', jdl):
                jdl = '[' + jdl + ']'
            classAdJob = ClassAd(jdl)
            paramsDict = classAdJob.contents
            for param, value in paramsDict.items():
                if value.strip().startswith('{'):
                    self.log.debug('Found list type parameter %s' % (param))
                    rawValues = value.replace('{',
                                              '').replace('}', '').replace(
                                                  '"', '').split()
                    valueList = []
                    for val in rawValues:
                        if re.search(',$', val):
                            valueList.append(val[:-1])
                        else:
                            valueList.append(val)
                    parameters[param] = valueList
                else:
                    parameters[param] = value.replace('"', '').replace(
                        '{', '"{').replace('}', '}"')
                    self.log.debug('Found standard parameter %s: %s' %
                                   (param, parameters[param]))
            return S_OK(parameters)
        except Exception, x:
            self.log.exception(lException=x)
            return S_ERROR('Exception while extracting JDL parameters for job')
Пример #8
0
class JobAgent(AgentModule):
  """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """

  def __init__(self, agentName, loadName, baseAgentName=False, properties=None):
    """ Just defines some default parameters
    """
    if not properties:
      properties = {}
    super(JobAgent, self).__init__(agentName, loadName, baseAgentName, properties)

    self.ceName = 'InProcess'
    self.computingElement = None
    self.timeLeft = 0.0

    self.initTimes = os.times()
    # Localsite options
    self.siteName = 'Unknown'
    self.pilotReference = 'Unknown'
    self.defaultProxyLength = 86400 * 5
    # Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = 0.0
    self.jobSubmissionDelay = 10
    self.fillingMode = False
    self.minimumTimeLeft = 1000
    self.stopOnApplicationFailure = True
    self.stopAfterFailedMatches = 10
    self.jobCount = 0
    self.matchFailedCount = 0
    self.extraOptions = ''
    # Timeleft
    self.timeLeftUtil = None
    self.timeLeftError = ''
    self.pilotInfoReportedFlag = False

  #############################################################################
  def initialize(self, loops=0):
    """Sets default parameters and creates CE instance
    """
    # Disable monitoring, logLevel INFO, limited cycles
    self.am_setOption('MonitoringEnabled', False)
    self.am_setOption('MaxCycles', loops)

    ceType = self.am_getOption('CEType', 'InProcess')
    localCE = gConfig.getValue('/LocalSite/LocalCE', '')
    if localCE:
      self.log.info('Defining CE from local configuration = %s' % localCE)
      ceType = localCE

    # Create backend Computing Element
    ceFactory = ComputingElementFactory()
    self.ceName = ceType
    ceInstance = ceFactory.getCE(ceType)
    if not ceInstance['OK']:
      self.log.warn(ceInstance['Message'])
      return ceInstance
    self.computingElement = ceInstance['Value']

    result = self.computingElement.getDescription()
    if not result['OK']:
      self.log.warn("Can not get the CE description")
      return result
    ceDict = result['Value']
    self.timeLeft = ceDict.get('CPUTime', self.timeLeft)
    self.timeLeft = gConfig.getValue('/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft)

    self.initTimes = os.times()
    # Localsite options
    self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName)
    self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', self.pilotReference)
    self.defaultProxyLength = gConfig.getValue('/Registry/DefaultProxyLifeTime', self.defaultProxyLength)
    # Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', self.cpuFactor)
    self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', self.jobSubmissionDelay)
    self.fillingMode = self.am_getOption('FillingModeFlag', self.fillingMode)
    self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', self.minimumTimeLeft)
    self.stopOnApplicationFailure = self.am_getOption('StopOnApplicationFailure', self.stopOnApplicationFailure)
    self.stopAfterFailedMatches = self.am_getOption('StopAfterFailedMatches', self.stopAfterFailedMatches)
    self.extraOptions = gConfig.getValue('/AgentJobRequirements/ExtraOptions', self.extraOptions)
    # Timeleft
    self.timeLeftUtil = TimeLeft()
    return S_OK()

  #############################################################################
  def execute(self):
    """The JobAgent execution method.
    """
    if self.jobCount:
      # Temporary mechanism to pass a shutdown message to the agent
      if os.path.exists('/var/lib/dirac_drain'):
        return self.__finish('Node is being drained by an operator')
      # Only call timeLeft utility after a job has been picked up
      self.log.info('Attempting to check CPU time left for filling mode')
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn(self.timeLeftError)
          return self.__finish(self.timeLeftError)
        self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft))
        if self.timeLeft <= self.minimumTimeLeft:
          return self.__finish('No more time left')
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
        if not result['OK']:
          return self.__finish(result['Message'])

        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join('.', self.extraOptions)
        else:
          localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
        localCfg.loadFromFile(localConfigFile)
        if not localCfg.isSection('/LocalSite'):
          localCfg.createNewSection('/LocalSite')
        localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
        localCfg.writeToFile(localConfigFile)

      else:
        return self.__finish('Filling Mode is Disabled')

    self.log.verbose('Job Agent execution loop')
    result = self.computingElement.available()
    if not result['OK']:
      self.log.info('Resource is not available')
      self.log.info(result['Message'])
      return self.__finish('CE Not Available')

    self.log.info(result['Message'])

    ceInfoDict = result['CEInfoDict']
    runningJobs = ceInfoDict.get("RunningJobs")
    availableSlots = result['Value']

    if not availableSlots:
      if runningJobs:
        self.log.info('No available slots with %d running jobs' % runningJobs)
        return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs)
      else:
        self.log.info('CE is not available')
        return self.__finish('CE Not Available')

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']
    # Add pilot information
    gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if 'PilotReference' not in ceDict:
      ceDict['PilotReference'] = str(self.pilotReference)
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict('/AgentJobRequirements')
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update(requirementsDict)
      self.log.info('Requirements:', requirementsDict)

    self.log.verbose(ceDict)
    start = time.time()
    jobRequest = MatcherClient().requestJob(ceDict)
    matchTime = time.time() - start
    self.log.info('MatcherTime = %.2f (s)' % (matchTime))

    self.stopAfterFailedMatches = self.am_getOption('StopAfterFailedMatches', self.stopAfterFailedMatches)

    if not jobRequest['OK']:
      if re.search('No match found', jobRequest['Message']):
        self.log.notice('Job request OK: %s' % (jobRequest['Message']))
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])
      elif jobRequest['Message'].find("seconds timeout") != -1:
        self.log.error('Timeout while requesting job', jobRequest['Message'])
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])
      elif jobRequest['Message'].find("Pilot version does not match") != -1:
        errorMsg = 'Pilot version does not match the production version'
        self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, ''))
        return S_ERROR(jobRequest['Message'])
      else:
        self.log.notice('Failed to get jobs: %s' % (jobRequest['Message']))
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False)
    jobID = matcherInfo['JobID']
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if param not in matcherInfo:
        self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param))
        return self.__finish('Matcher Failed')
      elif not matcherInfo[param]:
        self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param))
        return self.__finish('Matcher Failed')
      else:
        self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param]))

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo:
      if key not in matcherParams:
        optimizerParams[key] = matcherInfo[key]

    parameters = self.__getJDLParameters(jobJDL)
    if not parameters['OK']:
      self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
      self.log.warn(parameters['Message'])
      return self.__finish('JDL Problem')

    params = parameters['Value']
    if 'JobID' not in params:
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report(jobID, 'Failed', msg)
      self.log.warn(msg)
      return self.__finish('JDL Problem')
    else:
      jobID = params['JobID']

    if 'JobType' not in params:
      self.log.warn('Job has no JobType defined in JDL parameters')
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if 'CPUTime' not in params:
      self.log.warn('Job has no CPU requirement defined in JDL parameters')

    # Job requirement for a number of processors
    processors = int(params.get('NumberOfProcessors', 1))
    wholeNode = 'WholeNode' in params

    if self.extraOptions:
      params['Arguments'] += ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose('Job request successful: \n', jobRequest['Value'])
    self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType))
    self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
    self.jobCount += 1
    try:
      jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
      jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False)

      if 'BOINC_JOB_ID' in os.environ:
        # Report BOINC environment
        for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'):
          jobReport.setJobParameter(thisp, gConfig.getValue('/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False)

      jobReport.setJobStatus('Matched', 'Job Received by Agent')
      result = self.__setupProxy(ownerDN, jobGroup)
      if not result['OK']:
        return self.__rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure)
      proxyChain = result.get('Value')

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest(jobID, jobJDL)

      software = self.__checkInstallSoftware(jobID, params, ceDict)
      if not software['OK']:
        self.log.error('Failed to install software for job', '%s' % (jobID))
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure)

      self.log.debug('Before %sCE submitJob()' % (self.ceName))
      result = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode)
      if not result['OK']:
        self.__report(jobID, 'Failed', result['Message'])
        return self.__finish(result['Message'])
      elif 'PayloadFailed' in result:
        # Do not keep running and do not overwrite the Payload error
        message = 'Payload execution failed with error code %s' % result['PayloadFailed']
        if self.stopOnApplicationFailure:
          return self.__finish(message, self.stopOnApplicationFailure)
        else:
          self.log.info(message)

      self.log.debug('After %sCE submitJob()' % (self.ceName))
    except Exception as subExcept:  # pylint: disable=broad-except
      self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True)
      return self.__rescheduleFailedJob(jobID, 'Job processing failed with exception', self.stopOnApplicationFailure)

    # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
    cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

    result = self.timeLeftUtil.getTimeLeft(cpuTime, processors)
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        # if the batch system is not defined, use the process time and the CPU normalization defined locally
        self.timeLeft = self.__getCPUTimeLeft()

    return S_OK('Job Agent cycle complete')

  #############################################################################
  def __saveJobJDLRequest(self, jobID, jobJDL):
    """Save job JDL local to JobAgent.
    """
    classAdJob = ClassAd(jobJDL)
    classAdJob.insertAttributeString('LocalCE', self.ceName)
    jdlFileName = jobID + '.jdl'
    jdlFile = open(jdlFileName, 'w')
    jdl = classAdJob.asJDL()
    jdlFile.write(jdl)
    jdlFile.close()

  #############################################################################
  def __getCPUTimeLeft(self):
    """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
    cpuTime = sum(os.times()[:-1])
    self.log.info('Current raw CPU time consumed is %s' % cpuTime)
    timeleft = self.timeLeft
    if self.cpuFactor:
      timeleft -= cpuTime * self.cpuFactor
    return timeleft

  #############################################################################
  def __setupProxy(self, ownerDN, ownerGroup):
    """
    Retrieve a proxy for the execution of the job
    """
    if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False):
      proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
      if not proxyResult['OK']:
        self.log.error('Failed to setup proxy', proxyResult['Message'])
        return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message'])
      return S_OK(proxyResult['Value'])
    else:
      ret = getProxyInfo(disableVOMS=True)
      if not ret['OK']:
        self.log.error('Invalid Proxy', ret['Message'])
        return S_ERROR('Invalid Proxy')

      proxyChain = ret['Value']['chain']
      if 'groupProperties' not in ret['Value']:
        print ret['Value']
        print proxyChain.dumpAllToString()
        self.log.error('Invalid Proxy', 'Group has no properties defined')
        return S_ERROR('Proxy has no group properties defined')

      groupProps = ret['Value']['groupProperties']
      if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
        proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
        if not proxyResult['OK']:
          self.log.error('Invalid Proxy', proxyResult['Message'])
          return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message'])
        proxyChain = proxyResult['Value']

    return S_OK(proxyChain)

  #############################################################################
  def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
    """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

    self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
    token = gConfig.getValue("/Security/ProxyToken", "")
    if not token:
      self.log.info("No token defined. Trying to download proxy without token")
      token = False
    retVal = gProxyManager.getPayloadProxyFromDIRACGroup(ownerDN, ownerGroup,
                                                         self.defaultProxyLength, token)
    if not retVal['OK']:
      self.log.error('Could not retrieve payload proxy', retVal['Message'])
      self.log.warn(retVal)
      os.system('dirac-proxy-info')
      sys.stdout.flush()
      return S_ERROR('Error retrieving proxy')

    chain = retVal['Value']
    return S_OK(chain)

  #############################################################################
  def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
    """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
    if 'SoftwareDistModule' not in jobParams:
      msg = 'Job has no software installation requirement'
      self.log.verbose(msg)
      return S_OK(msg)

    self.__report(jobID, 'Matched', 'Installing Software')
    softwareDist = jobParams['SoftwareDistModule']
    self.log.verbose('Found VO Software Distribution module: %s' % (softwareDist))
    argumentsDict = {'Job': jobParams, 'CE': resourceParams}
    moduleFactory = ModuleFactory()
    moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
    if not moduleInstance['OK']:
      return moduleInstance

    module = moduleInstance['Value']
    return module.execute()

  #############################################################################
  def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams,
                  proxyChain, processors, wholeNode=False):
    """ Submit job to the Computing Element instance after creating a custom
        Job Wrapper with the available job parameters.
    """
    logLevel = self.am_getOption('DefaultLogLevel', 'INFO')
    defaultWrapperLocation = self.am_getOption('JobWrapperTemplate',
                                               'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py')
    jobDesc = {"jobID": jobID,
               "jobParams": jobParams,
               "resourceParams": resourceParams,
               "optimizerParams": optimizerParams,
               "extraOptions": self.extraOptions,
               "defaultWrapperLocation": defaultWrapperLocation}
    result = createJobWrapper(log=self.log, logLevel=logLevel, **jobDesc)
    if not result['OK']:
      return result

    wrapperFile = result['Value']
    self.__report(jobID, 'Matched', 'Submitted To CE')

    self.log.info('Submitting JobWrapper %s to %sCE' % (os.path.basename(wrapperFile), self.ceName))

    # Pass proxy to the CE
    proxy = proxyChain.dumpAllToString()
    if not proxy['OK']:
      self.log.error('Invalid proxy', proxy)
      return S_ERROR('Payload Proxy Not Found')

    payloadProxy = proxy['Value']
    submission = self.computingElement.submitJob(wrapperFile, payloadProxy,
                                                 numberOfProcessors=processors,
                                                 wholeNode=wholeNode,
                                                 jobDesc=jobDesc,
                                                 log=self.log,
                                                 logLevel=logLevel)
    ret = S_OK('Job submitted')

    if submission['OK']:
      batchID = submission['Value']
      self.log.info('Job %s submitted as %s' % (jobID, batchID))
      if 'PayloadFailed' in submission:
        ret['PayloadFailed'] = submission['PayloadFailed']
        return ret
      time.sleep(self.jobSubmissionDelay)
    else:
      self.log.error('Job submission failed', jobID)
      self.__setJobParam(jobID, 'ErrorMessage', '%s CE Submission Error' % (self.ceName))
      if 'ReschedulePayload' in submission:
        rescheduleFailedJob(jobID, submission['Message'])
        return S_OK()  # Without this, the job is marked as failed
      else:
        if 'Value' in submission:
          self.log.error('Error in DIRAC JobWrapper:', 'exit code = %s' % (str(submission['Value'])))
      return S_ERROR('%s CE Error: %s' % (self.ceName, submission['Message']))

    return ret

  #############################################################################
  def __getJDLParameters(self, jdl):
    """Returns a dictionary of JDL parameters.
    """
    try:
      parameters = {}
#      print jdl
      if not re.search(r'\[', jdl):
        jdl = '[' + jdl + ']'
      classAdJob = ClassAd(jdl)
      paramsDict = classAdJob.contents
      for param, value in paramsDict.items():
        if value.strip().startswith('{'):
          self.log.debug('Found list type parameter %s' % (param))
          rawValues = value.replace('{', '').replace('}', '').replace('"', '').split()
          valueList = []
          for val in rawValues:
            if re.search(',$', val):
              valueList.append(val[:-1])
            else:
              valueList.append(val)
          parameters[param] = valueList
        else:
          parameters[param] = value.replace('"', '').replace('{', '"{').replace('}', '}"')
          self.log.debug('Found standard parameter %s: %s' % (param, parameters[param]))
      return S_OK(parameters)
    except Exception as x:
      self.log.exception(lException=x)
      return S_ERROR('Exception while extracting JDL parameters for job')

  #############################################################################
  def __report(self, jobID, status, minorStatus):
    """Wraps around setJobStatus of state update client
    """
    jobStatus = JobStateUpdateClient().setJobStatus(int(jobID), status, minorStatus, 'JobAgent@%s' % self.siteName)
    self.log.verbose('setJobStatus(%s,%s,%s,%s)' % (jobID, status, minorStatus, 'JobAgent@%s' % self.siteName))
    if not jobStatus['OK']:
      self.log.warn(jobStatus['Message'])

    return jobStatus

  #############################################################################
  def __setJobParam(self, jobID, name, value):
    """Wraps around setJobParameter of state update client
    """
    jobParam = JobStateUpdateClient().setJobParameter(int(jobID), str(name), str(value))
    self.log.verbose('setJobParameter(%s,%s,%s)' % (jobID, name, value))
    if not jobParam['OK']:
      self.log.warn(jobParam['Message'])

    return jobParam

  #############################################################################
  def __finish(self, message, stop=True):
    """Force the JobAgent to complete gracefully.
    """
    if stop:
      self.log.info('JobAgent will stop with message "%s", execution complete.' % message)
      self.am_stopExecution()
      return S_ERROR(message)
    else:
      return S_OK(message)

  #############################################################################
  def __rescheduleFailedJob(self, jobID, message, stop=True):
    """
    Set Job Status to "Rescheduled" and issue a reschedule command to the Job Manager
    """

    self.log.warn('Failure during %s' % (message))

    jobReport = JobReport(int(jobID), 'JobAgent@%s' % self.siteName)

    # Setting a job parameter does not help since the job will be rescheduled,
    # instead set the status with the cause and then another status showing the
    # reschedule operation.

    jobReport.setJobStatus(status='Rescheduled',
                           application=message,
                           sendFlag=True)

    self.log.info('Job will be rescheduled')
    result = JobManagerClient().rescheduleJob(jobID)
    if not result['OK']:
      self.log.error('Failed to reschedule job', result['Message'])
      return self.__finish('Problem Rescheduling Job', stop)

    self.log.info('Job Rescheduled %s' % (jobID))
    return self.__finish('Job Rescheduled', stop)

  #############################################################################
  def finalize(self):
    """ Job Agent finalization method
    """

    gridCE = gConfig.getValue('/LocalSite/GridCE', '')
    queue = gConfig.getValue('/LocalSite/CEQueue', '')
    wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
    result = wmsAdmin.setPilotStatus(str(self.pilotReference), 'Done', gridCE,
                                     'Report from JobAgent', self.siteName, queue)
    if not result['OK']:
      self.log.warn(result['Message'])

    return S_OK()
Пример #9
0
class JobAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        #Disable monitoring
        self.am_setOption('MonitoringEnabled', False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption('MaxCycles', loops)

        ceType = self.am_getOption('CEType', 'InProcess')
        localCE = gConfig.getValue('/LocalSite/LocalCE', '')
        if localCE:
            self.log.info('Defining CE from local configuration = %s' %
                          localCE)
            ceType = localCE

        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance['OK']:
            self.log.warn(ceInstance['Message'])
            return ceInstance

        self.initTimes = os.times()

        self.computingElement = ceInstance['Value']
        self.diracRoot = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
        #Localsite options
        self.siteRoot = gConfig.getValue('/LocalSite/Root', os.getcwd())
        self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown')
        self.pilotReference = gConfig.getValue('/LocalSite/PilotReference',
                                               'Unknown')
        self.defaultProxyLength = gConfig.getValue(
            '/Registry/DefaultProxyLifeTime', 86400 * 5)
        #Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor',
                                          0.0)
        defaultWrapperLocation = 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py'
        self.jobWrapperTemplate = os.path.join(
            self.diracRoot,
            self.am_getOption('JobWrapperTemplate', defaultWrapperLocation))
        self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10)
        self.defaultLogLevel = self.am_getOption('DefaultLogLevel', 'info')
        self.fillingMode = self.am_getOption('FillingModeFlag', False)
        self.stopOnApplicationFailure = self.am_getOption(
            'StopOnApplicationFailure', True)
        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        #Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeft = gConfig.getValue(
            '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0)
        self.gridCEQueue = gConfig.getValue(
            '/Resources/Computing/CEDefaults/GridCEQueue', '')
        self.timeLeftError = ''
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            #Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])
            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict('/AgentJobRequirements')
        if result['OK']:
            requirementsDict = result['Value']
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error(jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                self.log.error(jobRequest['Message'])
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs: %s' %
                                (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        jobID = matcherInfo['JobID']
        self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag',
                                                     False)
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if not matcherInfo.has_key(param):
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo.keys():
            if not key in matcherParams:
                value = matcherInfo[key]
                optimizerParams[key] = value

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if not params.has_key('JobID'):
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if not params.has_key('JobType'):
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if not params.has_key('SystemConfig'):
            self.log.warn(
                'Job has no system configuration defined in JDL parameters')
            systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
            self.log.info(
                'Setting system config to /LocalSite/Architecture = %s since it was not specified'
                % systemConfig)
            if not systemConfig:
                self.log.warn('/LocalSite/Architecture is not defined')
            params['SystemConfig'] = systemConfig
        else:
            systemConfig = params['SystemConfig']
            if systemConfig.lower() == 'any':
                systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
                self.log.info(
                    'Setting SystemConfig = /LocalSite/Architecture =',
                    '"%s" since it was set to "ANY" in the job description' %
                    systemConfig)
                if not systemConfig:
                    self.log.warn('/LocalSite/Architecture is not defined')
                params['SystemConfig'] = systemConfig

        if not params.has_key('CPUTime'):
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        self.log.verbose('Job request successful: \n %s' %
                         (jobRequest['Value']))
        self.log.info('Received JobID=%s, JobType=%s, SystemConfig=%s' %
                      (jobID, jobType, systemConfig))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)
            if self.gridCEQueue:
                jobReport.setJobParameter('GridCEQueue',
                                          self.gridCEQueue,
                                          sendFlag=False)

            if os.environ.has_key('BOINC_JOB_ID'):
                # Report BOINC environment
                for p in [
                        'BoincUserID', 'BoincHostID', 'BoincHostPlatform',
                        'BoincHostName'
                ]:
                    jobReport.setJobParameter(p,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % p,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            # self.__setJobSite( jobID, self.siteName )
            if not self.pilotInfoReportedFlag:
                self.__reportPilotInfo(jobID)
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            if 'Value' in result and result['Value']:
                proxyChain = result['Value']

            # Is this necessary at all?
            saveJDL = self.__saveJobJDLRequest(jobID, jobJDL)
            #self.__report(jobID,'Matched','Job Prepared to Submit')

            #resourceParameters = self.__getJDLParameters( resourceJDL )
            #if not resourceParameters['OK']:
            #  return resourceParameters
            #resourceParams = resourceParameters['Value']

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job %s' %
                               (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.verbose('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, jobJDL, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                return self.__finish(
                    'Payload execution failed with error code %s' %
                    submission['PayloadFailed'], self.stopOnApplicationFailure)

            self.log.verbose('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        currentTimes = list(os.times())
        for i in range(len(currentTimes)):
            currentTimes[i] -= self.initTimes[i]

        utime, stime, cutime, cstime, elapsed = currentTimes
        cpuTime = utime + stime + cutime + cstime

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                if self.cpuFactor:
                    # if the batch system is not defined used the CPUNormalizationFactor
                    # defined locally
                    self.timeLeft = self.__getCPUTimeLeft()
        scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')

    #############################################################################
    def __getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        utime, stime, cutime, cstime, elapsed = os.times()
        cpuTime = utime + stime + cutime
        self.log.info('Current raw CPU time consumed is %s' % cpuTime)
        timeleft = self.timeLeft - cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def __changeProxy(self, oldProxy, newProxy):
        """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
        self.log.verbose('Log proxy change (to be instrumented)')
        return S_OK()

    #############################################################################
    def __setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False):
            proxyResult = self.__requestProxyFromProxyManager(
                ownerDN, ownerGroup)
            if not proxyResult['OK']:
                self.log.error('Invalid Proxy', proxyResult['Message'])
                return S_ERROR('Failed to setup proxy: %s' %
                               proxyResult['Message'])
            return S_OK(proxyResult['Value'])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret['OK']:
                self.log.error('Invalid Proxy', ret['Message'])
                return S_ERROR('Invalid Proxy')

            proxyChain = ret['Value']['chain']
            if not 'groupProperties' in ret['Value']:
                print ret['Value']
                print proxyChain.dumpAllToString()
                self.log.error('Invalid Proxy',
                               'Group has no properties defined')
                return S_ERROR('Proxy has no group properties defined')

            groupProps = ret['Value']['groupProperties']
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self.__requestProxyFromProxyManager(
                    ownerDN, ownerGroup)
                if not proxyResult['OK']:
                    self.log.error('Invalid Proxy', proxyResult['Message'])
                    return S_ERROR('Failed to setup proxy: %s' %
                                   proxyResult['Message'])
                proxyChain = proxyResult['Value']

        return S_OK(proxyChain)

    #############################################################################
    def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info(
                "No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(
            ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal['OK']:
            self.log.error('Could not retrieve proxy')
            self.log.warn(retVal)
            os.system('dirac-proxy-info')
            sys.stdout.flush()
            return S_ERROR('Error retrieving proxy')

        chain = retVal['Value']
        return S_OK(chain)

    #############################################################################
    def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if not jobParams.has_key('SoftwareDistModule'):
            msg = 'Job has no software installation requirement'
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, 'Matched', 'Installing Software')
        softwareDist = jobParams['SoftwareDistModule']
        self.log.verbose('Found VO Software Distribution module: %s' %
                         (softwareDist))
        argumentsDict = {'Job': jobParams, 'CE': resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance['OK']:
            return moduleInstance

        module = moduleInstance['Value']
        result = module.execute()
        return result

    #############################################################################
    def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams,
                    jobJDL, proxyChain):
        """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
        result = self.__createJobWrapper(jobID, jobParams, resourceParams,
                                         optimizerParams)

        if not result['OK']:
            return result

        wrapperData = result['Value']
        wrapperFile = wrapperData['execFile']
        self.__report(jobID, 'Matched', 'Submitted To CE')

        wrapperName = os.path.basename(wrapperFile)
        self.log.info('Submitting %s to %sCE' % (wrapperName, self.ceName))

        #Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy['OK']:
            self.log.error(proxy)
            return S_ERROR('Payload Proxy Not Found')

        payloadProxy = proxy['Value']
        # FIXME: how can we set the batchID before we submit, this makes no sense
        batchID = 'dc%s' % (jobID)
        submission = self.computingElement.submitJob(wrapperFile, payloadProxy,
                                                     wrapperData)

        ret = S_OK('Job submitted')

        if submission['OK']:
            batchID = submission['Value']
            self.log.info('Job %s submitted as %s' % (jobID, batchID))
            self.log.verbose('Set JobParameter: Local batch ID %s' % (batchID))
            self.__setJobParam(jobID, 'LocalBatchID', str(batchID))
            if 'PayloadFailed' in submission:
                ret['PayloadFailed'] = submission['PayloadFailed']
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error('Job submission failed', jobID)
            self.__setJobParam(jobID, 'ErrorMessage',
                               '%s CE Submission Error' % (self.ceName))
            if 'ReschedulePayload' in submission:
                rescheduleFailedJob(jobID, submission['Message'],
                                    self.__report)
            else:
                if 'Value' in submission:
                    self.log.error(
                        'Error in DIRAC JobWrapper:',
                        'exit code = %s' % (str(submission['Value'])))
                # make sure the Job is declared Failed
                self.__report(jobID, 'Failed', submission['Message'])
            return S_ERROR('%s CE Submission Error: %s' %
                           (self.ceName, submission['Message']))

        return ret

    #############################################################################
    def __createJobWrapper(self, jobID, jobParams, resourceParams,
                           optimizerParams):
        """This method creates a job wrapper filled with the CE and Job parameters
       to executed the job.
    """
        arguments = {
            'Job': jobParams,
            'CE': resourceParams,
            'Optimizer': optimizerParams
        }
        self.log.verbose('Job arguments are: \n %s' % (arguments))

        workingDir = gConfig.getValue('/LocalSite/WorkingDirectory',
                                      self.siteRoot)
        if not os.path.exists('%s/job/Wrapper' % (workingDir)):
            try:
                os.makedirs('%s/job/Wrapper' % (workingDir))
            except Exception:
                self.log.exception()
                return S_ERROR('Could not create directory for wrapper script')

        jobWrapperFile = '%s/job/Wrapper/Wrapper_%s' % (workingDir, jobID)
        if os.path.exists(jobWrapperFile):
            self.log.verbose('Removing existing Job Wrapper for %s' % (jobID))
            os.remove(jobWrapperFile)
        fd = open(self.jobWrapperTemplate, 'r')
        wrapperTemplate = fd.read()
        fd.close()

        dateStr = time.strftime("%Y-%m-%d", time.localtime(time.time()))
        timeStr = time.strftime("%H:%M", time.localtime(time.time()))
        date_time = '%s %s' % (dateStr, timeStr)
        signature = __RCSID__
        dPython = sys.executable

        systemConfig = ''
        if jobParams.has_key('SystemConfig'):
            systemConfig = jobParams['SystemConfig']
            self.log.verbose('Job system configuration requirement is %s' %
                             (systemConfig))
            if resourceParams.has_key('Root'):
                jobPython = '%s/%s/bin/python' % (resourceParams['Root'],
                                                  systemConfig)
                if os.path.exists(jobPython):
                    self.log.verbose('Found local python for job:\n%s' %
                                     (jobPython))
                    dPython = jobPython
                else:
                    if systemConfig == 'ANY':
                        self.log.verbose(
                            'Using standard available python %s for job' %
                            (dPython))
                    else:
                        self.log.warn(
                            'Job requested python \n%s\n but this is not available locally'
                            % (jobPython))
            else:
                self.log.warn('No LocalSite/Root defined')
        else:
            self.log.warn('Job has no system configuration requirement')

        if not systemConfig or systemConfig.lower() == 'any':
            systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
            if not systemConfig:
                self.log.warn('Could not establish SystemConfig')

        logLevel = self.defaultLogLevel
        if jobParams.has_key('LogLevel'):
            logLevel = jobParams['LogLevel']
            self.log.info('Found Job LogLevel JDL parameter with value: %s' %
                          (logLevel))
        else:
            self.log.info(
                'Applying default LogLevel JDL parameter with value: %s' %
                (logLevel))

        realPythonPath = os.path.realpath(dPython)
        self.log.debug('Real python path after resolving links is:')
        self.log.debug(realPythonPath)
        dPython = realPythonPath

        siteRootPython = self.siteRoot
        self.log.debug('DIRACPython is:\n%s' % dPython)
        self.log.debug('SiteRootPythonDir is:\n%s' % siteRootPython)
        wrapperTemplate = wrapperTemplate.replace("@SIGNATURE@",
                                                  str(signature))
        wrapperTemplate = wrapperTemplate.replace("@JOBID@", str(jobID))
        wrapperTemplate = wrapperTemplate.replace("@DATESTRING@",
                                                  str(date_time))
        wrapperTemplate = wrapperTemplate.replace("@JOBARGS@", str(arguments))
        wrapperTemplate = wrapperTemplate.replace("@SITEPYTHON@",
                                                  str(siteRootPython))
        wrapper = open(jobWrapperFile, "w")
        wrapper.write(wrapperTemplate)
        wrapper.close()
        jobExeFile = '%s/job/Wrapper/Job%s' % (workingDir, jobID)
        jobFileContents = \
    """#!/bin/sh
%s %s -o LogLevel=%s -o /DIRAC/Security/UseServerCertificate=no
""" % ( dPython, jobWrapperFile, logLevel )
        jobFile = open(jobExeFile, 'w')
        jobFile.write(jobFileContents)
        jobFile.close()

        data = {
            'execFile': jobExeFile,
            'wrapperTemplate': self.jobWrapperTemplate,
            'signature': signature,
            'wrapperFile': jobWrapperFile,
            'logLevel': logLevel,
            'jid': jobID,
            'datetime': date_time,
            'jobArgs': arguments,
            'sitePython': siteRootPython
        }
        return S_OK(data)

    #############################################################################
    def __saveJobJDLRequest(self, jobID, jobJDL):
        """Save job JDL local to JobAgent.
    """
        classAdJob = ClassAd(jobJDL)
        classAdJob.insertAttributeString('LocalCE', self.ceName)
        jdlFileName = jobID + '.jdl'
        jdlFile = open(jdlFileName, 'w')
        jdl = classAdJob.asJDL()
        jdlFile.write(jdl)
        jdlFile.close()
        return S_OK(jdlFileName)

    #############################################################################
    def __requestJob(self, ceDict):
        """Request a single job from the matcher service.
    """
        try:
            matcher = RPCClient('WorkloadManagement/Matcher', timeout=600)
            result = matcher.requestJob(ceDict)
            return result
        except Exception, x:
            self.log.exception(lException=x)
            return S_ERROR(
                'Job request to matcher service failed with exception')
Пример #10
0
class JobAgent(AgentModule):
    """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """
    def __init__(self,
                 agentName,
                 loadName,
                 baseAgentName=False,
                 properties=None):
        """ Just defines some default parameters
    """
        if not properties:
            properties = {}
        super(JobAgent, self).__init__(agentName, loadName, baseAgentName,
                                       properties)

        self.ceName = 'InProcess'
        self.computingElement = None
        self.timeLeft = 0.0

        self.initTimes = os.times()
        # Localsite options
        self.siteName = 'Unknown'
        self.pilotReference = 'Unknown'
        self.defaultProxyLength = 86400 * 5
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = 0.0
        self.jobSubmissionDelay = 10
        self.fillingMode = False
        self.minimumTimeLeft = 1000
        self.stopOnApplicationFailure = True
        self.stopAfterFailedMatches = 10
        self.jobCount = 0
        self.matchFailedCount = 0
        self.extraOptions = ''
        # Timeleft
        self.timeLeftUtil = None
        self.timeLeftError = ''
        self.pilotInfoReportedFlag = False

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring, logLevel INFO, limited cycles
        self.am_setOption('MonitoringEnabled', False)
        self.am_setOption('MaxCycles', loops)

        ceType = self.am_getOption('CEType', self.ceName)
        localCE = gConfig.getValue('/LocalSite/LocalCE', '')
        if localCE:
            self.log.info('Defining CE from local configuration',
                          '= %s' % localCE)
            ceType = localCE

        # Create backend Computing Element
        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance['OK']:
            self.log.warn("Can't instantiate a CE", ceInstance['Message'])
            return ceInstance
        self.computingElement = ceInstance['Value']

        result = self.computingElement.getDescription()
        if not result['OK']:
            self.log.warn("Can not get the CE description")
            return result
        if isinstance(result['Value'], list):
            ceDict = result['Value'][0]
        else:
            ceDict = result['Value']
        self.timeLeft = ceDict.get('CPUTime', self.timeLeft)
        self.timeLeft = gConfig.getValue(
            '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft)

        self.initTimes = os.times()
        # Localsite options
        self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName)
        self.pilotReference = gConfig.getValue('/LocalSite/PilotReference',
                                               self.pilotReference)
        self.defaultProxyLength = gConfig.getValue(
            '/Registry/DefaultProxyLifeTime', self.defaultProxyLength)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor',
                                          self.cpuFactor)
        self.jobSubmissionDelay = self.am_getOption('SubmissionDelay',
                                                    self.jobSubmissionDelay)
        self.fillingMode = self.am_getOption('FillingModeFlag',
                                             self.fillingMode)
        self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft',
                                                 self.minimumTimeLeft)
        self.stopOnApplicationFailure = self.am_getOption(
            'StopOnApplicationFailure', self.stopOnApplicationFailure)
        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)
        self.extraOptions = gConfig.getValue(
            '/AgentJobRequirements/ExtraOptions', self.extraOptions)
        # Timeleft
        self.timeLeftUtil = TimeLeft()
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Temporary mechanism to pass a shutdown message to the agent
            if os.path.exists('/var/lib/dirac_drain'):
                return self.__finish('Node is being drained by an operator')
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(
                        "Disabling filling mode as errors calculating time left",
                        self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('normalized CPU units remaining in slot',
                              self.timeLeft)
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        result = self.computingElement.available()
        if not result['OK']:
            self.log.info('Resource is not available', result['Message'])
            return self.__finish('CE Not Available')

        ceInfoDict = result['CEInfoDict']
        runningJobs = ceInfoDict.get("RunningJobs")
        availableSlots = result['Value']

        if not availableSlots:
            if runningJobs:
                self.log.info('No available slots',
                              '%d running jobs' % runningJobs)
                return S_OK('Job Agent cycle complete with %d running jobs' %
                            runningJobs)
            else:
                self.log.info('CE is not available')
                return self.__finish('CE Not Available')

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result

        # We can have several prioritized job retrieval strategies
        if isinstance(result['Value'], dict):
            ceDictList = [result['Value']]
        elif isinstance(result['Value'], list):
            ceDictList = result['Value']

        for ceDict in ceDictList:

            # Add pilot information
            gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
            if gridCE != 'Unknown':
                ceDict['GridCE'] = gridCE
            if 'PilotReference' not in ceDict:
                ceDict['PilotReference'] = str(self.pilotReference)
            ceDict['PilotBenchmark'] = self.cpuFactor
            ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

            # Add possible job requirements
            result = gConfig.getOptionsDict('/AgentJobRequirements')
            if result['OK']:
                requirementsDict = result['Value']
                ceDict.update(requirementsDict)
                self.log.info('Requirements:', requirementsDict)

            self.log.verbose('CE dict', ceDict)
            start = time.time()
            jobRequest = MatcherClient().requestJob(ceDict)
            matchTime = time.time() - start
            self.log.info('MatcherTime', '= %.2f (s)' % (matchTime))
            if jobRequest['OK']:
                break

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK, but no match found',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned',
                                 '%s = %s ' % (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self._getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn('Could Not Extract JDL Parameters',
                          parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        # Job requirement for a number of processors
        processors = int(params.get('NumberOfProcessors', 1))
        wholeNode = 'WholeNode' in params

        if self.extraOptions:
            params['Arguments'] += ' ' + self.extraOptions
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info(
            'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' %
            (jobID, jobType, ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for thisp in ('BoincUserID', 'BoincHostID',
                              'BoincHostPlatform', 'BoincHostName'):
                    jobReport.setJobParameter(thisp,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % thisp,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            result = self._setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self._rescheduleFailedJob(jobID, result['Message'],
                                                 self.stopOnApplicationFailure)
            proxyChain = result.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self._checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self._rescheduleFailedJob(jobID, errorMsg,
                                                 self.stopOnApplicationFailure)

            self.log.debug('Before %sCE submitJob()' % (self.ceName))
            result = self._submitJob(jobID, params, ceDict, optimizerParams,
                                     proxyChain, processors, wholeNode)
            if not result['OK']:
                self.__report(jobID, 'Failed', result['Message'])
                return self.__finish(result['Message'])
            elif 'PayloadFailed' in result:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % result[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception as subExcept:  # pylint: disable=broad-except
            self.log.exception("Exception in submission",
                               "",
                               lException=subExcept,
                               lExcInfo=True)
            return self._rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime, processors)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self._getCPUTimeLeft()

        return S_OK('Job Agent cycle complete')

    #############################################################################
    def __saveJobJDLRequest(self, jobID, jobJDL):
        """Save job JDL local to JobAgent.
    """
        classAdJob = ClassAd(jobJDL)
        classAdJob.insertAttributeString('LocalCE', self.ceName)
        jdlFileName = jobID + '.jdl'
        jdlFile = open(jdlFileName, 'w')
        jdl = classAdJob.asJDL()
        jdlFile.write(jdl)
        jdlFile.close()

    #############################################################################
    def _getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        cpuTime = sum(os.times()[:-1])
        self.log.info('Current raw CPU time consumed is %s' % cpuTime)
        timeleft = self.timeLeft
        if self.cpuFactor:
            timeleft -= cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def _setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False):
            proxyResult = self._requestProxyFromProxyManager(
                ownerDN, ownerGroup)
            if not proxyResult['OK']:
                self.log.error('Failed to setup proxy', proxyResult['Message'])
                return S_ERROR('Failed to setup proxy: %s' %
                               proxyResult['Message'])
            return S_OK(proxyResult['Value'])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret['OK']:
                self.log.error('Invalid Proxy', ret['Message'])
                return S_ERROR('Invalid Proxy')

            proxyChain = ret['Value']['chain']
            if 'groupProperties' not in ret['Value']:
                print(ret['Value'])
                print(proxyChain.dumpAllToString())
                self.log.error('Invalid Proxy',
                               'Group has no properties defined')
                return S_ERROR('Proxy has no group properties defined')

            groupProps = ret['Value']['groupProperties']
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self._requestProxyFromProxyManager(
                    ownerDN, ownerGroup)
                if not proxyResult['OK']:
                    self.log.error('Invalid Proxy', proxyResult['Message'])
                    return S_ERROR('Failed to setup proxy: %s' %
                                   proxyResult['Message'])
                proxyChain = proxyResult['Value']

        return S_OK(proxyChain)

    #############################################################################
    def _requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy', 'for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info(
                "No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(
            ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal['OK']:
            self.log.error('Could not retrieve payload proxy',
                           retVal['Message'])
            os.system('dirac-proxy-info')
            sys.stdout.flush()
            return S_ERROR('Error retrieving proxy')

        chain = retVal['Value']
        return S_OK(chain)

    #############################################################################
    def _checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if 'SoftwareDistModule' not in jobParams:
            msg = 'Job has no software installation requirement'
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, 'Matched', 'Installing Software')
        softwareDist = jobParams['SoftwareDistModule']
        self.log.verbose('Found VO Software Distribution module',
                         ': %s' % (softwareDist))
        argumentsDict = {'Job': jobParams, 'CE': resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance['OK']:
            return moduleInstance

        module = moduleInstance['Value']
        return module.execute()

    #############################################################################
    def _submitJob(self,
                   jobID,
                   jobParams,
                   resourceParams,
                   optimizerParams,
                   proxyChain,
                   processors,
                   wholeNode=False):
        """ Submit job to the Computing Element instance after creating a custom
        Job Wrapper with the available job parameters.
    """
        logLevel = self.am_getOption('DefaultLogLevel', 'INFO')
        defaultWrapperLocation = self.am_getOption(
            'JobWrapperTemplate',
            'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py')
        jobDesc = {
            "jobID": jobID,
            "jobParams": jobParams,
            "resourceParams": resourceParams,
            "optimizerParams": optimizerParams,
            "extraOptions": self.extraOptions,
            "defaultWrapperLocation": defaultWrapperLocation
        }
        result = createJobWrapper(log=self.log, logLevel=logLevel, **jobDesc)
        if not result['OK']:
            return result

        wrapperFile = result['Value']
        self.__report(jobID, 'Matched', 'Submitted To CE')

        self.log.info(
            'Submitting JobWrapper',
            '%s to %sCE' % (os.path.basename(wrapperFile), self.ceName))

        # Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy['OK']:
            self.log.error('Invalid proxy', proxy)
            return S_ERROR('Payload Proxy Not Found')

        payloadProxy = proxy['Value']
        submission = self.computingElement.submitJob(
            wrapperFile,
            payloadProxy,
            numberOfProcessors=processors,
            wholeNode=wholeNode,
            jobDesc=jobDesc,
            log=self.log,
            logLevel=logLevel)
        ret = S_OK('Job submitted')

        if submission['OK']:
            batchID = submission['Value']
            self.log.info('Job submitted', '%s as %s' % (jobID, batchID))
            if 'PayloadFailed' in submission:
                ret['PayloadFailed'] = submission['PayloadFailed']
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error('Job submission failed', jobID)
            self.__setJobParam(jobID, 'ErrorMessage',
                               '%s CE Submission Error' % (self.ceName))
            if 'ReschedulePayload' in submission:
                rescheduleFailedJob(jobID, submission['Message'])
                return S_OK()  # Without this, the job is marked as failed
            else:
                if 'Value' in submission:
                    self.log.error(
                        'Error in DIRAC JobWrapper:',
                        'exit code = %s' % (str(submission['Value'])))
            return S_ERROR('%s CE Error: %s' %
                           (self.ceName, submission['Message']))

        return ret

    #############################################################################
    def _getJDLParameters(self, jdl):
        """Returns a dictionary of JDL parameters.
    """
        try:
            parameters = {}
            #      print jdl
            if not re.search(r'\[', jdl):
                jdl = '[' + jdl + ']'
            classAdJob = ClassAd(jdl)
            paramsDict = classAdJob.contents
            for param, value in paramsDict.items():
                if value.strip().startswith('{'):
                    self.log.debug('Found list type parameter %s' % (param))
                    rawValues = value.replace('{',
                                              '').replace('}', '').replace(
                                                  '"', '').split()
                    valueList = []
                    for val in rawValues:
                        if re.search(',$', val):
                            valueList.append(val[:-1])
                        else:
                            valueList.append(val)
                    parameters[param] = valueList
                else:
                    parameters[param] = value.replace('"', '').replace(
                        '{', '"{').replace('}', '}"')
                    self.log.debug('Found standard parameter %s: %s' %
                                   (param, parameters[param]))
            return S_OK(parameters)
        except Exception as x:
            self.log.exception(lException=x)
            return S_ERROR('Exception while extracting JDL parameters for job')

    #############################################################################
    def __report(self, jobID, status, minorStatus):
        """Wraps around setJobStatus of state update client
    """
        jobStatus = JobStateUpdateClient().setJobStatus(
            int(jobID), status, minorStatus, 'JobAgent@%s' % self.siteName)
        self.log.verbose(
            'Setting job status', 'setJobStatus(%s,%s,%s,%s)' %
            (jobID, status, minorStatus, 'JobAgent@%s' % self.siteName))
        if not jobStatus['OK']:
            self.log.warn('Issue setting the job status', jobStatus['Message'])

        return jobStatus

    #############################################################################
    def __setJobParam(self, jobID, name, value):
        """Wraps around setJobParameter of state update client
    """
        jobParam = JobStateUpdateClient().setJobParameter(
            int(jobID), str(name), str(value))
        self.log.verbose('Setting job parameter',
                         'setJobParameter(%s,%s,%s)' % (jobID, name, value))
        if not jobParam['OK']:
            self.log.warn('Issue setting the job parameter',
                          jobParam['Message'])

        return jobParam

    #############################################################################
    def __finish(self, message, stop=True):
        """Force the JobAgent to complete gracefully.
    """
        if stop:
            self.log.info('JobAgent will stop',
                          'with message "%s", execution complete.' % message)
            self.am_stopExecution()
            return S_ERROR(message)
        else:
            return S_OK(message)

    #############################################################################
    def _rescheduleFailedJob(self, jobID, message, stop=True):
        """
    Set Job Status to "Rescheduled" and issue a reschedule command to the Job Manager
    """

        self.log.warn('Failure ==> rescheduling', '(during %s)' % (message))

        jobReport = JobReport(int(jobID), 'JobAgent@%s' % self.siteName)

        # Setting a job parameter does not help since the job will be rescheduled,
        # instead set the status with the cause and then another status showing the
        # reschedule operation.

        jobReport.setJobStatus(status='Rescheduled',
                               application=message,
                               sendFlag=True)

        self.log.info('Job will be rescheduled')
        result = JobManagerClient().rescheduleJob(jobID)
        if not result['OK']:
            self.log.error('Failed to reschedule job', result['Message'])
            return self.__finish('Problem Rescheduling Job', stop)

        self.log.info('Job Rescheduled', jobID)
        return self.__finish('Job Rescheduled', stop)

    #############################################################################
    def finalize(self):
        """ Job Agent finalization method
    """

        gridCE = gConfig.getValue('/LocalSite/GridCE', '')
        queue = gConfig.getValue('/LocalSite/CEQueue', '')
        result = PilotManagerClient().setPilotStatus(str(self.pilotReference),
                                                     'Done', gridCE,
                                                     'Report from JobAgent',
                                                     self.siteName, queue)
        if not result['OK']:
            self.log.warn('Issue setting the pilot status', result['Message'])

        return S_OK()
Пример #11
0
class JobAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring
        self.am_setOption("MonitoringEnabled", False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption("MaxCycles", loops)

        ceType = self.am_getOption("CEType", "InProcess")
        localCE = gConfig.getValue("/LocalSite/LocalCE", "")
        if localCE:
            self.log.info("Defining CE from local configuration = %s" % localCE)
            ceType = localCE

        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance["OK"]:
            self.log.warn(ceInstance["Message"])
            return ceInstance

        self.initTimes = os.times()

        self.computingElement = ceInstance["Value"]
        self.diracRoot = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
        # Localsite options
        self.siteRoot = gConfig.getValue("/LocalSite/Root", os.getcwd())
        self.siteName = gConfig.getValue("/LocalSite/Site", "Unknown")
        self.pilotReference = gConfig.getValue("/LocalSite/PilotReference", "Unknown")
        self.defaultProxyLength = gConfig.getValue("/Registry/DefaultProxyLifeTime", 86400 * 5)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 0.0)
        defaultWrapperLocation = "DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py"
        self.jobWrapperTemplate = os.path.join(
            self.diracRoot, self.am_getOption("JobWrapperTemplate", defaultWrapperLocation)
        )
        self.jobSubmissionDelay = self.am_getOption("SubmissionDelay", 10)
        self.defaultLogLevel = self.am_getOption("DefaultLogLevel", "info")
        self.fillingMode = self.am_getOption("FillingModeFlag", False)
        self.stopOnApplicationFailure = self.am_getOption("StopOnApplicationFailure", True)
        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        # Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeft = gConfig.getValue("/Resources/Computing/CEDefaults/MaxCPUTime", 0.0)
        self.gridCEQueue = gConfig.getValue("/Resources/Computing/CEDefaults/GridCEQueue", "")
        self.timeLeftError = ""
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info("Attempting to check CPU time left for filling mode")
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft))
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
                if not result["OK"]:
                    return self.__finish(result["Message"])
            else:
                return self.__finish("Filling Mode is Disabled")

        self.log.verbose("Job Agent execution loop")
        available = self.computingElement.available()
        if not available["OK"] or not available["Value"]:
            self.log.info("Resource is not available")
            self.log.info(available["Message"])
            return self.__finish("CE Not Available")

        self.log.info(available["Message"])

        result = self.computingElement.getDescription()
        if not result["OK"]:
            return result
        ceDict = result["Value"]

        # Add pilot information
        gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown")
        if gridCE != "Unknown":
            ceDict["GridCE"] = gridCE
        if not "PilotReference" in ceDict:
            ceDict["PilotReference"] = str(self.pilotReference)
        ceDict["PilotBenchmark"] = self.cpuFactor
        ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict("/AgentJobRequirements")
        if result["OK"]:
            requirementsDict = result["Value"]
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info("MatcherTime = %.2f (s)" % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches)

        if not jobRequest["OK"]:
            if re.search("No match found", jobRequest["Message"]):
                self.log.notice("Job request OK: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("seconds timeout") != -1:
                self.log.error(jobRequest["Message"])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("Pilot version does not match") != -1:
                self.log.error(jobRequest["Message"])
                return S_ERROR(jobRequest["Message"])
            else:
                self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest["Value"]
        jobID = matcherInfo["JobID"]
        self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False)
        matcherParams = ["JDL", "DN", "Group"]
        for param in matcherParams:
            if not matcherInfo.has_key(param):
                self.__report(jobID, "Failed", "Matcher did not return %s" % (param))
                return self.__finish("Matcher Failed")
            elif not matcherInfo[param]:
                self.__report(jobID, "Failed", "Matcher returned null %s" % (param))
                return self.__finish("Matcher Failed")
            else:
                self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param]))

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]

        optimizerParams = {}
        for key in matcherInfo.keys():
            if not key in matcherParams:
                value = matcherInfo[key]
                optimizerParams[key] = value

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters["OK"]:
            self.__report(jobID, "Failed", "Could Not Extract JDL Parameters")
            self.log.warn(parameters["Message"])
            return self.__finish("JDL Problem")

        params = parameters["Value"]
        if not params.has_key("JobID"):
            msg = "Job has not JobID defined in JDL parameters"
            self.__report(jobID, "Failed", msg)
            self.log.warn(msg)
            return self.__finish("JDL Problem")
        else:
            jobID = params["JobID"]

        if not params.has_key("JobType"):
            self.log.warn("Job has no JobType defined in JDL parameters")
            jobType = "Unknown"
        else:
            jobType = params["JobType"]

        if not params.has_key("SystemConfig"):
            self.log.warn("Job has no system configuration defined in JDL parameters")
            systemConfig = gConfig.getValue("/LocalSite/Architecture", "")
            self.log.info(
                "Setting system config to /LocalSite/Architecture = %s since it was not specified" % systemConfig
            )
            if not systemConfig:
                self.log.warn("/LocalSite/Architecture is not defined")
            params["SystemConfig"] = systemConfig
        else:
            systemConfig = params["SystemConfig"]
            if systemConfig.lower() == "any":
                systemConfig = gConfig.getValue("/LocalSite/Architecture", "")
                self.log.info(
                    "Setting SystemConfig = /LocalSite/Architecture =",
                    '"%s" since it was set to "ANY" in the job description' % systemConfig,
                )
                if not systemConfig:
                    self.log.warn("/LocalSite/Architecture is not defined")
                params["SystemConfig"] = systemConfig

        if not params.has_key("CPUTime"):
            self.log.warn("Job has no CPU requirement defined in JDL parameters")

        self.log.verbose("Job request successful: \n %s" % (jobRequest["Value"]))
        self.log.info("Received JobID=%s, JobType=%s, SystemConfig=%s" % (jobID, jobType, systemConfig))
        self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
            jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False)
            if self.gridCEQueue:
                jobReport.setJobParameter("GridCEQueue", self.gridCEQueue, sendFlag=False)

            if os.environ.has_key("BOINC_JOB_ID"):
                # Report BOINC environment
                for p in ["BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"]:
                    jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False)

            jobReport.setJobStatus("Matched", "Job Received by Agent")
            # self.__setJobSite( jobID, self.siteName )
            if not self.pilotInfoReportedFlag:
                self.__reportPilotInfo(jobID)
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result["OK"]:
                return self.__rescheduleFailedJob(jobID, result["Message"], params, self.stopOnApplicationFailure)
            if "Value" in result and result["Value"]:
                proxyChain = result["Value"]

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software["OK"]:
                self.log.error("Failed to install software for job %s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                return self.__rescheduleFailedJob(jobID, errorMsg, params, self.stopOnApplicationFailure)

            self.log.verbose("Before %sCE submitJob()" % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict, optimizerParams, jobJDL, proxyChain)
            if not submission["OK"]:
                self.__report(jobID, "Failed", submission["Message"])
                return self.__finish(submission["Message"])
            elif "PayloadFailed" in submission:
                # Do not keep running and do not overwrite the Payload error
                return self.__finish(
                    "Payload execution failed with error code %s" % submission["PayloadFailed"],
                    self.stopOnApplicationFailure,
                )

            self.log.verbose("After %sCE submitJob()" % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, "Job processing failed with exception", params, self.stopOnApplicationFailure
            )

        currentTimes = list(os.times())
        for i in range(len(currentTimes)):
            currentTimes[i] -= self.initTimes[i]

        utime, stime, cutime, cstime, elapsed = currentTimes
        cpuTime = utime + stime + cutime + cstime

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result["OK"]:
            self.timeLeft = result["Value"]
        else:
            if result["Message"] != "Current batch system is not supported":
                self.timeLeftError = result["Message"]
            else:
                if self.cpuFactor:
                    # if the batch system is not defined used the CPUNormalizationFactor
                    # defined locally
                    self.timeLeft = self.__getCPUTimeLeft()
        scaledCPUTime = self.timeLeftUtil.getScaledCPU()["Value"]

        self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK("Job Agent cycle complete")

    #############################################################################
    def __getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        utime, stime, cutime, cstime, elapsed = os.times()
        cpuTime = utime + stime + cutime
        self.log.info("Current raw CPU time consumed is %s" % cpuTime)
        timeleft = self.timeLeft - cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def __changeProxy(self, oldProxy, newProxy):
        """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
        self.log.verbose("Log proxy change (to be instrumented)")
        return S_OK()

    #############################################################################
    def __setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue("/DIRAC/Security/UseServerCertificate", False):
            proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
            if not proxyResult["OK"]:
                self.log.error("Invalid Proxy", proxyResult["Message"])
                return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"])
            return S_OK(proxyResult["Value"])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret["OK"]:
                self.log.error("Invalid Proxy", ret["Message"])
                return S_ERROR("Invalid Proxy")

            proxyChain = ret["Value"]["chain"]
            if not "groupProperties" in ret["Value"]:
                print ret["Value"]
                print proxyChain.dumpAllToString()
                self.log.error("Invalid Proxy", "Group has no properties defined")
                return S_ERROR("Proxy has no group properties defined")

            groupProps = ret["Value"]["groupProperties"]
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
                if not proxyResult["OK"]:
                    self.log.error("Invalid Proxy", proxyResult["Message"])
                    return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"])
                proxyChain = proxyResult["Value"]

        return S_OK(proxyChain)

    #############################################################################
    def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info("No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal["OK"]:
            self.log.error("Could not retrieve proxy")
            self.log.warn(retVal)
            os.system("dirac-proxy-info")
            sys.stdout.flush()
            return S_ERROR("Error retrieving proxy")

        chain = retVal["Value"]
        return S_OK(chain)

    #############################################################################
    def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if not jobParams.has_key("SoftwareDistModule"):
            msg = "Job has no software installation requirement"
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, "Matched", "Installing Software")
        softwareDist = jobParams["SoftwareDistModule"]
        self.log.verbose("Found VO Software Distribution module: %s" % (softwareDist))
        argumentsDict = {"Job": jobParams, "CE": resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance["OK"]:
            return moduleInstance

        module = moduleInstance["Value"]
        result = module.execute()
        return result

    #############################################################################
    def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams, jobJDL, proxyChain):
        """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
        result = self.__createJobWrapper(jobID, jobParams, resourceParams, optimizerParams)

        if not result["OK"]:
            return result

        wrapperData = result["Value"]
        wrapperFile = wrapperData["execFile"]
        self.__report(jobID, "Matched", "Submitted To CE")

        wrapperName = os.path.basename(wrapperFile)
        self.log.info("Submitting %s to %sCE" % (wrapperName, self.ceName))

        # Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy["OK"]:
            self.log.error(proxy)
            return S_ERROR("Payload Proxy Not Found")

        payloadProxy = proxy["Value"]
        # FIXME: how can we set the batchID before we submit, this makes no sense
        batchID = "dc%s" % (jobID)
        submission = self.computingElement.submitJob(wrapperFile, payloadProxy, wrapperData)

        ret = S_OK("Job submitted")

        if submission["OK"]:
            batchID = submission["Value"]
            self.log.info("Job %s submitted as %s" % (jobID, batchID))
            self.log.verbose("Set JobParameter: Local batch ID %s" % (batchID))
            self.__setJobParam(jobID, "LocalBatchID", str(batchID))
            if "PayloadFailed" in submission:
                ret["PayloadFailed"] = submission["PayloadFailed"]
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error("Job submission failed", jobID)
            self.__setJobParam(jobID, "ErrorMessage", "%s CE Submission Error" % (self.ceName))
            if "ReschedulePayload" in submission:
                rescheduleFailedJob(jobID, submission["Message"], self.__report)
            else:
                if "Value" in submission:
                    self.log.error("Error in DIRAC JobWrapper:", "exit code = %s" % (str(submission["Value"])))
                # make sure the Job is declared Failed
                self.__report(jobID, "Failed", submission["Message"])
            return S_ERROR("%s CE Submission Error: %s" % (self.ceName, submission["Message"]))

        return ret

    #############################################################################
    def __createJobWrapper(self, jobID, jobParams, resourceParams, optimizerParams):
        """This method creates a job wrapper filled with the CE and Job parameters
       to executed the job.
    """
        arguments = {"Job": jobParams, "CE": resourceParams, "Optimizer": optimizerParams}
        self.log.verbose("Job arguments are: \n %s" % (arguments))

        workingDir = gConfig.getValue("/LocalSite/WorkingDirectory", self.siteRoot)
        if not os.path.exists("%s/job/Wrapper" % (workingDir)):
            try:
                os.makedirs("%s/job/Wrapper" % (workingDir))
            except Exception:
                self.log.exception()
                return S_ERROR("Could not create directory for wrapper script")

        jobWrapperFile = "%s/job/Wrapper/Wrapper_%s" % (workingDir, jobID)
        if os.path.exists(jobWrapperFile):
            self.log.verbose("Removing existing Job Wrapper for %s" % (jobID))
            os.remove(jobWrapperFile)
        fd = open(self.jobWrapperTemplate, "r")
        wrapperTemplate = fd.read()
        fd.close()

        dateStr = time.strftime("%Y-%m-%d", time.localtime(time.time()))
        timeStr = time.strftime("%H:%M", time.localtime(time.time()))
        date_time = "%s %s" % (dateStr, timeStr)
        signature = __RCSID__
        dPython = sys.executable

        systemConfig = ""
        if jobParams.has_key("SystemConfig"):
            systemConfig = jobParams["SystemConfig"]
            self.log.verbose("Job system configuration requirement is %s" % (systemConfig))
            if resourceParams.has_key("Root"):
                jobPython = "%s/%s/bin/python" % (resourceParams["Root"], systemConfig)
                if os.path.exists(jobPython):
                    self.log.verbose("Found local python for job:\n%s" % (jobPython))
                    dPython = jobPython
                else:
                    if systemConfig == "ANY":
                        self.log.verbose("Using standard available python %s for job" % (dPython))
                    else:
                        self.log.warn("Job requested python \n%s\n but this is not available locally" % (jobPython))
            else:
                self.log.warn("No LocalSite/Root defined")
        else:
            self.log.warn("Job has no system configuration requirement")

        if not systemConfig or systemConfig.lower() == "any":
            systemConfig = gConfig.getValue("/LocalSite/Architecture", "")
            if not systemConfig:
                self.log.warn("Could not establish SystemConfig")

        logLevel = self.defaultLogLevel
        if jobParams.has_key("LogLevel"):
            logLevel = jobParams["LogLevel"]
            self.log.info("Found Job LogLevel JDL parameter with value: %s" % (logLevel))
        else:
            self.log.info("Applying default LogLevel JDL parameter with value: %s" % (logLevel))

        realPythonPath = os.path.realpath(dPython)
        self.log.debug("Real python path after resolving links is:")
        self.log.debug(realPythonPath)
        dPython = realPythonPath

        siteRootPython = self.siteRoot
        self.log.debug("DIRACPython is:\n%s" % dPython)
        self.log.debug("SiteRootPythonDir is:\n%s" % siteRootPython)
        wrapperTemplate = wrapperTemplate.replace("@SIGNATURE@", str(signature))
        wrapperTemplate = wrapperTemplate.replace("@JOBID@", str(jobID))
        wrapperTemplate = wrapperTemplate.replace("@DATESTRING@", str(date_time))
        wrapperTemplate = wrapperTemplate.replace("@JOBARGS@", str(arguments))
        wrapperTemplate = wrapperTemplate.replace("@SITEPYTHON@", str(siteRootPython))
        wrapper = open(jobWrapperFile, "w")
        wrapper.write(wrapperTemplate)
        wrapper.close()
        jobExeFile = "%s/job/Wrapper/Job%s" % (workingDir, jobID)
        jobFileContents = """#!/bin/sh
%s %s -o LogLevel=%s -o /DIRAC/Security/UseServerCertificate=no
""" % (
            dPython,
            jobWrapperFile,
            logLevel,
        )
        jobFile = open(jobExeFile, "w")
        jobFile.write(jobFileContents)
        jobFile.close()

        data = {
            "execFile": jobExeFile,
            "wrapperTemplate": self.jobWrapperTemplate,
            "signature": signature,
            "wrapperFile": jobWrapperFile,
            "logLevel": logLevel,
            "jid": jobID,
            "datetime": date_time,
            "jobArgs": arguments,
            "sitePython": siteRootPython,
        }
        return S_OK(data)

    #############################################################################
    def __requestJob(self, ceDict):
        """Request a single job from the matcher service.
    """
        try:
            matcher = RPCClient("WorkloadManagement/Matcher", timeout=600)
            result = matcher.requestJob(ceDict)
            return result
        except Exception, x:
            self.log.exception(lException=x)
            return S_ERROR("Job request to matcher service failed with exception")
Пример #12
0
class JobAgent(AgentModule):
    """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring
        self.am_setOption("MonitoringEnabled", False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption("MaxCycles", loops)

        ceType = self.am_getOption("CEType", "InProcess")
        localCE = gConfig.getValue("/LocalSite/LocalCE", "")
        if localCE:
            self.log.info("Defining CE from local configuration = %s" % localCE)
            ceType = localCE

        # Create backend Computing Element
        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance["OK"]:
            self.log.warn(ceInstance["Message"])
            return ceInstance
        self.computingElement = ceInstance["Value"]

        result = self.computingElement.getDescription()
        if not result["OK"]:
            self.log.warn("Can not get the CE description")
            return result
        ceDict = result["Value"]
        self.timeLeft = ceDict.get("CPUTime", 0.0)
        self.timeLeft = gConfig.getValue("/Resources/Computing/CEDefaults/MaxCPUTime", self.timeLeft)

        self.initTimes = os.times()
        # Localsite options
        self.siteName = gConfig.getValue("/LocalSite/Site", "Unknown")
        self.pilotReference = gConfig.getValue("/LocalSite/PilotReference", "Unknown")
        self.defaultProxyLength = gConfig.getValue("/Registry/DefaultProxyLifeTime", 86400 * 5)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 0.0)
        self.jobSubmissionDelay = self.am_getOption("SubmissionDelay", 10)
        self.fillingMode = self.am_getOption("FillingModeFlag", False)
        self.minimumTimeLeft = self.am_getOption("MinimumTimeLeft", 1000)
        self.stopOnApplicationFailure = self.am_getOption("StopOnApplicationFailure", True)
        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        self.extraOptions = gConfig.getValue("/AgentJobRequirements/ExtraOptions", "")
        # Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeftError = ""
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info("Attempting to check CPU time left for filling mode")
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish("No more time left")
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
                if not result["OK"]:
                    return self.__finish(result["Message"])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join(".", self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection("/LocalSite"):
                    localCfg.createNewSection("/LocalSite")
                localCfg.setOption("/LocalSite/CPUTimeLeft", self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish("Filling Mode is Disabled")

        self.log.verbose("Job Agent execution loop")
        available = self.computingElement.available()
        if not available["OK"] or not available["Value"]:
            self.log.info("Resource is not available")
            self.log.info(available["Message"])
            return self.__finish("CE Not Available")

        self.log.info(available["Message"])

        result = self.computingElement.getDescription()
        if not result["OK"]:
            return result
        ceDict = result["Value"]

        # Add pilot information
        gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown")
        if gridCE != "Unknown":
            ceDict["GridCE"] = gridCE
        if not "PilotReference" in ceDict:
            ceDict["PilotReference"] = str(self.pilotReference)
        ceDict["PilotBenchmark"] = self.cpuFactor
        ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict("/AgentJobRequirements")
        if result["OK"]:
            requirementsDict = result["Value"]
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info("MatcherTime = %.2f (s)" % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches)

        if not jobRequest["OK"]:
            if re.search("No match found", jobRequest["Message"]):
                self.log.notice("Job request OK: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("seconds timeout") != -1:
                self.log.error("Timeout while requesting job", jobRequest["Message"])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("Pilot version does not match") != -1:
                errorMsg = "Pilot version does not match the production version"
                self.log.error(errorMsg, jobRequest["Message"].replace(errorMsg, ""))
                return S_ERROR(jobRequest["Message"])
            else:
                self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest["Value"]
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False)
        jobID = matcherInfo["JobID"]
        matcherParams = ["JDL", "DN", "Group"]
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, "Failed", "Matcher did not return %s" % (param))
                return self.__finish("Matcher Failed")
            elif not matcherInfo[param]:
                self.__report(jobID, "Failed", "Matcher returned null %s" % (param))
                return self.__finish("Matcher Failed")
            else:
                self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param]))

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters["OK"]:
            self.__report(jobID, "Failed", "Could Not Extract JDL Parameters")
            self.log.warn(parameters["Message"])
            return self.__finish("JDL Problem")

        params = parameters["Value"]
        if "JobID" not in params:
            msg = "Job has not JobID defined in JDL parameters"
            self.__report(jobID, "Failed", msg)
            self.log.warn(msg)
            return self.__finish("JDL Problem")
        else:
            jobID = params["JobID"]

        if "JobType" not in params:
            self.log.warn("Job has no JobType defined in JDL parameters")
            jobType = "Unknown"
        else:
            jobType = params["JobType"]

        if "CPUTime" not in params:
            self.log.warn("Job has no CPU requirement defined in JDL parameters")

        if self.extraOptions:
            params["Arguments"] += " " + self.extraOptions
            params["ExtraOptions"] = self.extraOptions

        self.log.verbose("Job request successful: \n", jobRequest["Value"])
        self.log.info("Received JobID=%s, JobType=%s" % (jobID, jobType))
        self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
            jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False)

            if "BOINC_JOB_ID" in os.environ:
                # Report BOINC environment
                for p in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"):
                    jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False)

            jobReport.setJobStatus("Matched", "Job Received by Agent")
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result["OK"]:
                return self.__rescheduleFailedJob(jobID, result["Message"], self.stopOnApplicationFailure)
            proxyChain = result.get("Value")

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software["OK"]:
                self.log.error("Failed to install software for job", "%s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug("Before %sCE submitJob()" % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain)
            if not submission["OK"]:
                self.__report(jobID, "Failed", submission["Message"])
                return self.__finish(submission["Message"])
            elif "PayloadFailed" in submission:
                # Do not keep running and do not overwrite the Payload error
                message = "Payload execution failed with error code %s" % submission["PayloadFailed"]
                if self.stopOnApplicationFailure:
                    return self.__finish(message, self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug("After %sCE submitJob()" % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, "Job processing failed with exception", self.stopOnApplicationFailure
            )

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result["OK"]:
            self.timeLeft = result["Value"]
        else:
            if result["Message"] != "Current batch system is not supported":
                self.timeLeftError = result["Message"]
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self.__getCPUTimeLeft()

        scaledCPUTime = self.timeLeftUtil.getScaledCPU()
        self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK("Job Agent cycle complete")

    #############################################################################
    def __saveJobJDLRequest(self, jobID, jobJDL):
        """Save job JDL local to JobAgent.
    """
        classAdJob = ClassAd(jobJDL)
        classAdJob.insertAttributeString("LocalCE", self.ceName)
        jdlFileName = jobID + ".jdl"
        jdlFile = open(jdlFileName, "w")
        jdl = classAdJob.asJDL()
        jdlFile.write(jdl)
        jdlFile.close()

    #############################################################################
    def __getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        cpuTime = sum(os.times()[:-1])
        self.log.info("Current raw CPU time consumed is %s" % cpuTime)
        timeleft = self.timeLeft
        if self.cpuFactor:
            timeleft -= cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def __changeProxy(self, oldProxy, newProxy):
        """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
        self.log.verbose("Log proxy change (to be instrumented)")
        return S_OK()

    #############################################################################
    def __setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue("/DIRAC/Security/UseServerCertificate", False):
            proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
            if not proxyResult["OK"]:
                self.log.error("Failed to setup proxy", proxyResult["Message"])
                return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"])
            return S_OK(proxyResult["Value"])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret["OK"]:
                self.log.error("Invalid Proxy", ret["Message"])
                return S_ERROR("Invalid Proxy")

            proxyChain = ret["Value"]["chain"]
            if not "groupProperties" in ret["Value"]:
                print ret["Value"]
                print proxyChain.dumpAllToString()
                self.log.error("Invalid Proxy", "Group has no properties defined")
                return S_ERROR("Proxy has no group properties defined")

            groupProps = ret["Value"]["groupProperties"]
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
                if not proxyResult["OK"]:
                    self.log.error("Invalid Proxy", proxyResult["Message"])
                    return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"])
                proxyChain = proxyResult["Value"]

        return S_OK(proxyChain)

    #############################################################################
    def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info("No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal["OK"]:
            self.log.error("Could not retrieve payload proxy", retVal["Message"])
            self.log.warn(retVal)
            os.system("dirac-proxy-info")
            sys.stdout.flush()
            return S_ERROR("Error retrieving proxy")

        chain = retVal["Value"]
        return S_OK(chain)

    #############################################################################
    def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if "SoftwareDistModule" not in jobParams:
            msg = "Job has no software installation requirement"
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, "Matched", "Installing Software")
        softwareDist = jobParams["SoftwareDistModule"]
        self.log.verbose("Found VO Software Distribution module: %s" % (softwareDist))
        argumentsDict = {"Job": jobParams, "CE": resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance["OK"]:
            return moduleInstance

        module = moduleInstance["Value"]
        return module.execute()

    #############################################################################
    def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams, proxyChain):
        """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
        logLevel = self.am_getOption("DefaultLogLevel", "INFO")
        defaultWrapperLocation = self.am_getOption(
            "JobWrapperTemplate", "DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py"
        )
        result = createJobWrapper(
            jobID,
            jobParams,
            resourceParams,
            optimizerParams,
            extraOptions=self.extraOptions,
            defaultWrapperLocation=defaultWrapperLocation,
            log=self.log,
            logLevel=logLevel,
        )
        if not result["OK"]:
            return result

        wrapperFile = result["Value"]
        self.__report(jobID, "Matched", "Submitted To CE")

        self.log.info("Submitting %s to %sCE" % (os.path.basename(wrapperFile), self.ceName))

        # Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy["OK"]:
            self.log.error("Invalid proxy", proxy)
            return S_ERROR("Payload Proxy Not Found")

        payloadProxy = proxy["Value"]
        # FIXME: how can we set the batchID before we submit, this makes no sense
        batchID = "dc%s" % (jobID)
        submission = self.computingElement.submitJob(wrapperFile, payloadProxy)

        ret = S_OK("Job submitted")

        if submission["OK"]:
            batchID = submission["Value"]
            self.log.info("Job %s submitted as %s" % (jobID, batchID))
            self.log.verbose("Set JobParameter: Local batch ID %s" % (batchID))
            self.__setJobParam(jobID, "LocalBatchID", str(batchID))
            if "PayloadFailed" in submission:
                ret["PayloadFailed"] = submission["PayloadFailed"]
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error("Job submission failed", jobID)
            self.__setJobParam(jobID, "ErrorMessage", "%s CE Submission Error" % (self.ceName))
            if "ReschedulePayload" in submission:
                rescheduleFailedJob(jobID, submission["Message"])
                return S_OK()  # Without this job is marked as failed at line 265 above
            else:
                if "Value" in submission:
                    self.log.error("Error in DIRAC JobWrapper:", "exit code = %s" % (str(submission["Value"])))
                # make sure the Job is declared Failed
                self.__report(jobID, "Failed", submission["Message"])
            return S_ERROR("%s CE Submission Error: %s" % (self.ceName, submission["Message"]))

        return ret

    #############################################################################
    def __requestJob(self, ceDict):
        """Request a single job from the matcher service.
    """
        try:
            matcher = RPCClient("WorkloadManagement/Matcher", timeout=600)
            result = matcher.requestJob(ceDict)
            return result
        except Exception as x:
            self.log.exception(lException=x)
            return S_ERROR("Job request to matcher service failed with exception")

    #############################################################################
    def __getJDLParameters(self, jdl):
        """Returns a dictionary of JDL parameters.
    """
        try:
            parameters = {}
            #      print jdl
            if not re.search("\[", jdl):
                jdl = "[" + jdl + "]"
            classAdJob = ClassAd(jdl)
            paramsDict = classAdJob.contents
            for param, value in paramsDict.items():
                if value.strip().startswith("{"):
                    self.log.debug("Found list type parameter %s" % (param))
                    rawValues = value.replace("{", "").replace("}", "").replace('"', "").split()
                    valueList = []
                    for val in rawValues:
                        if re.search(",$", val):
                            valueList.append(val[:-1])
                        else:
                            valueList.append(val)
                    parameters[param] = valueList
                else:
                    parameters[param] = value.replace('"', "").replace("{", '"{').replace("}", '}"')
                    self.log.debug("Found standard parameter %s: %s" % (param, parameters[param]))
            return S_OK(parameters)
        except Exception, x:
            self.log.exception(lException=x)
            return S_ERROR("Exception while extracting JDL parameters for job")