示例#1
0
class TaskCallbacksTests(unittest.TestCase):
    """
  .. class:: TaskCallbacksTests
  test case for ProcessPool
  """
    def setUp(self):
        gLogger.showHeaders(True)
        self.log = gLogger.getSubLogger(self.__class__.__name__)
        self.processPool = ProcessPool(4, 8, 8)
        self.processPool.daemonize()

    def testCallableClass(self):
        """ CallableClass and task callbacks test """
        i = 0
        while True:
            if self.processPool.getFreeSlots() > 0:
                timeWait = random.randint(0, 5)
                raiseException = False
                if not timeWait:
                    raiseException = True
                result = self.processPool.createAndQueueTask(
                    CallableClass,
                    taskID=i,
                    args=(i, timeWait, raiseException),
                    callback=ResultCallback,
                    exceptionCallback=ExceptionCallback,
                    blocking=True)
                if result["OK"]:
                    self.log.always("CallableClass enqueued to task %s" % i)
                    i += 1
                else:
                    continue
            if i == 10:
                break
        self.processPool.finalize(2)

    def testCallableFunc(self):
        """ CallableFunc and task callbacks test """
        i = 0
        while True:
            if self.processPool.getFreeSlots() > 0:
                timeWait = random.randint(0, 5)
                raiseException = False
                if not timeWait:
                    raiseException = True
                result = self.processPool.createAndQueueTask(
                    CallableFunc,
                    taskID=i,
                    args=(i, timeWait, raiseException),
                    callback=ResultCallback,
                    exceptionCallback=ExceptionCallback,
                    blocking=True)
                if result["OK"]:
                    self.log.always("CallableClass enqueued to task %s" % i)
                    i += 1
                else:
                    continue
            if i == 10:
                break
        self.processPool.finalize(2)
示例#2
0
 def processPool(self):
     """ 'Live long and prosper, my dear ProcessPool'
                                     - Mr. Spock    
 :param self: self reference
 :return: brand new shiny ProcessPool instance on first call, the same instance
          on subsequent calls
 """
     if not self.__processPool:
         minProcess = max(1, self.__minProcess)
         maxProcess = max(self.__minProcess, self.__maxProcess)
         queueSize = abs(self.__queueSize)
         self.log.info(
             "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" %
             (minProcess, maxProcess, queueSize))
         self.log.info(
             "ProcessPool: tasks will use callbacks attached to ProcessPool"
         )
         self.__processPool = ProcessPool(
             minProcess,
             maxProcess,
             queueSize,
             poolCallback=self.resultCallback,
             poolExceptionCallback=self.exceptionCallback)
         self.__processPool.daemonize()
         self.log.info("ProcessPool: daemonized and ready")
     return self.__processPool
示例#3
0
  def getCEStatus(self):
    """ Method to return information on running and waiting jobs,
        as well as the number of processors (used, and available).

    :return: dictionary of numbers of jobs per status and processors (used, and available)
    """

    if self.pPool is None:
      self.pPool = ProcessPool(minSize=self.processors,
                               maxSize=self.processors,
                               poolCallback=self.finalizeJob)

    self.pPool.processResults()
    result = S_OK()
    nJobs = 0
    for _j, value in self.processorsPerTask.iteritems():
      if value > 0:
        nJobs += 1
    result['SubmittedJobs'] = nJobs
    result['RunningJobs'] = nJobs
    result['WaitingJobs'] = 0

    # dealing with processors
    processorsInUse = self.getProcessorsInUse()
    result['UsedProcessors'] = processorsInUse
    result['AvailableProcessors'] = self.processors - processorsInUse
    return result
class TaskCallbacksTests(unittest.TestCase):
  """
  .. class:: TaskCallbacksTests
  test case for ProcessPool
  """

  def setUp( self ):
    gLogger.showHeaders( True )
    self.log = gLogger.getSubLogger( self.__class__.__name__ )
    self.processPool = ProcessPool( 4, 8, 8 ) 
    self.processPool.daemonize()

  def testCallableClass( self ):
    """ CallableClass and task callbacks test """
    i = 0
    while True:
      if self.processPool.getFreeSlots() > 0:
        timeWait = random.randint(0, 5)
        raiseException = False
        if not timeWait:
          raiseException = True 
        result = self.processPool.createAndQueueTask( CallableClass,
                                                      taskID = i,
                                                      args = ( i, timeWait, raiseException ),  
                                                      callback = ResultCallback,
                                                      exceptionCallback = ExceptionCallback,
                                                      blocking = True )    
        if result["OK"]:
          self.log.always("CallableClass enqueued to task %s" % i )
          i += 1
        else:
          continue
      if i == 10:
        break
    self.processPool.finalize( 2 )

  def testCallableFunc( self ):
    """ CallableFunc and task callbacks test """
    i = 0
    while True:
      if self.processPool.getFreeSlots() > 0:
        timeWait = random.randint(0, 5)
        raiseException = False
        if not timeWait:
          raiseException = True 
        result = self.processPool.createAndQueueTask( CallableFunc,
                                                      taskID = i,
                                                      args = ( i, timeWait, raiseException ),  
                                                      callback = ResultCallback,
                                                      exceptionCallback = ExceptionCallback,
                                                      blocking = True )    
        if result["OK"]:
          self.log.always("CallableClass enqueued to task %s" % i )
          i += 1          
        else:
          continue
      if i == 10:
        break
    self.processPool.finalize( 2 )
示例#5
0
  def submitJob(self, executableFile, proxy, **kwargs):
    """ Method to submit job.

    :param str executableFile: location of the executable file
    :param str proxy: payload proxy

    :return: S_OK/S_ERROR of the result of the job submission
    """

    if self.pPool is None:
      self.pPool = ProcessPool(minSize=self.processors,
                               maxSize=self.processors,
                               poolCallback=self.finalizeJob)

    self.pPool.processResults()

    processorsForJob = self._getProcessorsForJobs(kwargs)
    if not processorsForJob:
      return S_ERROR('Not enough processors for the job')

    # Now persisiting the job limits for later use in pilot.cfg file (pilot 3 default)
    cd = ConfigurationData(loadDefaultCFG=False)
    res = cd.loadFile('pilot.cfg')
    if not res['OK']:
      self.log.error("Could not load pilot.cfg", res['Message'])
    # only NumberOfProcessors for now, but RAM (or other stuff) can also be added
    jobID = int(kwargs.get('jobDesc', {}).get('jobID', 0))
    cd.setOptionInCFG('/Resources/Computing/JobLimits/%d/NumberOfProcessors' % jobID, processorsForJob)
    res = cd.dumpLocalCFGToFile('pilot.cfg')
    if not res['OK']:
      self.log.error("Could not dump cfg to pilot.cfg", res['Message'])

    ret = getProxyInfo()
    if not ret['OK']:
      pilotProxy = None
    else:
      pilotProxy = ret['Value']['path']
    self.log.notice('Pilot Proxy:', pilotProxy)

    kwargs = {'UseSudo': False}
    if self.useSudo:
      for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS):
        if nUser not in self.userNumberPerTask.values():
          break
      kwargs['NUser'] = nUser
      kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str(nUser).zfill(2)
      kwargs['UseSudo'] = True

    result = self.pPool.createAndQueueTask(executeJob,
                                           args=(executableFile, proxy, self.taskID),
                                           kwargs=kwargs,
                                           taskID=self.taskID,
                                           usePoolCallbacks=True)
    self.processorsPerTask[self.taskID] = processorsForJob
    self.taskID += 1

    self.pPool.processResults()

    return result
示例#6
0
 def setUp(self):
     from DIRAC.Core.Base import Script
     Script.parseCommandLine()
     from DIRAC.FrameworkSystem.Client.Logger import gLogger
     gLogger.showHeaders(True)
     self.log = gLogger.getSubLogger(self.__class__.__name__)
     self.processPool = ProcessPool(4, 8, 8)
     self.processPool.daemonize()
示例#7
0
  def setUp( self ):
    """c'tor

    :param self: self reference
    """
    gLogger.showHeaders( True )
    self.log = gLogger.getSubLogger( self.__class__.__name__ )
    self.processPool = ProcessPool( 4, 8, 8,
                                    poolCallback = self.poolCallback, 
                                    poolExceptionCallback = self.poolExceptionCallback )
    self.processPool.daemonize()
示例#8
0
def processPoolWithCallbacks2():
    gLogger.showHeaders(True)
    log = gLogger.getSubLogger("TaskTimeOutTests")
    processPoolWithCallbacks2 = ProcessPool(
        2,
        4,
        8,
        poolCallback=lambda taskID, taskResult: log.always(
            "callback result for %s is %s" % (taskID, taskResult)),
        poolExceptionCallback=lambda taskID, taskException: log.always(
            "callback exception for %s is %s" % (taskID, taskException)),
    )
    processPoolWithCallbacks2.daemonize()
    yield processPoolWithCallbacks2
示例#9
0
 def __init__(self, ceUniqueID, cores=0):
     """ Standard constructor.
 """
     ComputingElement.__init__(self, ceUniqueID)
     self.ceType = "Pool"
     self.submittedJobs = 0
     if cores > 0:
         self.cores = cores
     else:
         self.cores = getNumberOfCores()
     self.pPool = ProcessPool(self.cores,
                              self.cores,
                              poolCallback=self.finalizeJob)
     self.taskID = 0
     self.coresPerTask = {}
示例#10
0
 def processPool( self ):
   """ facade for ProcessPool """
   if not self.__processPool:
     minProcess = max( 1, self.__minProcess )
     maxProcess = max( self.__minProcess, self.__maxProcess )
     queueSize = abs( self.__queueSize )
     self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess,
                                                                                      maxProcess,
                                                                                      queueSize ) )
     self.__processPool = ProcessPool( minProcess,
                                       maxProcess,
                                       queueSize,
                                       poolCallback = self.resultCallback,
                                       poolExceptionCallback = self.exceptionCallback )
     self.__processPool.daemonize()
   return self.__processPool
示例#11
0
    def setUp(self):
        """c'tor

    :param self: self reference
    """
        from DIRAC.Core.Base import Script
        Script.parseCommandLine()
        from DIRAC.FrameworkSystem.Client.Logger import gLogger
        gLogger.showHeaders(True)
        self.log = gLogger.getSubLogger(self.__class__.__name__)
        self.processPool = ProcessPool(
            2,
            4,
            8,
            poolCallback=self.poolCallback,
            poolExceptionCallback=self.poolExceptionCallback)
        self.processPool.daemonize()
示例#12
0
 def setUp( self ):
   from DIRAC.Core.Base import Script
   Script.parseCommandLine()
   from DIRAC.FrameworkSystem.Client.Logger import gLogger
   gLogger.showHeaders( True )
   self.log = gLogger.getSubLogger( self.__class__.__name__ )
   self.processPool = ProcessPool( 4, 8, 8 ) 
   self.processPool.daemonize()
示例#13
0
 def __init__(self, ceUniqueID, processors=0):
     """ Standard constructor.
 """
     ComputingElement.__init__(self, ceUniqueID)
     self.ceType = "Pool"
     self.log = gLogger.getSubLogger('Pool')
     self.submittedJobs = 0
     if processors > 0:
         self.processors = processors
     else:
         self.processors = multiprocessing.cpu_count()
     self.pPool = ProcessPool(minSize=self.processors,
                              maxSize=self.processors,
                              poolCallback=self.finalizeJob)
     self.taskID = 0
     self.processorsPerTask = {}
     self.userNumberPerTask = {}
     self.useSudo = False
示例#14
0
  def submitJob(self, executableFile, proxy, **kwargs):
    """ Method to submit job.
    """

    if self.pPool is None:
      self.pPool = ProcessPool(minSize=self.processors,
                               maxSize=self.processors,
                               poolCallback=self.finalizeJob)

    self.pPool.processResults()

    processorsInUse = self.getProcessorsInUse()
    if kwargs.get('wholeNode'):
      if processorsInUse > 0:
        return S_ERROR('Can not take WholeNode job')  # , %d/%d slots used' % (self.slotsInUse,self.slots) )
      else:
        requestedProcessors = self.processors
    elif "numberOfProcessors" in kwargs:
      requestedProcessors = int(kwargs['numberOfProcessors'])
      if requestedProcessors > 0:
        if (processorsInUse + requestedProcessors) > self.processors:
          return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors,
                                                                           self.processors - processorsInUse))
    else:
      requestedProcessors = 1
    if self.processors - processorsInUse < requestedProcessors:
      return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors,
                                                                       self.processors - processorsInUse))

    ret = getProxyInfo()
    if not ret['OK']:
      pilotProxy = None
    else:
      pilotProxy = ret['Value']['path']
    self.log.notice('Pilot Proxy:', pilotProxy)

    kwargs = {'UseSudo': False}
    if self.useSudo:
      for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS):
        if nUser not in self.userNumberPerTask.values():
          break
      kwargs['NUser'] = nUser
      kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str(nUser).zfill(2)
      kwargs['UseSudo'] = True

    result = self.pPool.createAndQueueTask(executeJob,
                                           args=(executableFile, proxy, self.taskID),
                                           kwargs=kwargs,
                                           taskID=self.taskID,
                                           usePoolCallbacks=True)
    self.processorsPerTask[self.taskID] = requestedProcessors
    self.taskID += 1

    self.pPool.processResults()

    return result
示例#15
0
  def setUp( self ):
    """c'tor

    :param self: self reference
    """
    gLogger.showHeaders( True )
    self.log = gLogger.getSubLogger( self.__class__.__name__ )
    self.processPool = ProcessPool( 4, 8, 8,
                                    poolCallback = self.poolCallback, 
                                    poolExceptionCallback = self.poolExceptionCallback )
    self.processPool.daemonize()
示例#16
0
 def __init__( self, ceUniqueID, cores = 0 ):
   """ Standard constructor.
   """
   ComputingElement.__init__( self, ceUniqueID )
   self.ceType = "Pool"
   self.submittedJobs = 0
   if cores > 0:
     self.cores = cores
   else:  
     self.cores = getNumberOfCores()
   self.pPool = ProcessPool( self.cores, self.cores, poolCallback = self.finalizeJob )
   self.taskID = 0
   self.coresPerTask = {}
示例#17
0
  def setUp( self ):
    """c'tor

    :param self: self reference
    """
    from DIRAC.Core.Base import Script
    Script.parseCommandLine()
    from DIRAC.FrameworkSystem.Client.Logger import gLogger
    gLogger.showHeaders( True )
    self.log = gLogger.getSubLogger( self.__class__.__name__ )
    self.processPool = ProcessPool( 4, 8, 8,
                                    poolCallback = self.poolCallback, 
                                    poolExceptionCallback = self.poolExceptionCallback )
    self.processPool.daemonize()
示例#18
0
    def getCEStatus(self, jobIDList=None):
        """ Method to return information on running and pending jobs.

    :return: dictionary of numbers of jobs per status
    """

        if self.pPool is None:
            self.pPool = ProcessPool(minSize=self.processors,
                                     maxSize=self.processors,
                                     poolCallback=self.finalizeJob)

        self.pPool.processResults()
        result = S_OK()
        result['SubmittedJobs'] = 0
        nJobs = 0
        for _j, value in self.processorsPerTask.iteritems():
            if value > 0:
                nJobs += 1
        result['RunningJobs'] = nJobs
        result['WaitingJobs'] = 0
        processorsInUse = self.getProcessorsInUse()
        result['UsedProcessors'] = processorsInUse
        result['AvailableProcessors'] = self.processors - processorsInUse
        return result
示例#19
0
 def processPool( self ):
   """ facade for ProcessPool """
   if not self.__processPool:
     minProcess = max( 1, self.__minProcess )
     maxProcess = max( self.__minProcess, self.__maxProcess )
     queueSize = abs( self.__queueSize )
     self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess,
                                                                                      maxProcess,
                                                                                      queueSize ) )
     self.__processPool = ProcessPool( minProcess,
                                       maxProcess,
                                       queueSize,
                                       poolCallback = self.resultCallback,
                                       poolExceptionCallback = self.exceptionCallback )
     self.__processPool.daemonize()
   return self.__processPool
示例#20
0
def runTest():

    global nClients, nQueries, testType, resultTest, testDir, lfnListFile

    resultTest = []

    pp = ProcessPool(nClients)

    testFunction = eval(testType)

    for c in xrange(nClients):
        pp.createAndQueueTask(testFunction, [nQueries],
                              callback=finalize,
                              exceptionCallback=doException)

    pp.processAllResults(3600)
    pp.finalize(0)

    timeResult = []
    for testTime, success, failure in resultTest:
        #print testTime,success,failure
        timeResult += testTime

    averageTime, errorTime = doStats(timeResult)
    rateResult = [nClients / t for t in timeResult]
    averageRate, errorRate = doStats(rateResult)

    if testDir:
        print "\nTest results for clients %d, %s" % (nClients, testDir)
    else:
        print "\nTest results for clients %d, %s" % (nClients, lfnListFile)

    print "Query time: %.2f +/- %.2f" % (averageTime, errorTime)
    print "Query rate: %.2f +/- %.2f" % (averageRate, errorRate)

    return ((averageTime, errorTime), (averageRate, errorRate))
示例#21
0
 def processPool( self ):
   """ 'Live long and prosper, my dear ProcessPool'
                                       - Mr. Spock    
   :param self: self reference
   :return: brand new shiny ProcessPool instance on first call, the same instance
            on subsequent calls
   """
   if not self.__processPool:
     minProcess = max( 1, self.__minProcess ) 
     maxProcess = max( self.__minProcess, self.__maxProcess )
     queueSize = abs(self.__queueSize) 
     self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess, 
                                                                                      maxProcess, 
                                                                                      queueSize ) )
     self.log.info( "ProcessPool: tasks will use callbacks attached to ProcessPool" )
     self.__processPool = ProcessPool( minProcess, 
                                       maxProcess, 
                                       queueSize, 
                                       poolCallback = self.resultCallback,
                                       poolExceptionCallback = self.exceptionCallback )
     self.__processPool.daemonize()
     self.log.info( "ProcessPool: daemonized and ready")
   return self.__processPool
示例#22
0
  def getCEStatus(self, jobIDList=None):
    """ Method to return information on running and pending jobs.

    :return: dictionary of numbers of jobs per status
    """

    if self.pPool is None:
      self.pPool = ProcessPool(minSize=self.processors,
                               maxSize=self.processors,
                               poolCallback=self.finalizeJob)

    self.pPool.processResults()
    result = S_OK()
    result['SubmittedJobs'] = 0
    nJobs = 0
    for _j, value in self.processorsPerTask.iteritems():
      if value > 0:
        nJobs += 1
    result['RunningJobs'] = nJobs
    result['WaitingJobs'] = 0
    processorsInUse = self.getProcessorsInUse()
    result['UsedProcessors'] = processorsInUse
    result['AvailableProcessors'] = self.processors - processorsInUse
    return result
示例#23
0
def runTest():

  global nClients, nQueries, testType, resultTest, testDir, lfnListFile

  resultTest = []

  pp = ProcessPool(nClients)

  testFunction = eval(testType)

  for c in xrange(nClients):
    pp.createAndQueueTask(testFunction, [nQueries],
                          callback=finalize,
                          exceptionCallback=doException)

  pp.processAllResults(3600)
  pp.finalize(0)

  timeResult = []
  for testTime, success, failure in resultTest:
    # print testTime,success,failure
    timeResult += testTime

  averageTime, errorTime = doStats(timeResult)
  rateResult = [nClients / t for t in timeResult]
  averageRate, errorRate = doStats(rateResult)

  if testDir:
    print "\nTest results for clients %d, %s" % (nClients, testDir)
  else:
    print "\nTest results for clients %d, %s" % (nClients, lfnListFile)

  print "Query time: %.2f +/- %.2f" % (averageTime, errorTime)
  print "Query rate: %.2f +/- %.2f" % (averageRate, errorRate)

  return((averageTime, errorTime), (averageRate, errorRate))
示例#24
0
class RequestExecutingAgent( AgentModule ):
  """
  .. class:: RequestExecutingAgent

  request processing agent using ProcessPool, Operation handlers and RequestTask
  """
  # # process pool
  __processPool = None
  # # request cache
  __requestCache = {}
  # # requests/cycle
  __requestsPerCycle = 100
  # # minimal nb of subprocess running
  __minProcess = 2
  # # maximal nb of subprocess executed same time
  __maxProcess = 4
  # # ProcessPool queue size
  __queueSize = 20
  # # file timeout
  __fileTimeout = 300
  # # operation timeout
  __operationTimeout = 300
  # # ProcessTask default timeout in seconds
  __taskTimeout = 900
  # # ProcessPool finalization timeout
  __poolTimeout = 900
  # # ProcessPool sleep time
  __poolSleep = 5
  # # placeholder for RequestClient instance
  __requestClient = None
  # # Size of the bulk if use of getRequests. If 0, use getRequest
  __bulkRequest = 0

  def __init__( self, *args, **kwargs ):
    """ c'tor """
    # # call base class ctor
    AgentModule.__init__( self, *args, **kwargs )
    # # ProcessPool related stuff
    self.__requestsPerCycle = self.am_getOption( "RequestsPerCycle", self.__requestsPerCycle )
    self.log.info( "Requests/cycle = %d" % self.__requestsPerCycle )
    self.__minProcess = self.am_getOption( "MinProcess", self.__minProcess )
    self.log.info( "ProcessPool min process = %d" % self.__minProcess )
    self.__maxProcess = self.am_getOption( "MaxProcess", 4 )
    self.log.info( "ProcessPool max process = %d" % self.__maxProcess )
    self.__queueSize = self.am_getOption( "ProcessPoolQueueSize", self.__queueSize )
    self.log.info( "ProcessPool queue size = %d" % self.__queueSize )
    self.__poolTimeout = int( self.am_getOption( "ProcessPoolTimeout", self.__poolTimeout ) )
    self.log.info( "ProcessPool timeout = %d seconds" % self.__poolTimeout )
    self.__poolSleep = int( self.am_getOption( "ProcessPoolSleep", self.__poolSleep ) )
    self.log.info( "ProcessPool sleep time = %d seconds" % self.__poolSleep )
    self.__bulkRequest = self.am_getOption( "BulkRequest", 0 )
    self.log.info( "Bulk request size = %d" % self.__bulkRequest )

    # # keep config path and agent name
    self.agentName = self.am_getModuleParam( "fullName" )
    self.__configPath = PathFinder.getAgentSection( self.agentName )

    # # operation handlers over here
    opHandlersPath = "%s/%s" % ( self.__configPath, "OperationHandlers" )
    opHandlers = gConfig.getSections( opHandlersPath )
    if not opHandlers["OK"]:
      self.log.error( opHandlers["Message" ] )
      raise AgentConfigError( "OperationHandlers section not found in CS under %s" % self.__configPath )
    opHandlers = opHandlers["Value"]


    self.timeOuts = dict()

    # # handlers dict
    self.handlersDict = dict()
    for opHandler in opHandlers:
      opHandlerPath = "%s/%s/Location" % ( opHandlersPath, opHandler )
      opLocation = gConfig.getValue( opHandlerPath, "" )
      if not opLocation:
        self.log.error( "%s not set for %s operation handler" % ( opHandlerPath, opHandler ) )
        continue
      self.timeOuts[opHandler] = { "PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout }

      opTimeout = gConfig.getValue( "%s/%s/TimeOut" % ( opHandlersPath, opHandler ), 0 )
      if opTimeout:
        self.timeOuts[opHandler]["PerOperation"] = opTimeout
      fileTimeout = gConfig.getValue( "%s/%s/TimeOutPerFile" % ( opHandlersPath, opHandler ), 0 )
      if fileTimeout:
        self.timeOuts[opHandler]["PerFile"] = fileTimeout

      self.handlersDict[opHandler] = opLocation

    self.log.info( "Operation handlers:" )
    for item in enumerate ( self.handlersDict.items() ):
      opHandler = item[1][0]
      self.log.info("[%s] %s: %s (timeout: %d s + %d s per file)" % (item[0], item[1][0], item[1][1],
                                                                     self.timeOuts[opHandler]['PerOperation'],
                                                                     self.timeOuts[opHandler]['PerFile']))

    # # common monitor activity
    gMonitor.registerActivity( "Iteration", "Agent Loops",
                               "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Processed", "Request Processed",
                               "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Done", "Request Completed",
                               "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM )
    # # create request dict
    self.__requestCache = dict()

    # ?? Probably should be removed
    self.FTSMode = self.am_getOption( "FTSMode", False )



  def processPool( self ):
    """ facade for ProcessPool """
    if not self.__processPool:
      minProcess = max( 1, self.__minProcess )
      maxProcess = max( self.__minProcess, self.__maxProcess )
      queueSize = abs( self.__queueSize )
      self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess,
                                                                                       maxProcess,
                                                                                       queueSize ) )
      self.__processPool = ProcessPool( minProcess,
                                        maxProcess,
                                        queueSize,
                                        poolCallback = self.resultCallback,
                                        poolExceptionCallback = self.exceptionCallback )
      self.__processPool.daemonize()
    return self.__processPool

  def requestClient( self ):
    """ RequestClient getter """
    if not self.__requestClient:
      self.__requestClient = ReqClient()
    return self.__requestClient

  def cacheRequest( self, request ):
    """ put request into requestCache

    :param ~Request.Request request: Request instance
    """
    maxProcess = max( self.__minProcess, self.__maxProcess )
    if len( self.__requestCache ) > maxProcess + 50:
      # For the time being we just print a warning... If the ProcessPool is working well, this is not needed
      # We don't know how much is acceptable as it depends on many factors
      self.log.warn( "Too many requests in cache", ': %d' % len( self.__requestCache ) )
#      return S_ERROR( "Too many requests in cache" )
    if request.RequestID in self.__requestCache:
      # We don't call  putRequest as we have got back the request that is still being executed. Better keep it
      # The main reason for this is that it lasted longer than the kick time of CleanReqAgent
      self.log.warn( "Duplicate request, keep it but don't execute", ': %d/%s' % ( request.RequestID, request.RequestName ) )
      return S_ERROR( errno.EALREADY, 'Request already in cache' )
    self.__requestCache[ request.RequestID ] = request
    return S_OK()

  def putRequest( self, requestID, taskResult = None ):
    """ put back :requestID: to RequestClient

    :param str requestID: request's id
    """
    if requestID in self.__requestCache:
      request = self.__requestCache.pop( requestID )
      if taskResult:
        if taskResult['OK']:
          request = taskResult['Value']
          # The RequestTask is putting back the Done tasks, no need to redo it
          if request.Status == 'Done':
            return S_OK()
        # In case of timeout, we need to increment ourselves all the attempts
        elif cmpError( taskResult, errno.ETIME ):
          waitingOp = request.getWaiting()
          for rmsFile in waitingOp.get( 'Value', [] ):
            rmsFile.Attempt += 1

      reset = self.requestClient().putRequest( request, useFailoverProxy = False, retryMainService = 2 )
      if not reset["OK"]:
        return S_ERROR( "putRequest: unable to reset request %s: %s" % ( requestID, reset["Message"] ) )
    else:
      return S_ERROR( 'Not in cache' )
    return S_OK()

  def putAllRequests( self ):
    """ put back all requests without callback called into requestClient

    :param self: self reference
    """
    self.log.info( "putAllRequests: will put %s back requests" % len( self.__requestCache ) )
    for requestID in self.__requestCache.keys():
      reset = self.putRequest( requestID )
      if not reset["OK"]:
        self.log.error( 'Failed to put request', reset["Message"] )
      else:
        self.log.debug( "putAllRequests: request %s has been put back with its initial state" % requestID )
    return S_OK()

  def initialize( self ):
    """ initialize agent
    """
    return S_OK()

  def execute( self ):
    """ read requests from RequestClient and enqueue them into ProcessPool """
    gMonitor.addMark( "Iteration", 1 )
    # # requests (and so tasks) counter
    taskCounter = 0
    while taskCounter < self.__requestsPerCycle:
      self.log.debug( "execute: executing %d request in this cycle" % taskCounter )

      requestsToExecute = []

      if not self.__bulkRequest:
        self.log.info( "execute: ask for a single request" )
        getRequest = self.requestClient().getRequest()
        if not getRequest["OK"]:
          self.log.error( "execute: %s" % getRequest["Message"] )
          break
        if not getRequest["Value"]:
          self.log.info( "execute: no more 'Waiting' requests to process" )
          break
        requestsToExecute = [getRequest["Value"] ]
      else:
        numberOfRequest = min( self.__bulkRequest, self.__requestsPerCycle - taskCounter )
        self.log.info( "execute: ask for %s requests" % numberOfRequest )
        getRequests = self.requestClient().getBulkRequests( numberOfRequest )
        if not getRequests["OK"]:
          self.log.error( "execute: %s" % getRequests["Message"] )
          break
        if not getRequests["Value"]:
          self.log.info( "execute: no more 'Waiting' requests to process" )
          break
        for rId in getRequests["Value"]["Failed"]:
          self.log.error( "execute: %s" % getRequests["Value"]["Failed"][rId] )

        requestsToExecute = getRequests["Value"]["Successful"].values()

      self.log.info( "execute: will execute %s requests " % len( requestsToExecute ) )

      for request in requestsToExecute:
        # # set task id
        taskID = request.RequestID

        self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(),
                                                                      self.processPool().getNumWorkingProcesses() ) )

        looping = 0
        while True:
          if not self.processPool().getFreeSlots():
            if not looping:
              self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep )
            time.sleep( self.__poolSleep )
            looping += 1
          else:
            if looping:
              self.log.info( "Free slot found after %d seconds" % looping * self.__poolSleep )
            looping = 0
            # # save current request in cache
            res = self.cacheRequest( request )
            if not res['OK']:
              if cmpError( res, errno.EALREADY ):
                # The request is already in the cache, skip it. break out of the while loop to get next request
                break
              # There are too many requests in the cache, commit suicide
              self.log.error( res['Message'], '(%d requests): put back all requests and exit cycle' % len( self.__requestCache ) )
              self.putAllRequests()
              return res
            # # serialize to JSON
            result = request.toJSON()
            if not result['OK']:
              continue
            requestJSON = result['Value']
            self.log.info( "spawning task for request '%s/%s'" % ( request.RequestID, request.RequestName ) )
            timeOut = self.getTimeout( request )
            enqueue = self.processPool().createAndQueueTask( RequestTask,
                                                             kwargs = { "requestJSON" : requestJSON,
                                                                        "handlersDict" : self.handlersDict,
                                                                        "csPath" : self.__configPath,
                                                                        "agentName": self.agentName },
                                                             taskID = taskID,
                                                             blocking = True,
                                                             usePoolCallbacks = True,
                                                             timeOut = timeOut )
            if not enqueue["OK"]:
              self.log.error( enqueue["Message"] )
            else:
              self.log.debug( "successfully enqueued task '%s'" % taskID )
              # # update monitor
              gMonitor.addMark( "Processed", 1 )
              # # update request counter
              taskCounter += 1
              # # task created, a little time kick to proceed
              time.sleep( 0.1 )
              break

    self.log.info( 'Flushing callbacks (%d requests still in cache)' % len( self.__requestCache ) )
    processed = self.processPool().processResults()
    # This happens when the result queue is screwed up.
    # Returning S_ERROR proved not to be sufficient,
    # and when in this situation, there is nothing we can do.
    # So we just exit. runit will restart from scratch.
    if processed < 0:
      self.log.fatal("Results queue is screwed up")
      sys.exit(1)
    # # clean return
    return S_OK()

  def getTimeout( self, request ):
    """ get timeout for request """
    timeout = 0
    for op in request:
      if op.Status not in ( "Waiting", "Scheduled", 'Queued' ):
        continue
      if op.Type not in self.timeOuts:
        timeout += self.__operationTimeout
      else:
        perOp = self.timeOuts[op.Type].get( "PerOperation", self.__operationTimeout )
        perFiles = self.timeOuts[op.Type].get( "PerFile", self.__fileTimeout ) * len( op )
        timeout += perOp + perFiles
    self.log.info( "estimated timeOut for request (%s/%s) is %s" % ( request.RequestID, request.RequestName, timeout ) )
    return timeout

  def finalize( self ):
    """ agent finalization """
    if self.__processPool:
      self.processPool().finalize( timeout = self.__poolTimeout )
    self.putAllRequests()
    return S_OK()

  def resultCallback( self, taskID, taskResult ):
    """ definition of request callback function

    :param str taskID: Request.RequestID
    :param dict taskResult: task result S_OK(Request)/S_ERROR(Message)
    """
    # # clean cache
    res = self.putRequest( taskID, taskResult )
    self.log.info("callback: %s result is %s(%s), put %s(%s)" % (taskID,
                                                                 "S_OK" if taskResult["OK"] else "S_ERROR",
                                                                 taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"],
                                                                 "S_OK" if res['OK'] else 'S_ERROR',
                                                                 '' if res['OK'] else res['Message']))


  def exceptionCallback( self, taskID, taskException ):
    """ definition of exception callback function

    :param str taskID: Request.RequestID
    :param Exception taskException: Exception instance
    """
    self.log.error( "exceptionCallback: %s was hit by exception %s" % ( taskID, taskException ) )
    self.putRequest( taskID )
示例#25
0
def processPool():
    gLogger.showHeaders(True)
    log = gLogger.getSubLogger("TaskCallbacksTests")
    processPool = ProcessPool(4, 8, 8)
    processPool.daemonize()
    yield processPool
示例#26
0
class RequestExecutingAgent(AgentModule):
  """
  .. class:: RequestExecutingAgent

  request processing agent using ProcessPool, Operation handlers and RequestTask
  """
  # # process pool
  __processPool = None
  # # request cache
  __requestCache = {}
  # # requests/cycle
  __requestsPerCycle = 100
  # # minimal nb of subprocess running
  __minProcess = 2
  # # maximal nb of subprocess executed same time
  __maxProcess = 4
  # # ProcessPool queue size
  __queueSize = 20
  # # file timeout
  __fileTimeout = 300
  # # operation timeout
  __operationTimeout = 300
  # # ProcessTask default timeout in seconds
  __taskTimeout = 900
  # # ProcessPool finalization timeout
  __poolTimeout = 900
  # # ProcessPool sleep time
  __poolSleep = 5
  # # placeholder for RequestClient instance
  __requestClient = None
  # # Size of the bulk if use of getRequests. If 0, use getRequest
  __bulkRequest = 0

  def __init__(self, *args, **kwargs):
    """ c'tor """
    # # call base class ctor
    AgentModule.__init__(self, *args, **kwargs)
    # # ProcessPool related stuff
    self.__requestsPerCycle = self.am_getOption("RequestsPerCycle", self.__requestsPerCycle)
    self.log.info("Requests/cycle = %d" % self.__requestsPerCycle)
    self.__minProcess = self.am_getOption("MinProcess", self.__minProcess)
    self.log.info("ProcessPool min process = %d" % self.__minProcess)
    self.__maxProcess = self.am_getOption("MaxProcess", 4)
    self.log.info("ProcessPool max process = %d" % self.__maxProcess)
    self.__queueSize = self.am_getOption("ProcessPoolQueueSize", self.__queueSize)
    self.log.info("ProcessPool queue size = %d" % self.__queueSize)
    self.__poolTimeout = int(self.am_getOption("ProcessPoolTimeout", self.__poolTimeout))
    self.log.info("ProcessPool timeout = %d seconds" % self.__poolTimeout)
    self.__poolSleep = int(self.am_getOption("ProcessPoolSleep", self.__poolSleep))
    self.log.info("ProcessPool sleep time = %d seconds" % self.__poolSleep)
    self.__bulkRequest = self.am_getOption("BulkRequest", 0)
    self.log.info("Bulk request size = %d" % self.__bulkRequest)

    # # keep config path and agent name
    self.agentName = self.am_getModuleParam("fullName")
    self.__configPath = PathFinder.getAgentSection(self.agentName)

    # # operation handlers over here
    opHandlersPath = "%s/%s" % (self.__configPath, "OperationHandlers")
    opHandlers = gConfig.getSections(opHandlersPath)
    if not opHandlers["OK"]:
      self.log.error(opHandlers["Message"])
      raise AgentConfigError("OperationHandlers section not found in CS under %s" % self.__configPath)
    opHandlers = opHandlers["Value"]

    self.timeOuts = dict()

    # # handlers dict
    self.handlersDict = dict()
    for opHandler in opHandlers:
      opHandlerPath = "%s/%s/Location" % (opHandlersPath, opHandler)
      opLocation = gConfig.getValue(opHandlerPath, "")
      if not opLocation:
        self.log.error("%s not set for %s operation handler" % (opHandlerPath, opHandler))
        continue
      self.timeOuts[opHandler] = {"PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout}

      opTimeout = gConfig.getValue("%s/%s/TimeOut" % (opHandlersPath, opHandler), 0)
      if opTimeout:
        self.timeOuts[opHandler]["PerOperation"] = opTimeout
      fileTimeout = gConfig.getValue("%s/%s/TimeOutPerFile" % (opHandlersPath, opHandler), 0)
      if fileTimeout:
        self.timeOuts[opHandler]["PerFile"] = fileTimeout

      self.handlersDict[opHandler] = opLocation

    self.log.info("Operation handlers:")
    for item in enumerate(self.handlersDict.items()):
      opHandler = item[1][0]
      self.log.info("[%s] %s: %s (timeout: %d s + %d s per file)" % (item[0], item[1][0], item[1][1],
                                                                     self.timeOuts[opHandler]['PerOperation'],
                                                                     self.timeOuts[opHandler]['PerFile']))

    # # common monitor activity
    gMonitor.registerActivity("Iteration", "Agent Loops",
                              "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM)
    gMonitor.registerActivity("Processed", "Request Processed",
                              "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM)
    gMonitor.registerActivity("Done", "Request Completed",
                              "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM)
    # # create request dict
    self.__requestCache = dict()

    # ?? Probably should be removed
    self.FTSMode = self.am_getOption("FTSMode", False)

  def processPool(self):
    """ facade for ProcessPool """
    if not self.__processPool:
      minProcess = max(1, self.__minProcess)
      maxProcess = max(self.__minProcess, self.__maxProcess)
      queueSize = abs(self.__queueSize)
      self.log.info("REA ProcessPool configuration", "minProcess = %d maxProcess = %d queueSize = %d" % (minProcess,
                                                                                                         maxProcess,
                                                                                                         queueSize))
      self.__processPool = ProcessPool(minProcess,
                                       maxProcess,
                                       queueSize,
                                       poolCallback=self.resultCallback,
                                       poolExceptionCallback=self.exceptionCallback)
      self.__processPool.daemonize()
    return self.__processPool

  def requestClient(self):
    """ RequestClient getter """
    if not self.__requestClient:
      self.__requestClient = ReqClient()
    return self.__requestClient

  def cacheRequest(self, request):
    """ put request into requestCache

    :param ~Request.Request request: Request instance
    """
    maxProcess = max(self.__minProcess, self.__maxProcess)
    if len(self.__requestCache) > maxProcess + 50:
      # For the time being we just print a warning... If the ProcessPool is working well, this is not needed
      # We don't know how much is acceptable as it depends on many factors
      self.log.warn("Too many requests in cache", ': %d' % len(self.__requestCache))
#      return S_ERROR( "Too many requests in cache" )
    if request.RequestID in self.__requestCache:
      # We don't call  putRequest as we have got back the request that is still being executed. Better keep it
      # The main reason for this is that it lasted longer than the kick time of CleanReqAgent
      self.log.warn("Duplicate request, keep it but don't execute",
                    ': %d/%s' % (request.RequestID, request.RequestName))
      return S_ERROR(errno.EALREADY, 'Request already in cache')
    self.__requestCache[request.RequestID] = request
    return S_OK()

  def putRequest(self, requestID, taskResult=None):
    """ put back :requestID: to RequestClient

    :param str requestID: request's id
    """
    if requestID in self.__requestCache:
      request = self.__requestCache.pop(requestID)
      if taskResult:
        if taskResult['OK']:
          request = taskResult['Value']
          # The RequestTask is putting back the Done tasks, no need to redo it
          if request.Status == 'Done':
            return S_OK()
        # In case of timeout, we need to increment ourselves all the attempts
        elif cmpError(taskResult, errno.ETIME):
          waitingOp = request.getWaiting()
          for rmsFile in waitingOp.get('Value', []):
            rmsFile.Attempt += 1

      reset = self.requestClient().putRequest(request, useFailoverProxy=False, retryMainService=2)
      if not reset["OK"]:
        return S_ERROR("putRequest: unable to reset request %s: %s" % (requestID, reset["Message"]))
    else:
      return S_ERROR('Not in cache')
    return S_OK()

  def putAllRequests(self):
    """ put back all requests without callback called into requestClient

    :param self: self reference
    """
    self.log.info("putAllRequests: will put back requests", "%s" % len(self.__requestCache))
    for requestID in self.__requestCache.keys():
      reset = self.putRequest(requestID)
      if not reset["OK"]:
        self.log.error('Failed to put request', reset["Message"])
      else:
        self.log.debug("putAllRequests: request %s has been put back with its initial state" % requestID)
    return S_OK()

  def initialize(self):
    """ initialize agent
    """
    return S_OK()

  def execute(self):
    """ read requests from RequestClient and enqueue them into ProcessPool """
    gMonitor.addMark("Iteration", 1)
    # # requests (and so tasks) counter
    taskCounter = 0
    while taskCounter < self.__requestsPerCycle:
      self.log.debug("execute: executing %d request in this cycle" % taskCounter)

      requestsToExecute = []

      if not self.__bulkRequest:
        self.log.info("execute: ask for a single request")
        getRequest = self.requestClient().getRequest()
        if not getRequest["OK"]:
          self.log.error("execute:", "%s" % getRequest["Message"])
          break
        if not getRequest["Value"]:
          self.log.info("execute: no more 'Waiting' requests to process")
          break
        requestsToExecute = [getRequest["Value"]]
      else:
        numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter)
        self.log.info("execute: ask for requests", "%s" % numberOfRequest)
        getRequests = self.requestClient().getBulkRequests(numberOfRequest)
        if not getRequests["OK"]:
          self.log.error("execute:", "%s" % getRequests["Message"])
          break
        if not getRequests["Value"]:
          self.log.info("execute: no more 'Waiting' requests to process")
          break
        for rId in getRequests["Value"]["Failed"]:
          self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId])

        requestsToExecute = getRequests["Value"]["Successful"].values()

      self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute))

      for request in requestsToExecute:
        # # set task id
        taskID = request.RequestID

        self.log.info("processPool status", "tasks idle = %s working = %s" %
                      (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses()))

        looping = 0
        while True:
          if not self.processPool().getFreeSlots():
            if not looping:
              self.log.info(
                  "No free slots available in processPool",
                  "will wait %d seconds to proceed" %
                  self.__poolSleep)
            time.sleep(self.__poolSleep)
            looping += 1
          else:
            if looping:
              self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep)
            looping = 0
            # # save current request in cache
            res = self.cacheRequest(request)
            if not res['OK']:
              if cmpError(res, errno.EALREADY):
                # The request is already in the cache, skip it. break out of the while loop to get next request
                break
              # There are too many requests in the cache, commit suicide
              self.log.error(
                  "Too many requests in cache",
                  '(%d requests): put back all requests and exit cycle. Error %s' % (
                      len(
                          self.__requestCache),
                      res['Message']))
              self.putAllRequests()
              return res
            # # serialize to JSON
            result = request.toJSON()
            if not result['OK']:
              continue
            requestJSON = result['Value']
            self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName))
            timeOut = self.getTimeout(request)
            enqueue = self.processPool().createAndQueueTask(RequestTask,
                                                            kwargs={"requestJSON": requestJSON,
                                                                    "handlersDict": self.handlersDict,
                                                                    "csPath": self.__configPath,
                                                                    "agentName": self.agentName},
                                                            taskID=taskID,
                                                            blocking=True,
                                                            usePoolCallbacks=True,
                                                            timeOut=timeOut)
            if not enqueue["OK"]:
              self.log.error("Could not enqueue task", enqueue["Message"])
            else:
              self.log.debug("successfully enqueued task", "'%s'" % taskID)
              # # update monitor
              gMonitor.addMark("Processed", 1)
              # # update request counter
              taskCounter += 1
              # # task created, a little time kick to proceed
              time.sleep(0.1)
              break

    self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache))
    processed = self.processPool().processResults()
    # This happens when the result queue is screwed up.
    # Returning S_ERROR proved not to be sufficient,
    # and when in this situation, there is nothing we can do.
    # So we just exit. runit will restart from scratch.
    if processed < 0:
      self.log.fatal("Results queue is screwed up")
      sys.exit(1)
    # # clean return
    return S_OK()

  def getTimeout(self, request):
    """ get timeout for request """
    timeout = 0
    for op in request:
      if op.Status not in ("Waiting", "Scheduled", 'Queued'):
        continue
      if op.Type not in self.timeOuts:
        timeout += self.__operationTimeout
      else:
        perOp = self.timeOuts[op.Type].get("PerOperation", self.__operationTimeout)
        perFiles = self.timeOuts[op.Type].get("PerFile", self.__fileTimeout) * len(op)
        timeout += perOp + perFiles
    self.log.info("estimated timeOut for request", "(%s/%s) is %s" % (request.RequestID, request.RequestName, timeout))
    return timeout

  def finalize(self):
    """ agent finalization """
    if self.__processPool:
      self.processPool().finalize(timeout=self.__poolTimeout)
    self.putAllRequests()
    return S_OK()

  def resultCallback(self, taskID, taskResult):
    """ definition of request callback function

    :param str taskID: Request.RequestID
    :param dict taskResult: task result S_OK(Request)/S_ERROR(Message)
    """
    # # clean cache
    res = self.putRequest(taskID, taskResult)
    self.log.info(
        "callback:",
        "%s result is %s(%s), put %s(%s)" %
        (taskID,
         "S_OK" if taskResult["OK"] else "S_ERROR",
         taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"],
         "S_OK" if res['OK'] else 'S_ERROR',
         '' if res['OK'] else res['Message']))

  def exceptionCallback(self, taskID, taskException):
    """ definition of exception callback function

    :param str taskID: Request.RequestID
    :param Exception taskException: Exception instance
    """
    self.log.error("exceptionCallback:", "%s was hit by exception %s" % (taskID, taskException))
    self.putRequest(taskID)
示例#27
0
class PoolComputingElement(ComputingElement):

    mandatoryParameters = MandatoryParameters

    #############################################################################
    def __init__(self, ceUniqueID, cores=0):
        """ Standard constructor.
    """
        ComputingElement.__init__(self, ceUniqueID)
        self.ceType = "Pool"
        self.submittedJobs = 0
        if cores > 0:
            self.cores = cores
        else:
            self.cores = getNumberOfCores()
        self.pPool = ProcessPool(self.cores,
                                 self.cores,
                                 poolCallback=self.finalizeJob)
        self.taskID = 0
        self.coresPerTask = {}

    #############################################################################
    def _addCEConfigDefaults(self):
        """Method to make sure all necessary Configuration Parameters are defined
    """
        # First assure that any global parameters are loaded
        ComputingElement._addCEConfigDefaults(self)

    def getCoresInUse(self):
        """
    """
        coresInUse = 0
        for _task, cores in self.coresPerTask.items():
            coresInUse += cores
        return coresInUse

    #############################################################################
    def submitJob(self, executableFile, proxy, **kwargs):
        """ Method to submit job.
    """

        self.pPool.processResults()

        coresInUse = self.getCoresInUse()
        if "WholeNode" in kwargs and kwargs['WholeNode']:
            if coresInUse > 0:
                return S_ERROR(
                    'Can not take WholeNode job'
                )  #, %d/%d slots used' % (self.slotsInUse,self.slots) )
            else:
                requestedCores = self.cores
        elif "NumberOfCores" in kwargs:
            requestedCores = int(kwargs['NumberOfCores'])
            if requestedCores > 0:
                if (coresInUse + requestedCores) > self.cores:
                    return S_ERROR(
                        'Not enough slots: requested %d, available %d' %
                        (requestedCores, self.cores - coresInUse))
        else:
            requestedCores = 1
        if self.cores - coresInUse < requestedCores:
            return S_ERROR('Not enough slots: requested %d, available %d' %
                           (requestedCores, self.cores - coresInUse))

        ret = getProxyInfo()
        if not ret['OK']:
            pilotProxy = None
        else:
            pilotProxy = ret['Value']['path']
        self.log.notice('Pilot Proxy:', pilotProxy)

        result = self.pPool.createAndQueueTask(
            executeJob, [executableFile, proxy, self.taskID],
            None,
            self.taskID,
            usePoolCallbacks=True)
        self.taskID += 1
        self.coresPerTask[self.taskID] = requestedCores

        self.pPool.processResults()

        return result

    def finalizeJob(self, taskID, result):
        """ Finalize the job
    """
        del self.coresPerTask[taskID]

    #############################################################################
    def getCEStatus(self):
        """ Method to return information on running and pending jobs.
    """
        self.pPool.processResults()
        result = S_OK()
        result['SubmittedJobs'] = 0
        nJobs = 0
        for _j, value in self.coresPerTask.items():
            if value > 0:
                nJobs += 1
        result['RunningJobs'] = nJobs
        result['WaitingJobs'] = 0
        coresInUse = self.getCoresInUse()
        result['UsedCores'] = coresInUse
        result['AvailableCores'] = self.cores - coresInUse
        return result

    #############################################################################
    def monitorProxy(self, pilotProxy, payloadProxy):
        """ Monitor the payload proxy and renew as necessary.
    """
        return self._monitorProxy(pilotProxy, payloadProxy)
示例#28
0
class RequestExecutingAgent( AgentModule ):
  """
  .. class:: RequestExecutingAgent

  request processing agent using ProcessPool, Operation handlers and RequestTask
  """
  # # process pool
  __processPool = None
  # # request cache
  __requestCache = {}
  # # requests/cycle
  __requestsPerCycle = 100
  # # minimal nb of subprocess running
  __minProcess = 2
  # # maximal nb of subprocess executed same time
  __maxProcess = 4
  # # ProcessPool queue size
  __queueSize = 20
  # # file timeout
  __fileTimeout = 300
  # # operation timeout
  __operationTimeout = 300
  # # ProcessTask default timeout in seconds
  __taskTimeout = 900
  # # ProcessPool finalization timeout
  __poolTimeout = 900
  # # ProcessPool sleep time
  __poolSleep = 5
  # # placeholder for RequestClient instance
  __requestClient = None
  # # Size of the bulk if use of getRequests. If 0, use getRequest
  __bulkRequest = 0

  def __init__( self, *args, **kwargs ):
    """ c'tor """
    # # call base class ctor
    AgentModule.__init__( self, *args, **kwargs )
    # # ProcessPool related stuff
    self.__requestsPerCycle = self.am_getOption( "RequestsPerCycle", self.__requestsPerCycle )
    self.log.info( "Requests/cycle = %d" % self.__requestsPerCycle )
    self.__minProcess = self.am_getOption( "MinProcess", self.__minProcess )
    self.log.info( "ProcessPool min process = %d" % self.__minProcess )
    self.__maxProcess = self.am_getOption( "MaxProcess", 4 )
    self.log.info( "ProcessPool max process = %d" % self.__maxProcess )
    self.__queueSize = self.am_getOption( "ProcessPoolQueueSize", self.__queueSize )
    self.log.info( "ProcessPool queue size = %d" % self.__queueSize )
    self.__poolTimeout = int( self.am_getOption( "ProcessPoolTimeout", self.__poolTimeout ) )
    self.log.info( "ProcessPool timeout = %d seconds" % self.__poolTimeout )
    self.__poolSleep = int( self.am_getOption( "ProcessPoolSleep", self.__poolSleep ) )
    self.log.info( "ProcessPool sleep time = %d seconds" % self.__poolSleep )
    self.__taskTimeout = int( self.am_getOption( "ProcessTaskTimeout", self.__taskTimeout ) )
    self.log.info( "ProcessTask timeout = %d seconds" % self.__taskTimeout )
    self.__bulkRequest = self.am_getOption( "BulkRequest", 0 )
    self.log.info( "Bulk request size = %d" % self.__bulkRequest )

    # # keep config path and agent name
    self.agentName = self.am_getModuleParam( "fullName" )
    self.__configPath = PathFinder.getAgentSection( self.agentName )

    # # operation handlers over here
    opHandlersPath = "%s/%s" % ( self.__configPath, "OperationHandlers" )
    opHandlers = gConfig.getSections( opHandlersPath )
    if not opHandlers["OK"]:
      self.log.error( opHandlers["Message" ] )
      raise AgentConfigError( "OperationHandlers section not found in CS under %s" % self.__configPath )
    opHandlers = opHandlers["Value"]


    self.timeOuts = dict()

    # # handlers dict
    self.handlersDict = dict()
    for opHandler in opHandlers:
      opHandlerPath = "%s/%s/Location" % ( opHandlersPath, opHandler )
      opLocation = gConfig.getValue( opHandlerPath, "" )
      if not opLocation:
        self.log.error( "%s not set for %s operation handler" % ( opHandlerPath, opHandler ) )
        continue
      self.timeOuts[opHandler] = { "PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout }

      opTimeout = gConfig.getValue( "%s/%s/TimeOut" % ( opHandlersPath, opHandler ), 0 )
      if opTimeout:
        self.timeOuts[opHandler]["PerOperation"] = opTimeout
      fileTimeout = gConfig.getValue( "%s/%s/TimeOutPerFile" % ( opHandlersPath, opHandler ), 0 )
      if fileTimeout:
        self.timeOuts[opHandler]["PerFile"] = fileTimeout

      self.handlersDict[opHandler] = opLocation

    self.log.info( "Operation handlers:" )
    for item in enumerate ( self.handlersDict.items() ):
      opHandler = item[1][0]
      self.log.info( "[%s] %s: %s (timeout: %d s + %d s per file)" % ( item[0], item[1][0], item[1][1],
                                                                   self.timeOuts[opHandler]['PerOperation'],
                                                                   self.timeOuts[opHandler]['PerFile'] ) )

    # # common monitor activity
    gMonitor.registerActivity( "Iteration", "Agent Loops",
                               "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Processed", "Request Processed",
                               "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Done", "Request Completed",
                               "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM )
    # # create request dict
    self.__requestCache = dict()

    self.FTSMode = self.am_getOption( "FTSMode", False )



  def processPool( self ):
    """ facade for ProcessPool """
    if not self.__processPool:
      minProcess = max( 1, self.__minProcess )
      maxProcess = max( self.__minProcess, self.__maxProcess )
      queueSize = abs( self.__queueSize )
      self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess,
                                                                                       maxProcess,
                                                                                       queueSize ) )
      self.__processPool = ProcessPool( minProcess,
                                        maxProcess,
                                        queueSize,
                                        poolCallback = self.resultCallback,
                                        poolExceptionCallback = self.exceptionCallback )
      self.__processPool.daemonize()
    return self.__processPool

  def requestClient( self ):
    """ RequestClient getter """
    if not self.__requestClient:
      self.__requestClient = ReqClient()
    return self.__requestClient

  def cacheRequest( self, request ):
    """ put request into requestCache

    :param Request request: Request instance
    """
    count = 5
    # Wait a bit as there may be a race condition between RequestTask putting back the request and the callback clearing the cache
    while request.RequestID in self.__requestCache:
      count -= 1
      if not count:
        self.requestClient().putRequest( request, useFailoverProxy = False, retryMainService = 2 )
        return S_ERROR( "Duplicate request, ignore: %s" % request.RequestID )
      time.sleep( 1 )
    self.__requestCache[ request.RequestID ] = request
    return S_OK()

  def putRequest( self, requestID, taskResult = None ):
    """ put back :requestID: to RequestClient

    :param str requestID: request's id
    """
    if requestID in self.__requestCache:
      request = self.__requestCache.pop( requestID )
      if taskResult and taskResult['OK']:
        request = taskResult['Value']

      reset = self.requestClient().putRequest( request, useFailoverProxy = False, retryMainService = 2 )
      if not reset["OK"]:
        return S_ERROR( "putRequest: unable to reset request %s: %s" % ( requestID, reset["Message"] ) )
    else:
      return S_ERROR( 'Not in cache' )
    return S_OK()

  def putAllRequests( self ):
    """ put back all requests without callback called into requestClient

    :param self: self reference
    """
    self.log.info( "putAllRequests: will put %s back requests" % len( self.__requestCache ) )
    for requestID in self.__requestCache.keys():
      reset = self.putRequest( requestID )
      if not reset["OK"]:
        self.log.error( 'Failed to put request', reset["Message"] )
      else:
        self.log.debug( "putAllRequests: request %s has been put back with its initial state" % requestID )
    return S_OK()

  def initialize( self ):
    """ initialize agent
    """
    return S_OK()

  def execute( self ):
    """ read requests from RequestClient and enqueue them into ProcessPool """
    gMonitor.addMark( "Iteration", 1 )
    # # requests (and so tasks) counter
    taskCounter = 0
    while taskCounter < self.__requestsPerCycle:
      self.log.debug( "execute: executing %d request in this cycle" % taskCounter )

      requestsToExecute = []

      if not self.__bulkRequest:
        self.log.info( "execute: ask for a single request" )
        getRequest = self.requestClient().getRequest()
        if not getRequest["OK"]:
          self.log.error( "execute: %s" % getRequest["Message"] )
          break
        if not getRequest["Value"]:
          self.log.info( "execute: no more 'Waiting' requests to process" )
          break
        requestsToExecute = [getRequest["Value"] ]
      else:
        numberOfRequest = min( self.__bulkRequest, self.__requestsPerCycle - taskCounter )
        self.log.info( "execute: ask for %s requests" % numberOfRequest )
        getRequests = self.requestClient().getBulkRequests( numberOfRequest )
        if not getRequests["OK"]:
          self.log.error( "execute: %s" % getRequests["Message"] )
          break
        if not getRequests["Value"]:
          self.log.info( "execute: no more 'Waiting' requests to process" )
          break
        for rId in getRequests["Value"]["Failed"]:
          self.log.error( "execute: %s" % getRequests["Value"]["Failed"][rId] )

        requestsToExecute = getRequests["Value"]["Successful"].values()

      self.log.info( "execute: will execute %s requests " % len( requestsToExecute ) )

      for request in requestsToExecute:
        # # set task id
        taskID = request.RequestID
        # # save current request in cache
        self.cacheRequest( request )
        # # serialize to JSON
        result = request.toJSON()
        if not result['OK']:
          continue
        requestJSON = result['Value']

        self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(),
                                                                      self.processPool().getNumWorkingProcesses() ) )

        looping = 0
        while True:
          if not self.processPool().getFreeSlots():
            if not looping:
              self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep )
            time.sleep( self.__poolSleep )
            looping += 1
          else:
            if looping:
              self.log.info( "Free slot found after %d seconds" % looping * self.__poolSleep )
            looping = 0
            self.log.info( "spawning task for request '%s/%s'" % ( request.RequestID, request.RequestName ) )
            timeOut = self.getTimeout( request )
            enqueue = self.processPool().createAndQueueTask( RequestTask,
                                                             kwargs = { "requestJSON" : requestJSON,
                                                                        "handlersDict" : self.handlersDict,
                                                                        "csPath" : self.__configPath,
                                                                        "agentName": self.agentName },
                                                             taskID = taskID,
                                                             blocking = True,
                                                             usePoolCallbacks = True,
                                                             timeOut = timeOut )
            if not enqueue["OK"]:
              self.log.error( enqueue["Message"] )
            else:
              self.log.debug( "successfully enqueued task '%s'" % taskID )
              # # update monitor
              gMonitor.addMark( "Processed", 1 )
              # # update request counter
              taskCounter += 1
              # # task created, a little time kick to proceed
              time.sleep( 0.1 )
              break

    # # clean return
    return S_OK()

  def getTimeout( self, request ):
    """ get timeout for request """
    timeout = 0
    for op in request:
      if op.Status not in ( "Waiting", "Scheduled", 'Queued' ):
        continue
      if op.Type not in self.timeOuts:
        timeout += self.__operationTimeout
      else:
        perOp = self.timeOuts[op.Type].get( "PerOperation", self.__operationTimeout )
        perFiles = self.timeOuts[op.Type].get( "PerFile", self.__fileTimeout ) * len( op )
        timeout += perOp + perFiles
    self.log.info( "estimated timeOut for request (%s/%s) is %s" % ( request.RequestID, request.RequestName, timeout ) )
    return timeout

  def finalize( self ):
    """ agent finalization """
    if self.__processPool:
      self.processPool().finalize( timeout = self.__poolTimeout )
    self.putAllRequests()
    return S_OK()

  def resultCallback( self, taskID, taskResult ):
    """ definition of request callback function

    :param str taskID: Request.RequestID
    :param dict taskResult: task result S_OK(Request)/S_ERROR(Message)
    """
    # # clean cache
    res = self.putRequest( taskID, taskResult )
    self.log.info( "callback: %s result is %s(%s), put %s(%s)" % ( taskID,
                                                      "S_OK" if taskResult["OK"] else "S_ERROR",
                                                      taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"],
                                                      "S_OK" if res['OK'] else 'S_ERROR',
                                                      '' if res['OK'] else res['Message'] ) )


  def exceptionCallback( self, taskID, taskException ):
    """ definition of exception callback function

    :param str taskID: Request.RequestID
    :param Exception taskException: Exception instance
    """
    self.log.error( "exceptionCallback: %s was hit by exception %s" % ( taskID, taskException ) )
    self.putRequest( taskID )
示例#29
0
      for path in result['Value']['Directories']:
        random.shuffle(lfcHosts)
        #print pPool.getNumWorkingProcesses(), pPool.hasPendingTasks()
        print "Queueing task for directory %s, lfc %s" % ( path, lfcHosts[0] )
        result = pPool.createAndQueueTask( processDir, [path , writerQueue, False, lfcHosts[0]], callback = finalizeDirectory )
        if not result['OK']:
          print "Failed queueing %s" % path
  else:
    print "Task failed: %s" % result['Message']
    if 'Path' in result:
      random.shuffle(lfcHosts)
      print "Requeueing task for directory %s, lfc %s" % ( result['Path'], lfcHosts[0] )

#########################################################################

pPool = ProcessPool(30,40,0)

manager = Manager()
writerQueue = manager.Queue()
stopFlag = Value( 'i', 0 )

#pPool.daemonize()

# lfcHosts = ['lfc-lhcb-ro.cern.ch',
#             'lfc-lhcb-ro.cr.cnaf.infn.it',
#             'lhcb-lfc-fzk.gridka.de',
#             'lfc-lhcb-ro.in2p3.fr',
#             'lfc-lhcb.grid.sara.nl',
#             'lfclhcb.pic.es',
#             'lhcb-lfc.gridpp.rl.ac.uk']
lfcHosts = ['prod-lfc-lhcb-ro.cern.ch']
示例#30
0
 def setUp( self ):
   gLogger.showHeaders( True )
   self.log = gLogger.getSubLogger( self.__class__.__name__ )
   self.processPool = ProcessPool( 4, 8, 8 ) 
   self.processPool.daemonize()
示例#31
0
class TaskTimeOutTests( unittest.TestCase ):
  """
  .. class:: TaskTimeOutTests

  test case for ProcessPool
  """

  def setUp( self ):
    """c'tor

    :param self: self reference
    """
    from DIRAC.Core.Base import Script
    Script.parseCommandLine()
    from DIRAC.FrameworkSystem.Client.Logger import gLogger
    gLogger.showHeaders( True )
    self.log = gLogger.getSubLogger( self.__class__.__name__ )
    self.processPool = ProcessPool( 2,
                                    4, 
                                    8,
                                    poolCallback = self.poolCallback, 
                                    poolExceptionCallback = self.poolExceptionCallback )
    self.processPool.daemonize()
    
  def poolCallback( self, taskID, taskResult ):
    self.log.always( "callback result for %s is %s" % ( taskID, taskResult )  ) 

  def poolExceptionCallback( self, taskID, taskException ): 
    self.log.always( "callback exception for %s is %s" % ( taskID, taskException ) )


  def testCallableClass( self ):
    """ CallableClass and task time out test """
    i = 0
    while True:
      if self.processPool.getFreeSlots() > 0:
        timeWait = random.randint( 0, 5 ) * 10
        raiseException = False
        if not timeWait:
          raiseException = True 
        result = self.processPool.createAndQueueTask( CallableClass,
                                                      taskID = i,
                                                      args = ( i, timeWait, raiseException ), 
                                                      timeOut = 15,
                                                      usePoolCallbacks = True,
                                                      blocking = True )    
        if result["OK"]:
          self.log.always("CallableClass enqueued to task %s timeWait=%s exception=%s" % ( i, timeWait, raiseException ) )
          i += 1
        else:
          continue
      if i == 16:
        break
    self.processPool.finalize( 2 )
    
  def testCallableFunc( self ):
    """ CallableFunc and task timeout test """
    i = 0
    while True:
      if self.processPool.getFreeSlots() > 0:
        timeWait = random.randint(0, 5) * 5
        raiseException = False
        if not timeWait:
          raiseException = True 
        result = self.processPool.createAndQueueTask( CallableFunc,
                                                      taskID = i,
                                                      args = ( i, timeWait, raiseException ),  
                                                      timeOut = 15,
                                                      usePoolCallbacks = True,
                                                      blocking = True )    
        if result["OK"]:
          self.log.always("CallableFunc enqueued to task %s timeWait=%s exception=%s" % ( i, timeWait, raiseException ) )
          i += 1
        else:
          continue
      if i == 16:
        break
    self.processPool.finalize( 2 )

  def testLockedClass( self ):
    """ LockedCallableClass and task time out test """

    for loop in range(2):
      self.log.always( "loop %s" % loop )
      i = 0
      while i < 16:
        if self.processPool.getFreeSlots() > 0:
          timeWait = random.randint(0, 5) * 5
          raiseException = False
          if timeWait == 5:
            raiseException = True
          klass = CallableClass
          if timeWait >= 20:
            klass = LockedCallableClass
          result = self.processPool.createAndQueueTask( klass,
                                                        taskID = i,
                                                        args = ( i, timeWait, raiseException ), 
                                                        timeOut = 15,
                                                        usePoolCallbacks = True,
                                                        blocking = True )    
          if result["OK"]:
            self.log.always("%s enqueued to task %s timeWait=%s exception=%s" % ( klass.__name__ , i, timeWait, raiseException ) )
            i += 1
          else:
            continue
      self.log.always("being idle for a while")
      for i in range(100000):
        for j in range(1000):
          pass

    self.log.always("finalizing...")
    self.processPool.finalize( 10 )
    ## unlock
    gLock.release()
示例#32
0
  def submitJob(self, executableFile, proxy, **kwargs):
    """ Method to submit job.

    :param str executableFile: location of the executable file
    :param str proxy: payload proxy

    :return: S_OK/S_ERROR of the result of the job submission
    """

    if self.pPool is None:
      self.pPool = ProcessPool(minSize=self.processors,
                               maxSize=self.processors,
                               poolCallback=self.finalizeJob)

    self.pPool.processResults()

    processorsInUse = self.getProcessorsInUse()
    if kwargs.get('wholeNode'):
      if processorsInUse > 0:
        return S_ERROR('Can not take WholeNode job')  # , %d/%d slots used' % (self.slotsInUse,self.slots) )
      else:
        requestedProcessors = self.processors
    elif "numberOfProcessors" in kwargs:
      requestedProcessors = int(kwargs['numberOfProcessors'])
      if requestedProcessors > 0:
        if (processorsInUse + requestedProcessors) > self.processors:
          return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors,
                                                                           self.processors - processorsInUse))
    else:
      requestedProcessors = 1
    if self.processors - processorsInUse < requestedProcessors:
      return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors,
                                                                       self.processors - processorsInUse))

    ret = getProxyInfo()
    if not ret['OK']:
      pilotProxy = None
    else:
      pilotProxy = ret['Value']['path']
    self.log.notice('Pilot Proxy:', pilotProxy)

    kwargs = {'UseSudo': False}
    if self.useSudo:
      for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS):
        if nUser not in self.userNumberPerTask.values():
          break
      kwargs['NUser'] = nUser
      kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str(nUser).zfill(2)
      kwargs['UseSudo'] = True

    result = self.pPool.createAndQueueTask(executeJob,
                                           args=(executableFile, proxy, self.taskID),
                                           kwargs=kwargs,
                                           taskID=self.taskID,
                                           usePoolCallbacks=True)
    self.processorsPerTask[self.taskID] = requestedProcessors
    self.taskID += 1

    self.pPool.processResults()

    return result
示例#33
0
                result = pPool.createAndQueueTask(
                    processDir, [path, writerQueue, False, lfcHosts[0]],
                    callback=finalizeDirectory)
                if not result['OK']:
                    print("Failed queueing %s" % path)
    else:
        print("Task failed: %s" % result['Message'])
        if 'Path' in result:
            random.shuffle(lfcHosts)
            print("Requeueing task for directory %s, lfc %s" %
                  (result['Path'], lfcHosts[0]))


#########################################################################

pPool = ProcessPool(30, 40, 0)

manager = Manager()
writerQueue = manager.Queue()
stopFlag = Value('i', 0)

# pPool.daemonize()

# lfcHosts = ['lfc-lhcb-ro.cern.ch',
#             'lfc-lhcb-ro.cr.cnaf.infn.it',
#             'lhcb-lfc-fzk.gridka.de',
#             'lfc-lhcb-ro.in2p3.fr',
#             'lfc-lhcb.grid.sara.nl',
#             'lfclhcb.pic.es',
#             'lhcb-lfc.gridpp.rl.ac.uk']
lfcHosts = ['prod-lfc-lhcb-ro.cern.ch']
示例#34
0
class PoolComputingElement(ComputingElement):

  mandatoryParameters = MandatoryParameters

  #############################################################################
  def __init__(self, ceUniqueID):
    """ Standard constructor.
    """
    ComputingElement.__init__(self, ceUniqueID)
    self.ceType = "Pool"
    self.log = gLogger.getSubLogger('Pool')
    self.submittedJobs = 0
    self.processors = 1
    self.pPool = None
    self.taskID = 0
    self.processorsPerTask = {}
    self.userNumberPerTask = {}
    self.useSudo = False

  #############################################################################
  def _addCEConfigDefaults(self):
    """Method to make sure all necessary Configuration Parameters are defined
    """
    # First assure that any global parameters are loaded
    ComputingElement._addCEConfigDefaults(self)

  def _reset(self):
    """ Update internal variables after some extra parameters are added

    :return: None
    """

    self.processors = int(self.ceParameters.get('NumberOfProcessors', self.processors))
    self.ceParameters['MaxTotalJobs'] = self.processors
    self.useSudo = self.ceParameters.get('SudoExecution', False)

  def getProcessorsInUse(self):
    """ Get the number of currently allocated processor cores

    :return: number of processor cores
    """
    processorsInUse = 0
    for task in self.processorsPerTask:
      processorsInUse += self.processorsPerTask[task]
    return processorsInUse

  #############################################################################
  def submitJob(self, executableFile, proxy, **kwargs):
    """ Method to submit job.

    :param str executableFile: location of the executable file
    :param str proxy: payload proxy

    :return: S_OK/S_ERROR of the result of the job submission
    """

    if self.pPool is None:
      self.pPool = ProcessPool(minSize=self.processors,
                               maxSize=self.processors,
                               poolCallback=self.finalizeJob)

    self.pPool.processResults()

    processorsInUse = self.getProcessorsInUse()
    if kwargs.get('wholeNode'):
      if processorsInUse > 0:
        return S_ERROR('Can not take WholeNode job')  # , %d/%d slots used' % (self.slotsInUse,self.slots) )
      else:
        requestedProcessors = self.processors
    elif "numberOfProcessors" in kwargs:
      requestedProcessors = int(kwargs['numberOfProcessors'])
      if requestedProcessors > 0:
        if (processorsInUse + requestedProcessors) > self.processors:
          return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors,
                                                                           self.processors - processorsInUse))
    else:
      requestedProcessors = 1
    if self.processors - processorsInUse < requestedProcessors:
      return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors,
                                                                       self.processors - processorsInUse))

    ret = getProxyInfo()
    if not ret['OK']:
      pilotProxy = None
    else:
      pilotProxy = ret['Value']['path']
    self.log.notice('Pilot Proxy:', pilotProxy)

    kwargs = {'UseSudo': False}
    if self.useSudo:
      for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS):
        if nUser not in self.userNumberPerTask.values():
          break
      kwargs['NUser'] = nUser
      kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str(nUser).zfill(2)
      kwargs['UseSudo'] = True

    result = self.pPool.createAndQueueTask(executeJob,
                                           args=(executableFile, proxy, self.taskID),
                                           kwargs=kwargs,
                                           taskID=self.taskID,
                                           usePoolCallbacks=True)
    self.processorsPerTask[self.taskID] = requestedProcessors
    self.taskID += 1

    self.pPool.processResults()

    return result

  def finalizeJob(self, taskID, result):
    """ Finalize the job by updating the process utilisation counters

    :param int taskID: local PoolCE task ID
    :param dict result: result of the job execution

    """
    nProc = self.processorsPerTask.pop(taskID)
    if result['OK']:
      self.log.info('Task %d finished successfully, %d processor(s) freed' % (taskID, nProc))
    else:
      self.log.error("Task failed submission", "%d, message: %s" % (taskID, result['Message']))

  #############################################################################
  def getCEStatus(self, jobIDList=None):
    """ Method to return information on running and pending jobs.

    :return: dictionary of numbers of jobs per status
    """

    if self.pPool is None:
      self.pPool = ProcessPool(minSize=self.processors,
                               maxSize=self.processors,
                               poolCallback=self.finalizeJob)

    self.pPool.processResults()
    result = S_OK()
    result['SubmittedJobs'] = 0
    nJobs = 0
    for _j, value in self.processorsPerTask.iteritems():
      if value > 0:
        nJobs += 1
    result['RunningJobs'] = nJobs
    result['WaitingJobs'] = 0
    processorsInUse = self.getProcessorsInUse()
    result['UsedProcessors'] = processorsInUse
    result['AvailableProcessors'] = self.processors - processorsInUse
    return result

  #############################################################################
  def monitorProxy(self, pilotProxy, payloadProxy):
    """ Monitor the payload proxy and renew as necessary.

    :param str pilotProxy: location of the pilotProxy
    :param str payloadProxy: location of the payloadProxy
    """
    return self._monitorProxy(pilotProxy, payloadProxy)
示例#35
0
class RequestExecutingAgent( AgentModule ):
  """
  .. class:: RequestExecutingAgent

  request processing agent using ProcessPool, Operation handlers and RequestTask
  """
  # # process pool
  __processPool = None
  # # request cache
  __requestCache = {}
  # # requests/cycle
  __requestsPerCycle = 100
  # # minimal nb of subprocess running
  __minProcess = 2
  # # maximal nb of subprocess executed same time
  __maxProcess = 4
  # # ProcessPool queue size
  __queueSize = 20
  # # file timeout
  __fileTimeout = 300
  # # operation timeout
  __operationTimeout = 300
  # # ProcessTask default timeout in seconds
  __taskTimeout = 900
  # # ProcessPool finalization timeout
  __poolTimeout = 900
  # # ProcessPool sleep time
  __poolSleep = 5
  # # placeholder for RequestClient instance
  __requestClient = None

  def __init__( self, *args, **kwargs ):
    """ c'tor """
    # # call base class ctor
    AgentModule.__init__( self, *args, **kwargs )
    # # ProcessPool related stuff
    self.__requestsPerCycle = self.am_getOption( "RequestsPerCycle", self.__requestsPerCycle )
    self.log.info( "Requests/cycle = %d" % self.__requestsPerCycle )
    self.__minProcess = self.am_getOption( "MinProcess", self.__minProcess )
    self.log.info( "ProcessPool min process = %d" % self.__minProcess )
    self.__maxProcess = self.am_getOption( "MaxProcess", 4 )
    self.log.info( "ProcessPool max process = %d" % self.__maxProcess )
    self.__queueSize = self.am_getOption( "ProcessPoolQueueSize", self.__queueSize )
    self.log.info( "ProcessPool queue size = %d" % self.__queueSize )
    self.__poolTimeout = int( self.am_getOption( "ProcessPoolTimeout", self.__poolTimeout ) )
    self.log.info( "ProcessPool timeout = %d seconds" % self.__poolTimeout )
    self.__poolSleep = int( self.am_getOption( "ProcessPoolSleep", self.__poolSleep ) )
    self.log.info( "ProcessPool sleep time = %d seconds" % self.__poolSleep )
    self.__taskTimeout = int( self.am_getOption( "ProcessTaskTimeout", self.__taskTimeout ) )
    self.log.info( "ProcessTask timeout = %d seconds" % self.__taskTimeout )

    # # keep config path and agent name
    self.agentName = self.am_getModuleParam( "fullName" )
    self.__configPath = PathFinder.getAgentSection( self.agentName )

    # # operation handlers over here
    opHandlersPath = "%s/%s" % ( self.__configPath, "OperationHandlers" )
    opHandlers = gConfig.getSections( opHandlersPath )
    if not opHandlers["OK"]:
      self.log.error( opHandlers["Message" ] )
      raise AgentConfigError( "OperationHandlers section not found in CS under %s" % self.__configPath )
    opHandlers = opHandlers["Value"]


    self.timeOuts = dict()

    self.operationHandlers = []
    for opHandler in opHandlers:
      opHandlerPath = "%s/%s/Location" % ( opHandlersPath, opHandler )
      opLocation = gConfig.getValue( opHandlerPath, "" )
      if not opLocation:
        self.log.error( "%s not set for %s operation handler" % ( opHandlerPath, opHandler ) )
        continue
      self.timeOuts[opHandler] = { "PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout }

      opTimeout = gConfig.getValue( "%s/%s/TimeOut" % ( opHandlersPath, opHandler ), 0 )
      if opTimeout:
        self.timeOuts[opHandler]["PerOperation"] = opTimeout
      fileTimeout = gConfig.getValue( "%s/%s/TimeOutPerFile" % ( opHandlersPath, opHandler ), 0 )
      if fileTimeout:
        self.timeOuts[opHandler]["PerFile"] = fileTimeout

      self.operationHandlers.append( opLocation )

    self.log.info( "Operation handlers:" )
    for itemTuple in enumerate ( self.operationHandlers ):
      self.log.info( "[%s] %s" % itemTuple )

    # # handlers dict
    self.handlersDict = dict()
    # # common monitor activity
    gMonitor.registerActivity( "Iteration", "Agent Loops",
                               "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Processed", "Request Processed",
                               "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Done", "Request Completed",
                               "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM )
    # # create request dict
    self.__requestCache = dict()

    self.FTSMode = self.am_getOption( "FTSMode", False )

  def processPool( self ):
    """ facade for ProcessPool """
    if not self.__processPool:
      minProcess = max( 1, self.__minProcess )
      maxProcess = max( self.__minProcess, self.__maxProcess )
      queueSize = abs( self.__queueSize )
      self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess,
                                                                                       maxProcess,
                                                                                       queueSize ) )
      self.__processPool = ProcessPool( minProcess,
                                        maxProcess,
                                        queueSize,
                                        poolCallback = self.resultCallback,
                                        poolExceptionCallback = self.exceptionCallback )
      self.__processPool.daemonize()
    return self.__processPool

  def requestClient( self ):
    """ RequestClient getter """
    if not self.__requestClient:
      self.__requestClient = ReqClient()
    return self.__requestClient

  def cleanCache( self, requestName = None ):
    """ delete request from requestCache

    :param str requestName: Request.RequestName
    """
    if requestName in self.__requestCache:
      del self.__requestCache[requestName]
    return S_OK()

  def cacheRequest( self, request ):
    """ put request into requestCache

    :param Request request: Request instance
    """
    self.__requestCache.setdefault( request.RequestName, request )
    return S_OK()

  def resetRequest( self, requestName ):
    """ put back :requestName: to RequestClient

    :param str requestName: request's name
    """
    if requestName in self.__requestCache:
      reset = self.requestClient().updateRequest( self.__requestCache[requestName] )
      if not reset["OK"]:
        return S_ERROR( "resetRequest: unable to reset request %s: %s" % ( requestName, reset["Message"] ) )
    return S_OK()

  def resetAllRequests( self ):
    """ put back all requests without callback called into requestClient

    :param self: self reference
    """
    self.log.info( "resetAllRequests: will put %s back requests" % len( self.__requestCache ) )
    for requestName, request in self.__requestCache.iteritems():
      reset = self.requestClient().updateRequest( request )
      if not reset["OK"]:
        self.log.error( "resetAllRequests: unable to reset request %s: %s" % ( requestName, reset["Message"] ) )
        continue
      self.log.debug( "resetAllRequests: request %s has been put back with its initial state" % requestName )
    return S_OK()

  def initialize( self ):
    """ initialize agent

    at the moment creates handlers dictionary
    """
    for opHandler in self.operationHandlers:
      handlerName = opHandler.split( "/" )[-1]
      self.handlersDict[ handlerName ] = opHandler
      self.log.debug( "handler '%s' for operation '%s' registered" % ( opHandler, handlerName ) )
    if not self.handlersDict:
      self.log.error( "operation handlers not set, check configuration option 'Operations'!" )
      return S_ERROR( "Operation handlers not set!" )
    return S_OK()

  def execute( self ):
    """ read requests from RequestClient and enqueue them into ProcessPool """
    gMonitor.addMark( "Iteration", 1 )
    # # requests (and so tasks) counter
    taskCounter = 0
    while taskCounter < self.__requestsPerCycle:
      self.log.debug( "execute: executing %d request in this cycle" % taskCounter )
      getRequest = self.requestClient().getRequest()
      if not getRequest["OK"]:
        self.log.error( "execute: %s" % getRequest["Message"] )
        break
      if not getRequest["Value"]:
        self.log.info( "execute: no more 'Waiting' requests to process" )
        break
      # # OK, we've got you
      request = getRequest["Value"]
      # # set task id
      taskID = request.RequestName
      # # save current request in cache
      self.cacheRequest( request )
      # # serialize to JSON
      requestJSON = request.toJSON()
      if not requestJSON["OK"]:
        self.log.error( "JSON serialization error: %s" % requestJSON["Message"] )
        break
      requestJSON = requestJSON["Value"]

      self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(),
                                                                    self.processPool().getNumWorkingProcesses() ) )

      while True:
        if not self.processPool().getFreeSlots():
          self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep )
          time.sleep( self.__poolSleep )
        else:
          self.log.info( "spawning task for request '%s'" % ( request.RequestName ) )
          timeOut = self.getTimeout( request )
          enqueue = self.processPool().createAndQueueTask( RequestTask,
                                                           kwargs = { "requestJSON" : requestJSON,
                                                                      "handlersDict" : self.handlersDict,
                                                                      "csPath" : self.__configPath,
                                                                      "agentName": self.agentName },
                                                           taskID = taskID,
                                                           blocking = True,
                                                           usePoolCallbacks = True,
                                                           timeOut = timeOut )
          if not enqueue["OK"]:
            self.log.error( enqueue["Message"] )
          else:
            self.log.debug( "successfully enqueued task '%s'" % taskID )
            # # update monitor
            gMonitor.addMark( "Processed", 1 )
            # # update request counter
            taskCounter += 1
            # # task created, a little time kick to proceed
            time.sleep( 0.1 )
            break

    # # clean return
    return S_OK()

  def getTimeout( self, request ):
    """ get timeout for request """
    timeout = 0
    for op in request:
      if op.Status not in ( "Waiting", "Scheduled" ):
        continue
      if op.Type not in self.timeOuts:
        timeout += self.__operationTimeout
      else:
        perOp = self.timeOuts[op.Type].get( "PerOperation", self.__operationTimeout )
        perFiles = self.timeOuts[op.Type].get( "PerFile", self.__fileTimeout ) * len( op )
        timeout += perOp + perFiles
    self.log.info( "estimated timeOut for request %s is %s" % ( request.RequestName, timeout ) )
    return timeout

  def finalize( self ):
    """ agent finalization """
    if self.__processPool:
      self.processPool().finalize( timeout = self.__poolTimeout )
    self.resetAllRequests()
    return S_OK()

  def resultCallback( self, taskID, taskResult ):
    """ definition of request callback function

    :param str taskID: Reqiest.RequestName
    :param dict taskResult: task result S_OK/S_ERROR
    """
    self.log.info( "callback: %s result is %s(%s)" % ( taskID,
                                                      "S_OK" if taskResult["OK"] else "S_ERROR",
                                                      taskResult["Value"] if taskResult["OK"] else taskResult["Message"] ) )

    if not taskResult["OK"]:
      if taskResult["Message"] == "Timed out":
        self.resetRequest( taskID )
    # # clean cache
    self.cleanCache( taskID )

  def exceptionCallback( self, taskID, taskException ):
    """ definition of exception callback function

    :param str taskID: Request.RequestName
    :param Exception taskException: Exception instance
    """
    self.log.error( "exceptionCallback: %s was hit by exception %s" % ( taskID, taskException ) )
    self.resetRequest( taskID )
示例#36
0
class PoolComputingElement(ComputingElement):

  mandatoryParameters = MandatoryParameters

  #############################################################################
  def __init__(self, ceUniqueID):
    """ Standard constructor.
    """
    ComputingElement.__init__(self, ceUniqueID)
    self.ceType = "Pool"
    self.log = gLogger.getSubLogger('Pool')
    self.submittedJobs = 0
    self.processors = 1
    self.pPool = None
    self.taskID = 0
    self.processorsPerTask = {}
    self.userNumberPerTask = {}
    self.useSudo = False

  #############################################################################
  def _addCEConfigDefaults(self):
    """Method to make sure all necessary Configuration Parameters are defined
    """
    # First assure that any global parameters are loaded
    ComputingElement._addCEConfigDefaults(self)

  def _reset(self):

    self.processors = int(self.ceParameters.get('NumberOfProcessors', self.processors))
    self.ceParameters['MaxTotalJobs'] = self.processors
    self.useSudo = self.ceParameters.get('SudoExecution', False)

  def getProcessorsInUse(self):
    """
    """
    processorsInUse = 0
    for task in self.processorsPerTask:
      processorsInUse += self.processorsPerTask[task]
    return processorsInUse

  #############################################################################
  def submitJob(self, executableFile, proxy, **kwargs):
    """ Method to submit job.
    """

    if self.pPool is None:
      self.pPool = ProcessPool(minSize=self.processors,
                               maxSize=self.processors,
                               poolCallback=self.finalizeJob)

    self.pPool.processResults()

    processorsInUse = self.getProcessorsInUse()
    if kwargs.get('wholeNode'):
      if processorsInUse > 0:
        return S_ERROR('Can not take WholeNode job')  # , %d/%d slots used' % (self.slotsInUse,self.slots) )
      else:
        requestedProcessors = self.processors
    elif "numberOfProcessors" in kwargs:
      requestedProcessors = int(kwargs['numberOfProcessors'])
      if requestedProcessors > 0:
        if (processorsInUse + requestedProcessors) > self.processors:
          return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors,
                                                                           self.processors - processorsInUse))
    else:
      requestedProcessors = 1
    if self.processors - processorsInUse < requestedProcessors:
      return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors,
                                                                       self.processors - processorsInUse))

    ret = getProxyInfo()
    if not ret['OK']:
      pilotProxy = None
    else:
      pilotProxy = ret['Value']['path']
    self.log.notice('Pilot Proxy:', pilotProxy)

    kwargs = {'UseSudo': False}
    if self.useSudo:
      for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS):
        if nUser not in self.userNumberPerTask.values():
          break
      kwargs['NUser'] = nUser
      kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str(nUser).zfill(2)
      kwargs['UseSudo'] = True

    result = self.pPool.createAndQueueTask(executeJob,
                                           args=(executableFile, proxy, self.taskID),
                                           kwargs=kwargs,
                                           taskID=self.taskID,
                                           usePoolCallbacks=True)
    self.processorsPerTask[self.taskID] = requestedProcessors
    self.taskID += 1

    self.pPool.processResults()

    return result

  def finalizeJob(self, taskID, result):
    """ Finalize the job
    """
    nProc = self.processorsPerTask.pop(taskID)
    if result['OK']:
      self.log.info('Task %d finished successfully, %d processor(s) freed' % (taskID, nProc))
    else:
      self.log.error("Task failed submission", "%d, message: %s" % (taskID, result['Message']))

  #############################################################################
  def getCEStatus(self, jobIDList=None):
    """ Method to return information on running and pending jobs.
    """
    self.pPool.processResults()
    result = S_OK()
    result['SubmittedJobs'] = 0
    nJobs = 0
    for _j, value in self.processorsPerTask.iteritems():
      if value > 0:
        nJobs += 1
    result['RunningJobs'] = nJobs
    result['WaitingJobs'] = 0
    processorsInUse = self.getProcessorsInUse()
    result['UsedProcessors'] = processorsInUse
    result['AvailableProcessors'] = self.processors - processorsInUse
    return result

  #############################################################################
  def monitorProxy(self, pilotProxy, payloadProxy):
    """ Monitor the payload proxy and renew as necessary.
    """
    return self._monitorProxy(pilotProxy, payloadProxy)
示例#37
0
class ProcessPoolCallbacksTests( unittest.TestCase ):
  """
  .. class:: ProcessPoolCallbacksTests
  test case for ProcessPool
  """

  def setUp( self ):
    """c'tor

    :param self: self reference
    """
    from DIRAC.Core.Base import Script
    Script.parseCommandLine()
    from DIRAC.FrameworkSystem.Client.Logger import gLogger
    gLogger.showHeaders( True )
    self.log = gLogger.getSubLogger( self.__class__.__name__ )
    self.processPool = ProcessPool( 4, 8, 8,
                                    poolCallback = self.poolCallback, 
                                    poolExceptionCallback = self.poolExceptionCallback )
    self.processPool.daemonize()

  def poolCallback( self, taskID, taskResult ):
    self.log.always( "callback for %s result is %s" % ( taskID, taskResult ) ) 

  def poolExceptionCallback( self, taskID, taskException ):
    self.log.always( "callback for %s exception is %s" % ( taskID, taskException ) )

  def testCallableClass( self ):
    """ CallableClass and pool callbacks test """
    i = 0
    while True:
      if self.processPool.getFreeSlots() > 0:
        timeWait = random.randint(0, 5)
        raiseException = False
        if not timeWait:
          raiseException = True 
        result = self.processPool.createAndQueueTask( CallableClass,
                                                      taskID = i,
                                                      args = ( i, timeWait, raiseException ),  
                                                      usePoolCallbacks = True,
                                                      blocking = True )    
        if result["OK"]:
          self.log.always("CallableClass enqueued to task %s" % i )
          i += 1          
        else:
          continue
      if i == 10:
        break
    self.processPool.finalize( 2 )


  def testCallableFunc( self ):
    """ CallableFunc and pool callbacks test """
    i = 0
    while True:
      if self.processPool.getFreeSlots() > 0:
        timeWait = random.randint(0, 5)
        raiseException = False
        if not timeWait:
          raiseException = True 
        result = self.processPool.createAndQueueTask( CallableFunc,
                                                      taskID = i,
                                                      args = ( i, timeWait, raiseException ),  
                                                      usePoolCallbacks = True,
                                                      blocking = True )    
        if result["OK"]:
          self.log.always("CallableFunc enqueued to task %s" % i )
          i += 1          
        else:
          continue
      if i == 10:
        break
    self.processPool.finalize( 2 )
示例#38
0
class RequestAgentBase( AgentModule ):
  """
  .. class:: RequestAgentBase

  Helper class for DIRAC agents dealing with RequestContainers and Requests.  
  """
  
  ## placeholder for thread pool
  __processPool = None
  ## requests/cycle 
  __requestsPerCycle = 50
  ## minimal nb of subprocess running 
  __minProcess = 2
  ## maximal nb of subprocess executed same time
  __maxProcess = 4
  ## ProcessPool queue size 
  __queueSize = 10
  ## ProcessTask default timeout in seconds
  __taskTimeout = 300
  ## ProcessPool finalisation timeout 
  __poolTimeout = 300
  ## placeholder for RequestClient instance
  __requestClient = None
  ## request type
  __requestType = ""
  ## placeholder for request task class definition 
  __requestTask = None
  ## placeholder for request callback function
  __requestCallback = None
  ## placeholder for exception callback function
  __exceptionCallback = None
  ## config path in CS
  __configPath = None
  ## read request holder 
  __requestHolder = dict()

  def __init__( self, *args, **kwargs ):
    """ c'tor

    :param self: self reference
    :param str agentName: name of agent
    :param str loadName: name of module
    :param bool baseAgentName: whatever  
    :param dict properties: whatever else
    """
    
    AgentModule.__init__( self, *args, **kwargs )

    agentName = args[0]

    ## save config path
    self.__configPath = PathFinder.getAgentSection( agentName )
    self.log.info( "Will use %s config path" % self.__configPath )

    ## ProcessPool related stuff
    self.__requestsPerCycle = self.am_getOption( "RequestsPerCycle", 10 )
    self.log.info("requests/cycle = %d" % self.__requestsPerCycle )
    self.__minProcess = self.am_getOption( "MinProcess", 1 )
    self.log.info("ProcessPool min process = %d" % self.__minProcess )
    self.__maxProcess = self.am_getOption( "MaxProcess", 4 )
    self.log.info("ProcessPool max process = %d" % self.__maxProcess )
    self.__queueSize = self.am_getOption( "ProcessPoolQueueSize", 10 )
    self.log.info("ProcessPool queue size = %d" % self.__queueSize )
    self.__poolTimeout = int( self.am_getOption( "ProcessPoolTimeout", 300 ) )
    self.log.info("ProcessPool timeout = %d seconds" % self.__poolTimeout ) 
    self.__taskTimeout = int( self.am_getOption( "ProcessTaskTimeout", 300 ) )
    self.log.info("ProcessTask timeout = %d seconds" % self.__taskTimeout )
    ## request type
    self.__requestType = self.am_getOption( "RequestType", self.__requestType )
    self.log.info( "Will process '%s' request type." % str( self.__requestType ) )
    ## shifter proxy
    self.am_setOption( "shifterProxy", "DataManager" )
    self.log.info( "Will use DataManager proxy by default." )

    ## common monitor activity 
    self.monitor.registerActivity( "Iteration", "Agent Loops", 
                                   self.__class__.__name__, "Loops/min", gMonitor.OP_SUM )
    self.monitor.registerActivity( "Execute", "Request Processed", 
                                   self.__class__.__name__, "Requests/min", gMonitor.OP_SUM )
    self.monitor.registerActivity( "Done", "Request Completed", 
                                   self.__class__.__name__, "Requests/min", gMonitor.OP_SUM )
      
    ## create request dict
    self.__requestHolder = dict()

  def poolTimeout( self ):
    """ poolTimeout getter

    :param self: self reference
    """
    return self.__poolTimeout

  def setPoolTimeout( self, timeout=300 ):
    """ poolTimeoit setter

    :param self: self reference
    :param int timeout: PP finalisation timeout in seconds 
    """
    self.__poolTimeout = int(timeout)
    
  def taskTimeout( self ):
    """ taskTimeout getter

    :param self: self reference
    """
    return self.__taskTimeout

  def setTaskTimeout( self, timeout=300 ):
    """ taskTimeout setter

    :param self: self reference
    :param int timeout: task timeout in seconds
    """
    self.__taskTimeout = int(timeout)

  def requestHolder( self ):
    """ get request holder dict
    
    :param self: self reference
    """
    return self.__requestHolder

  def deleteRequest( self, requestName ):
    """ delete request from requestHolder

    :param self: self reference
    """
    if requestName in self.__requestHolder:
      del self.__requestHolder[requestName]
      return S_OK()
    return S_ERROR("%s not found in requestHolder" % requestName )

  def saveRequest( self, requestName, requestString ):
    """ put request into requestHolder

    :param cls: class reference
    :param str requestName: request name
    :param str requestString: XML-serialised request
    :param str requestServer: server URL
    """
    if requestName not in self.__requestHolder:
      self.__requestHolder.setdefault( requestName, requestString )
      return S_OK()
    return S_ERROR("saveRequest: request %s cannot be saved, it's already in requestHolder")

  def resetRequest( self, requestName ):
    """ put back :requestName: to RequestClient

    :param self: self reference
    :param str requestName: request's name
    """
    if requestName in self.__requestHolder:
      requestString = self.__requestHolder[requestName]
      reset = self.requestClient().updateRequest( requestName, requestString )
      if not reset["OK"]:
        self.log.error("resetRequest: unable to reset request %s: %s" % ( requestName, reset["Message"] ) )
      self.log.debug("resetRequest: request %s has been put back with its initial state" % requestName )
    else:
      self.log.error("resetRequest: unable to reset request %s: request not found in requestHolder" % requestName )

  def resetAllRequests( self ):
    """ put back all requests without callback called into requestClient 

    :param self: self reference
    """
    self.log.info("resetAllRequests: will put %s back requests" % len(self.__requestHolder) )
    for requestName, requestString in self.__requestHolder.items():
      reset = self.requestClient().updateRequest( requestName, requestString )
      if not reset["OK"]:
        self.log.error("resetAllRequests: unable to reset request %s: %s" % ( requestName, reset["Message"] ) )
        continue
      self.log.debug("resetAllRequests: request %s has been put back with its initial state" % requestName )

  def configPath( self ):
    """ config path getter

    :param self: self reference
    """
    return self.__configPath

  def requestsPerCycle( self ):
    """ get number of request to be processed in one cycle
    
    :param self: self reference
    """
    return self.__requestsPerCycle

  def requestClient( self ):
    """ RequestClient getter

    :param self: self reference
    """
    if not self.__requestClient:
      self.__requestClient = RequestClient()
    return self.__requestClient

  def processPool( self ):
    """ 'Live long and prosper, my dear ProcessPool'
                                        - Mr. Spock    
    :param self: self reference
    :return: brand new shiny ProcessPool instance on first call, the same instance
             on subsequent calls
    """
    if not self.__processPool:
      minProcess = max( 1, self.__minProcess ) 
      maxProcess = max( self.__minProcess, self.__maxProcess )
      queueSize = abs(self.__queueSize) 
      self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess, 
                                                                                       maxProcess, 
                                                                                       queueSize ) )
      self.log.info( "ProcessPool: tasks will use callbacks attached to ProcessPool" )
      self.__processPool = ProcessPool( minProcess, 
                                        maxProcess, 
                                        queueSize, 
                                        poolCallback = self.resultCallback,
                                        poolExceptionCallback = self.exceptionCallback )
      self.__processPool.daemonize()
      self.log.info( "ProcessPool: daemonized and ready")
    return self.__processPool

  def hasProcessPool( self ):
    """ check if ProcessPool exist to speed up finalization 

    :param self: self reference
    """
    return bool( self.__processPool )

  def resultCallback( self, taskID, taskResult ):
    """ definition of request callback function
    
    :param self: self reference
    """
    self.log.info("%s result callback" %  taskID ) 

    if not taskResult["OK"]:
      self.log.error( "%s result callback: %s" % ( taskID, taskResult["Message"] ) )
      if taskResult["Message"] == "Timed out":
        self.resetRequest( taskID )
      self.deleteRequest( taskID )
      return
    
    self.deleteRequest( taskID )
    taskResult = taskResult["Value"]
    ## add monitoring info
    monitor = taskResult["monitor"] if "monitor" in taskResult else {}
    for mark, value in monitor.items():
      try:
        gMonitor.addMark( mark, value )
      except Exception, error:
        self.log.exception( str(error) )
示例#39
0
class RequestExecutingAgent( AgentModule ):
  """
  .. class:: RequestExecutingAgent

  request processing agent using ProcessPool, Operation handlers and RequestTask
  """
  # # process pool
  __processPool = None
  # # request cache
  __requestCache = {}
  # # requests/cycle
  __requestsPerCycle = 100
  # # minimal nb of subprocess running
  __minProcess = 2
  # # maximal nb of subprocess executed same time
  __maxProcess = 4
  # # ProcessPool queue size
  __queueSize = 20
  # # file timeout
  __fileTimeout = 300
  # # operation timeout
  __operationTimeout = 300
  # # ProcessTask default timeout in seconds
  __taskTimeout = 900
  # # ProcessPool finalization timeout
  __poolTimeout = 900
  # # ProcessPool sleep time
  __poolSleep = 5
  # # placeholder for RequestClient instance
  __requestClient = None
  # # FTS scheduling flag
  __FTSMode = False

  def __init__( self, *args, **kwargs ):
    """ c'tor """
    # # call base class ctor
    AgentModule.__init__( self, *args, **kwargs )
    # # ProcessPool related stuff
    self.__requestsPerCycle = self.am_getOption( "RequestsPerCycle", self.__requestsPerCycle )
    self.log.info( "Requests/cycle = %d" % self.__requestsPerCycle )
    self.__minProcess = self.am_getOption( "MinProcess", self.__minProcess )
    self.log.info( "ProcessPool min process = %d" % self.__minProcess )
    self.__maxProcess = self.am_getOption( "MaxProcess", 4 )
    self.log.info( "ProcessPool max process = %d" % self.__maxProcess )
    self.__queueSize = self.am_getOption( "ProcessPoolQueueSize", self.__queueSize )
    self.log.info( "ProcessPool queue size = %d" % self.__queueSize )
    self.__poolTimeout = int( self.am_getOption( "ProcessPoolTimeout", self.__poolTimeout ) )
    self.log.info( "ProcessPool timeout = %d seconds" % self.__poolTimeout )
    self.__poolSleep = int( self.am_getOption( "ProcessPoolSleep", self.__poolSleep ) )
    self.log.info( "ProcessPool sleep time = %d seconds" % self.__poolSleep )
    self.__taskTimeout = int( self.am_getOption( "ProcessTaskTimeout", self.__taskTimeout ) )
    self.log.info( "ProcessTask timeout = %d seconds" % self.__taskTimeout )

    # # keep config path and agent name
    self.agentName = self.am_getModuleParam( "fullName" )
    self.__configPath = PathFinder.getAgentSection( self.agentName )

    # # operation handlers over here
    opHandlersPath = "%s/%s" % ( self.__configPath, "OperationHandlers" )
    opHandlers = gConfig.getSections( opHandlersPath )
    if not opHandlers["OK"]:
      self.log.error( opHandlers["Message" ] )
      raise AgentConfigError( "OperationHandlers section not found in CS under %s" % self.__configPath )
    opHandlers = opHandlers["Value"]


    self.timeOuts = dict()

    self.operationHandlers = []
    for opHandler in opHandlers:
      opHandlerPath = "%s/%s/Location" % ( opHandlersPath, opHandler )
      opLocation = gConfig.getValue( opHandlerPath, "" )
      if not opLocation:
        self.log.error( "%s not set for %s operation handler" % ( opHandlerPath, opHandler ) )
        continue
      self.timeOuts[opHandler] = { "PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout }

      opTimeout = gConfig.getValue( "%s/%s/TimeOut" % ( opHandlersPath, opHandler ), 0 )
      if opTimeout:
        self.timeOuts[opHandler]["PerOperation"] = opTimeout
      fileTimeout = gConfig.getValue( "%s/%s/TimeOutPerFile" % ( opHandlersPath, opHandler ), 0 )
      if fileTimeout:
        self.timeOuts[opHandler]["PerFile"] = fileTimeout

      self.operationHandlers.append( opLocation )

    self.log.info( "Operation handlers:" )
    for itemTuple in enumerate ( self.operationHandlers ):
      self.log.info( "[%s] %s" % itemTuple )

    # # handlers dict
    self.handlersDict = dict()
    # # common monitor activity
    gMonitor.registerActivity( "Iteration", "Agent Loops",
                               "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Processed", "Request Processed",
                               "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Done", "Request Completed",
                               "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM )
    # # create request dict
    self.__requestCache = dict()

  def processPool( self ):
    """ facade for ProcessPool """
    if not self.__processPool:
      minProcess = max( 1, self.__minProcess )
      maxProcess = max( self.__minProcess, self.__maxProcess )
      queueSize = abs( self.__queueSize )
      self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess,
                                                                                       maxProcess,
                                                                                       queueSize ) )
      self.__processPool = ProcessPool( minProcess,
                                        maxProcess,
                                        queueSize,
                                        poolCallback = self.resultCallback,
                                        poolExceptionCallback = self.exceptionCallback )
      self.__processPool.daemonize()
    return self.__processPool

  def requestClient( self ):
    """ RequestClient getter """
    if not self.__requestClient:
      self.__requestClient = ReqClient()
    return self.__requestClient

  def cleanCache( self, requestName = None ):
    """ delete request from requestCache

    :param str requestName: Request.RequestName
    """
    if requestName in self.__requestCache:
      del self.__requestCache[requestName]
    return S_OK()

  def cacheRequest( self, request ):
    """ put request into requestCache

    :param Request request: Request instance
    """
    self.__requestCache.setdefault( request.RequestName, request )
    return S_OK()

  def resetRequest( self, requestName ):
    """ put back :requestName: to RequestClient

    :param str requestName: request's name
    """
    if requestName in self.__requestCache:
      reset = self.requestClient().updateRequest( self.__requestCache[requestName] )
      if not reset["OK"]:
        return S_ERROR( "resetRequest: unable to reset request %s: %s" % ( requestName, reset["Message"] ) )
    return S_OK()

  def resetAllRequests( self ):
    """ put back all requests without callback called into requestClient

    :param self: self reference
    """
    self.log.info( "resetAllRequests: will put %s back requests" % len( self.__requestCache ) )
    for requestName, request in self.__requestCache.iteritems():
      reset = self.requestClient().updateRequest( request )
      if not reset["OK"]:
        self.log.error( "resetAllRequests: unable to reset request %s: %s" % ( requestName, reset["Message"] ) )
        continue
      self.log.debug( "resetAllRequests: request %s has been put back with its initial state" % requestName )
    return S_OK()

  def initialize( self ):
    """ initialize agent

    at the moment creates handlers dictionary
    """
    for opHandler in self.operationHandlers:
      handlerName = opHandler.split( "/" )[-1]
      self.handlersDict[ handlerName ] = opHandler
      self.log.debug( "handler '%s' for operation '%s' registered" % ( opHandler, handlerName ) )
    if not self.handlersDict:
      self.log.error( "operation handlers not set, check configuration option 'Operations'!" )
      return S_ERROR( "Operation handlers not set!" )
    return S_OK()

  def execute( self ):
    """ read requests from RequestClient and enqueue them into ProcessPool """
    gMonitor.addMark( "Iteration", 1 )
    # # requests (and so tasks) counter
    taskCounter = 0
    while taskCounter < self.__requestsPerCycle:
      self.log.debug( "execute: executing %d request in this cycle" % taskCounter )
      getRequest = self.requestClient().getRequest()
      if not getRequest["OK"]:
        self.log.error( "execute: %s" % getRequest["Message"] )
        break
      if not getRequest["Value"]:
        self.log.info( "execute: not more 'Waiting' requests to process" )
        break
      # # OK, we've got you
      request = getRequest["Value"]
      # # set task id
      taskID = request.RequestName
      # # save current request in cache
      self.cacheRequest( request )
      # # serialize to JSON
      requestJSON = request.toJSON()
      if not requestJSON["OK"]:
        self.log.error( "JSON serialization error: %s" % requestJSON["Message"] )
        break
      requestJSON = requestJSON["Value"]

      self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(),
                                                                    self.processPool().getNumWorkingProcesses() ) )

      while True:
        if not self.processPool().getFreeSlots():
          self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep )
          time.sleep( self.__poolSleep )
        else:
          self.log.info( "spawning task for request '%s'" % ( request.RequestName ) )
          timeOut = self.getTimeout( request )
          enqueue = self.processPool().createAndQueueTask( RequestTask,
                                                           kwargs = { "requestJSON" : requestJSON,
                                                                      "handlersDict" : self.handlersDict,
                                                                      "csPath" : self.__configPath,
                                                                      "agentName": self.agentName },
                                                           taskID = taskID,
                                                           blocking = True,
                                                           usePoolCallbacks = True,
                                                           timeOut = timeOut )
          if not enqueue["OK"]:
            self.log.error( enqueue["Message"] )
          else:
            self.log.debug( "successfully enqueued task '%s'" % taskID )
            # # update monitor
            gMonitor.addMark( "Processed", 1 )
            # # update request counter
            taskCounter += 1
            # # task created, a little time kick to proceed
            time.sleep( 0.1 )
            break

    # # clean return
    return S_OK()

  def getTimeout( self, request ):
    """ get timeout for request """
    timeout = 0
    for op in request:
      if op.Status not in ( "Waiting", "Scheduled" ):
        continue
      if op.Type not in self.timeOuts:
        timeout += self.__operationTimeout
      else:
        perOp = self.timeOuts[op.Type].get( "PerOperation", self.__operationTimeout )
        perFiles = self.timeOuts[op.Type].get( "PerFile", self.__fileTimeout ) * len( op )
        timeout += perOp + perFiles
    self.log.info( "estimated timeOut for request %s is %s" % ( request.RequestName, timeout ) )
    return timeout

  def finalize( self ):
    """ agent finalization """
    if self.__processPool:
      self.processPool().finalize( timeout = self.__poolTimeout )
    self.resetAllRequests()
    return S_OK()

  def resultCallback( self, taskID, taskResult ):
    """ definition of request callback function

    :param str taskID: Reqiest.RequestName
    :param dict taskResult: task result S_OK/S_ERROR
    """
    self.log.info( "callback: %s result is %s(%s)" % ( taskID,
                                                      "S_OK" if taskResult["OK"] else "S_ERROR",
                                                      taskResult["Value"] if taskResult["OK"] else taskResult["Message"] ) )

    if not taskResult["OK"]:
      if taskResult["Message"] == "Timed out":
        self.resetRequest( taskID )
    # # clean cache
    self.cleanCache( taskID )

  def exceptionCallback( self, taskID, taskException ):
    """ definition of exception callback function

    :param str taskID: Request.RequestName
    :param Exception taskException: Exception instance
    """
    self.log.error( "exceptionCallback: %s was hit by exception %s" % ( taskID, taskException ) )
    self.resetRequest( taskID )
示例#40
0
class RequestExecutingAgent( AgentModule ):
  """
  .. class:: RequestExecutingAgent

  request processing agent using ProcessPool, Operation handlers and RequestTask
  """
  # # process pool
  __processPool = None
  # # request cache
  __requestCache = {}
  # # requests/cycle
  __requestsPerCycle = 100
  # # minimal nb of subprocess running
  __minProcess = 2
  # # maximal nb of subprocess executed same time
  __maxProcess = 4
  # # ProcessPool queue size
  __queueSize = 20
  # # file timeout
  __fileTimeout = 300
  # # operation timeout
  __operationTimeout = 300
  # # ProcessTask default timeout in seconds
  __taskTimeout = 900
  # # ProcessPool finalization timeout
  __poolTimeout = 900
  # # ProcessPool sleep time
  __poolSleep = 5
  # # placeholder for RequestClient instance
  __requestClient = None
  # # Size of the bulk if use of getRequests. If 0, use getRequest
  __bulkRequest = 0

  def __init__( self, *args, **kwargs ):
    """ c'tor """
    # # call base class ctor
    AgentModule.__init__( self, *args, **kwargs )
    # # ProcessPool related stuff
    self.__requestsPerCycle = self.am_getOption( "RequestsPerCycle", self.__requestsPerCycle )
    self.log.info( "Requests/cycle = %d" % self.__requestsPerCycle )
    self.__minProcess = self.am_getOption( "MinProcess", self.__minProcess )
    self.log.info( "ProcessPool min process = %d" % self.__minProcess )
    self.__maxProcess = self.am_getOption( "MaxProcess", 4 )
    self.log.info( "ProcessPool max process = %d" % self.__maxProcess )
    self.__queueSize = self.am_getOption( "ProcessPoolQueueSize", self.__queueSize )
    self.log.info( "ProcessPool queue size = %d" % self.__queueSize )
    self.__poolTimeout = int( self.am_getOption( "ProcessPoolTimeout", self.__poolTimeout ) )
    self.log.info( "ProcessPool timeout = %d seconds" % self.__poolTimeout )
    self.__poolSleep = int( self.am_getOption( "ProcessPoolSleep", self.__poolSleep ) )
    self.log.info( "ProcessPool sleep time = %d seconds" % self.__poolSleep )
    self.__taskTimeout = int( self.am_getOption( "ProcessTaskTimeout", self.__taskTimeout ) )
    self.log.info( "ProcessTask timeout = %d seconds" % self.__taskTimeout )
    self.__bulkRequest = self.am_getOption( "BulkRequest", 0 )
    self.log.info( "Bulk request size = %d" % self.__bulkRequest )

    # # keep config path and agent name
    self.agentName = self.am_getModuleParam( "fullName" )
    self.__configPath = PathFinder.getAgentSection( self.agentName )

    # # operation handlers over here
    opHandlersPath = "%s/%s" % ( self.__configPath, "OperationHandlers" )
    opHandlers = gConfig.getSections( opHandlersPath )
    if not opHandlers["OK"]:
      self.log.error( opHandlers["Message" ] )
      raise AgentConfigError( "OperationHandlers section not found in CS under %s" % self.__configPath )
    opHandlers = opHandlers["Value"]


    self.timeOuts = dict()

    # # handlers dict
    self.handlersDict = dict()
    for opHandler in opHandlers:
      opHandlerPath = "%s/%s/Location" % ( opHandlersPath, opHandler )
      opLocation = gConfig.getValue( opHandlerPath, "" )
      if not opLocation:
        self.log.error( "%s not set for %s operation handler" % ( opHandlerPath, opHandler ) )
        continue
      self.timeOuts[opHandler] = { "PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout }

      opTimeout = gConfig.getValue( "%s/%s/TimeOut" % ( opHandlersPath, opHandler ), 0 )
      if opTimeout:
        self.timeOuts[opHandler]["PerOperation"] = opTimeout
      fileTimeout = gConfig.getValue( "%s/%s/TimeOutPerFile" % ( opHandlersPath, opHandler ), 0 )
      if fileTimeout:
        self.timeOuts[opHandler]["PerFile"] = fileTimeout

      self.handlersDict[opHandler] = opLocation

    self.log.info( "Operation handlers:" )
    for item in enumerate ( self.handlersDict.items() ):
      opHandler = item[1][0]
      self.log.info( "[%s] %s: %s (timeout: %d s + %d s per file)" % ( item[0], item[1][0], item[1][1],
                                                                   self.timeOuts[opHandler]['PerOperation'],
                                                                   self.timeOuts[opHandler]['PerFile'] ) )

    # # common monitor activity
    gMonitor.registerActivity( "Iteration", "Agent Loops",
                               "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Processed", "Request Processed",
                               "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Done", "Request Completed",
                               "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM )
    # # create request dict
    self.__requestCache = dict()

    self.FTSMode = self.am_getOption( "FTSMode", False )



  def processPool( self ):
    """ facade for ProcessPool """
    if not self.__processPool:
      minProcess = max( 1, self.__minProcess )
      maxProcess = max( self.__minProcess, self.__maxProcess )
      queueSize = abs( self.__queueSize )
      self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess,
                                                                                       maxProcess,
                                                                                       queueSize ) )
      self.__processPool = ProcessPool( minProcess,
                                        maxProcess,
                                        queueSize,
                                        poolCallback = self.resultCallback,
                                        poolExceptionCallback = self.exceptionCallback )
      self.__processPool.daemonize()
    return self.__processPool

  def requestClient( self ):
    """ RequestClient getter """
    if not self.__requestClient:
      self.__requestClient = ReqClient()
    return self.__requestClient

  def cacheRequest( self, request ):
    """ put request into requestCache

    :param Request request: Request instance
    """
    count = 5
    # Wait a bit as there may be a race condition between RequestTask putting back the request and the callback clearing the cache
    while request.RequestName in self.__requestCache:
      count -= 1
      if not count:
        self.requestClient().putRequest( request )
        return S_ERROR( "Duplicate request, ignore: %s" % request.RequestName )
      time.sleep( 1 )
    self.__requestCache[ request.RequestName ] = request
    return S_OK()

  def putRequest( self, requestName, taskResult = None ):
    """ put back :requestName: to RequestClient

    :param str requestName: request's name
    """
    if requestName in self.__requestCache:
      request = self.__requestCache.pop( requestName )
      if taskResult and taskResult['OK']:
        request = taskResult['Value']

      reset = self.requestClient().putRequest( request )
      if not reset["OK"]:
        return S_ERROR( "putRequest: unable to reset request %s: %s" % ( requestName, reset["Message"] ) )
    else:
      return S_ERROR( 'Not in cache' )
    return S_OK()

  def putAllRequests( self ):
    """ put back all requests without callback called into requestClient

    :param self: self reference
    """
    self.log.info( "putAllRequests: will put %s back requests" % len( self.__requestCache ) )
    for requestName in self.__requestCache.keys():
      reset = self.putRequest( requestName )
      if not reset["OK"]:
        self.log.error( reset["Message"] )
      else:
        self.log.debug( "putAllRequests: request %s has been put back with its initial state" % requestName )
    return S_OK()

  def initialize( self ):
    """ initialize agent
    """
    return S_OK()

  def execute( self ):
    """ read requests from RequestClient and enqueue them into ProcessPool """
    gMonitor.addMark( "Iteration", 1 )
    # # requests (and so tasks) counter
    taskCounter = 0
    while taskCounter < self.__requestsPerCycle:
      self.log.debug( "execute: executing %d request in this cycle" % taskCounter )

      requestsToExecute = []

      if not self.__bulkRequest:
        self.log.info( "execute: ask for a single request" )
        getRequest = self.requestClient().getRequest()
        if not getRequest["OK"]:
          self.log.error( "execute: %s" % getRequest["Message"] )
          break
        if not getRequest["Value"]:
          self.log.info( "execute: no more 'Waiting' requests to process" )
          break
        requestsToExecute = [getRequest["Value"] ]
      else:
        numberOfRequest = min( self.__bulkRequest, self.__requestsPerCycle - taskCounter )
        self.log.info( "execute: ask for %s requests" % numberOfRequest )
        getRequests = self.requestClient().getBulkRequests( numberOfRequest )
        if not getRequests["OK"]:
          self.log.error( "execute: %s" % getRequests["Message"] )
          break
        if not getRequests["Value"]:
          self.log.info( "execute: no more 'Waiting' requests to process" )
          break
        for rId in getRequests["Value"]["Failed"]:
          self.log.error( "execute: %s" % getRequests["Value"]["Failed"][rId] )

        requestsToExecute = getRequests["Value"]["Successful"].values()

      self.log.info( "execute: will execute %s requests " % len( requestsToExecute ) )

      for request in requestsToExecute:
        # # set task id
        taskID = request.RequestName
        # # save current request in cache
        self.cacheRequest( request )
        # # serialize to JSON
        requestJSON = request.toJSON()
        if not requestJSON["OK"]:
          self.log.error( "JSON serialization error: %s" % requestJSON["Message"] )
          break
        requestJSON = requestJSON["Value"]

        self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(),
                                                                      self.processPool().getNumWorkingProcesses() ) )

        looping = 0
        while True:
          if not self.processPool().getFreeSlots():
            if not looping:
              self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep )
            time.sleep( self.__poolSleep )
            looping += 1
          else:
            if looping:
              self.log.info( "Free slot found after %d seconds" % looping * self.__poolSleep )
            looping = 0
            self.log.info( "spawning task for request '%s'" % ( request.RequestName ) )
            timeOut = self.getTimeout( request )
            enqueue = self.processPool().createAndQueueTask( RequestTask,
                                                             kwargs = { "requestJSON" : requestJSON,
                                                                        "handlersDict" : self.handlersDict,
                                                                        "csPath" : self.__configPath,
                                                                        "agentName": self.agentName },
                                                             taskID = taskID,
                                                             blocking = True,
                                                             usePoolCallbacks = True,
                                                             timeOut = timeOut )
            if not enqueue["OK"]:
              self.log.error( enqueue["Message"] )
            else:
              self.log.debug( "successfully enqueued task '%s'" % taskID )
              # # update monitor
              gMonitor.addMark( "Processed", 1 )
              # # update request counter
              taskCounter += 1
              # # task created, a little time kick to proceed
              time.sleep( 0.1 )
              break

    # # clean return
    return S_OK()

  def getTimeout( self, request ):
    """ get timeout for request """
    timeout = 0
    for op in request:
      if op.Status not in ( "Waiting", "Scheduled", 'Queued' ):
        continue
      if op.Type not in self.timeOuts:
        timeout += self.__operationTimeout
      else:
        perOp = self.timeOuts[op.Type].get( "PerOperation", self.__operationTimeout )
        perFiles = self.timeOuts[op.Type].get( "PerFile", self.__fileTimeout ) * len( op )
        timeout += perOp + perFiles
    self.log.info( "estimated timeOut for request %s is %s" % ( request.RequestName, timeout ) )
    return timeout

  def finalize( self ):
    """ agent finalization """
    if self.__processPool:
      self.processPool().finalize( timeout = self.__poolTimeout )
    self.putAllRequests()
    return S_OK()

  def resultCallback( self, taskID, taskResult ):
    """ definition of request callback function

    :param str taskID: Request.RequestName
    :param dict taskResult: task result S_OK(Request)/S_ERROR(Message)
    """
    # # clean cache
    res = self.putRequest( taskID, taskResult )
    self.log.info( "callback: %s result is %s(%s), put %s(%s)" % ( taskID,
                                                      "S_OK" if taskResult["OK"] else "S_ERROR",
                                                      taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"],
                                                      "S_OK" if res['OK'] else 'S_ERROR',
                                                      '' if res['OK'] else res['Message'] ) )


  def exceptionCallback( self, taskID, taskException ):
    """ definition of exception callback function

    :param str taskID: Request.RequestName
    :param Exception taskException: Exception instance
    """
    self.log.error( "exceptionCallback: %s was hit by exception %s" % ( taskID, taskException ) )
    self.putRequest( taskID )
示例#41
0
 def setUp( self ):
   gLogger.showHeaders( True )
   self.log = gLogger.getSubLogger( self.__class__.__name__ )
   self.processPool = ProcessPool( 4, 8, 8 ) 
   self.processPool.daemonize()
示例#42
0
class ProcessPoolCallbacksTests( unittest.TestCase ):
  """
  .. class:: ProcessPoolCallbacksTests
  test case for ProcessPool
  """

  def setUp( self ):
    """c'tor

    :param self: self reference
    """
    gLogger.showHeaders( True )
    self.log = gLogger.getSubLogger( self.__class__.__name__ )
    self.processPool = ProcessPool( 4, 8, 8,
                                    poolCallback = self.poolCallback, 
                                    poolExceptionCallback = self.poolExceptionCallback )
    self.processPool.daemonize()

  def poolCallback( self, taskID, taskResult ):
    self.log.always( "callback for %s result is %s" % ( taskID, taskResult ) ) 

  def poolExceptionCallback( self, taskID, taskException ):
    self.log.always( "callback for %s exception is %s" % ( taskID, taskException ) )

  def testCallableClass( self ):
    """ CallableClass and pool callbacks test """
    i = 0
    while True:
      if self.processPool.getFreeSlots() > 0:
        timeWait = random.randint(0, 5)
        raiseException = False
        if not timeWait:
          raiseException = True 
        result = self.processPool.createAndQueueTask( CallableClass,
                                                      taskID = i,
                                                      args = ( i, timeWait, raiseException ),  
                                                      usePoolCallbacks = True,
                                                      blocking = True )    
        if result["OK"]:
          self.log.always("CallableClass enqueued to task %s" % i )
          i += 1          
        else:
          continue
      if i == 10:
        break
    self.processPool.finalize( 2 )


  def testCallableFunc( self ):
    """ CallableFunc and pool callbacks test """
    i = 0
    while True:
      if self.processPool.getFreeSlots() > 0:
        timeWait = random.randint(0, 5)
        raiseException = False
        if not timeWait:
          raiseException = True 
        result = self.processPool.createAndQueueTask( CallableFunc,
                                                      taskID = i,
                                                      args = ( i, timeWait, raiseException ),  
                                                      usePoolCallbacks = True,
                                                      blocking = True )    
        if result["OK"]:
          self.log.always("CallableFunc enqueued to task %s" % i )
          i += 1          
        else:
          continue
      if i == 10:
        break
    self.processPool.finalize( 2 )
示例#43
0
#########################################################################

argvs = sys.argv
if len(argvs) != 2 :
   print 'Usage: LFC_to_DFC.py [dirlist_file]'
   print '[dirlist_file] should contain the directory list.'
   quit()

dirlist_file=argvs[1]
if ( not os.path.exists(dirlist_file) ) :
  print dirlist_file+" does not exist"
  quit()

execfile(dirlist_file)

pPool = ProcessPool(10,50,50)
pPool.daemonize()

# dirlist = ['prod/ilc/mc-dbd/generated','prod/ilc/mc-dbd/ild']
# dirlist= ['prod/ilc/mc-dbd/generated/500-TDR_ws/higgs']
# dirlist= ['prod/ilc/mc-dbd/generated/250-TDR_ws/higgs','prod/ilc/mc-dbd/generated/350-TDR_ws/higgs']
#dirlist= ['prod/ilc/mc-dbd/generated/250-TDR_ws']
#dirlist= ['prod/ilc/mc-dbd/generated/250-TDR_ws/1f',
#          'prod/ilc/mc-dbd/generated/250-TDR_ws/3f',
#          'prod/ilc/mc-dbd/generated/250-TDR_ws/aa_lowpt',
#          'prod/ilc/mc-dbd/generated/250-TDR_ws/aa_minijet']
#dirlist= ['prod/ilc/mc-dbd/generated/250-TDR_ws/aa_2f',
#          'prod/ilc/mc-dbd/generated/350-TDR_ws/3f',
#          'prod/ilc/mc-dbd/generated/350-TDR_ws/1f',
#          'prod/ilc/mc-dbd/generated/350-TDR_ws/aa_minijet']
示例#44
0
class PoolComputingElement( ComputingElement ):

  mandatoryParameters = MandatoryParameters

  #############################################################################
  def __init__( self, ceUniqueID, cores = 0 ):
    """ Standard constructor.
    """
    ComputingElement.__init__( self, ceUniqueID )
    self.ceType = "Pool"
    self.submittedJobs = 0
    if cores > 0:
      self.cores = cores
    else:  
      self.cores = getNumberOfCores()
    self.pPool = ProcessPool( self.cores, self.cores, poolCallback = self.finalizeJob )
    self.taskID = 0
    self.coresPerTask = {}

  #############################################################################
  def _addCEConfigDefaults( self ):
    """Method to make sure all necessary Configuration Parameters are defined
    """
    # First assure that any global parameters are loaded
    ComputingElement._addCEConfigDefaults( self )
    
  def getCoresInUse( self ):
    """ 
    """ 
    coresInUse = 0
    for _task, cores in self.coresPerTask.items():
      coresInUse += cores 
    return coresInUse  

  #############################################################################
  def submitJob( self, executableFile, proxy, **kwargs ):
    """ Method to submit job, should be overridden in sub-class.
    """
    
    self.pPool.processResults()
    
    coresInUse = self.getCoresInUse()
    if "WholeNode" in kwargs and kwargs['WholeNode']:
      if coresInUse > 0:
        return S_ERROR('Can not take WholeNode job, %d/%d slots used' % (self.slotsInUse,self.slots) )
      else:
        requestedCores = self.cores
    elif "NumberOfCores" in kwargs:           
      requestedCores = int( kwargs['NumberOfCores'] )
      if requestedCores > 0:
        if (coresInUse + requestedCores) > self.cores:
          return S_ERROR( 'Not enough slots: requested %d, available %d' % ( requestedCores, self.cores-coresInUse) )
    else:
      requestedCores = 1   
    if self.cores - coresInUse < requestedCores:
      return S_ERROR( 'Not enough slots: requested %d, available %d' % ( requestedCores, self.cores-coresInUse) )
    
    ret = getProxyInfo()
    if not ret['OK']:
      pilotProxy = None
    else:
      pilotProxy = ret['Value']['path']
    self.log.notice( 'Pilot Proxy:', pilotProxy )

    result = self.pPool.createAndQueueTask( executeJob, 
                                            [executableFile,proxy,self.taskID],None,
                                            self.taskID,
                                            usePoolCallbacks = True )
    self.taskID += 1
    self.coresPerTask[self.taskID] = requestedCores
    
    self.pPool.processResults()
    
    return result
  
  def finalizeJob( self, taskID, result ):
    """ Finalize the job
    """
    del self.coresPerTask[taskID]

  #############################################################################
  def getCEStatus( self ):
    """ Method to return information on running and pending jobs.
    """
    self.pPool.processResults()
    result = S_OK()
    result['SubmittedJobs'] = 0
    nJobs = 0
    for _j, value in self.coresPerTask.items():
      if value > 0: 
        nJobs += 1
    result['RunningJobs'] = nJobs 
    result['WaitingJobs'] = 0
    coresInUse = self.getCoresInUse()
    result['UsedCores'] = coresInUse
    result['AvailableCores'] = self.cores - coresInUse
    return result

  #############################################################################
  def monitorProxy( self, pilotProxy, payloadProxy ):
    """ Monitor the payload proxy and renew as necessary.
    """
    return self._monitorProxy( pilotProxy, payloadProxy )
示例#45
0
class PoolComputingElement(ComputingElement):

    mandatoryParameters = MandatoryParameters

    #############################################################################
    def __init__(self, ceUniqueID):
        """ Standard constructor.
    """
        super(PoolComputingElement, self).__init__(ceUniqueID)

        self.ceType = "Pool"
        self.log = gLogger.getSubLogger('Pool')
        self.submittedJobs = 0
        self.processors = 1
        self.pPool = None
        self.taskID = 0
        self.processorsPerTask = {}
        self.userNumberPerTask = {}
        self.useSudo = False

    #############################################################################
    def _addCEConfigDefaults(self):
        """Method to make sure all necessary Configuration Parameters are defined
    """
        # First assure that any global parameters are loaded
        ComputingElement._addCEConfigDefaults(self)

    def _reset(self):
        """ Update internal variables after some extra parameters are added

    :return: None
    """

        self.processors = int(
            self.ceParameters.get('NumberOfProcessors', self.processors))
        self.ceParameters['MaxTotalJobs'] = self.processors
        self.useSudo = self.ceParameters.get('SudoExecution', False)

    def getProcessorsInUse(self):
        """ Get the number of currently allocated processor cores

    :return: number of processor cores
    """
        processorsInUse = 0
        for task in self.processorsPerTask:
            processorsInUse += self.processorsPerTask[task]
        return processorsInUse

    #############################################################################
    def submitJob(self, executableFile, proxy, **kwargs):
        """ Method to submit job.

    :param str executableFile: location of the executable file
    :param str proxy: payload proxy

    :return: S_OK/S_ERROR of the result of the job submission
    """

        if self.pPool is None:
            self.pPool = ProcessPool(minSize=self.processors,
                                     maxSize=self.processors,
                                     poolCallback=self.finalizeJob)

        self.pPool.processResults()

        processorsForJob = self._getProcessorsForJobs(kwargs)
        if not processorsForJob:
            return S_ERROR('Not enough processors for the job')

        # Now persisiting the job limits for later use in pilot.cfg file (pilot 3 default)
        cd = ConfigurationData(loadDefaultCFG=False)
        res = cd.loadFile('pilot.cfg')
        if not res['OK']:
            self.log.error("Could not load pilot.cfg", res['Message'])
        # only NumberOfProcessors for now, but RAM (or other stuff) can also be added
        jobID = int(kwargs.get('jobDesc', {}).get('jobID', 0))
        cd.setOptionInCFG(
            '/Resources/Computing/JobLimits/%d/NumberOfProcessors' % jobID,
            processorsForJob)
        res = cd.dumpLocalCFGToFile('pilot.cfg')
        if not res['OK']:
            self.log.error("Could not dump cfg to pilot.cfg", res['Message'])

        ret = getProxyInfo()
        if not ret['OK']:
            pilotProxy = None
        else:
            pilotProxy = ret['Value']['path']
        self.log.notice('Pilot Proxy:', pilotProxy)

        kwargs = {'UseSudo': False}
        if self.useSudo:
            for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS):
                if nUser not in self.userNumberPerTask.values():
                    break
            kwargs['NUser'] = nUser
            kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str(
                nUser).zfill(2)
            kwargs['UseSudo'] = True

        result = self.pPool.createAndQueueTask(executeJob,
                                               args=(executableFile, proxy,
                                                     self.taskID),
                                               kwargs=kwargs,
                                               taskID=self.taskID,
                                               usePoolCallbacks=True)
        self.processorsPerTask[self.taskID] = processorsForJob
        self.taskID += 1

        self.pPool.processResults()

        return result

    def _getProcessorsForJobs(self, kwargs):
        """ helper function
    """
        processorsInUse = self.getProcessorsInUse()
        availableProcessors = self.processors - processorsInUse

        # Does this ask for MP?
        if not kwargs.get('mpTag', False):
            if availableProcessors:
                return 1
            else:
                return 0

        # From here we assume the job is asking for MP
        if kwargs.get('wholeNode', False):
            if processorsInUse > 0:
                return 0
            else:
                requestedProcessors = self.processors

        if "numberOfProcessors" in kwargs:
            requestedProcessors = int(kwargs['numberOfProcessors'])
        else:
            requestedProcessors = 1

        if availableProcessors < requestedProcessors:
            return 0

        # If there's a maximum number of processors allowed for the job, use that as maximum,
        # otherwise it will use all the remaining processors
        if 'maxNumberOfProcessors' in kwargs and kwargs[
                'maxNumberOfProcessors']:
            maxNumberOfProcessors = min(int(kwargs['maxNumberOfProcessors']),
                                        availableProcessors)
        else:
            maxNumberOfProcessors = availableProcessors

        return maxNumberOfProcessors

    def finalizeJob(self, taskID, result):
        """ Finalize the job by updating the process utilisation counters

    :param int taskID: local PoolCE task ID
    :param dict result: result of the job execution

    """
        nProc = self.processorsPerTask.pop(taskID)
        if result['OK']:
            self.log.info(
                'Task %d finished successfully, %d processor(s) freed' %
                (taskID, nProc))
        else:
            self.log.error("Task failed submission",
                           "%d, message: %s" % (taskID, result['Message']))

    #############################################################################
    def getCEStatus(self, jobIDList=None):
        """ Method to return information on running and pending jobs.

    :return: dictionary of numbers of jobs per status
    """

        if self.pPool is None:
            self.pPool = ProcessPool(minSize=self.processors,
                                     maxSize=self.processors,
                                     poolCallback=self.finalizeJob)

        self.pPool.processResults()
        result = S_OK()
        result['SubmittedJobs'] = 0
        nJobs = 0
        for _j, value in self.processorsPerTask.iteritems():
            if value > 0:
                nJobs += 1
        result['RunningJobs'] = nJobs
        result['WaitingJobs'] = 0
        processorsInUse = self.getProcessorsInUse()
        result['UsedProcessors'] = processorsInUse
        result['AvailableProcessors'] = self.processors - processorsInUse
        return result

    def getDescription(self):
        """ Get a list of CEs descriptions (each is a dict)

        This is called by the JobAgent.
    """
        result = super(PoolComputingElement, self).getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        ceDictList = []
        if self.ceParameters.get('MultiProcessorStrategy'):
            strategyRequiredTags = []
            if not ceDict.get("ProcessorsInUse", 0):
                # We are starting from a clean page, try to get the most demanding
                # jobs first
                strategyRequiredTags.append(['WholeNode'])
            processors = ceDict.get('NumberOfProcessors', 0)
            if processors > 1:
                # We have several processors at hand, try to use most of them
                strategyRequiredTags.append(['%dProcessors' % processors])
                # Well, at least jobs with some processors requirement
                strategyRequiredTags.append(['MultiProcessor'])

            for strat in strategyRequiredTags:
                newCEDict = dict(ceDict)
                newCEDict.setdefault("RequiredTag", []).extend(strat)
                ceDictList.append(newCEDict)

        # Do not require anything special if nothing else was lucky
        ceDictList.append(dict(ceDict))

        return S_OK(ceDictList)

    #############################################################################
    def monitorProxy(self, pilotProxy, payloadProxy):
        """ Monitor the payload proxy and renew as necessary.

    :param str pilotProxy: location of the pilotProxy
    :param str payloadProxy: location of the payloadProxy
    """
        return self._monitorProxy(pilotProxy, payloadProxy)
示例#46
0
class TaskTimeOutTests( unittest.TestCase ):
  """
  .. class:: TaskTimeOutTests

  test case for ProcessPool
  """

  def setUp( self ):
    """c'tor

    :param self: self reference
    """
    gLogger.showHeaders( True )
    self.log = gLogger.getSubLogger( self.__class__.__name__ )
    self.processPool = ProcessPool( 2,
                                    4, 
                                    8,
                                    poolCallback = self.poolCallback, 
                                    poolExceptionCallback = self.poolExceptionCallback )
    self.processPool.daemonize()
    
  def poolCallback( self, taskID, taskResult ):
    self.log.always( "callback result for %s is %s" % ( taskID, taskResult )  ) 

  def poolExceptionCallback( self, taskID, taskException ): 
    self.log.always( "callback exception for %s is %s" % ( taskID, taskException ) )


  def testCallableClass( self ):
    """ CallableClass and task time out test """
    i = 0
    while True:
      if self.processPool.getFreeSlots() > 0:
        timeWait = random.randint( 0, 5 ) * 10
        raiseException = False
        if not timeWait:
          raiseException = True 
        result = self.processPool.createAndQueueTask( CallableClass,
                                                      taskID = i,
                                                      args = ( i, timeWait, raiseException ), 
                                                      timeOut = 15,
                                                      usePoolCallbacks = True,
                                                      blocking = True )    
        if result["OK"]:
          self.log.always("CallableClass enqueued to task %s timeWait=%s exception=%s" % ( i, timeWait, raiseException ) )
          i += 1
        else:
          continue
      if i == 16:
        break
    self.processPool.finalize( 2 )
    
  def testCallableFunc( self ):
    """ CallableFunc and task timeout test """
    i = 0
    while True:
      if self.processPool.getFreeSlots() > 0:
        timeWait = random.randint(0, 5) * 5
        raiseException = False
        if not timeWait:
          raiseException = True 
        result = self.processPool.createAndQueueTask( CallableFunc,
                                                      taskID = i,
                                                      args = ( i, timeWait, raiseException ),  
                                                      timeOut = 15,
                                                      usePoolCallbacks = True,
                                                      blocking = True )    
        if result["OK"]:
          self.log.always("CallableFunc enqueued to task %s timeWait=%s exception=%s" % ( i, timeWait, raiseException ) )
          i += 1
        else:
          continue
      if i == 16:
        break
    self.processPool.finalize( 2 )

  def testLockedClass( self ):
    """ LockedCallableClass and task time out test """

    for loop in range(2):
      self.log.always( "loop %s" % loop )
      i = 0
      while i < 16:
        if self.processPool.getFreeSlots() > 0:
          timeWait = random.randint(0, 5) * 5
          raiseException = False
          if timeWait == 5:
            raiseException = True
          klass = CallableClass
          if timeWait >= 20:
            klass = LockedCallableClass
          result = self.processPool.createAndQueueTask( klass,
                                                        taskID = i,
                                                        args = ( i, timeWait, raiseException ), 
                                                        timeOut = 15,
                                                        usePoolCallbacks = True,
                                                        blocking = True )    
          if result["OK"]:
            self.log.always("%s enqueued to task %s timeWait=%s exception=%s" % ( klass.__name__ , i, timeWait, raiseException ) )
            i += 1
          else:
            continue
      self.log.always("being idle for a while")
      for _ in range( 100000 ):
        for _ in range( 1000 ):
          pass

    self.log.always("finalizing...")
    self.processPool.finalize( 10 )
    ## unlock
    gLock.release()
示例#47
0
class PoolComputingElement(ComputingElement):

  mandatoryParameters = MandatoryParameters

  #############################################################################
  def __init__(self, ceUniqueID):
    """ Standard constructor.
    """
    ComputingElement.__init__(self, ceUniqueID)
    self.ceType = "Pool"
    self.log = gLogger.getSubLogger('Pool')
    self.submittedJobs = 0
    self.processors = 1
    self.pPool = None
    self.taskID = 0
    self.processorsPerTask = {}
    self.userNumberPerTask = {}
    self.useSudo = False

  #############################################################################
  def _addCEConfigDefaults(self):
    """Method to make sure all necessary Configuration Parameters are defined
    """
    # First assure that any global parameters are loaded
    ComputingElement._addCEConfigDefaults(self)

  def _reset(self):
    """ Update internal variables after some extra parameters are added

    :return: None
    """

    self.processors = int(self.ceParameters.get('NumberOfProcessors', self.processors))
    self.ceParameters['MaxTotalJobs'] = self.processors
    self.useSudo = self.ceParameters.get('SudoExecution', False)

  def getProcessorsInUse(self):
    """ Get the number of currently allocated processor cores

    :return: number of processor cores
    """
    processorsInUse = 0
    for task in self.processorsPerTask:
      processorsInUse += self.processorsPerTask[task]
    return processorsInUse

  #############################################################################
  def submitJob(self, executableFile, proxy, **kwargs):
    """ Method to submit job.

    :param str executableFile: location of the executable file
    :param str proxy: payload proxy

    :return: S_OK/S_ERROR of the result of the job submission
    """

    if self.pPool is None:
      self.pPool = ProcessPool(minSize=self.processors,
                               maxSize=self.processors,
                               poolCallback=self.finalizeJob)

    self.pPool.processResults()

    processorsInUse = self.getProcessorsInUse()
    if kwargs.get('wholeNode'):
      if processorsInUse > 0:
        return S_ERROR('Can not take WholeNode job')  # , %d/%d slots used' % (self.slotsInUse,self.slots) )
      else:
        requestedProcessors = self.processors
    elif "numberOfProcessors" in kwargs:
      requestedProcessors = int(kwargs['numberOfProcessors'])
      if requestedProcessors > 0:
        if (processorsInUse + requestedProcessors) > self.processors:
          return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors,
                                                                           self.processors - processorsInUse))
    else:
      requestedProcessors = 1
    if self.processors - processorsInUse < requestedProcessors:
      return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors,
                                                                       self.processors - processorsInUse))

    ret = getProxyInfo()
    if not ret['OK']:
      pilotProxy = None
    else:
      pilotProxy = ret['Value']['path']
    self.log.notice('Pilot Proxy:', pilotProxy)

    kwargs = {'UseSudo': False}
    if self.useSudo:
      for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS):
        if nUser not in self.userNumberPerTask.values():
          break
      kwargs['NUser'] = nUser
      kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str(nUser).zfill(2)
      kwargs['UseSudo'] = True

    result = self.pPool.createAndQueueTask(executeJob,
                                           args=(executableFile, proxy, self.taskID),
                                           kwargs=kwargs,
                                           taskID=self.taskID,
                                           usePoolCallbacks=True)
    self.processorsPerTask[self.taskID] = requestedProcessors
    self.taskID += 1

    self.pPool.processResults()

    return result

  def finalizeJob(self, taskID, result):
    """ Finalize the job by updating the process utilisation counters

    :param int taskID: local PoolCE task ID
    :param dict result: result of the job execution

    """
    nProc = self.processorsPerTask.pop(taskID)
    if result['OK']:
      self.log.info('Task %d finished successfully, %d processor(s) freed' % (taskID, nProc))
    else:
      self.log.error("Task failed submission", "%d, message: %s" % (taskID, result['Message']))

  #############################################################################
  def getCEStatus(self, jobIDList=None):
    """ Method to return information on running and pending jobs.

    :return: dictionary of numbers of jobs per status
    """

    if self.pPool is None:
      self.pPool = ProcessPool(minSize=self.processors,
                               maxSize=self.processors,
                               poolCallback=self.finalizeJob)

    self.pPool.processResults()
    result = S_OK()
    result['SubmittedJobs'] = 0
    nJobs = 0
    for _j, value in self.processorsPerTask.iteritems():
      if value > 0:
        nJobs += 1
    result['RunningJobs'] = nJobs
    result['WaitingJobs'] = 0
    processorsInUse = self.getProcessorsInUse()
    result['UsedProcessors'] = processorsInUse
    result['AvailableProcessors'] = self.processors - processorsInUse
    return result

  def getDescription(self):
    """ Get CE description as a dictionary
    """
    result = super(PoolComputingElement, self).getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']

    ceDictList = []
    if self.ceParameters.get('MultiProcessorStrategy'):
      strategyRequiredTags = []
      if not ceDict.get("ProcessorsInUse", 0):
        # We are starting from a clean page, try to get the most demanding
        # jobs first
        strategyRequiredTags.append(['WholeNode'])
      processors = ceDict.get('NumberOfProcessors', 0)
      if processors > 1:
        # We have several processors at hand, try to use most of them
        strategyRequiredTags.append(['%dProcessors' % processors])
        # Well, at least jobs with some processors requirement
        strategyRequiredTags.append(['MultiProcessor'])

      for strat in strategyRequiredTags:
        newCEDict = dict(ceDict)
        newCEDict.setdefault("RequiredTag", []).extend(strat)
        ceDictList.append(newCEDict)

    # Do not require anything special if nothing else was lucky
    ceDictList.append(dict(ceDict))

    return S_OK(ceDictList)

  #############################################################################
  def monitorProxy(self, pilotProxy, payloadProxy):
    """ Monitor the payload proxy and renew as necessary.

    :param str pilotProxy: location of the pilotProxy
    :param str payloadProxy: location of the payloadProxy
    """
    return self._monitorProxy(pilotProxy, payloadProxy)
示例#48
0
class RequestExecutingAgent(AgentModule):
    """
    .. class:: RequestExecutingAgent

    request processing agent using ProcessPool, Operation handlers and RequestTask
    """

    def __init__(self, *args, **kwargs):
        """c'tor"""
        # # call base class ctor
        super().__init__(*args, **kwargs)

        self.__processPool = None
        self.__requestCache = {}
        self.__requestsPerCycle = REQUESTSPERCYCLE
        self.__minProcess = MINPROCESS
        self.__maxProcess = MAXPROCESS
        self.__queueSize = QUEUESIZE
        self.__fileTimeout = FILETIMEOUT
        self.__operationTimeout = OPERATIONTIMEOUT
        self.__poolTimeout = POOLTIMEOUT
        self.__poolSleep = POOLSLEEP
        self.__requestClient = None
        # Size of the bulk if use of getRequests. If 0, use getRequest
        self.__bulkRequest = 0
        self.__rmsMonitoring = False

    def processPool(self):
        """facade for ProcessPool"""
        if not self.__processPool:
            minProcess = max(1, self.__minProcess)
            maxProcess = max(self.__minProcess, self.__maxProcess)
            queueSize = abs(self.__queueSize)
            self.log.info(
                "REA ProcessPool configuration",
                "minProcess = %d maxProcess = %d queueSize = %d" % (minProcess, maxProcess, queueSize),
            )
            self.__processPool = ProcessPool(
                minProcess,
                maxProcess,
                queueSize,
                poolCallback=self.resultCallback,
                poolExceptionCallback=self.exceptionCallback,
            )
            self.__processPool.daemonize()
        return self.__processPool

    def requestClient(self):
        """RequestClient getter"""
        if not self.__requestClient:
            self.__requestClient = ReqClient()
        return self.__requestClient

    def cacheRequest(self, request):
        """put request into requestCache

        :param ~Request.Request request: Request instance
        """
        maxProcess = max(self.__minProcess, self.__maxProcess)
        if len(self.__requestCache) > maxProcess + 50:
            # For the time being we just print a warning... If the ProcessPool is working well, this is not needed
            # We don't know how much is acceptable as it depends on many factors
            self.log.warn("Too many requests in cache", ": %d" % len(self.__requestCache))
        #      return S_ERROR( "Too many requests in cache" )
        if request.RequestID in self.__requestCache:
            # We don't call  putRequest as we have got back the request that is still being executed. Better keep it
            # The main reason for this is that it lasted longer than the kick time of CleanReqAgent
            self.log.warn(
                "Duplicate request, keep it but don't execute", ": %d/%s" % (request.RequestID, request.RequestName)
            )
            return S_ERROR(errno.EALREADY, "Request already in cache")
        self.__requestCache[request.RequestID] = request
        return S_OK()

    def putRequest(self, requestID, taskResult=None):
        """put back :requestID: to RequestClient

        :param str requestID: request's id
        """
        if requestID in self.__requestCache:
            request = self.__requestCache.pop(requestID)
            if taskResult:
                if taskResult["OK"]:
                    request = taskResult["Value"]
                    # The RequestTask is putting back the Done tasks, no need to redo it
                    if request.Status == "Done":
                        return S_OK()
                # In case of timeout, we need to increment ourselves all the attempts
                elif cmpError(taskResult, errno.ETIME):
                    waitingOp = request.getWaiting()
                    for rmsFile in waitingOp.get("Value", []):
                        rmsFile.Attempt += 1

            reset = self.requestClient().putRequest(request, useFailoverProxy=False, retryMainService=2)
            if not reset["OK"]:
                return S_ERROR("putRequest: unable to reset request %s: %s" % (requestID, reset["Message"]))
        else:
            return S_ERROR("Not in cache")
        return S_OK()

    def putAllRequests(self):
        """put back all requests without callback called into requestClient

        :param self: self reference
        """
        self.log.info("putAllRequests: will put back requests", "%s" % len(self.__requestCache))
        for requestID in self.__requestCache.keys():
            reset = self.putRequest(requestID)
            if not reset["OK"]:
                self.log.error("Failed to put request", reset["Message"])
            else:
                self.log.debug("putAllRequests: request %s has been put back with its initial state" % requestID)
        return S_OK()

    def initialize(self):
        """initialize agent"""

        # # ProcessPool related stuff
        self.__requestsPerCycle = self.am_getOption("RequestsPerCycle", self.__requestsPerCycle)
        self.log.info("Requests/cycle = %d" % self.__requestsPerCycle)
        self.__minProcess = self.am_getOption("MinProcess", self.__minProcess)
        self.log.info("ProcessPool min process = %d" % self.__minProcess)
        self.__maxProcess = self.am_getOption("MaxProcess", self.__maxProcess)
        self.log.info("ProcessPool max process = %d" % self.__maxProcess)
        self.__queueSize = self.am_getOption("ProcessPoolQueueSize", self.__queueSize)
        self.log.info("ProcessPool queue size = %d" % self.__queueSize)
        self.__poolTimeout = int(self.am_getOption("ProcessPoolTimeout", self.__poolTimeout))
        self.log.info("ProcessPool timeout = %d seconds" % self.__poolTimeout)
        self.__poolSleep = int(self.am_getOption("ProcessPoolSleep", self.__poolSleep))
        self.log.info("ProcessPool sleep time = %d seconds" % self.__poolSleep)
        self.__bulkRequest = self.am_getOption("BulkRequest", self.__bulkRequest)
        self.log.info("Bulk request size = %d" % self.__bulkRequest)
        # Check if monitoring is enabled
        if "Monitoring" in Operations().getMonitoringBackends(monitoringType="RMSMonitoring"):
            # Enable RMS monitoring
            self.__rmsMonitoring = True
        self.log.info("Enable ES RMS Monitoring = %s" % self.__rmsMonitoring)

        # # keep config path and agent name
        self.agentName = self.am_getModuleParam("fullName")
        self.__configPath = PathFinder.getAgentSection(self.agentName)

        # # operation handlers over here
        opHandlersPath = "%s/%s" % (self.__configPath, "OperationHandlers")
        opHandlers = gConfig.getSections(opHandlersPath)
        if not opHandlers["OK"]:
            self.log.error(opHandlers["Message"])
            raise AgentConfigError("OperationHandlers section not found in CS under %s" % self.__configPath)
        opHandlers = opHandlers["Value"]

        self.timeOuts = dict()

        # # handlers dict
        self.handlersDict = dict()
        for opHandler in opHandlers:
            opHandlerPath = "%s/%s/Location" % (opHandlersPath, opHandler)
            opLocation = gConfig.getValue(opHandlerPath, "")
            if not opLocation:
                self.log.error("%s not set for %s operation handler" % (opHandlerPath, opHandler))
                continue
            self.timeOuts[opHandler] = {"PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout}

            opTimeout = gConfig.getValue("%s/%s/TimeOut" % (opHandlersPath, opHandler), 0)
            if opTimeout:
                self.timeOuts[opHandler]["PerOperation"] = opTimeout
            fileTimeout = gConfig.getValue("%s/%s/TimeOutPerFile" % (opHandlersPath, opHandler), 0)
            if fileTimeout:
                self.timeOuts[opHandler]["PerFile"] = fileTimeout

            self.handlersDict[opHandler] = opLocation

        self.log.info("Operation handlers:")
        for item in enumerate(self.handlersDict.items()):
            opHandler = item[1][0]
            self.log.info(
                "[%s] %s: %s (timeout: %d s + %d s per file)"
                % (
                    item[0],
                    item[1][0],
                    item[1][1],
                    self.timeOuts[opHandler]["PerOperation"],
                    self.timeOuts[opHandler]["PerFile"],
                )
            )

        if self.__rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")
            gThreadScheduler.addPeriodicTask(100, self.__rmsMonitoringReporting)

        # # create request dict
        self.__requestCache = dict()

        return S_OK()

    def execute(self):
        """read requests from RequestClient and enqueue them into ProcessPool"""
        # # requests (and so tasks) counter
        taskCounter = 0
        while taskCounter < self.__requestsPerCycle:
            self.log.debug("execute: executing %d request in this cycle" % taskCounter)

            requestsToExecute = []

            if not self.__bulkRequest:
                self.log.info("execute: ask for a single request")
                getRequest = self.requestClient().getRequest()
                if not getRequest["OK"]:
                    self.log.error("execute:", "%s" % getRequest["Message"])
                    break
                if not getRequest["Value"]:
                    self.log.info("execute: no more 'Waiting' requests to process")
                    break
                requestsToExecute = [getRequest["Value"]]
            else:
                numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter)
                self.log.info("execute: ask for requests", "%s" % numberOfRequest)
                getRequests = self.requestClient().getBulkRequests(numberOfRequest)
                if not getRequests["OK"]:
                    self.log.error("execute:", "%s" % getRequests["Message"])
                    break
                if not getRequests["Value"]:
                    self.log.info("execute: no more 'Waiting' requests to process")
                    break
                for rId in getRequests["Value"]["Failed"]:
                    self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId])

                requestsToExecute = list(getRequests["Value"]["Successful"].values())

            self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute))

            for request in requestsToExecute:
                # # set task id
                taskID = request.RequestID

                self.log.info(
                    "processPool status",
                    "tasks idle = %s working = %s"
                    % (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses()),
                )

                looping = 0
                while True:
                    if not self.processPool().getFreeSlots():
                        if not looping:
                            self.log.info(
                                "No free slots available in processPool",
                                "will wait %d seconds to proceed" % self.__poolSleep,
                            )
                        time.sleep(self.__poolSleep)
                        looping += 1
                    else:
                        if looping:
                            self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep)
                        looping = 0
                        # # save current request in cache
                        res = self.cacheRequest(request)
                        if not res["OK"]:
                            if cmpError(res, errno.EALREADY):
                                # The request is already in the cache, skip it. break out of the while loop to get next request
                                break
                            # There are too many requests in the cache, commit suicide
                            self.log.error(
                                "Too many requests in cache",
                                "(%d requests): put back all requests and exit cycle. Error %s"
                                % (len(self.__requestCache), res["Message"]),
                            )
                            self.putAllRequests()
                            return res
                        # # serialize to JSON
                        result = request.toJSON()
                        if not result["OK"]:
                            continue
                        requestJSON = result["Value"]
                        self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName))
                        timeOut = self.getTimeout(request)
                        enqueue = self.processPool().createAndQueueTask(
                            RequestTask,
                            kwargs={
                                "requestJSON": requestJSON,
                                "handlersDict": self.handlersDict,
                                "csPath": self.__configPath,
                                "agentName": self.agentName,
                                "rmsMonitoring": self.__rmsMonitoring,
                            },
                            taskID=taskID,
                            blocking=True,
                            usePoolCallbacks=True,
                            timeOut=timeOut,
                        )
                        if not enqueue["OK"]:
                            self.log.error("Could not enqueue task", enqueue["Message"])
                        else:
                            self.log.debug("successfully enqueued task", "'%s'" % taskID)
                            # # update monitor
                            if self.__rmsMonitoring:
                                self.rmsMonitoringReporter.addRecord(
                                    {
                                        "timestamp": int(TimeUtilities.toEpoch()),
                                        "host": Network.getFQDN(),
                                        "objectType": "Request",
                                        "status": "Attempted",
                                        "objectID": request.RequestID,
                                        "nbObject": 1,
                                    }
                                )

                            # # update request counter
                            taskCounter += 1
                            # # task created, a little time kick to proceed
                            time.sleep(0.1)
                            break

        self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache))
        processed = self.processPool().processResults()
        # This happens when the result queue is screwed up.
        # Returning S_ERROR proved not to be sufficient,
        # and when in this situation, there is nothing we can do.
        # So we just exit. runit will restart from scratch.
        if processed < 0:
            self.log.fatal("Results queue is screwed up")
            sys.exit(1)
        # # clean return
        return S_OK()

    def getTimeout(self, request):
        """get timeout for request"""
        timeout = 0
        for op in request:
            if op.Status not in ("Waiting", "Scheduled", "Queued"):
                continue
            if op.Type not in self.timeOuts:
                timeout += self.__operationTimeout
            else:
                perOp = self.timeOuts[op.Type].get("PerOperation", self.__operationTimeout)
                perFiles = self.timeOuts[op.Type].get("PerFile", self.__fileTimeout) * len(op)
                timeout += perOp + perFiles
        self.log.info(
            "estimated timeOut for request", "(%s/%s) is %s" % (request.RequestID, request.RequestName, timeout)
        )
        return timeout

    def finalize(self):
        """agent finalization"""
        if self.__processPool:
            self.processPool().finalize(timeout=self.__poolTimeout)
        self.putAllRequests()
        return S_OK()

    def resultCallback(self, taskID, taskResult):
        """definition of request callback function

        :param str taskID: Request.RequestID
        :param dict taskResult: task result S_OK(Request)/S_ERROR(Message)
        """
        # # clean cache
        res = self.putRequest(taskID, taskResult)
        self.log.info(
            "callback:",
            "%s result is %s(%s), put %s(%s)"
            % (
                taskID,
                "S_OK" if taskResult["OK"] else "S_ERROR",
                taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"],
                "S_OK" if res["OK"] else "S_ERROR",
                "" if res["OK"] else res["Message"],
            ),
        )

    def exceptionCallback(self, taskID, taskException):
        """definition of exception callback function

        :param str taskID: Request.RequestID
        :param Exception taskException: Exception instance
        """
        self.log.error("exceptionCallback:", "%s was hit by exception %s" % (taskID, taskException))
        self.putRequest(taskID)

    def __rmsMonitoringReporting(self):
        """This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which
        is done by the MonitoringReporter and is send to the 'RMSMonitoring' type.
        :return: True / False
        """
        result = self.rmsMonitoringReporter.commit()
        return result["OK"]