class TaskCallbacksTests(unittest.TestCase): """ .. class:: TaskCallbacksTests test case for ProcessPool """ def setUp(self): gLogger.showHeaders(True) self.log = gLogger.getSubLogger(self.__class__.__name__) self.processPool = ProcessPool(4, 8, 8) self.processPool.daemonize() def testCallableClass(self): """ CallableClass and task callbacks test """ i = 0 while True: if self.processPool.getFreeSlots() > 0: timeWait = random.randint(0, 5) raiseException = False if not timeWait: raiseException = True result = self.processPool.createAndQueueTask( CallableClass, taskID=i, args=(i, timeWait, raiseException), callback=ResultCallback, exceptionCallback=ExceptionCallback, blocking=True) if result["OK"]: self.log.always("CallableClass enqueued to task %s" % i) i += 1 else: continue if i == 10: break self.processPool.finalize(2) def testCallableFunc(self): """ CallableFunc and task callbacks test """ i = 0 while True: if self.processPool.getFreeSlots() > 0: timeWait = random.randint(0, 5) raiseException = False if not timeWait: raiseException = True result = self.processPool.createAndQueueTask( CallableFunc, taskID=i, args=(i, timeWait, raiseException), callback=ResultCallback, exceptionCallback=ExceptionCallback, blocking=True) if result["OK"]: self.log.always("CallableClass enqueued to task %s" % i) i += 1 else: continue if i == 10: break self.processPool.finalize(2)
def processPool(self): """ 'Live long and prosper, my dear ProcessPool' - Mr. Spock :param self: self reference :return: brand new shiny ProcessPool instance on first call, the same instance on subsequent calls """ if not self.__processPool: minProcess = max(1, self.__minProcess) maxProcess = max(self.__minProcess, self.__maxProcess) queueSize = abs(self.__queueSize) self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % (minProcess, maxProcess, queueSize)) self.log.info( "ProcessPool: tasks will use callbacks attached to ProcessPool" ) self.__processPool = ProcessPool( minProcess, maxProcess, queueSize, poolCallback=self.resultCallback, poolExceptionCallback=self.exceptionCallback) self.__processPool.daemonize() self.log.info("ProcessPool: daemonized and ready") return self.__processPool
def getCEStatus(self): """ Method to return information on running and waiting jobs, as well as the number of processors (used, and available). :return: dictionary of numbers of jobs per status and processors (used, and available) """ if self.pPool is None: self.pPool = ProcessPool(minSize=self.processors, maxSize=self.processors, poolCallback=self.finalizeJob) self.pPool.processResults() result = S_OK() nJobs = 0 for _j, value in self.processorsPerTask.iteritems(): if value > 0: nJobs += 1 result['SubmittedJobs'] = nJobs result['RunningJobs'] = nJobs result['WaitingJobs'] = 0 # dealing with processors processorsInUse = self.getProcessorsInUse() result['UsedProcessors'] = processorsInUse result['AvailableProcessors'] = self.processors - processorsInUse return result
class TaskCallbacksTests(unittest.TestCase): """ .. class:: TaskCallbacksTests test case for ProcessPool """ def setUp( self ): gLogger.showHeaders( True ) self.log = gLogger.getSubLogger( self.__class__.__name__ ) self.processPool = ProcessPool( 4, 8, 8 ) self.processPool.daemonize() def testCallableClass( self ): """ CallableClass and task callbacks test """ i = 0 while True: if self.processPool.getFreeSlots() > 0: timeWait = random.randint(0, 5) raiseException = False if not timeWait: raiseException = True result = self.processPool.createAndQueueTask( CallableClass, taskID = i, args = ( i, timeWait, raiseException ), callback = ResultCallback, exceptionCallback = ExceptionCallback, blocking = True ) if result["OK"]: self.log.always("CallableClass enqueued to task %s" % i ) i += 1 else: continue if i == 10: break self.processPool.finalize( 2 ) def testCallableFunc( self ): """ CallableFunc and task callbacks test """ i = 0 while True: if self.processPool.getFreeSlots() > 0: timeWait = random.randint(0, 5) raiseException = False if not timeWait: raiseException = True result = self.processPool.createAndQueueTask( CallableFunc, taskID = i, args = ( i, timeWait, raiseException ), callback = ResultCallback, exceptionCallback = ExceptionCallback, blocking = True ) if result["OK"]: self.log.always("CallableClass enqueued to task %s" % i ) i += 1 else: continue if i == 10: break self.processPool.finalize( 2 )
def submitJob(self, executableFile, proxy, **kwargs): """ Method to submit job. :param str executableFile: location of the executable file :param str proxy: payload proxy :return: S_OK/S_ERROR of the result of the job submission """ if self.pPool is None: self.pPool = ProcessPool(minSize=self.processors, maxSize=self.processors, poolCallback=self.finalizeJob) self.pPool.processResults() processorsForJob = self._getProcessorsForJobs(kwargs) if not processorsForJob: return S_ERROR('Not enough processors for the job') # Now persisiting the job limits for later use in pilot.cfg file (pilot 3 default) cd = ConfigurationData(loadDefaultCFG=False) res = cd.loadFile('pilot.cfg') if not res['OK']: self.log.error("Could not load pilot.cfg", res['Message']) # only NumberOfProcessors for now, but RAM (or other stuff) can also be added jobID = int(kwargs.get('jobDesc', {}).get('jobID', 0)) cd.setOptionInCFG('/Resources/Computing/JobLimits/%d/NumberOfProcessors' % jobID, processorsForJob) res = cd.dumpLocalCFGToFile('pilot.cfg') if not res['OK']: self.log.error("Could not dump cfg to pilot.cfg", res['Message']) ret = getProxyInfo() if not ret['OK']: pilotProxy = None else: pilotProxy = ret['Value']['path'] self.log.notice('Pilot Proxy:', pilotProxy) kwargs = {'UseSudo': False} if self.useSudo: for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS): if nUser not in self.userNumberPerTask.values(): break kwargs['NUser'] = nUser kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str(nUser).zfill(2) kwargs['UseSudo'] = True result = self.pPool.createAndQueueTask(executeJob, args=(executableFile, proxy, self.taskID), kwargs=kwargs, taskID=self.taskID, usePoolCallbacks=True) self.processorsPerTask[self.taskID] = processorsForJob self.taskID += 1 self.pPool.processResults() return result
def setUp(self): from DIRAC.Core.Base import Script Script.parseCommandLine() from DIRAC.FrameworkSystem.Client.Logger import gLogger gLogger.showHeaders(True) self.log = gLogger.getSubLogger(self.__class__.__name__) self.processPool = ProcessPool(4, 8, 8) self.processPool.daemonize()
def setUp( self ): """c'tor :param self: self reference """ gLogger.showHeaders( True ) self.log = gLogger.getSubLogger( self.__class__.__name__ ) self.processPool = ProcessPool( 4, 8, 8, poolCallback = self.poolCallback, poolExceptionCallback = self.poolExceptionCallback ) self.processPool.daemonize()
def processPoolWithCallbacks2(): gLogger.showHeaders(True) log = gLogger.getSubLogger("TaskTimeOutTests") processPoolWithCallbacks2 = ProcessPool( 2, 4, 8, poolCallback=lambda taskID, taskResult: log.always( "callback result for %s is %s" % (taskID, taskResult)), poolExceptionCallback=lambda taskID, taskException: log.always( "callback exception for %s is %s" % (taskID, taskException)), ) processPoolWithCallbacks2.daemonize() yield processPoolWithCallbacks2
def __init__(self, ceUniqueID, cores=0): """ Standard constructor. """ ComputingElement.__init__(self, ceUniqueID) self.ceType = "Pool" self.submittedJobs = 0 if cores > 0: self.cores = cores else: self.cores = getNumberOfCores() self.pPool = ProcessPool(self.cores, self.cores, poolCallback=self.finalizeJob) self.taskID = 0 self.coresPerTask = {}
def processPool( self ): """ facade for ProcessPool """ if not self.__processPool: minProcess = max( 1, self.__minProcess ) maxProcess = max( self.__minProcess, self.__maxProcess ) queueSize = abs( self.__queueSize ) self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess, maxProcess, queueSize ) ) self.__processPool = ProcessPool( minProcess, maxProcess, queueSize, poolCallback = self.resultCallback, poolExceptionCallback = self.exceptionCallback ) self.__processPool.daemonize() return self.__processPool
def setUp(self): """c'tor :param self: self reference """ from DIRAC.Core.Base import Script Script.parseCommandLine() from DIRAC.FrameworkSystem.Client.Logger import gLogger gLogger.showHeaders(True) self.log = gLogger.getSubLogger(self.__class__.__name__) self.processPool = ProcessPool( 2, 4, 8, poolCallback=self.poolCallback, poolExceptionCallback=self.poolExceptionCallback) self.processPool.daemonize()
def setUp( self ): from DIRAC.Core.Base import Script Script.parseCommandLine() from DIRAC.FrameworkSystem.Client.Logger import gLogger gLogger.showHeaders( True ) self.log = gLogger.getSubLogger( self.__class__.__name__ ) self.processPool = ProcessPool( 4, 8, 8 ) self.processPool.daemonize()
def __init__(self, ceUniqueID, processors=0): """ Standard constructor. """ ComputingElement.__init__(self, ceUniqueID) self.ceType = "Pool" self.log = gLogger.getSubLogger('Pool') self.submittedJobs = 0 if processors > 0: self.processors = processors else: self.processors = multiprocessing.cpu_count() self.pPool = ProcessPool(minSize=self.processors, maxSize=self.processors, poolCallback=self.finalizeJob) self.taskID = 0 self.processorsPerTask = {} self.userNumberPerTask = {} self.useSudo = False
def submitJob(self, executableFile, proxy, **kwargs): """ Method to submit job. """ if self.pPool is None: self.pPool = ProcessPool(minSize=self.processors, maxSize=self.processors, poolCallback=self.finalizeJob) self.pPool.processResults() processorsInUse = self.getProcessorsInUse() if kwargs.get('wholeNode'): if processorsInUse > 0: return S_ERROR('Can not take WholeNode job') # , %d/%d slots used' % (self.slotsInUse,self.slots) ) else: requestedProcessors = self.processors elif "numberOfProcessors" in kwargs: requestedProcessors = int(kwargs['numberOfProcessors']) if requestedProcessors > 0: if (processorsInUse + requestedProcessors) > self.processors: return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors, self.processors - processorsInUse)) else: requestedProcessors = 1 if self.processors - processorsInUse < requestedProcessors: return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors, self.processors - processorsInUse)) ret = getProxyInfo() if not ret['OK']: pilotProxy = None else: pilotProxy = ret['Value']['path'] self.log.notice('Pilot Proxy:', pilotProxy) kwargs = {'UseSudo': False} if self.useSudo: for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS): if nUser not in self.userNumberPerTask.values(): break kwargs['NUser'] = nUser kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str(nUser).zfill(2) kwargs['UseSudo'] = True result = self.pPool.createAndQueueTask(executeJob, args=(executableFile, proxy, self.taskID), kwargs=kwargs, taskID=self.taskID, usePoolCallbacks=True) self.processorsPerTask[self.taskID] = requestedProcessors self.taskID += 1 self.pPool.processResults() return result
def __init__( self, ceUniqueID, cores = 0 ): """ Standard constructor. """ ComputingElement.__init__( self, ceUniqueID ) self.ceType = "Pool" self.submittedJobs = 0 if cores > 0: self.cores = cores else: self.cores = getNumberOfCores() self.pPool = ProcessPool( self.cores, self.cores, poolCallback = self.finalizeJob ) self.taskID = 0 self.coresPerTask = {}
def setUp( self ): """c'tor :param self: self reference """ from DIRAC.Core.Base import Script Script.parseCommandLine() from DIRAC.FrameworkSystem.Client.Logger import gLogger gLogger.showHeaders( True ) self.log = gLogger.getSubLogger( self.__class__.__name__ ) self.processPool = ProcessPool( 4, 8, 8, poolCallback = self.poolCallback, poolExceptionCallback = self.poolExceptionCallback ) self.processPool.daemonize()
def getCEStatus(self, jobIDList=None): """ Method to return information on running and pending jobs. :return: dictionary of numbers of jobs per status """ if self.pPool is None: self.pPool = ProcessPool(minSize=self.processors, maxSize=self.processors, poolCallback=self.finalizeJob) self.pPool.processResults() result = S_OK() result['SubmittedJobs'] = 0 nJobs = 0 for _j, value in self.processorsPerTask.iteritems(): if value > 0: nJobs += 1 result['RunningJobs'] = nJobs result['WaitingJobs'] = 0 processorsInUse = self.getProcessorsInUse() result['UsedProcessors'] = processorsInUse result['AvailableProcessors'] = self.processors - processorsInUse return result
def runTest(): global nClients, nQueries, testType, resultTest, testDir, lfnListFile resultTest = [] pp = ProcessPool(nClients) testFunction = eval(testType) for c in xrange(nClients): pp.createAndQueueTask(testFunction, [nQueries], callback=finalize, exceptionCallback=doException) pp.processAllResults(3600) pp.finalize(0) timeResult = [] for testTime, success, failure in resultTest: #print testTime,success,failure timeResult += testTime averageTime, errorTime = doStats(timeResult) rateResult = [nClients / t for t in timeResult] averageRate, errorRate = doStats(rateResult) if testDir: print "\nTest results for clients %d, %s" % (nClients, testDir) else: print "\nTest results for clients %d, %s" % (nClients, lfnListFile) print "Query time: %.2f +/- %.2f" % (averageTime, errorTime) print "Query rate: %.2f +/- %.2f" % (averageRate, errorRate) return ((averageTime, errorTime), (averageRate, errorRate))
def processPool( self ): """ 'Live long and prosper, my dear ProcessPool' - Mr. Spock :param self: self reference :return: brand new shiny ProcessPool instance on first call, the same instance on subsequent calls """ if not self.__processPool: minProcess = max( 1, self.__minProcess ) maxProcess = max( self.__minProcess, self.__maxProcess ) queueSize = abs(self.__queueSize) self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess, maxProcess, queueSize ) ) self.log.info( "ProcessPool: tasks will use callbacks attached to ProcessPool" ) self.__processPool = ProcessPool( minProcess, maxProcess, queueSize, poolCallback = self.resultCallback, poolExceptionCallback = self.exceptionCallback ) self.__processPool.daemonize() self.log.info( "ProcessPool: daemonized and ready") return self.__processPool
def runTest(): global nClients, nQueries, testType, resultTest, testDir, lfnListFile resultTest = [] pp = ProcessPool(nClients) testFunction = eval(testType) for c in xrange(nClients): pp.createAndQueueTask(testFunction, [nQueries], callback=finalize, exceptionCallback=doException) pp.processAllResults(3600) pp.finalize(0) timeResult = [] for testTime, success, failure in resultTest: # print testTime,success,failure timeResult += testTime averageTime, errorTime = doStats(timeResult) rateResult = [nClients / t for t in timeResult] averageRate, errorRate = doStats(rateResult) if testDir: print "\nTest results for clients %d, %s" % (nClients, testDir) else: print "\nTest results for clients %d, %s" % (nClients, lfnListFile) print "Query time: %.2f +/- %.2f" % (averageTime, errorTime) print "Query rate: %.2f +/- %.2f" % (averageRate, errorRate) return((averageTime, errorTime), (averageRate, errorRate))
class RequestExecutingAgent( AgentModule ): """ .. class:: RequestExecutingAgent request processing agent using ProcessPool, Operation handlers and RequestTask """ # # process pool __processPool = None # # request cache __requestCache = {} # # requests/cycle __requestsPerCycle = 100 # # minimal nb of subprocess running __minProcess = 2 # # maximal nb of subprocess executed same time __maxProcess = 4 # # ProcessPool queue size __queueSize = 20 # # file timeout __fileTimeout = 300 # # operation timeout __operationTimeout = 300 # # ProcessTask default timeout in seconds __taskTimeout = 900 # # ProcessPool finalization timeout __poolTimeout = 900 # # ProcessPool sleep time __poolSleep = 5 # # placeholder for RequestClient instance __requestClient = None # # Size of the bulk if use of getRequests. If 0, use getRequest __bulkRequest = 0 def __init__( self, *args, **kwargs ): """ c'tor """ # # call base class ctor AgentModule.__init__( self, *args, **kwargs ) # # ProcessPool related stuff self.__requestsPerCycle = self.am_getOption( "RequestsPerCycle", self.__requestsPerCycle ) self.log.info( "Requests/cycle = %d" % self.__requestsPerCycle ) self.__minProcess = self.am_getOption( "MinProcess", self.__minProcess ) self.log.info( "ProcessPool min process = %d" % self.__minProcess ) self.__maxProcess = self.am_getOption( "MaxProcess", 4 ) self.log.info( "ProcessPool max process = %d" % self.__maxProcess ) self.__queueSize = self.am_getOption( "ProcessPoolQueueSize", self.__queueSize ) self.log.info( "ProcessPool queue size = %d" % self.__queueSize ) self.__poolTimeout = int( self.am_getOption( "ProcessPoolTimeout", self.__poolTimeout ) ) self.log.info( "ProcessPool timeout = %d seconds" % self.__poolTimeout ) self.__poolSleep = int( self.am_getOption( "ProcessPoolSleep", self.__poolSleep ) ) self.log.info( "ProcessPool sleep time = %d seconds" % self.__poolSleep ) self.__bulkRequest = self.am_getOption( "BulkRequest", 0 ) self.log.info( "Bulk request size = %d" % self.__bulkRequest ) # # keep config path and agent name self.agentName = self.am_getModuleParam( "fullName" ) self.__configPath = PathFinder.getAgentSection( self.agentName ) # # operation handlers over here opHandlersPath = "%s/%s" % ( self.__configPath, "OperationHandlers" ) opHandlers = gConfig.getSections( opHandlersPath ) if not opHandlers["OK"]: self.log.error( opHandlers["Message" ] ) raise AgentConfigError( "OperationHandlers section not found in CS under %s" % self.__configPath ) opHandlers = opHandlers["Value"] self.timeOuts = dict() # # handlers dict self.handlersDict = dict() for opHandler in opHandlers: opHandlerPath = "%s/%s/Location" % ( opHandlersPath, opHandler ) opLocation = gConfig.getValue( opHandlerPath, "" ) if not opLocation: self.log.error( "%s not set for %s operation handler" % ( opHandlerPath, opHandler ) ) continue self.timeOuts[opHandler] = { "PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout } opTimeout = gConfig.getValue( "%s/%s/TimeOut" % ( opHandlersPath, opHandler ), 0 ) if opTimeout: self.timeOuts[opHandler]["PerOperation"] = opTimeout fileTimeout = gConfig.getValue( "%s/%s/TimeOutPerFile" % ( opHandlersPath, opHandler ), 0 ) if fileTimeout: self.timeOuts[opHandler]["PerFile"] = fileTimeout self.handlersDict[opHandler] = opLocation self.log.info( "Operation handlers:" ) for item in enumerate ( self.handlersDict.items() ): opHandler = item[1][0] self.log.info("[%s] %s: %s (timeout: %d s + %d s per file)" % (item[0], item[1][0], item[1][1], self.timeOuts[opHandler]['PerOperation'], self.timeOuts[opHandler]['PerFile'])) # # common monitor activity gMonitor.registerActivity( "Iteration", "Agent Loops", "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Processed", "Request Processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Done", "Request Completed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) # # create request dict self.__requestCache = dict() # ?? Probably should be removed self.FTSMode = self.am_getOption( "FTSMode", False ) def processPool( self ): """ facade for ProcessPool """ if not self.__processPool: minProcess = max( 1, self.__minProcess ) maxProcess = max( self.__minProcess, self.__maxProcess ) queueSize = abs( self.__queueSize ) self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess, maxProcess, queueSize ) ) self.__processPool = ProcessPool( minProcess, maxProcess, queueSize, poolCallback = self.resultCallback, poolExceptionCallback = self.exceptionCallback ) self.__processPool.daemonize() return self.__processPool def requestClient( self ): """ RequestClient getter """ if not self.__requestClient: self.__requestClient = ReqClient() return self.__requestClient def cacheRequest( self, request ): """ put request into requestCache :param ~Request.Request request: Request instance """ maxProcess = max( self.__minProcess, self.__maxProcess ) if len( self.__requestCache ) > maxProcess + 50: # For the time being we just print a warning... If the ProcessPool is working well, this is not needed # We don't know how much is acceptable as it depends on many factors self.log.warn( "Too many requests in cache", ': %d' % len( self.__requestCache ) ) # return S_ERROR( "Too many requests in cache" ) if request.RequestID in self.__requestCache: # We don't call putRequest as we have got back the request that is still being executed. Better keep it # The main reason for this is that it lasted longer than the kick time of CleanReqAgent self.log.warn( "Duplicate request, keep it but don't execute", ': %d/%s' % ( request.RequestID, request.RequestName ) ) return S_ERROR( errno.EALREADY, 'Request already in cache' ) self.__requestCache[ request.RequestID ] = request return S_OK() def putRequest( self, requestID, taskResult = None ): """ put back :requestID: to RequestClient :param str requestID: request's id """ if requestID in self.__requestCache: request = self.__requestCache.pop( requestID ) if taskResult: if taskResult['OK']: request = taskResult['Value'] # The RequestTask is putting back the Done tasks, no need to redo it if request.Status == 'Done': return S_OK() # In case of timeout, we need to increment ourselves all the attempts elif cmpError( taskResult, errno.ETIME ): waitingOp = request.getWaiting() for rmsFile in waitingOp.get( 'Value', [] ): rmsFile.Attempt += 1 reset = self.requestClient().putRequest( request, useFailoverProxy = False, retryMainService = 2 ) if not reset["OK"]: return S_ERROR( "putRequest: unable to reset request %s: %s" % ( requestID, reset["Message"] ) ) else: return S_ERROR( 'Not in cache' ) return S_OK() def putAllRequests( self ): """ put back all requests without callback called into requestClient :param self: self reference """ self.log.info( "putAllRequests: will put %s back requests" % len( self.__requestCache ) ) for requestID in self.__requestCache.keys(): reset = self.putRequest( requestID ) if not reset["OK"]: self.log.error( 'Failed to put request', reset["Message"] ) else: self.log.debug( "putAllRequests: request %s has been put back with its initial state" % requestID ) return S_OK() def initialize( self ): """ initialize agent """ return S_OK() def execute( self ): """ read requests from RequestClient and enqueue them into ProcessPool """ gMonitor.addMark( "Iteration", 1 ) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug( "execute: executing %d request in this cycle" % taskCounter ) requestsToExecute = [] if not self.__bulkRequest: self.log.info( "execute: ask for a single request" ) getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error( "execute: %s" % getRequest["Message"] ) break if not getRequest["Value"]: self.log.info( "execute: no more 'Waiting' requests to process" ) break requestsToExecute = [getRequest["Value"] ] else: numberOfRequest = min( self.__bulkRequest, self.__requestsPerCycle - taskCounter ) self.log.info( "execute: ask for %s requests" % numberOfRequest ) getRequests = self.requestClient().getBulkRequests( numberOfRequest ) if not getRequests["OK"]: self.log.error( "execute: %s" % getRequests["Message"] ) break if not getRequests["Value"]: self.log.info( "execute: no more 'Waiting' requests to process" ) break for rId in getRequests["Value"]["Failed"]: self.log.error( "execute: %s" % getRequests["Value"]["Failed"][rId] ) requestsToExecute = getRequests["Value"]["Successful"].values() self.log.info( "execute: will execute %s requests " % len( requestsToExecute ) ) for request in requestsToExecute: # # set task id taskID = request.RequestID self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses() ) ) looping = 0 while True: if not self.processPool().getFreeSlots(): if not looping: self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep ) time.sleep( self.__poolSleep ) looping += 1 else: if looping: self.log.info( "Free slot found after %d seconds" % looping * self.__poolSleep ) looping = 0 # # save current request in cache res = self.cacheRequest( request ) if not res['OK']: if cmpError( res, errno.EALREADY ): # The request is already in the cache, skip it. break out of the while loop to get next request break # There are too many requests in the cache, commit suicide self.log.error( res['Message'], '(%d requests): put back all requests and exit cycle' % len( self.__requestCache ) ) self.putAllRequests() return res # # serialize to JSON result = request.toJSON() if not result['OK']: continue requestJSON = result['Value'] self.log.info( "spawning task for request '%s/%s'" % ( request.RequestID, request.RequestName ) ) timeOut = self.getTimeout( request ) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs = { "requestJSON" : requestJSON, "handlersDict" : self.handlersDict, "csPath" : self.__configPath, "agentName": self.agentName }, taskID = taskID, blocking = True, usePoolCallbacks = True, timeOut = timeOut ) if not enqueue["OK"]: self.log.error( enqueue["Message"] ) else: self.log.debug( "successfully enqueued task '%s'" % taskID ) # # update monitor gMonitor.addMark( "Processed", 1 ) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep( 0.1 ) break self.log.info( 'Flushing callbacks (%d requests still in cache)' % len( self.__requestCache ) ) processed = self.processPool().processResults() # This happens when the result queue is screwed up. # Returning S_ERROR proved not to be sufficient, # and when in this situation, there is nothing we can do. # So we just exit. runit will restart from scratch. if processed < 0: self.log.fatal("Results queue is screwed up") sys.exit(1) # # clean return return S_OK() def getTimeout( self, request ): """ get timeout for request """ timeout = 0 for op in request: if op.Status not in ( "Waiting", "Scheduled", 'Queued' ): continue if op.Type not in self.timeOuts: timeout += self.__operationTimeout else: perOp = self.timeOuts[op.Type].get( "PerOperation", self.__operationTimeout ) perFiles = self.timeOuts[op.Type].get( "PerFile", self.__fileTimeout ) * len( op ) timeout += perOp + perFiles self.log.info( "estimated timeOut for request (%s/%s) is %s" % ( request.RequestID, request.RequestName, timeout ) ) return timeout def finalize( self ): """ agent finalization """ if self.__processPool: self.processPool().finalize( timeout = self.__poolTimeout ) self.putAllRequests() return S_OK() def resultCallback( self, taskID, taskResult ): """ definition of request callback function :param str taskID: Request.RequestID :param dict taskResult: task result S_OK(Request)/S_ERROR(Message) """ # # clean cache res = self.putRequest( taskID, taskResult ) self.log.info("callback: %s result is %s(%s), put %s(%s)" % (taskID, "S_OK" if taskResult["OK"] else "S_ERROR", taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"], "S_OK" if res['OK'] else 'S_ERROR', '' if res['OK'] else res['Message'])) def exceptionCallback( self, taskID, taskException ): """ definition of exception callback function :param str taskID: Request.RequestID :param Exception taskException: Exception instance """ self.log.error( "exceptionCallback: %s was hit by exception %s" % ( taskID, taskException ) ) self.putRequest( taskID )
def processPool(): gLogger.showHeaders(True) log = gLogger.getSubLogger("TaskCallbacksTests") processPool = ProcessPool(4, 8, 8) processPool.daemonize() yield processPool
class RequestExecutingAgent(AgentModule): """ .. class:: RequestExecutingAgent request processing agent using ProcessPool, Operation handlers and RequestTask """ # # process pool __processPool = None # # request cache __requestCache = {} # # requests/cycle __requestsPerCycle = 100 # # minimal nb of subprocess running __minProcess = 2 # # maximal nb of subprocess executed same time __maxProcess = 4 # # ProcessPool queue size __queueSize = 20 # # file timeout __fileTimeout = 300 # # operation timeout __operationTimeout = 300 # # ProcessTask default timeout in seconds __taskTimeout = 900 # # ProcessPool finalization timeout __poolTimeout = 900 # # ProcessPool sleep time __poolSleep = 5 # # placeholder for RequestClient instance __requestClient = None # # Size of the bulk if use of getRequests. If 0, use getRequest __bulkRequest = 0 def __init__(self, *args, **kwargs): """ c'tor """ # # call base class ctor AgentModule.__init__(self, *args, **kwargs) # # ProcessPool related stuff self.__requestsPerCycle = self.am_getOption("RequestsPerCycle", self.__requestsPerCycle) self.log.info("Requests/cycle = %d" % self.__requestsPerCycle) self.__minProcess = self.am_getOption("MinProcess", self.__minProcess) self.log.info("ProcessPool min process = %d" % self.__minProcess) self.__maxProcess = self.am_getOption("MaxProcess", 4) self.log.info("ProcessPool max process = %d" % self.__maxProcess) self.__queueSize = self.am_getOption("ProcessPoolQueueSize", self.__queueSize) self.log.info("ProcessPool queue size = %d" % self.__queueSize) self.__poolTimeout = int(self.am_getOption("ProcessPoolTimeout", self.__poolTimeout)) self.log.info("ProcessPool timeout = %d seconds" % self.__poolTimeout) self.__poolSleep = int(self.am_getOption("ProcessPoolSleep", self.__poolSleep)) self.log.info("ProcessPool sleep time = %d seconds" % self.__poolSleep) self.__bulkRequest = self.am_getOption("BulkRequest", 0) self.log.info("Bulk request size = %d" % self.__bulkRequest) # # keep config path and agent name self.agentName = self.am_getModuleParam("fullName") self.__configPath = PathFinder.getAgentSection(self.agentName) # # operation handlers over here opHandlersPath = "%s/%s" % (self.__configPath, "OperationHandlers") opHandlers = gConfig.getSections(opHandlersPath) if not opHandlers["OK"]: self.log.error(opHandlers["Message"]) raise AgentConfigError("OperationHandlers section not found in CS under %s" % self.__configPath) opHandlers = opHandlers["Value"] self.timeOuts = dict() # # handlers dict self.handlersDict = dict() for opHandler in opHandlers: opHandlerPath = "%s/%s/Location" % (opHandlersPath, opHandler) opLocation = gConfig.getValue(opHandlerPath, "") if not opLocation: self.log.error("%s not set for %s operation handler" % (opHandlerPath, opHandler)) continue self.timeOuts[opHandler] = {"PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout} opTimeout = gConfig.getValue("%s/%s/TimeOut" % (opHandlersPath, opHandler), 0) if opTimeout: self.timeOuts[opHandler]["PerOperation"] = opTimeout fileTimeout = gConfig.getValue("%s/%s/TimeOutPerFile" % (opHandlersPath, opHandler), 0) if fileTimeout: self.timeOuts[opHandler]["PerFile"] = fileTimeout self.handlersDict[opHandler] = opLocation self.log.info("Operation handlers:") for item in enumerate(self.handlersDict.items()): opHandler = item[1][0] self.log.info("[%s] %s: %s (timeout: %d s + %d s per file)" % (item[0], item[1][0], item[1][1], self.timeOuts[opHandler]['PerOperation'], self.timeOuts[opHandler]['PerFile'])) # # common monitor activity gMonitor.registerActivity("Iteration", "Agent Loops", "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM) gMonitor.registerActivity("Processed", "Request Processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("Done", "Request Completed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) # # create request dict self.__requestCache = dict() # ?? Probably should be removed self.FTSMode = self.am_getOption("FTSMode", False) def processPool(self): """ facade for ProcessPool """ if not self.__processPool: minProcess = max(1, self.__minProcess) maxProcess = max(self.__minProcess, self.__maxProcess) queueSize = abs(self.__queueSize) self.log.info("REA ProcessPool configuration", "minProcess = %d maxProcess = %d queueSize = %d" % (minProcess, maxProcess, queueSize)) self.__processPool = ProcessPool(minProcess, maxProcess, queueSize, poolCallback=self.resultCallback, poolExceptionCallback=self.exceptionCallback) self.__processPool.daemonize() return self.__processPool def requestClient(self): """ RequestClient getter """ if not self.__requestClient: self.__requestClient = ReqClient() return self.__requestClient def cacheRequest(self, request): """ put request into requestCache :param ~Request.Request request: Request instance """ maxProcess = max(self.__minProcess, self.__maxProcess) if len(self.__requestCache) > maxProcess + 50: # For the time being we just print a warning... If the ProcessPool is working well, this is not needed # We don't know how much is acceptable as it depends on many factors self.log.warn("Too many requests in cache", ': %d' % len(self.__requestCache)) # return S_ERROR( "Too many requests in cache" ) if request.RequestID in self.__requestCache: # We don't call putRequest as we have got back the request that is still being executed. Better keep it # The main reason for this is that it lasted longer than the kick time of CleanReqAgent self.log.warn("Duplicate request, keep it but don't execute", ': %d/%s' % (request.RequestID, request.RequestName)) return S_ERROR(errno.EALREADY, 'Request already in cache') self.__requestCache[request.RequestID] = request return S_OK() def putRequest(self, requestID, taskResult=None): """ put back :requestID: to RequestClient :param str requestID: request's id """ if requestID in self.__requestCache: request = self.__requestCache.pop(requestID) if taskResult: if taskResult['OK']: request = taskResult['Value'] # The RequestTask is putting back the Done tasks, no need to redo it if request.Status == 'Done': return S_OK() # In case of timeout, we need to increment ourselves all the attempts elif cmpError(taskResult, errno.ETIME): waitingOp = request.getWaiting() for rmsFile in waitingOp.get('Value', []): rmsFile.Attempt += 1 reset = self.requestClient().putRequest(request, useFailoverProxy=False, retryMainService=2) if not reset["OK"]: return S_ERROR("putRequest: unable to reset request %s: %s" % (requestID, reset["Message"])) else: return S_ERROR('Not in cache') return S_OK() def putAllRequests(self): """ put back all requests without callback called into requestClient :param self: self reference """ self.log.info("putAllRequests: will put back requests", "%s" % len(self.__requestCache)) for requestID in self.__requestCache.keys(): reset = self.putRequest(requestID) if not reset["OK"]: self.log.error('Failed to put request', reset["Message"]) else: self.log.debug("putAllRequests: request %s has been put back with its initial state" % requestID) return S_OK() def initialize(self): """ initialize agent """ return S_OK() def execute(self): """ read requests from RequestClient and enqueue them into ProcessPool """ gMonitor.addMark("Iteration", 1) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug("execute: executing %d request in this cycle" % taskCounter) requestsToExecute = [] if not self.__bulkRequest: self.log.info("execute: ask for a single request") getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error("execute:", "%s" % getRequest["Message"]) break if not getRequest["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break requestsToExecute = [getRequest["Value"]] else: numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter) self.log.info("execute: ask for requests", "%s" % numberOfRequest) getRequests = self.requestClient().getBulkRequests(numberOfRequest) if not getRequests["OK"]: self.log.error("execute:", "%s" % getRequests["Message"]) break if not getRequests["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break for rId in getRequests["Value"]["Failed"]: self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId]) requestsToExecute = getRequests["Value"]["Successful"].values() self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute)) for request in requestsToExecute: # # set task id taskID = request.RequestID self.log.info("processPool status", "tasks idle = %s working = %s" % (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses())) looping = 0 while True: if not self.processPool().getFreeSlots(): if not looping: self.log.info( "No free slots available in processPool", "will wait %d seconds to proceed" % self.__poolSleep) time.sleep(self.__poolSleep) looping += 1 else: if looping: self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep) looping = 0 # # save current request in cache res = self.cacheRequest(request) if not res['OK']: if cmpError(res, errno.EALREADY): # The request is already in the cache, skip it. break out of the while loop to get next request break # There are too many requests in the cache, commit suicide self.log.error( "Too many requests in cache", '(%d requests): put back all requests and exit cycle. Error %s' % ( len( self.__requestCache), res['Message'])) self.putAllRequests() return res # # serialize to JSON result = request.toJSON() if not result['OK']: continue requestJSON = result['Value'] self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName)) timeOut = self.getTimeout(request) enqueue = self.processPool().createAndQueueTask(RequestTask, kwargs={"requestJSON": requestJSON, "handlersDict": self.handlersDict, "csPath": self.__configPath, "agentName": self.agentName}, taskID=taskID, blocking=True, usePoolCallbacks=True, timeOut=timeOut) if not enqueue["OK"]: self.log.error("Could not enqueue task", enqueue["Message"]) else: self.log.debug("successfully enqueued task", "'%s'" % taskID) # # update monitor gMonitor.addMark("Processed", 1) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep(0.1) break self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache)) processed = self.processPool().processResults() # This happens when the result queue is screwed up. # Returning S_ERROR proved not to be sufficient, # and when in this situation, there is nothing we can do. # So we just exit. runit will restart from scratch. if processed < 0: self.log.fatal("Results queue is screwed up") sys.exit(1) # # clean return return S_OK() def getTimeout(self, request): """ get timeout for request """ timeout = 0 for op in request: if op.Status not in ("Waiting", "Scheduled", 'Queued'): continue if op.Type not in self.timeOuts: timeout += self.__operationTimeout else: perOp = self.timeOuts[op.Type].get("PerOperation", self.__operationTimeout) perFiles = self.timeOuts[op.Type].get("PerFile", self.__fileTimeout) * len(op) timeout += perOp + perFiles self.log.info("estimated timeOut for request", "(%s/%s) is %s" % (request.RequestID, request.RequestName, timeout)) return timeout def finalize(self): """ agent finalization """ if self.__processPool: self.processPool().finalize(timeout=self.__poolTimeout) self.putAllRequests() return S_OK() def resultCallback(self, taskID, taskResult): """ definition of request callback function :param str taskID: Request.RequestID :param dict taskResult: task result S_OK(Request)/S_ERROR(Message) """ # # clean cache res = self.putRequest(taskID, taskResult) self.log.info( "callback:", "%s result is %s(%s), put %s(%s)" % (taskID, "S_OK" if taskResult["OK"] else "S_ERROR", taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"], "S_OK" if res['OK'] else 'S_ERROR', '' if res['OK'] else res['Message'])) def exceptionCallback(self, taskID, taskException): """ definition of exception callback function :param str taskID: Request.RequestID :param Exception taskException: Exception instance """ self.log.error("exceptionCallback:", "%s was hit by exception %s" % (taskID, taskException)) self.putRequest(taskID)
class PoolComputingElement(ComputingElement): mandatoryParameters = MandatoryParameters ############################################################################# def __init__(self, ceUniqueID, cores=0): """ Standard constructor. """ ComputingElement.__init__(self, ceUniqueID) self.ceType = "Pool" self.submittedJobs = 0 if cores > 0: self.cores = cores else: self.cores = getNumberOfCores() self.pPool = ProcessPool(self.cores, self.cores, poolCallback=self.finalizeJob) self.taskID = 0 self.coresPerTask = {} ############################################################################# def _addCEConfigDefaults(self): """Method to make sure all necessary Configuration Parameters are defined """ # First assure that any global parameters are loaded ComputingElement._addCEConfigDefaults(self) def getCoresInUse(self): """ """ coresInUse = 0 for _task, cores in self.coresPerTask.items(): coresInUse += cores return coresInUse ############################################################################# def submitJob(self, executableFile, proxy, **kwargs): """ Method to submit job. """ self.pPool.processResults() coresInUse = self.getCoresInUse() if "WholeNode" in kwargs and kwargs['WholeNode']: if coresInUse > 0: return S_ERROR( 'Can not take WholeNode job' ) #, %d/%d slots used' % (self.slotsInUse,self.slots) ) else: requestedCores = self.cores elif "NumberOfCores" in kwargs: requestedCores = int(kwargs['NumberOfCores']) if requestedCores > 0: if (coresInUse + requestedCores) > self.cores: return S_ERROR( 'Not enough slots: requested %d, available %d' % (requestedCores, self.cores - coresInUse)) else: requestedCores = 1 if self.cores - coresInUse < requestedCores: return S_ERROR('Not enough slots: requested %d, available %d' % (requestedCores, self.cores - coresInUse)) ret = getProxyInfo() if not ret['OK']: pilotProxy = None else: pilotProxy = ret['Value']['path'] self.log.notice('Pilot Proxy:', pilotProxy) result = self.pPool.createAndQueueTask( executeJob, [executableFile, proxy, self.taskID], None, self.taskID, usePoolCallbacks=True) self.taskID += 1 self.coresPerTask[self.taskID] = requestedCores self.pPool.processResults() return result def finalizeJob(self, taskID, result): """ Finalize the job """ del self.coresPerTask[taskID] ############################################################################# def getCEStatus(self): """ Method to return information on running and pending jobs. """ self.pPool.processResults() result = S_OK() result['SubmittedJobs'] = 0 nJobs = 0 for _j, value in self.coresPerTask.items(): if value > 0: nJobs += 1 result['RunningJobs'] = nJobs result['WaitingJobs'] = 0 coresInUse = self.getCoresInUse() result['UsedCores'] = coresInUse result['AvailableCores'] = self.cores - coresInUse return result ############################################################################# def monitorProxy(self, pilotProxy, payloadProxy): """ Monitor the payload proxy and renew as necessary. """ return self._monitorProxy(pilotProxy, payloadProxy)
class RequestExecutingAgent( AgentModule ): """ .. class:: RequestExecutingAgent request processing agent using ProcessPool, Operation handlers and RequestTask """ # # process pool __processPool = None # # request cache __requestCache = {} # # requests/cycle __requestsPerCycle = 100 # # minimal nb of subprocess running __minProcess = 2 # # maximal nb of subprocess executed same time __maxProcess = 4 # # ProcessPool queue size __queueSize = 20 # # file timeout __fileTimeout = 300 # # operation timeout __operationTimeout = 300 # # ProcessTask default timeout in seconds __taskTimeout = 900 # # ProcessPool finalization timeout __poolTimeout = 900 # # ProcessPool sleep time __poolSleep = 5 # # placeholder for RequestClient instance __requestClient = None # # Size of the bulk if use of getRequests. If 0, use getRequest __bulkRequest = 0 def __init__( self, *args, **kwargs ): """ c'tor """ # # call base class ctor AgentModule.__init__( self, *args, **kwargs ) # # ProcessPool related stuff self.__requestsPerCycle = self.am_getOption( "RequestsPerCycle", self.__requestsPerCycle ) self.log.info( "Requests/cycle = %d" % self.__requestsPerCycle ) self.__minProcess = self.am_getOption( "MinProcess", self.__minProcess ) self.log.info( "ProcessPool min process = %d" % self.__minProcess ) self.__maxProcess = self.am_getOption( "MaxProcess", 4 ) self.log.info( "ProcessPool max process = %d" % self.__maxProcess ) self.__queueSize = self.am_getOption( "ProcessPoolQueueSize", self.__queueSize ) self.log.info( "ProcessPool queue size = %d" % self.__queueSize ) self.__poolTimeout = int( self.am_getOption( "ProcessPoolTimeout", self.__poolTimeout ) ) self.log.info( "ProcessPool timeout = %d seconds" % self.__poolTimeout ) self.__poolSleep = int( self.am_getOption( "ProcessPoolSleep", self.__poolSleep ) ) self.log.info( "ProcessPool sleep time = %d seconds" % self.__poolSleep ) self.__taskTimeout = int( self.am_getOption( "ProcessTaskTimeout", self.__taskTimeout ) ) self.log.info( "ProcessTask timeout = %d seconds" % self.__taskTimeout ) self.__bulkRequest = self.am_getOption( "BulkRequest", 0 ) self.log.info( "Bulk request size = %d" % self.__bulkRequest ) # # keep config path and agent name self.agentName = self.am_getModuleParam( "fullName" ) self.__configPath = PathFinder.getAgentSection( self.agentName ) # # operation handlers over here opHandlersPath = "%s/%s" % ( self.__configPath, "OperationHandlers" ) opHandlers = gConfig.getSections( opHandlersPath ) if not opHandlers["OK"]: self.log.error( opHandlers["Message" ] ) raise AgentConfigError( "OperationHandlers section not found in CS under %s" % self.__configPath ) opHandlers = opHandlers["Value"] self.timeOuts = dict() # # handlers dict self.handlersDict = dict() for opHandler in opHandlers: opHandlerPath = "%s/%s/Location" % ( opHandlersPath, opHandler ) opLocation = gConfig.getValue( opHandlerPath, "" ) if not opLocation: self.log.error( "%s not set for %s operation handler" % ( opHandlerPath, opHandler ) ) continue self.timeOuts[opHandler] = { "PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout } opTimeout = gConfig.getValue( "%s/%s/TimeOut" % ( opHandlersPath, opHandler ), 0 ) if opTimeout: self.timeOuts[opHandler]["PerOperation"] = opTimeout fileTimeout = gConfig.getValue( "%s/%s/TimeOutPerFile" % ( opHandlersPath, opHandler ), 0 ) if fileTimeout: self.timeOuts[opHandler]["PerFile"] = fileTimeout self.handlersDict[opHandler] = opLocation self.log.info( "Operation handlers:" ) for item in enumerate ( self.handlersDict.items() ): opHandler = item[1][0] self.log.info( "[%s] %s: %s (timeout: %d s + %d s per file)" % ( item[0], item[1][0], item[1][1], self.timeOuts[opHandler]['PerOperation'], self.timeOuts[opHandler]['PerFile'] ) ) # # common monitor activity gMonitor.registerActivity( "Iteration", "Agent Loops", "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Processed", "Request Processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Done", "Request Completed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) # # create request dict self.__requestCache = dict() self.FTSMode = self.am_getOption( "FTSMode", False ) def processPool( self ): """ facade for ProcessPool """ if not self.__processPool: minProcess = max( 1, self.__minProcess ) maxProcess = max( self.__minProcess, self.__maxProcess ) queueSize = abs( self.__queueSize ) self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess, maxProcess, queueSize ) ) self.__processPool = ProcessPool( minProcess, maxProcess, queueSize, poolCallback = self.resultCallback, poolExceptionCallback = self.exceptionCallback ) self.__processPool.daemonize() return self.__processPool def requestClient( self ): """ RequestClient getter """ if not self.__requestClient: self.__requestClient = ReqClient() return self.__requestClient def cacheRequest( self, request ): """ put request into requestCache :param Request request: Request instance """ count = 5 # Wait a bit as there may be a race condition between RequestTask putting back the request and the callback clearing the cache while request.RequestID in self.__requestCache: count -= 1 if not count: self.requestClient().putRequest( request, useFailoverProxy = False, retryMainService = 2 ) return S_ERROR( "Duplicate request, ignore: %s" % request.RequestID ) time.sleep( 1 ) self.__requestCache[ request.RequestID ] = request return S_OK() def putRequest( self, requestID, taskResult = None ): """ put back :requestID: to RequestClient :param str requestID: request's id """ if requestID in self.__requestCache: request = self.__requestCache.pop( requestID ) if taskResult and taskResult['OK']: request = taskResult['Value'] reset = self.requestClient().putRequest( request, useFailoverProxy = False, retryMainService = 2 ) if not reset["OK"]: return S_ERROR( "putRequest: unable to reset request %s: %s" % ( requestID, reset["Message"] ) ) else: return S_ERROR( 'Not in cache' ) return S_OK() def putAllRequests( self ): """ put back all requests without callback called into requestClient :param self: self reference """ self.log.info( "putAllRequests: will put %s back requests" % len( self.__requestCache ) ) for requestID in self.__requestCache.keys(): reset = self.putRequest( requestID ) if not reset["OK"]: self.log.error( 'Failed to put request', reset["Message"] ) else: self.log.debug( "putAllRequests: request %s has been put back with its initial state" % requestID ) return S_OK() def initialize( self ): """ initialize agent """ return S_OK() def execute( self ): """ read requests from RequestClient and enqueue them into ProcessPool """ gMonitor.addMark( "Iteration", 1 ) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug( "execute: executing %d request in this cycle" % taskCounter ) requestsToExecute = [] if not self.__bulkRequest: self.log.info( "execute: ask for a single request" ) getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error( "execute: %s" % getRequest["Message"] ) break if not getRequest["Value"]: self.log.info( "execute: no more 'Waiting' requests to process" ) break requestsToExecute = [getRequest["Value"] ] else: numberOfRequest = min( self.__bulkRequest, self.__requestsPerCycle - taskCounter ) self.log.info( "execute: ask for %s requests" % numberOfRequest ) getRequests = self.requestClient().getBulkRequests( numberOfRequest ) if not getRequests["OK"]: self.log.error( "execute: %s" % getRequests["Message"] ) break if not getRequests["Value"]: self.log.info( "execute: no more 'Waiting' requests to process" ) break for rId in getRequests["Value"]["Failed"]: self.log.error( "execute: %s" % getRequests["Value"]["Failed"][rId] ) requestsToExecute = getRequests["Value"]["Successful"].values() self.log.info( "execute: will execute %s requests " % len( requestsToExecute ) ) for request in requestsToExecute: # # set task id taskID = request.RequestID # # save current request in cache self.cacheRequest( request ) # # serialize to JSON result = request.toJSON() if not result['OK']: continue requestJSON = result['Value'] self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses() ) ) looping = 0 while True: if not self.processPool().getFreeSlots(): if not looping: self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep ) time.sleep( self.__poolSleep ) looping += 1 else: if looping: self.log.info( "Free slot found after %d seconds" % looping * self.__poolSleep ) looping = 0 self.log.info( "spawning task for request '%s/%s'" % ( request.RequestID, request.RequestName ) ) timeOut = self.getTimeout( request ) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs = { "requestJSON" : requestJSON, "handlersDict" : self.handlersDict, "csPath" : self.__configPath, "agentName": self.agentName }, taskID = taskID, blocking = True, usePoolCallbacks = True, timeOut = timeOut ) if not enqueue["OK"]: self.log.error( enqueue["Message"] ) else: self.log.debug( "successfully enqueued task '%s'" % taskID ) # # update monitor gMonitor.addMark( "Processed", 1 ) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep( 0.1 ) break # # clean return return S_OK() def getTimeout( self, request ): """ get timeout for request """ timeout = 0 for op in request: if op.Status not in ( "Waiting", "Scheduled", 'Queued' ): continue if op.Type not in self.timeOuts: timeout += self.__operationTimeout else: perOp = self.timeOuts[op.Type].get( "PerOperation", self.__operationTimeout ) perFiles = self.timeOuts[op.Type].get( "PerFile", self.__fileTimeout ) * len( op ) timeout += perOp + perFiles self.log.info( "estimated timeOut for request (%s/%s) is %s" % ( request.RequestID, request.RequestName, timeout ) ) return timeout def finalize( self ): """ agent finalization """ if self.__processPool: self.processPool().finalize( timeout = self.__poolTimeout ) self.putAllRequests() return S_OK() def resultCallback( self, taskID, taskResult ): """ definition of request callback function :param str taskID: Request.RequestID :param dict taskResult: task result S_OK(Request)/S_ERROR(Message) """ # # clean cache res = self.putRequest( taskID, taskResult ) self.log.info( "callback: %s result is %s(%s), put %s(%s)" % ( taskID, "S_OK" if taskResult["OK"] else "S_ERROR", taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"], "S_OK" if res['OK'] else 'S_ERROR', '' if res['OK'] else res['Message'] ) ) def exceptionCallback( self, taskID, taskException ): """ definition of exception callback function :param str taskID: Request.RequestID :param Exception taskException: Exception instance """ self.log.error( "exceptionCallback: %s was hit by exception %s" % ( taskID, taskException ) ) self.putRequest( taskID )
for path in result['Value']['Directories']: random.shuffle(lfcHosts) #print pPool.getNumWorkingProcesses(), pPool.hasPendingTasks() print "Queueing task for directory %s, lfc %s" % ( path, lfcHosts[0] ) result = pPool.createAndQueueTask( processDir, [path , writerQueue, False, lfcHosts[0]], callback = finalizeDirectory ) if not result['OK']: print "Failed queueing %s" % path else: print "Task failed: %s" % result['Message'] if 'Path' in result: random.shuffle(lfcHosts) print "Requeueing task for directory %s, lfc %s" % ( result['Path'], lfcHosts[0] ) ######################################################################### pPool = ProcessPool(30,40,0) manager = Manager() writerQueue = manager.Queue() stopFlag = Value( 'i', 0 ) #pPool.daemonize() # lfcHosts = ['lfc-lhcb-ro.cern.ch', # 'lfc-lhcb-ro.cr.cnaf.infn.it', # 'lhcb-lfc-fzk.gridka.de', # 'lfc-lhcb-ro.in2p3.fr', # 'lfc-lhcb.grid.sara.nl', # 'lfclhcb.pic.es', # 'lhcb-lfc.gridpp.rl.ac.uk'] lfcHosts = ['prod-lfc-lhcb-ro.cern.ch']
def setUp( self ): gLogger.showHeaders( True ) self.log = gLogger.getSubLogger( self.__class__.__name__ ) self.processPool = ProcessPool( 4, 8, 8 ) self.processPool.daemonize()
class TaskTimeOutTests( unittest.TestCase ): """ .. class:: TaskTimeOutTests test case for ProcessPool """ def setUp( self ): """c'tor :param self: self reference """ from DIRAC.Core.Base import Script Script.parseCommandLine() from DIRAC.FrameworkSystem.Client.Logger import gLogger gLogger.showHeaders( True ) self.log = gLogger.getSubLogger( self.__class__.__name__ ) self.processPool = ProcessPool( 2, 4, 8, poolCallback = self.poolCallback, poolExceptionCallback = self.poolExceptionCallback ) self.processPool.daemonize() def poolCallback( self, taskID, taskResult ): self.log.always( "callback result for %s is %s" % ( taskID, taskResult ) ) def poolExceptionCallback( self, taskID, taskException ): self.log.always( "callback exception for %s is %s" % ( taskID, taskException ) ) def testCallableClass( self ): """ CallableClass and task time out test """ i = 0 while True: if self.processPool.getFreeSlots() > 0: timeWait = random.randint( 0, 5 ) * 10 raiseException = False if not timeWait: raiseException = True result = self.processPool.createAndQueueTask( CallableClass, taskID = i, args = ( i, timeWait, raiseException ), timeOut = 15, usePoolCallbacks = True, blocking = True ) if result["OK"]: self.log.always("CallableClass enqueued to task %s timeWait=%s exception=%s" % ( i, timeWait, raiseException ) ) i += 1 else: continue if i == 16: break self.processPool.finalize( 2 ) def testCallableFunc( self ): """ CallableFunc and task timeout test """ i = 0 while True: if self.processPool.getFreeSlots() > 0: timeWait = random.randint(0, 5) * 5 raiseException = False if not timeWait: raiseException = True result = self.processPool.createAndQueueTask( CallableFunc, taskID = i, args = ( i, timeWait, raiseException ), timeOut = 15, usePoolCallbacks = True, blocking = True ) if result["OK"]: self.log.always("CallableFunc enqueued to task %s timeWait=%s exception=%s" % ( i, timeWait, raiseException ) ) i += 1 else: continue if i == 16: break self.processPool.finalize( 2 ) def testLockedClass( self ): """ LockedCallableClass and task time out test """ for loop in range(2): self.log.always( "loop %s" % loop ) i = 0 while i < 16: if self.processPool.getFreeSlots() > 0: timeWait = random.randint(0, 5) * 5 raiseException = False if timeWait == 5: raiseException = True klass = CallableClass if timeWait >= 20: klass = LockedCallableClass result = self.processPool.createAndQueueTask( klass, taskID = i, args = ( i, timeWait, raiseException ), timeOut = 15, usePoolCallbacks = True, blocking = True ) if result["OK"]: self.log.always("%s enqueued to task %s timeWait=%s exception=%s" % ( klass.__name__ , i, timeWait, raiseException ) ) i += 1 else: continue self.log.always("being idle for a while") for i in range(100000): for j in range(1000): pass self.log.always("finalizing...") self.processPool.finalize( 10 ) ## unlock gLock.release()
def submitJob(self, executableFile, proxy, **kwargs): """ Method to submit job. :param str executableFile: location of the executable file :param str proxy: payload proxy :return: S_OK/S_ERROR of the result of the job submission """ if self.pPool is None: self.pPool = ProcessPool(minSize=self.processors, maxSize=self.processors, poolCallback=self.finalizeJob) self.pPool.processResults() processorsInUse = self.getProcessorsInUse() if kwargs.get('wholeNode'): if processorsInUse > 0: return S_ERROR('Can not take WholeNode job') # , %d/%d slots used' % (self.slotsInUse,self.slots) ) else: requestedProcessors = self.processors elif "numberOfProcessors" in kwargs: requestedProcessors = int(kwargs['numberOfProcessors']) if requestedProcessors > 0: if (processorsInUse + requestedProcessors) > self.processors: return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors, self.processors - processorsInUse)) else: requestedProcessors = 1 if self.processors - processorsInUse < requestedProcessors: return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors, self.processors - processorsInUse)) ret = getProxyInfo() if not ret['OK']: pilotProxy = None else: pilotProxy = ret['Value']['path'] self.log.notice('Pilot Proxy:', pilotProxy) kwargs = {'UseSudo': False} if self.useSudo: for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS): if nUser not in self.userNumberPerTask.values(): break kwargs['NUser'] = nUser kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str(nUser).zfill(2) kwargs['UseSudo'] = True result = self.pPool.createAndQueueTask(executeJob, args=(executableFile, proxy, self.taskID), kwargs=kwargs, taskID=self.taskID, usePoolCallbacks=True) self.processorsPerTask[self.taskID] = requestedProcessors self.taskID += 1 self.pPool.processResults() return result
result = pPool.createAndQueueTask( processDir, [path, writerQueue, False, lfcHosts[0]], callback=finalizeDirectory) if not result['OK']: print("Failed queueing %s" % path) else: print("Task failed: %s" % result['Message']) if 'Path' in result: random.shuffle(lfcHosts) print("Requeueing task for directory %s, lfc %s" % (result['Path'], lfcHosts[0])) ######################################################################### pPool = ProcessPool(30, 40, 0) manager = Manager() writerQueue = manager.Queue() stopFlag = Value('i', 0) # pPool.daemonize() # lfcHosts = ['lfc-lhcb-ro.cern.ch', # 'lfc-lhcb-ro.cr.cnaf.infn.it', # 'lhcb-lfc-fzk.gridka.de', # 'lfc-lhcb-ro.in2p3.fr', # 'lfc-lhcb.grid.sara.nl', # 'lfclhcb.pic.es', # 'lhcb-lfc.gridpp.rl.ac.uk'] lfcHosts = ['prod-lfc-lhcb-ro.cern.ch']
class PoolComputingElement(ComputingElement): mandatoryParameters = MandatoryParameters ############################################################################# def __init__(self, ceUniqueID): """ Standard constructor. """ ComputingElement.__init__(self, ceUniqueID) self.ceType = "Pool" self.log = gLogger.getSubLogger('Pool') self.submittedJobs = 0 self.processors = 1 self.pPool = None self.taskID = 0 self.processorsPerTask = {} self.userNumberPerTask = {} self.useSudo = False ############################################################################# def _addCEConfigDefaults(self): """Method to make sure all necessary Configuration Parameters are defined """ # First assure that any global parameters are loaded ComputingElement._addCEConfigDefaults(self) def _reset(self): """ Update internal variables after some extra parameters are added :return: None """ self.processors = int(self.ceParameters.get('NumberOfProcessors', self.processors)) self.ceParameters['MaxTotalJobs'] = self.processors self.useSudo = self.ceParameters.get('SudoExecution', False) def getProcessorsInUse(self): """ Get the number of currently allocated processor cores :return: number of processor cores """ processorsInUse = 0 for task in self.processorsPerTask: processorsInUse += self.processorsPerTask[task] return processorsInUse ############################################################################# def submitJob(self, executableFile, proxy, **kwargs): """ Method to submit job. :param str executableFile: location of the executable file :param str proxy: payload proxy :return: S_OK/S_ERROR of the result of the job submission """ if self.pPool is None: self.pPool = ProcessPool(minSize=self.processors, maxSize=self.processors, poolCallback=self.finalizeJob) self.pPool.processResults() processorsInUse = self.getProcessorsInUse() if kwargs.get('wholeNode'): if processorsInUse > 0: return S_ERROR('Can not take WholeNode job') # , %d/%d slots used' % (self.slotsInUse,self.slots) ) else: requestedProcessors = self.processors elif "numberOfProcessors" in kwargs: requestedProcessors = int(kwargs['numberOfProcessors']) if requestedProcessors > 0: if (processorsInUse + requestedProcessors) > self.processors: return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors, self.processors - processorsInUse)) else: requestedProcessors = 1 if self.processors - processorsInUse < requestedProcessors: return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors, self.processors - processorsInUse)) ret = getProxyInfo() if not ret['OK']: pilotProxy = None else: pilotProxy = ret['Value']['path'] self.log.notice('Pilot Proxy:', pilotProxy) kwargs = {'UseSudo': False} if self.useSudo: for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS): if nUser not in self.userNumberPerTask.values(): break kwargs['NUser'] = nUser kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str(nUser).zfill(2) kwargs['UseSudo'] = True result = self.pPool.createAndQueueTask(executeJob, args=(executableFile, proxy, self.taskID), kwargs=kwargs, taskID=self.taskID, usePoolCallbacks=True) self.processorsPerTask[self.taskID] = requestedProcessors self.taskID += 1 self.pPool.processResults() return result def finalizeJob(self, taskID, result): """ Finalize the job by updating the process utilisation counters :param int taskID: local PoolCE task ID :param dict result: result of the job execution """ nProc = self.processorsPerTask.pop(taskID) if result['OK']: self.log.info('Task %d finished successfully, %d processor(s) freed' % (taskID, nProc)) else: self.log.error("Task failed submission", "%d, message: %s" % (taskID, result['Message'])) ############################################################################# def getCEStatus(self, jobIDList=None): """ Method to return information on running and pending jobs. :return: dictionary of numbers of jobs per status """ if self.pPool is None: self.pPool = ProcessPool(minSize=self.processors, maxSize=self.processors, poolCallback=self.finalizeJob) self.pPool.processResults() result = S_OK() result['SubmittedJobs'] = 0 nJobs = 0 for _j, value in self.processorsPerTask.iteritems(): if value > 0: nJobs += 1 result['RunningJobs'] = nJobs result['WaitingJobs'] = 0 processorsInUse = self.getProcessorsInUse() result['UsedProcessors'] = processorsInUse result['AvailableProcessors'] = self.processors - processorsInUse return result ############################################################################# def monitorProxy(self, pilotProxy, payloadProxy): """ Monitor the payload proxy and renew as necessary. :param str pilotProxy: location of the pilotProxy :param str payloadProxy: location of the payloadProxy """ return self._monitorProxy(pilotProxy, payloadProxy)
class RequestExecutingAgent( AgentModule ): """ .. class:: RequestExecutingAgent request processing agent using ProcessPool, Operation handlers and RequestTask """ # # process pool __processPool = None # # request cache __requestCache = {} # # requests/cycle __requestsPerCycle = 100 # # minimal nb of subprocess running __minProcess = 2 # # maximal nb of subprocess executed same time __maxProcess = 4 # # ProcessPool queue size __queueSize = 20 # # file timeout __fileTimeout = 300 # # operation timeout __operationTimeout = 300 # # ProcessTask default timeout in seconds __taskTimeout = 900 # # ProcessPool finalization timeout __poolTimeout = 900 # # ProcessPool sleep time __poolSleep = 5 # # placeholder for RequestClient instance __requestClient = None def __init__( self, *args, **kwargs ): """ c'tor """ # # call base class ctor AgentModule.__init__( self, *args, **kwargs ) # # ProcessPool related stuff self.__requestsPerCycle = self.am_getOption( "RequestsPerCycle", self.__requestsPerCycle ) self.log.info( "Requests/cycle = %d" % self.__requestsPerCycle ) self.__minProcess = self.am_getOption( "MinProcess", self.__minProcess ) self.log.info( "ProcessPool min process = %d" % self.__minProcess ) self.__maxProcess = self.am_getOption( "MaxProcess", 4 ) self.log.info( "ProcessPool max process = %d" % self.__maxProcess ) self.__queueSize = self.am_getOption( "ProcessPoolQueueSize", self.__queueSize ) self.log.info( "ProcessPool queue size = %d" % self.__queueSize ) self.__poolTimeout = int( self.am_getOption( "ProcessPoolTimeout", self.__poolTimeout ) ) self.log.info( "ProcessPool timeout = %d seconds" % self.__poolTimeout ) self.__poolSleep = int( self.am_getOption( "ProcessPoolSleep", self.__poolSleep ) ) self.log.info( "ProcessPool sleep time = %d seconds" % self.__poolSleep ) self.__taskTimeout = int( self.am_getOption( "ProcessTaskTimeout", self.__taskTimeout ) ) self.log.info( "ProcessTask timeout = %d seconds" % self.__taskTimeout ) # # keep config path and agent name self.agentName = self.am_getModuleParam( "fullName" ) self.__configPath = PathFinder.getAgentSection( self.agentName ) # # operation handlers over here opHandlersPath = "%s/%s" % ( self.__configPath, "OperationHandlers" ) opHandlers = gConfig.getSections( opHandlersPath ) if not opHandlers["OK"]: self.log.error( opHandlers["Message" ] ) raise AgentConfigError( "OperationHandlers section not found in CS under %s" % self.__configPath ) opHandlers = opHandlers["Value"] self.timeOuts = dict() self.operationHandlers = [] for opHandler in opHandlers: opHandlerPath = "%s/%s/Location" % ( opHandlersPath, opHandler ) opLocation = gConfig.getValue( opHandlerPath, "" ) if not opLocation: self.log.error( "%s not set for %s operation handler" % ( opHandlerPath, opHandler ) ) continue self.timeOuts[opHandler] = { "PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout } opTimeout = gConfig.getValue( "%s/%s/TimeOut" % ( opHandlersPath, opHandler ), 0 ) if opTimeout: self.timeOuts[opHandler]["PerOperation"] = opTimeout fileTimeout = gConfig.getValue( "%s/%s/TimeOutPerFile" % ( opHandlersPath, opHandler ), 0 ) if fileTimeout: self.timeOuts[opHandler]["PerFile"] = fileTimeout self.operationHandlers.append( opLocation ) self.log.info( "Operation handlers:" ) for itemTuple in enumerate ( self.operationHandlers ): self.log.info( "[%s] %s" % itemTuple ) # # handlers dict self.handlersDict = dict() # # common monitor activity gMonitor.registerActivity( "Iteration", "Agent Loops", "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Processed", "Request Processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Done", "Request Completed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) # # create request dict self.__requestCache = dict() self.FTSMode = self.am_getOption( "FTSMode", False ) def processPool( self ): """ facade for ProcessPool """ if not self.__processPool: minProcess = max( 1, self.__minProcess ) maxProcess = max( self.__minProcess, self.__maxProcess ) queueSize = abs( self.__queueSize ) self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess, maxProcess, queueSize ) ) self.__processPool = ProcessPool( minProcess, maxProcess, queueSize, poolCallback = self.resultCallback, poolExceptionCallback = self.exceptionCallback ) self.__processPool.daemonize() return self.__processPool def requestClient( self ): """ RequestClient getter """ if not self.__requestClient: self.__requestClient = ReqClient() return self.__requestClient def cleanCache( self, requestName = None ): """ delete request from requestCache :param str requestName: Request.RequestName """ if requestName in self.__requestCache: del self.__requestCache[requestName] return S_OK() def cacheRequest( self, request ): """ put request into requestCache :param Request request: Request instance """ self.__requestCache.setdefault( request.RequestName, request ) return S_OK() def resetRequest( self, requestName ): """ put back :requestName: to RequestClient :param str requestName: request's name """ if requestName in self.__requestCache: reset = self.requestClient().updateRequest( self.__requestCache[requestName] ) if not reset["OK"]: return S_ERROR( "resetRequest: unable to reset request %s: %s" % ( requestName, reset["Message"] ) ) return S_OK() def resetAllRequests( self ): """ put back all requests without callback called into requestClient :param self: self reference """ self.log.info( "resetAllRequests: will put %s back requests" % len( self.__requestCache ) ) for requestName, request in self.__requestCache.iteritems(): reset = self.requestClient().updateRequest( request ) if not reset["OK"]: self.log.error( "resetAllRequests: unable to reset request %s: %s" % ( requestName, reset["Message"] ) ) continue self.log.debug( "resetAllRequests: request %s has been put back with its initial state" % requestName ) return S_OK() def initialize( self ): """ initialize agent at the moment creates handlers dictionary """ for opHandler in self.operationHandlers: handlerName = opHandler.split( "/" )[-1] self.handlersDict[ handlerName ] = opHandler self.log.debug( "handler '%s' for operation '%s' registered" % ( opHandler, handlerName ) ) if not self.handlersDict: self.log.error( "operation handlers not set, check configuration option 'Operations'!" ) return S_ERROR( "Operation handlers not set!" ) return S_OK() def execute( self ): """ read requests from RequestClient and enqueue them into ProcessPool """ gMonitor.addMark( "Iteration", 1 ) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug( "execute: executing %d request in this cycle" % taskCounter ) getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error( "execute: %s" % getRequest["Message"] ) break if not getRequest["Value"]: self.log.info( "execute: no more 'Waiting' requests to process" ) break # # OK, we've got you request = getRequest["Value"] # # set task id taskID = request.RequestName # # save current request in cache self.cacheRequest( request ) # # serialize to JSON requestJSON = request.toJSON() if not requestJSON["OK"]: self.log.error( "JSON serialization error: %s" % requestJSON["Message"] ) break requestJSON = requestJSON["Value"] self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses() ) ) while True: if not self.processPool().getFreeSlots(): self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep ) time.sleep( self.__poolSleep ) else: self.log.info( "spawning task for request '%s'" % ( request.RequestName ) ) timeOut = self.getTimeout( request ) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs = { "requestJSON" : requestJSON, "handlersDict" : self.handlersDict, "csPath" : self.__configPath, "agentName": self.agentName }, taskID = taskID, blocking = True, usePoolCallbacks = True, timeOut = timeOut ) if not enqueue["OK"]: self.log.error( enqueue["Message"] ) else: self.log.debug( "successfully enqueued task '%s'" % taskID ) # # update monitor gMonitor.addMark( "Processed", 1 ) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep( 0.1 ) break # # clean return return S_OK() def getTimeout( self, request ): """ get timeout for request """ timeout = 0 for op in request: if op.Status not in ( "Waiting", "Scheduled" ): continue if op.Type not in self.timeOuts: timeout += self.__operationTimeout else: perOp = self.timeOuts[op.Type].get( "PerOperation", self.__operationTimeout ) perFiles = self.timeOuts[op.Type].get( "PerFile", self.__fileTimeout ) * len( op ) timeout += perOp + perFiles self.log.info( "estimated timeOut for request %s is %s" % ( request.RequestName, timeout ) ) return timeout def finalize( self ): """ agent finalization """ if self.__processPool: self.processPool().finalize( timeout = self.__poolTimeout ) self.resetAllRequests() return S_OK() def resultCallback( self, taskID, taskResult ): """ definition of request callback function :param str taskID: Reqiest.RequestName :param dict taskResult: task result S_OK/S_ERROR """ self.log.info( "callback: %s result is %s(%s)" % ( taskID, "S_OK" if taskResult["OK"] else "S_ERROR", taskResult["Value"] if taskResult["OK"] else taskResult["Message"] ) ) if not taskResult["OK"]: if taskResult["Message"] == "Timed out": self.resetRequest( taskID ) # # clean cache self.cleanCache( taskID ) def exceptionCallback( self, taskID, taskException ): """ definition of exception callback function :param str taskID: Request.RequestName :param Exception taskException: Exception instance """ self.log.error( "exceptionCallback: %s was hit by exception %s" % ( taskID, taskException ) ) self.resetRequest( taskID )
class PoolComputingElement(ComputingElement): mandatoryParameters = MandatoryParameters ############################################################################# def __init__(self, ceUniqueID): """ Standard constructor. """ ComputingElement.__init__(self, ceUniqueID) self.ceType = "Pool" self.log = gLogger.getSubLogger('Pool') self.submittedJobs = 0 self.processors = 1 self.pPool = None self.taskID = 0 self.processorsPerTask = {} self.userNumberPerTask = {} self.useSudo = False ############################################################################# def _addCEConfigDefaults(self): """Method to make sure all necessary Configuration Parameters are defined """ # First assure that any global parameters are loaded ComputingElement._addCEConfigDefaults(self) def _reset(self): self.processors = int(self.ceParameters.get('NumberOfProcessors', self.processors)) self.ceParameters['MaxTotalJobs'] = self.processors self.useSudo = self.ceParameters.get('SudoExecution', False) def getProcessorsInUse(self): """ """ processorsInUse = 0 for task in self.processorsPerTask: processorsInUse += self.processorsPerTask[task] return processorsInUse ############################################################################# def submitJob(self, executableFile, proxy, **kwargs): """ Method to submit job. """ if self.pPool is None: self.pPool = ProcessPool(minSize=self.processors, maxSize=self.processors, poolCallback=self.finalizeJob) self.pPool.processResults() processorsInUse = self.getProcessorsInUse() if kwargs.get('wholeNode'): if processorsInUse > 0: return S_ERROR('Can not take WholeNode job') # , %d/%d slots used' % (self.slotsInUse,self.slots) ) else: requestedProcessors = self.processors elif "numberOfProcessors" in kwargs: requestedProcessors = int(kwargs['numberOfProcessors']) if requestedProcessors > 0: if (processorsInUse + requestedProcessors) > self.processors: return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors, self.processors - processorsInUse)) else: requestedProcessors = 1 if self.processors - processorsInUse < requestedProcessors: return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors, self.processors - processorsInUse)) ret = getProxyInfo() if not ret['OK']: pilotProxy = None else: pilotProxy = ret['Value']['path'] self.log.notice('Pilot Proxy:', pilotProxy) kwargs = {'UseSudo': False} if self.useSudo: for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS): if nUser not in self.userNumberPerTask.values(): break kwargs['NUser'] = nUser kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str(nUser).zfill(2) kwargs['UseSudo'] = True result = self.pPool.createAndQueueTask(executeJob, args=(executableFile, proxy, self.taskID), kwargs=kwargs, taskID=self.taskID, usePoolCallbacks=True) self.processorsPerTask[self.taskID] = requestedProcessors self.taskID += 1 self.pPool.processResults() return result def finalizeJob(self, taskID, result): """ Finalize the job """ nProc = self.processorsPerTask.pop(taskID) if result['OK']: self.log.info('Task %d finished successfully, %d processor(s) freed' % (taskID, nProc)) else: self.log.error("Task failed submission", "%d, message: %s" % (taskID, result['Message'])) ############################################################################# def getCEStatus(self, jobIDList=None): """ Method to return information on running and pending jobs. """ self.pPool.processResults() result = S_OK() result['SubmittedJobs'] = 0 nJobs = 0 for _j, value in self.processorsPerTask.iteritems(): if value > 0: nJobs += 1 result['RunningJobs'] = nJobs result['WaitingJobs'] = 0 processorsInUse = self.getProcessorsInUse() result['UsedProcessors'] = processorsInUse result['AvailableProcessors'] = self.processors - processorsInUse return result ############################################################################# def monitorProxy(self, pilotProxy, payloadProxy): """ Monitor the payload proxy and renew as necessary. """ return self._monitorProxy(pilotProxy, payloadProxy)
class ProcessPoolCallbacksTests( unittest.TestCase ): """ .. class:: ProcessPoolCallbacksTests test case for ProcessPool """ def setUp( self ): """c'tor :param self: self reference """ from DIRAC.Core.Base import Script Script.parseCommandLine() from DIRAC.FrameworkSystem.Client.Logger import gLogger gLogger.showHeaders( True ) self.log = gLogger.getSubLogger( self.__class__.__name__ ) self.processPool = ProcessPool( 4, 8, 8, poolCallback = self.poolCallback, poolExceptionCallback = self.poolExceptionCallback ) self.processPool.daemonize() def poolCallback( self, taskID, taskResult ): self.log.always( "callback for %s result is %s" % ( taskID, taskResult ) ) def poolExceptionCallback( self, taskID, taskException ): self.log.always( "callback for %s exception is %s" % ( taskID, taskException ) ) def testCallableClass( self ): """ CallableClass and pool callbacks test """ i = 0 while True: if self.processPool.getFreeSlots() > 0: timeWait = random.randint(0, 5) raiseException = False if not timeWait: raiseException = True result = self.processPool.createAndQueueTask( CallableClass, taskID = i, args = ( i, timeWait, raiseException ), usePoolCallbacks = True, blocking = True ) if result["OK"]: self.log.always("CallableClass enqueued to task %s" % i ) i += 1 else: continue if i == 10: break self.processPool.finalize( 2 ) def testCallableFunc( self ): """ CallableFunc and pool callbacks test """ i = 0 while True: if self.processPool.getFreeSlots() > 0: timeWait = random.randint(0, 5) raiseException = False if not timeWait: raiseException = True result = self.processPool.createAndQueueTask( CallableFunc, taskID = i, args = ( i, timeWait, raiseException ), usePoolCallbacks = True, blocking = True ) if result["OK"]: self.log.always("CallableFunc enqueued to task %s" % i ) i += 1 else: continue if i == 10: break self.processPool.finalize( 2 )
class RequestAgentBase( AgentModule ): """ .. class:: RequestAgentBase Helper class for DIRAC agents dealing with RequestContainers and Requests. """ ## placeholder for thread pool __processPool = None ## requests/cycle __requestsPerCycle = 50 ## minimal nb of subprocess running __minProcess = 2 ## maximal nb of subprocess executed same time __maxProcess = 4 ## ProcessPool queue size __queueSize = 10 ## ProcessTask default timeout in seconds __taskTimeout = 300 ## ProcessPool finalisation timeout __poolTimeout = 300 ## placeholder for RequestClient instance __requestClient = None ## request type __requestType = "" ## placeholder for request task class definition __requestTask = None ## placeholder for request callback function __requestCallback = None ## placeholder for exception callback function __exceptionCallback = None ## config path in CS __configPath = None ## read request holder __requestHolder = dict() def __init__( self, *args, **kwargs ): """ c'tor :param self: self reference :param str agentName: name of agent :param str loadName: name of module :param bool baseAgentName: whatever :param dict properties: whatever else """ AgentModule.__init__( self, *args, **kwargs ) agentName = args[0] ## save config path self.__configPath = PathFinder.getAgentSection( agentName ) self.log.info( "Will use %s config path" % self.__configPath ) ## ProcessPool related stuff self.__requestsPerCycle = self.am_getOption( "RequestsPerCycle", 10 ) self.log.info("requests/cycle = %d" % self.__requestsPerCycle ) self.__minProcess = self.am_getOption( "MinProcess", 1 ) self.log.info("ProcessPool min process = %d" % self.__minProcess ) self.__maxProcess = self.am_getOption( "MaxProcess", 4 ) self.log.info("ProcessPool max process = %d" % self.__maxProcess ) self.__queueSize = self.am_getOption( "ProcessPoolQueueSize", 10 ) self.log.info("ProcessPool queue size = %d" % self.__queueSize ) self.__poolTimeout = int( self.am_getOption( "ProcessPoolTimeout", 300 ) ) self.log.info("ProcessPool timeout = %d seconds" % self.__poolTimeout ) self.__taskTimeout = int( self.am_getOption( "ProcessTaskTimeout", 300 ) ) self.log.info("ProcessTask timeout = %d seconds" % self.__taskTimeout ) ## request type self.__requestType = self.am_getOption( "RequestType", self.__requestType ) self.log.info( "Will process '%s' request type." % str( self.__requestType ) ) ## shifter proxy self.am_setOption( "shifterProxy", "DataManager" ) self.log.info( "Will use DataManager proxy by default." ) ## common monitor activity self.monitor.registerActivity( "Iteration", "Agent Loops", self.__class__.__name__, "Loops/min", gMonitor.OP_SUM ) self.monitor.registerActivity( "Execute", "Request Processed", self.__class__.__name__, "Requests/min", gMonitor.OP_SUM ) self.monitor.registerActivity( "Done", "Request Completed", self.__class__.__name__, "Requests/min", gMonitor.OP_SUM ) ## create request dict self.__requestHolder = dict() def poolTimeout( self ): """ poolTimeout getter :param self: self reference """ return self.__poolTimeout def setPoolTimeout( self, timeout=300 ): """ poolTimeoit setter :param self: self reference :param int timeout: PP finalisation timeout in seconds """ self.__poolTimeout = int(timeout) def taskTimeout( self ): """ taskTimeout getter :param self: self reference """ return self.__taskTimeout def setTaskTimeout( self, timeout=300 ): """ taskTimeout setter :param self: self reference :param int timeout: task timeout in seconds """ self.__taskTimeout = int(timeout) def requestHolder( self ): """ get request holder dict :param self: self reference """ return self.__requestHolder def deleteRequest( self, requestName ): """ delete request from requestHolder :param self: self reference """ if requestName in self.__requestHolder: del self.__requestHolder[requestName] return S_OK() return S_ERROR("%s not found in requestHolder" % requestName ) def saveRequest( self, requestName, requestString ): """ put request into requestHolder :param cls: class reference :param str requestName: request name :param str requestString: XML-serialised request :param str requestServer: server URL """ if requestName not in self.__requestHolder: self.__requestHolder.setdefault( requestName, requestString ) return S_OK() return S_ERROR("saveRequest: request %s cannot be saved, it's already in requestHolder") def resetRequest( self, requestName ): """ put back :requestName: to RequestClient :param self: self reference :param str requestName: request's name """ if requestName in self.__requestHolder: requestString = self.__requestHolder[requestName] reset = self.requestClient().updateRequest( requestName, requestString ) if not reset["OK"]: self.log.error("resetRequest: unable to reset request %s: %s" % ( requestName, reset["Message"] ) ) self.log.debug("resetRequest: request %s has been put back with its initial state" % requestName ) else: self.log.error("resetRequest: unable to reset request %s: request not found in requestHolder" % requestName ) def resetAllRequests( self ): """ put back all requests without callback called into requestClient :param self: self reference """ self.log.info("resetAllRequests: will put %s back requests" % len(self.__requestHolder) ) for requestName, requestString in self.__requestHolder.items(): reset = self.requestClient().updateRequest( requestName, requestString ) if not reset["OK"]: self.log.error("resetAllRequests: unable to reset request %s: %s" % ( requestName, reset["Message"] ) ) continue self.log.debug("resetAllRequests: request %s has been put back with its initial state" % requestName ) def configPath( self ): """ config path getter :param self: self reference """ return self.__configPath def requestsPerCycle( self ): """ get number of request to be processed in one cycle :param self: self reference """ return self.__requestsPerCycle def requestClient( self ): """ RequestClient getter :param self: self reference """ if not self.__requestClient: self.__requestClient = RequestClient() return self.__requestClient def processPool( self ): """ 'Live long and prosper, my dear ProcessPool' - Mr. Spock :param self: self reference :return: brand new shiny ProcessPool instance on first call, the same instance on subsequent calls """ if not self.__processPool: minProcess = max( 1, self.__minProcess ) maxProcess = max( self.__minProcess, self.__maxProcess ) queueSize = abs(self.__queueSize) self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess, maxProcess, queueSize ) ) self.log.info( "ProcessPool: tasks will use callbacks attached to ProcessPool" ) self.__processPool = ProcessPool( minProcess, maxProcess, queueSize, poolCallback = self.resultCallback, poolExceptionCallback = self.exceptionCallback ) self.__processPool.daemonize() self.log.info( "ProcessPool: daemonized and ready") return self.__processPool def hasProcessPool( self ): """ check if ProcessPool exist to speed up finalization :param self: self reference """ return bool( self.__processPool ) def resultCallback( self, taskID, taskResult ): """ definition of request callback function :param self: self reference """ self.log.info("%s result callback" % taskID ) if not taskResult["OK"]: self.log.error( "%s result callback: %s" % ( taskID, taskResult["Message"] ) ) if taskResult["Message"] == "Timed out": self.resetRequest( taskID ) self.deleteRequest( taskID ) return self.deleteRequest( taskID ) taskResult = taskResult["Value"] ## add monitoring info monitor = taskResult["monitor"] if "monitor" in taskResult else {} for mark, value in monitor.items(): try: gMonitor.addMark( mark, value ) except Exception, error: self.log.exception( str(error) )
class RequestExecutingAgent( AgentModule ): """ .. class:: RequestExecutingAgent request processing agent using ProcessPool, Operation handlers and RequestTask """ # # process pool __processPool = None # # request cache __requestCache = {} # # requests/cycle __requestsPerCycle = 100 # # minimal nb of subprocess running __minProcess = 2 # # maximal nb of subprocess executed same time __maxProcess = 4 # # ProcessPool queue size __queueSize = 20 # # file timeout __fileTimeout = 300 # # operation timeout __operationTimeout = 300 # # ProcessTask default timeout in seconds __taskTimeout = 900 # # ProcessPool finalization timeout __poolTimeout = 900 # # ProcessPool sleep time __poolSleep = 5 # # placeholder for RequestClient instance __requestClient = None # # FTS scheduling flag __FTSMode = False def __init__( self, *args, **kwargs ): """ c'tor """ # # call base class ctor AgentModule.__init__( self, *args, **kwargs ) # # ProcessPool related stuff self.__requestsPerCycle = self.am_getOption( "RequestsPerCycle", self.__requestsPerCycle ) self.log.info( "Requests/cycle = %d" % self.__requestsPerCycle ) self.__minProcess = self.am_getOption( "MinProcess", self.__minProcess ) self.log.info( "ProcessPool min process = %d" % self.__minProcess ) self.__maxProcess = self.am_getOption( "MaxProcess", 4 ) self.log.info( "ProcessPool max process = %d" % self.__maxProcess ) self.__queueSize = self.am_getOption( "ProcessPoolQueueSize", self.__queueSize ) self.log.info( "ProcessPool queue size = %d" % self.__queueSize ) self.__poolTimeout = int( self.am_getOption( "ProcessPoolTimeout", self.__poolTimeout ) ) self.log.info( "ProcessPool timeout = %d seconds" % self.__poolTimeout ) self.__poolSleep = int( self.am_getOption( "ProcessPoolSleep", self.__poolSleep ) ) self.log.info( "ProcessPool sleep time = %d seconds" % self.__poolSleep ) self.__taskTimeout = int( self.am_getOption( "ProcessTaskTimeout", self.__taskTimeout ) ) self.log.info( "ProcessTask timeout = %d seconds" % self.__taskTimeout ) # # keep config path and agent name self.agentName = self.am_getModuleParam( "fullName" ) self.__configPath = PathFinder.getAgentSection( self.agentName ) # # operation handlers over here opHandlersPath = "%s/%s" % ( self.__configPath, "OperationHandlers" ) opHandlers = gConfig.getSections( opHandlersPath ) if not opHandlers["OK"]: self.log.error( opHandlers["Message" ] ) raise AgentConfigError( "OperationHandlers section not found in CS under %s" % self.__configPath ) opHandlers = opHandlers["Value"] self.timeOuts = dict() self.operationHandlers = [] for opHandler in opHandlers: opHandlerPath = "%s/%s/Location" % ( opHandlersPath, opHandler ) opLocation = gConfig.getValue( opHandlerPath, "" ) if not opLocation: self.log.error( "%s not set for %s operation handler" % ( opHandlerPath, opHandler ) ) continue self.timeOuts[opHandler] = { "PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout } opTimeout = gConfig.getValue( "%s/%s/TimeOut" % ( opHandlersPath, opHandler ), 0 ) if opTimeout: self.timeOuts[opHandler]["PerOperation"] = opTimeout fileTimeout = gConfig.getValue( "%s/%s/TimeOutPerFile" % ( opHandlersPath, opHandler ), 0 ) if fileTimeout: self.timeOuts[opHandler]["PerFile"] = fileTimeout self.operationHandlers.append( opLocation ) self.log.info( "Operation handlers:" ) for itemTuple in enumerate ( self.operationHandlers ): self.log.info( "[%s] %s" % itemTuple ) # # handlers dict self.handlersDict = dict() # # common monitor activity gMonitor.registerActivity( "Iteration", "Agent Loops", "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Processed", "Request Processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Done", "Request Completed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) # # create request dict self.__requestCache = dict() def processPool( self ): """ facade for ProcessPool """ if not self.__processPool: minProcess = max( 1, self.__minProcess ) maxProcess = max( self.__minProcess, self.__maxProcess ) queueSize = abs( self.__queueSize ) self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess, maxProcess, queueSize ) ) self.__processPool = ProcessPool( minProcess, maxProcess, queueSize, poolCallback = self.resultCallback, poolExceptionCallback = self.exceptionCallback ) self.__processPool.daemonize() return self.__processPool def requestClient( self ): """ RequestClient getter """ if not self.__requestClient: self.__requestClient = ReqClient() return self.__requestClient def cleanCache( self, requestName = None ): """ delete request from requestCache :param str requestName: Request.RequestName """ if requestName in self.__requestCache: del self.__requestCache[requestName] return S_OK() def cacheRequest( self, request ): """ put request into requestCache :param Request request: Request instance """ self.__requestCache.setdefault( request.RequestName, request ) return S_OK() def resetRequest( self, requestName ): """ put back :requestName: to RequestClient :param str requestName: request's name """ if requestName in self.__requestCache: reset = self.requestClient().updateRequest( self.__requestCache[requestName] ) if not reset["OK"]: return S_ERROR( "resetRequest: unable to reset request %s: %s" % ( requestName, reset["Message"] ) ) return S_OK() def resetAllRequests( self ): """ put back all requests without callback called into requestClient :param self: self reference """ self.log.info( "resetAllRequests: will put %s back requests" % len( self.__requestCache ) ) for requestName, request in self.__requestCache.iteritems(): reset = self.requestClient().updateRequest( request ) if not reset["OK"]: self.log.error( "resetAllRequests: unable to reset request %s: %s" % ( requestName, reset["Message"] ) ) continue self.log.debug( "resetAllRequests: request %s has been put back with its initial state" % requestName ) return S_OK() def initialize( self ): """ initialize agent at the moment creates handlers dictionary """ for opHandler in self.operationHandlers: handlerName = opHandler.split( "/" )[-1] self.handlersDict[ handlerName ] = opHandler self.log.debug( "handler '%s' for operation '%s' registered" % ( opHandler, handlerName ) ) if not self.handlersDict: self.log.error( "operation handlers not set, check configuration option 'Operations'!" ) return S_ERROR( "Operation handlers not set!" ) return S_OK() def execute( self ): """ read requests from RequestClient and enqueue them into ProcessPool """ gMonitor.addMark( "Iteration", 1 ) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug( "execute: executing %d request in this cycle" % taskCounter ) getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error( "execute: %s" % getRequest["Message"] ) break if not getRequest["Value"]: self.log.info( "execute: not more 'Waiting' requests to process" ) break # # OK, we've got you request = getRequest["Value"] # # set task id taskID = request.RequestName # # save current request in cache self.cacheRequest( request ) # # serialize to JSON requestJSON = request.toJSON() if not requestJSON["OK"]: self.log.error( "JSON serialization error: %s" % requestJSON["Message"] ) break requestJSON = requestJSON["Value"] self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses() ) ) while True: if not self.processPool().getFreeSlots(): self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep ) time.sleep( self.__poolSleep ) else: self.log.info( "spawning task for request '%s'" % ( request.RequestName ) ) timeOut = self.getTimeout( request ) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs = { "requestJSON" : requestJSON, "handlersDict" : self.handlersDict, "csPath" : self.__configPath, "agentName": self.agentName }, taskID = taskID, blocking = True, usePoolCallbacks = True, timeOut = timeOut ) if not enqueue["OK"]: self.log.error( enqueue["Message"] ) else: self.log.debug( "successfully enqueued task '%s'" % taskID ) # # update monitor gMonitor.addMark( "Processed", 1 ) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep( 0.1 ) break # # clean return return S_OK() def getTimeout( self, request ): """ get timeout for request """ timeout = 0 for op in request: if op.Status not in ( "Waiting", "Scheduled" ): continue if op.Type not in self.timeOuts: timeout += self.__operationTimeout else: perOp = self.timeOuts[op.Type].get( "PerOperation", self.__operationTimeout ) perFiles = self.timeOuts[op.Type].get( "PerFile", self.__fileTimeout ) * len( op ) timeout += perOp + perFiles self.log.info( "estimated timeOut for request %s is %s" % ( request.RequestName, timeout ) ) return timeout def finalize( self ): """ agent finalization """ if self.__processPool: self.processPool().finalize( timeout = self.__poolTimeout ) self.resetAllRequests() return S_OK() def resultCallback( self, taskID, taskResult ): """ definition of request callback function :param str taskID: Reqiest.RequestName :param dict taskResult: task result S_OK/S_ERROR """ self.log.info( "callback: %s result is %s(%s)" % ( taskID, "S_OK" if taskResult["OK"] else "S_ERROR", taskResult["Value"] if taskResult["OK"] else taskResult["Message"] ) ) if not taskResult["OK"]: if taskResult["Message"] == "Timed out": self.resetRequest( taskID ) # # clean cache self.cleanCache( taskID ) def exceptionCallback( self, taskID, taskException ): """ definition of exception callback function :param str taskID: Request.RequestName :param Exception taskException: Exception instance """ self.log.error( "exceptionCallback: %s was hit by exception %s" % ( taskID, taskException ) ) self.resetRequest( taskID )
class RequestExecutingAgent( AgentModule ): """ .. class:: RequestExecutingAgent request processing agent using ProcessPool, Operation handlers and RequestTask """ # # process pool __processPool = None # # request cache __requestCache = {} # # requests/cycle __requestsPerCycle = 100 # # minimal nb of subprocess running __minProcess = 2 # # maximal nb of subprocess executed same time __maxProcess = 4 # # ProcessPool queue size __queueSize = 20 # # file timeout __fileTimeout = 300 # # operation timeout __operationTimeout = 300 # # ProcessTask default timeout in seconds __taskTimeout = 900 # # ProcessPool finalization timeout __poolTimeout = 900 # # ProcessPool sleep time __poolSleep = 5 # # placeholder for RequestClient instance __requestClient = None # # Size of the bulk if use of getRequests. If 0, use getRequest __bulkRequest = 0 def __init__( self, *args, **kwargs ): """ c'tor """ # # call base class ctor AgentModule.__init__( self, *args, **kwargs ) # # ProcessPool related stuff self.__requestsPerCycle = self.am_getOption( "RequestsPerCycle", self.__requestsPerCycle ) self.log.info( "Requests/cycle = %d" % self.__requestsPerCycle ) self.__minProcess = self.am_getOption( "MinProcess", self.__minProcess ) self.log.info( "ProcessPool min process = %d" % self.__minProcess ) self.__maxProcess = self.am_getOption( "MaxProcess", 4 ) self.log.info( "ProcessPool max process = %d" % self.__maxProcess ) self.__queueSize = self.am_getOption( "ProcessPoolQueueSize", self.__queueSize ) self.log.info( "ProcessPool queue size = %d" % self.__queueSize ) self.__poolTimeout = int( self.am_getOption( "ProcessPoolTimeout", self.__poolTimeout ) ) self.log.info( "ProcessPool timeout = %d seconds" % self.__poolTimeout ) self.__poolSleep = int( self.am_getOption( "ProcessPoolSleep", self.__poolSleep ) ) self.log.info( "ProcessPool sleep time = %d seconds" % self.__poolSleep ) self.__taskTimeout = int( self.am_getOption( "ProcessTaskTimeout", self.__taskTimeout ) ) self.log.info( "ProcessTask timeout = %d seconds" % self.__taskTimeout ) self.__bulkRequest = self.am_getOption( "BulkRequest", 0 ) self.log.info( "Bulk request size = %d" % self.__bulkRequest ) # # keep config path and agent name self.agentName = self.am_getModuleParam( "fullName" ) self.__configPath = PathFinder.getAgentSection( self.agentName ) # # operation handlers over here opHandlersPath = "%s/%s" % ( self.__configPath, "OperationHandlers" ) opHandlers = gConfig.getSections( opHandlersPath ) if not opHandlers["OK"]: self.log.error( opHandlers["Message" ] ) raise AgentConfigError( "OperationHandlers section not found in CS under %s" % self.__configPath ) opHandlers = opHandlers["Value"] self.timeOuts = dict() # # handlers dict self.handlersDict = dict() for opHandler in opHandlers: opHandlerPath = "%s/%s/Location" % ( opHandlersPath, opHandler ) opLocation = gConfig.getValue( opHandlerPath, "" ) if not opLocation: self.log.error( "%s not set for %s operation handler" % ( opHandlerPath, opHandler ) ) continue self.timeOuts[opHandler] = { "PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout } opTimeout = gConfig.getValue( "%s/%s/TimeOut" % ( opHandlersPath, opHandler ), 0 ) if opTimeout: self.timeOuts[opHandler]["PerOperation"] = opTimeout fileTimeout = gConfig.getValue( "%s/%s/TimeOutPerFile" % ( opHandlersPath, opHandler ), 0 ) if fileTimeout: self.timeOuts[opHandler]["PerFile"] = fileTimeout self.handlersDict[opHandler] = opLocation self.log.info( "Operation handlers:" ) for item in enumerate ( self.handlersDict.items() ): opHandler = item[1][0] self.log.info( "[%s] %s: %s (timeout: %d s + %d s per file)" % ( item[0], item[1][0], item[1][1], self.timeOuts[opHandler]['PerOperation'], self.timeOuts[opHandler]['PerFile'] ) ) # # common monitor activity gMonitor.registerActivity( "Iteration", "Agent Loops", "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Processed", "Request Processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Done", "Request Completed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) # # create request dict self.__requestCache = dict() self.FTSMode = self.am_getOption( "FTSMode", False ) def processPool( self ): """ facade for ProcessPool """ if not self.__processPool: minProcess = max( 1, self.__minProcess ) maxProcess = max( self.__minProcess, self.__maxProcess ) queueSize = abs( self.__queueSize ) self.log.info( "ProcessPool: minProcess = %d maxProcess = %d queueSize = %d" % ( minProcess, maxProcess, queueSize ) ) self.__processPool = ProcessPool( minProcess, maxProcess, queueSize, poolCallback = self.resultCallback, poolExceptionCallback = self.exceptionCallback ) self.__processPool.daemonize() return self.__processPool def requestClient( self ): """ RequestClient getter """ if not self.__requestClient: self.__requestClient = ReqClient() return self.__requestClient def cacheRequest( self, request ): """ put request into requestCache :param Request request: Request instance """ count = 5 # Wait a bit as there may be a race condition between RequestTask putting back the request and the callback clearing the cache while request.RequestName in self.__requestCache: count -= 1 if not count: self.requestClient().putRequest( request ) return S_ERROR( "Duplicate request, ignore: %s" % request.RequestName ) time.sleep( 1 ) self.__requestCache[ request.RequestName ] = request return S_OK() def putRequest( self, requestName, taskResult = None ): """ put back :requestName: to RequestClient :param str requestName: request's name """ if requestName in self.__requestCache: request = self.__requestCache.pop( requestName ) if taskResult and taskResult['OK']: request = taskResult['Value'] reset = self.requestClient().putRequest( request ) if not reset["OK"]: return S_ERROR( "putRequest: unable to reset request %s: %s" % ( requestName, reset["Message"] ) ) else: return S_ERROR( 'Not in cache' ) return S_OK() def putAllRequests( self ): """ put back all requests without callback called into requestClient :param self: self reference """ self.log.info( "putAllRequests: will put %s back requests" % len( self.__requestCache ) ) for requestName in self.__requestCache.keys(): reset = self.putRequest( requestName ) if not reset["OK"]: self.log.error( reset["Message"] ) else: self.log.debug( "putAllRequests: request %s has been put back with its initial state" % requestName ) return S_OK() def initialize( self ): """ initialize agent """ return S_OK() def execute( self ): """ read requests from RequestClient and enqueue them into ProcessPool """ gMonitor.addMark( "Iteration", 1 ) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug( "execute: executing %d request in this cycle" % taskCounter ) requestsToExecute = [] if not self.__bulkRequest: self.log.info( "execute: ask for a single request" ) getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error( "execute: %s" % getRequest["Message"] ) break if not getRequest["Value"]: self.log.info( "execute: no more 'Waiting' requests to process" ) break requestsToExecute = [getRequest["Value"] ] else: numberOfRequest = min( self.__bulkRequest, self.__requestsPerCycle - taskCounter ) self.log.info( "execute: ask for %s requests" % numberOfRequest ) getRequests = self.requestClient().getBulkRequests( numberOfRequest ) if not getRequests["OK"]: self.log.error( "execute: %s" % getRequests["Message"] ) break if not getRequests["Value"]: self.log.info( "execute: no more 'Waiting' requests to process" ) break for rId in getRequests["Value"]["Failed"]: self.log.error( "execute: %s" % getRequests["Value"]["Failed"][rId] ) requestsToExecute = getRequests["Value"]["Successful"].values() self.log.info( "execute: will execute %s requests " % len( requestsToExecute ) ) for request in requestsToExecute: # # set task id taskID = request.RequestName # # save current request in cache self.cacheRequest( request ) # # serialize to JSON requestJSON = request.toJSON() if not requestJSON["OK"]: self.log.error( "JSON serialization error: %s" % requestJSON["Message"] ) break requestJSON = requestJSON["Value"] self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses() ) ) looping = 0 while True: if not self.processPool().getFreeSlots(): if not looping: self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep ) time.sleep( self.__poolSleep ) looping += 1 else: if looping: self.log.info( "Free slot found after %d seconds" % looping * self.__poolSleep ) looping = 0 self.log.info( "spawning task for request '%s'" % ( request.RequestName ) ) timeOut = self.getTimeout( request ) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs = { "requestJSON" : requestJSON, "handlersDict" : self.handlersDict, "csPath" : self.__configPath, "agentName": self.agentName }, taskID = taskID, blocking = True, usePoolCallbacks = True, timeOut = timeOut ) if not enqueue["OK"]: self.log.error( enqueue["Message"] ) else: self.log.debug( "successfully enqueued task '%s'" % taskID ) # # update monitor gMonitor.addMark( "Processed", 1 ) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep( 0.1 ) break # # clean return return S_OK() def getTimeout( self, request ): """ get timeout for request """ timeout = 0 for op in request: if op.Status not in ( "Waiting", "Scheduled", 'Queued' ): continue if op.Type not in self.timeOuts: timeout += self.__operationTimeout else: perOp = self.timeOuts[op.Type].get( "PerOperation", self.__operationTimeout ) perFiles = self.timeOuts[op.Type].get( "PerFile", self.__fileTimeout ) * len( op ) timeout += perOp + perFiles self.log.info( "estimated timeOut for request %s is %s" % ( request.RequestName, timeout ) ) return timeout def finalize( self ): """ agent finalization """ if self.__processPool: self.processPool().finalize( timeout = self.__poolTimeout ) self.putAllRequests() return S_OK() def resultCallback( self, taskID, taskResult ): """ definition of request callback function :param str taskID: Request.RequestName :param dict taskResult: task result S_OK(Request)/S_ERROR(Message) """ # # clean cache res = self.putRequest( taskID, taskResult ) self.log.info( "callback: %s result is %s(%s), put %s(%s)" % ( taskID, "S_OK" if taskResult["OK"] else "S_ERROR", taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"], "S_OK" if res['OK'] else 'S_ERROR', '' if res['OK'] else res['Message'] ) ) def exceptionCallback( self, taskID, taskException ): """ definition of exception callback function :param str taskID: Request.RequestName :param Exception taskException: Exception instance """ self.log.error( "exceptionCallback: %s was hit by exception %s" % ( taskID, taskException ) ) self.putRequest( taskID )
class ProcessPoolCallbacksTests( unittest.TestCase ): """ .. class:: ProcessPoolCallbacksTests test case for ProcessPool """ def setUp( self ): """c'tor :param self: self reference """ gLogger.showHeaders( True ) self.log = gLogger.getSubLogger( self.__class__.__name__ ) self.processPool = ProcessPool( 4, 8, 8, poolCallback = self.poolCallback, poolExceptionCallback = self.poolExceptionCallback ) self.processPool.daemonize() def poolCallback( self, taskID, taskResult ): self.log.always( "callback for %s result is %s" % ( taskID, taskResult ) ) def poolExceptionCallback( self, taskID, taskException ): self.log.always( "callback for %s exception is %s" % ( taskID, taskException ) ) def testCallableClass( self ): """ CallableClass and pool callbacks test """ i = 0 while True: if self.processPool.getFreeSlots() > 0: timeWait = random.randint(0, 5) raiseException = False if not timeWait: raiseException = True result = self.processPool.createAndQueueTask( CallableClass, taskID = i, args = ( i, timeWait, raiseException ), usePoolCallbacks = True, blocking = True ) if result["OK"]: self.log.always("CallableClass enqueued to task %s" % i ) i += 1 else: continue if i == 10: break self.processPool.finalize( 2 ) def testCallableFunc( self ): """ CallableFunc and pool callbacks test """ i = 0 while True: if self.processPool.getFreeSlots() > 0: timeWait = random.randint(0, 5) raiseException = False if not timeWait: raiseException = True result = self.processPool.createAndQueueTask( CallableFunc, taskID = i, args = ( i, timeWait, raiseException ), usePoolCallbacks = True, blocking = True ) if result["OK"]: self.log.always("CallableFunc enqueued to task %s" % i ) i += 1 else: continue if i == 10: break self.processPool.finalize( 2 )
######################################################################### argvs = sys.argv if len(argvs) != 2 : print 'Usage: LFC_to_DFC.py [dirlist_file]' print '[dirlist_file] should contain the directory list.' quit() dirlist_file=argvs[1] if ( not os.path.exists(dirlist_file) ) : print dirlist_file+" does not exist" quit() execfile(dirlist_file) pPool = ProcessPool(10,50,50) pPool.daemonize() # dirlist = ['prod/ilc/mc-dbd/generated','prod/ilc/mc-dbd/ild'] # dirlist= ['prod/ilc/mc-dbd/generated/500-TDR_ws/higgs'] # dirlist= ['prod/ilc/mc-dbd/generated/250-TDR_ws/higgs','prod/ilc/mc-dbd/generated/350-TDR_ws/higgs'] #dirlist= ['prod/ilc/mc-dbd/generated/250-TDR_ws'] #dirlist= ['prod/ilc/mc-dbd/generated/250-TDR_ws/1f', # 'prod/ilc/mc-dbd/generated/250-TDR_ws/3f', # 'prod/ilc/mc-dbd/generated/250-TDR_ws/aa_lowpt', # 'prod/ilc/mc-dbd/generated/250-TDR_ws/aa_minijet'] #dirlist= ['prod/ilc/mc-dbd/generated/250-TDR_ws/aa_2f', # 'prod/ilc/mc-dbd/generated/350-TDR_ws/3f', # 'prod/ilc/mc-dbd/generated/350-TDR_ws/1f', # 'prod/ilc/mc-dbd/generated/350-TDR_ws/aa_minijet']
class PoolComputingElement( ComputingElement ): mandatoryParameters = MandatoryParameters ############################################################################# def __init__( self, ceUniqueID, cores = 0 ): """ Standard constructor. """ ComputingElement.__init__( self, ceUniqueID ) self.ceType = "Pool" self.submittedJobs = 0 if cores > 0: self.cores = cores else: self.cores = getNumberOfCores() self.pPool = ProcessPool( self.cores, self.cores, poolCallback = self.finalizeJob ) self.taskID = 0 self.coresPerTask = {} ############################################################################# def _addCEConfigDefaults( self ): """Method to make sure all necessary Configuration Parameters are defined """ # First assure that any global parameters are loaded ComputingElement._addCEConfigDefaults( self ) def getCoresInUse( self ): """ """ coresInUse = 0 for _task, cores in self.coresPerTask.items(): coresInUse += cores return coresInUse ############################################################################# def submitJob( self, executableFile, proxy, **kwargs ): """ Method to submit job, should be overridden in sub-class. """ self.pPool.processResults() coresInUse = self.getCoresInUse() if "WholeNode" in kwargs and kwargs['WholeNode']: if coresInUse > 0: return S_ERROR('Can not take WholeNode job, %d/%d slots used' % (self.slotsInUse,self.slots) ) else: requestedCores = self.cores elif "NumberOfCores" in kwargs: requestedCores = int( kwargs['NumberOfCores'] ) if requestedCores > 0: if (coresInUse + requestedCores) > self.cores: return S_ERROR( 'Not enough slots: requested %d, available %d' % ( requestedCores, self.cores-coresInUse) ) else: requestedCores = 1 if self.cores - coresInUse < requestedCores: return S_ERROR( 'Not enough slots: requested %d, available %d' % ( requestedCores, self.cores-coresInUse) ) ret = getProxyInfo() if not ret['OK']: pilotProxy = None else: pilotProxy = ret['Value']['path'] self.log.notice( 'Pilot Proxy:', pilotProxy ) result = self.pPool.createAndQueueTask( executeJob, [executableFile,proxy,self.taskID],None, self.taskID, usePoolCallbacks = True ) self.taskID += 1 self.coresPerTask[self.taskID] = requestedCores self.pPool.processResults() return result def finalizeJob( self, taskID, result ): """ Finalize the job """ del self.coresPerTask[taskID] ############################################################################# def getCEStatus( self ): """ Method to return information on running and pending jobs. """ self.pPool.processResults() result = S_OK() result['SubmittedJobs'] = 0 nJobs = 0 for _j, value in self.coresPerTask.items(): if value > 0: nJobs += 1 result['RunningJobs'] = nJobs result['WaitingJobs'] = 0 coresInUse = self.getCoresInUse() result['UsedCores'] = coresInUse result['AvailableCores'] = self.cores - coresInUse return result ############################################################################# def monitorProxy( self, pilotProxy, payloadProxy ): """ Monitor the payload proxy and renew as necessary. """ return self._monitorProxy( pilotProxy, payloadProxy )
class PoolComputingElement(ComputingElement): mandatoryParameters = MandatoryParameters ############################################################################# def __init__(self, ceUniqueID): """ Standard constructor. """ super(PoolComputingElement, self).__init__(ceUniqueID) self.ceType = "Pool" self.log = gLogger.getSubLogger('Pool') self.submittedJobs = 0 self.processors = 1 self.pPool = None self.taskID = 0 self.processorsPerTask = {} self.userNumberPerTask = {} self.useSudo = False ############################################################################# def _addCEConfigDefaults(self): """Method to make sure all necessary Configuration Parameters are defined """ # First assure that any global parameters are loaded ComputingElement._addCEConfigDefaults(self) def _reset(self): """ Update internal variables after some extra parameters are added :return: None """ self.processors = int( self.ceParameters.get('NumberOfProcessors', self.processors)) self.ceParameters['MaxTotalJobs'] = self.processors self.useSudo = self.ceParameters.get('SudoExecution', False) def getProcessorsInUse(self): """ Get the number of currently allocated processor cores :return: number of processor cores """ processorsInUse = 0 for task in self.processorsPerTask: processorsInUse += self.processorsPerTask[task] return processorsInUse ############################################################################# def submitJob(self, executableFile, proxy, **kwargs): """ Method to submit job. :param str executableFile: location of the executable file :param str proxy: payload proxy :return: S_OK/S_ERROR of the result of the job submission """ if self.pPool is None: self.pPool = ProcessPool(minSize=self.processors, maxSize=self.processors, poolCallback=self.finalizeJob) self.pPool.processResults() processorsForJob = self._getProcessorsForJobs(kwargs) if not processorsForJob: return S_ERROR('Not enough processors for the job') # Now persisiting the job limits for later use in pilot.cfg file (pilot 3 default) cd = ConfigurationData(loadDefaultCFG=False) res = cd.loadFile('pilot.cfg') if not res['OK']: self.log.error("Could not load pilot.cfg", res['Message']) # only NumberOfProcessors for now, but RAM (or other stuff) can also be added jobID = int(kwargs.get('jobDesc', {}).get('jobID', 0)) cd.setOptionInCFG( '/Resources/Computing/JobLimits/%d/NumberOfProcessors' % jobID, processorsForJob) res = cd.dumpLocalCFGToFile('pilot.cfg') if not res['OK']: self.log.error("Could not dump cfg to pilot.cfg", res['Message']) ret = getProxyInfo() if not ret['OK']: pilotProxy = None else: pilotProxy = ret['Value']['path'] self.log.notice('Pilot Proxy:', pilotProxy) kwargs = {'UseSudo': False} if self.useSudo: for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS): if nUser not in self.userNumberPerTask.values(): break kwargs['NUser'] = nUser kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str( nUser).zfill(2) kwargs['UseSudo'] = True result = self.pPool.createAndQueueTask(executeJob, args=(executableFile, proxy, self.taskID), kwargs=kwargs, taskID=self.taskID, usePoolCallbacks=True) self.processorsPerTask[self.taskID] = processorsForJob self.taskID += 1 self.pPool.processResults() return result def _getProcessorsForJobs(self, kwargs): """ helper function """ processorsInUse = self.getProcessorsInUse() availableProcessors = self.processors - processorsInUse # Does this ask for MP? if not kwargs.get('mpTag', False): if availableProcessors: return 1 else: return 0 # From here we assume the job is asking for MP if kwargs.get('wholeNode', False): if processorsInUse > 0: return 0 else: requestedProcessors = self.processors if "numberOfProcessors" in kwargs: requestedProcessors = int(kwargs['numberOfProcessors']) else: requestedProcessors = 1 if availableProcessors < requestedProcessors: return 0 # If there's a maximum number of processors allowed for the job, use that as maximum, # otherwise it will use all the remaining processors if 'maxNumberOfProcessors' in kwargs and kwargs[ 'maxNumberOfProcessors']: maxNumberOfProcessors = min(int(kwargs['maxNumberOfProcessors']), availableProcessors) else: maxNumberOfProcessors = availableProcessors return maxNumberOfProcessors def finalizeJob(self, taskID, result): """ Finalize the job by updating the process utilisation counters :param int taskID: local PoolCE task ID :param dict result: result of the job execution """ nProc = self.processorsPerTask.pop(taskID) if result['OK']: self.log.info( 'Task %d finished successfully, %d processor(s) freed' % (taskID, nProc)) else: self.log.error("Task failed submission", "%d, message: %s" % (taskID, result['Message'])) ############################################################################# def getCEStatus(self, jobIDList=None): """ Method to return information on running and pending jobs. :return: dictionary of numbers of jobs per status """ if self.pPool is None: self.pPool = ProcessPool(minSize=self.processors, maxSize=self.processors, poolCallback=self.finalizeJob) self.pPool.processResults() result = S_OK() result['SubmittedJobs'] = 0 nJobs = 0 for _j, value in self.processorsPerTask.iteritems(): if value > 0: nJobs += 1 result['RunningJobs'] = nJobs result['WaitingJobs'] = 0 processorsInUse = self.getProcessorsInUse() result['UsedProcessors'] = processorsInUse result['AvailableProcessors'] = self.processors - processorsInUse return result def getDescription(self): """ Get a list of CEs descriptions (each is a dict) This is called by the JobAgent. """ result = super(PoolComputingElement, self).getDescription() if not result['OK']: return result ceDict = result['Value'] ceDictList = [] if self.ceParameters.get('MultiProcessorStrategy'): strategyRequiredTags = [] if not ceDict.get("ProcessorsInUse", 0): # We are starting from a clean page, try to get the most demanding # jobs first strategyRequiredTags.append(['WholeNode']) processors = ceDict.get('NumberOfProcessors', 0) if processors > 1: # We have several processors at hand, try to use most of them strategyRequiredTags.append(['%dProcessors' % processors]) # Well, at least jobs with some processors requirement strategyRequiredTags.append(['MultiProcessor']) for strat in strategyRequiredTags: newCEDict = dict(ceDict) newCEDict.setdefault("RequiredTag", []).extend(strat) ceDictList.append(newCEDict) # Do not require anything special if nothing else was lucky ceDictList.append(dict(ceDict)) return S_OK(ceDictList) ############################################################################# def monitorProxy(self, pilotProxy, payloadProxy): """ Monitor the payload proxy and renew as necessary. :param str pilotProxy: location of the pilotProxy :param str payloadProxy: location of the payloadProxy """ return self._monitorProxy(pilotProxy, payloadProxy)
class TaskTimeOutTests( unittest.TestCase ): """ .. class:: TaskTimeOutTests test case for ProcessPool """ def setUp( self ): """c'tor :param self: self reference """ gLogger.showHeaders( True ) self.log = gLogger.getSubLogger( self.__class__.__name__ ) self.processPool = ProcessPool( 2, 4, 8, poolCallback = self.poolCallback, poolExceptionCallback = self.poolExceptionCallback ) self.processPool.daemonize() def poolCallback( self, taskID, taskResult ): self.log.always( "callback result for %s is %s" % ( taskID, taskResult ) ) def poolExceptionCallback( self, taskID, taskException ): self.log.always( "callback exception for %s is %s" % ( taskID, taskException ) ) def testCallableClass( self ): """ CallableClass and task time out test """ i = 0 while True: if self.processPool.getFreeSlots() > 0: timeWait = random.randint( 0, 5 ) * 10 raiseException = False if not timeWait: raiseException = True result = self.processPool.createAndQueueTask( CallableClass, taskID = i, args = ( i, timeWait, raiseException ), timeOut = 15, usePoolCallbacks = True, blocking = True ) if result["OK"]: self.log.always("CallableClass enqueued to task %s timeWait=%s exception=%s" % ( i, timeWait, raiseException ) ) i += 1 else: continue if i == 16: break self.processPool.finalize( 2 ) def testCallableFunc( self ): """ CallableFunc and task timeout test """ i = 0 while True: if self.processPool.getFreeSlots() > 0: timeWait = random.randint(0, 5) * 5 raiseException = False if not timeWait: raiseException = True result = self.processPool.createAndQueueTask( CallableFunc, taskID = i, args = ( i, timeWait, raiseException ), timeOut = 15, usePoolCallbacks = True, blocking = True ) if result["OK"]: self.log.always("CallableFunc enqueued to task %s timeWait=%s exception=%s" % ( i, timeWait, raiseException ) ) i += 1 else: continue if i == 16: break self.processPool.finalize( 2 ) def testLockedClass( self ): """ LockedCallableClass and task time out test """ for loop in range(2): self.log.always( "loop %s" % loop ) i = 0 while i < 16: if self.processPool.getFreeSlots() > 0: timeWait = random.randint(0, 5) * 5 raiseException = False if timeWait == 5: raiseException = True klass = CallableClass if timeWait >= 20: klass = LockedCallableClass result = self.processPool.createAndQueueTask( klass, taskID = i, args = ( i, timeWait, raiseException ), timeOut = 15, usePoolCallbacks = True, blocking = True ) if result["OK"]: self.log.always("%s enqueued to task %s timeWait=%s exception=%s" % ( klass.__name__ , i, timeWait, raiseException ) ) i += 1 else: continue self.log.always("being idle for a while") for _ in range( 100000 ): for _ in range( 1000 ): pass self.log.always("finalizing...") self.processPool.finalize( 10 ) ## unlock gLock.release()
class PoolComputingElement(ComputingElement): mandatoryParameters = MandatoryParameters ############################################################################# def __init__(self, ceUniqueID): """ Standard constructor. """ ComputingElement.__init__(self, ceUniqueID) self.ceType = "Pool" self.log = gLogger.getSubLogger('Pool') self.submittedJobs = 0 self.processors = 1 self.pPool = None self.taskID = 0 self.processorsPerTask = {} self.userNumberPerTask = {} self.useSudo = False ############################################################################# def _addCEConfigDefaults(self): """Method to make sure all necessary Configuration Parameters are defined """ # First assure that any global parameters are loaded ComputingElement._addCEConfigDefaults(self) def _reset(self): """ Update internal variables after some extra parameters are added :return: None """ self.processors = int(self.ceParameters.get('NumberOfProcessors', self.processors)) self.ceParameters['MaxTotalJobs'] = self.processors self.useSudo = self.ceParameters.get('SudoExecution', False) def getProcessorsInUse(self): """ Get the number of currently allocated processor cores :return: number of processor cores """ processorsInUse = 0 for task in self.processorsPerTask: processorsInUse += self.processorsPerTask[task] return processorsInUse ############################################################################# def submitJob(self, executableFile, proxy, **kwargs): """ Method to submit job. :param str executableFile: location of the executable file :param str proxy: payload proxy :return: S_OK/S_ERROR of the result of the job submission """ if self.pPool is None: self.pPool = ProcessPool(minSize=self.processors, maxSize=self.processors, poolCallback=self.finalizeJob) self.pPool.processResults() processorsInUse = self.getProcessorsInUse() if kwargs.get('wholeNode'): if processorsInUse > 0: return S_ERROR('Can not take WholeNode job') # , %d/%d slots used' % (self.slotsInUse,self.slots) ) else: requestedProcessors = self.processors elif "numberOfProcessors" in kwargs: requestedProcessors = int(kwargs['numberOfProcessors']) if requestedProcessors > 0: if (processorsInUse + requestedProcessors) > self.processors: return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors, self.processors - processorsInUse)) else: requestedProcessors = 1 if self.processors - processorsInUse < requestedProcessors: return S_ERROR('Not enough slots: requested %d, available %d' % (requestedProcessors, self.processors - processorsInUse)) ret = getProxyInfo() if not ret['OK']: pilotProxy = None else: pilotProxy = ret['Value']['path'] self.log.notice('Pilot Proxy:', pilotProxy) kwargs = {'UseSudo': False} if self.useSudo: for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS): if nUser not in self.userNumberPerTask.values(): break kwargs['NUser'] = nUser kwargs['PayloadUser'] = os.environ['USER'] + 'p%s' % str(nUser).zfill(2) kwargs['UseSudo'] = True result = self.pPool.createAndQueueTask(executeJob, args=(executableFile, proxy, self.taskID), kwargs=kwargs, taskID=self.taskID, usePoolCallbacks=True) self.processorsPerTask[self.taskID] = requestedProcessors self.taskID += 1 self.pPool.processResults() return result def finalizeJob(self, taskID, result): """ Finalize the job by updating the process utilisation counters :param int taskID: local PoolCE task ID :param dict result: result of the job execution """ nProc = self.processorsPerTask.pop(taskID) if result['OK']: self.log.info('Task %d finished successfully, %d processor(s) freed' % (taskID, nProc)) else: self.log.error("Task failed submission", "%d, message: %s" % (taskID, result['Message'])) ############################################################################# def getCEStatus(self, jobIDList=None): """ Method to return information on running and pending jobs. :return: dictionary of numbers of jobs per status """ if self.pPool is None: self.pPool = ProcessPool(minSize=self.processors, maxSize=self.processors, poolCallback=self.finalizeJob) self.pPool.processResults() result = S_OK() result['SubmittedJobs'] = 0 nJobs = 0 for _j, value in self.processorsPerTask.iteritems(): if value > 0: nJobs += 1 result['RunningJobs'] = nJobs result['WaitingJobs'] = 0 processorsInUse = self.getProcessorsInUse() result['UsedProcessors'] = processorsInUse result['AvailableProcessors'] = self.processors - processorsInUse return result def getDescription(self): """ Get CE description as a dictionary """ result = super(PoolComputingElement, self).getDescription() if not result['OK']: return result ceDict = result['Value'] ceDictList = [] if self.ceParameters.get('MultiProcessorStrategy'): strategyRequiredTags = [] if not ceDict.get("ProcessorsInUse", 0): # We are starting from a clean page, try to get the most demanding # jobs first strategyRequiredTags.append(['WholeNode']) processors = ceDict.get('NumberOfProcessors', 0) if processors > 1: # We have several processors at hand, try to use most of them strategyRequiredTags.append(['%dProcessors' % processors]) # Well, at least jobs with some processors requirement strategyRequiredTags.append(['MultiProcessor']) for strat in strategyRequiredTags: newCEDict = dict(ceDict) newCEDict.setdefault("RequiredTag", []).extend(strat) ceDictList.append(newCEDict) # Do not require anything special if nothing else was lucky ceDictList.append(dict(ceDict)) return S_OK(ceDictList) ############################################################################# def monitorProxy(self, pilotProxy, payloadProxy): """ Monitor the payload proxy and renew as necessary. :param str pilotProxy: location of the pilotProxy :param str payloadProxy: location of the payloadProxy """ return self._monitorProxy(pilotProxy, payloadProxy)
class RequestExecutingAgent(AgentModule): """ .. class:: RequestExecutingAgent request processing agent using ProcessPool, Operation handlers and RequestTask """ def __init__(self, *args, **kwargs): """c'tor""" # # call base class ctor super().__init__(*args, **kwargs) self.__processPool = None self.__requestCache = {} self.__requestsPerCycle = REQUESTSPERCYCLE self.__minProcess = MINPROCESS self.__maxProcess = MAXPROCESS self.__queueSize = QUEUESIZE self.__fileTimeout = FILETIMEOUT self.__operationTimeout = OPERATIONTIMEOUT self.__poolTimeout = POOLTIMEOUT self.__poolSleep = POOLSLEEP self.__requestClient = None # Size of the bulk if use of getRequests. If 0, use getRequest self.__bulkRequest = 0 self.__rmsMonitoring = False def processPool(self): """facade for ProcessPool""" if not self.__processPool: minProcess = max(1, self.__minProcess) maxProcess = max(self.__minProcess, self.__maxProcess) queueSize = abs(self.__queueSize) self.log.info( "REA ProcessPool configuration", "minProcess = %d maxProcess = %d queueSize = %d" % (minProcess, maxProcess, queueSize), ) self.__processPool = ProcessPool( minProcess, maxProcess, queueSize, poolCallback=self.resultCallback, poolExceptionCallback=self.exceptionCallback, ) self.__processPool.daemonize() return self.__processPool def requestClient(self): """RequestClient getter""" if not self.__requestClient: self.__requestClient = ReqClient() return self.__requestClient def cacheRequest(self, request): """put request into requestCache :param ~Request.Request request: Request instance """ maxProcess = max(self.__minProcess, self.__maxProcess) if len(self.__requestCache) > maxProcess + 50: # For the time being we just print a warning... If the ProcessPool is working well, this is not needed # We don't know how much is acceptable as it depends on many factors self.log.warn("Too many requests in cache", ": %d" % len(self.__requestCache)) # return S_ERROR( "Too many requests in cache" ) if request.RequestID in self.__requestCache: # We don't call putRequest as we have got back the request that is still being executed. Better keep it # The main reason for this is that it lasted longer than the kick time of CleanReqAgent self.log.warn( "Duplicate request, keep it but don't execute", ": %d/%s" % (request.RequestID, request.RequestName) ) return S_ERROR(errno.EALREADY, "Request already in cache") self.__requestCache[request.RequestID] = request return S_OK() def putRequest(self, requestID, taskResult=None): """put back :requestID: to RequestClient :param str requestID: request's id """ if requestID in self.__requestCache: request = self.__requestCache.pop(requestID) if taskResult: if taskResult["OK"]: request = taskResult["Value"] # The RequestTask is putting back the Done tasks, no need to redo it if request.Status == "Done": return S_OK() # In case of timeout, we need to increment ourselves all the attempts elif cmpError(taskResult, errno.ETIME): waitingOp = request.getWaiting() for rmsFile in waitingOp.get("Value", []): rmsFile.Attempt += 1 reset = self.requestClient().putRequest(request, useFailoverProxy=False, retryMainService=2) if not reset["OK"]: return S_ERROR("putRequest: unable to reset request %s: %s" % (requestID, reset["Message"])) else: return S_ERROR("Not in cache") return S_OK() def putAllRequests(self): """put back all requests without callback called into requestClient :param self: self reference """ self.log.info("putAllRequests: will put back requests", "%s" % len(self.__requestCache)) for requestID in self.__requestCache.keys(): reset = self.putRequest(requestID) if not reset["OK"]: self.log.error("Failed to put request", reset["Message"]) else: self.log.debug("putAllRequests: request %s has been put back with its initial state" % requestID) return S_OK() def initialize(self): """initialize agent""" # # ProcessPool related stuff self.__requestsPerCycle = self.am_getOption("RequestsPerCycle", self.__requestsPerCycle) self.log.info("Requests/cycle = %d" % self.__requestsPerCycle) self.__minProcess = self.am_getOption("MinProcess", self.__minProcess) self.log.info("ProcessPool min process = %d" % self.__minProcess) self.__maxProcess = self.am_getOption("MaxProcess", self.__maxProcess) self.log.info("ProcessPool max process = %d" % self.__maxProcess) self.__queueSize = self.am_getOption("ProcessPoolQueueSize", self.__queueSize) self.log.info("ProcessPool queue size = %d" % self.__queueSize) self.__poolTimeout = int(self.am_getOption("ProcessPoolTimeout", self.__poolTimeout)) self.log.info("ProcessPool timeout = %d seconds" % self.__poolTimeout) self.__poolSleep = int(self.am_getOption("ProcessPoolSleep", self.__poolSleep)) self.log.info("ProcessPool sleep time = %d seconds" % self.__poolSleep) self.__bulkRequest = self.am_getOption("BulkRequest", self.__bulkRequest) self.log.info("Bulk request size = %d" % self.__bulkRequest) # Check if monitoring is enabled if "Monitoring" in Operations().getMonitoringBackends(monitoringType="RMSMonitoring"): # Enable RMS monitoring self.__rmsMonitoring = True self.log.info("Enable ES RMS Monitoring = %s" % self.__rmsMonitoring) # # keep config path and agent name self.agentName = self.am_getModuleParam("fullName") self.__configPath = PathFinder.getAgentSection(self.agentName) # # operation handlers over here opHandlersPath = "%s/%s" % (self.__configPath, "OperationHandlers") opHandlers = gConfig.getSections(opHandlersPath) if not opHandlers["OK"]: self.log.error(opHandlers["Message"]) raise AgentConfigError("OperationHandlers section not found in CS under %s" % self.__configPath) opHandlers = opHandlers["Value"] self.timeOuts = dict() # # handlers dict self.handlersDict = dict() for opHandler in opHandlers: opHandlerPath = "%s/%s/Location" % (opHandlersPath, opHandler) opLocation = gConfig.getValue(opHandlerPath, "") if not opLocation: self.log.error("%s not set for %s operation handler" % (opHandlerPath, opHandler)) continue self.timeOuts[opHandler] = {"PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout} opTimeout = gConfig.getValue("%s/%s/TimeOut" % (opHandlersPath, opHandler), 0) if opTimeout: self.timeOuts[opHandler]["PerOperation"] = opTimeout fileTimeout = gConfig.getValue("%s/%s/TimeOutPerFile" % (opHandlersPath, opHandler), 0) if fileTimeout: self.timeOuts[opHandler]["PerFile"] = fileTimeout self.handlersDict[opHandler] = opLocation self.log.info("Operation handlers:") for item in enumerate(self.handlersDict.items()): opHandler = item[1][0] self.log.info( "[%s] %s: %s (timeout: %d s + %d s per file)" % ( item[0], item[1][0], item[1][1], self.timeOuts[opHandler]["PerOperation"], self.timeOuts[opHandler]["PerFile"], ) ) if self.__rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") gThreadScheduler.addPeriodicTask(100, self.__rmsMonitoringReporting) # # create request dict self.__requestCache = dict() return S_OK() def execute(self): """read requests from RequestClient and enqueue them into ProcessPool""" # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug("execute: executing %d request in this cycle" % taskCounter) requestsToExecute = [] if not self.__bulkRequest: self.log.info("execute: ask for a single request") getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error("execute:", "%s" % getRequest["Message"]) break if not getRequest["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break requestsToExecute = [getRequest["Value"]] else: numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter) self.log.info("execute: ask for requests", "%s" % numberOfRequest) getRequests = self.requestClient().getBulkRequests(numberOfRequest) if not getRequests["OK"]: self.log.error("execute:", "%s" % getRequests["Message"]) break if not getRequests["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break for rId in getRequests["Value"]["Failed"]: self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId]) requestsToExecute = list(getRequests["Value"]["Successful"].values()) self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute)) for request in requestsToExecute: # # set task id taskID = request.RequestID self.log.info( "processPool status", "tasks idle = %s working = %s" % (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses()), ) looping = 0 while True: if not self.processPool().getFreeSlots(): if not looping: self.log.info( "No free slots available in processPool", "will wait %d seconds to proceed" % self.__poolSleep, ) time.sleep(self.__poolSleep) looping += 1 else: if looping: self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep) looping = 0 # # save current request in cache res = self.cacheRequest(request) if not res["OK"]: if cmpError(res, errno.EALREADY): # The request is already in the cache, skip it. break out of the while loop to get next request break # There are too many requests in the cache, commit suicide self.log.error( "Too many requests in cache", "(%d requests): put back all requests and exit cycle. Error %s" % (len(self.__requestCache), res["Message"]), ) self.putAllRequests() return res # # serialize to JSON result = request.toJSON() if not result["OK"]: continue requestJSON = result["Value"] self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName)) timeOut = self.getTimeout(request) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs={ "requestJSON": requestJSON, "handlersDict": self.handlersDict, "csPath": self.__configPath, "agentName": self.agentName, "rmsMonitoring": self.__rmsMonitoring, }, taskID=taskID, blocking=True, usePoolCallbacks=True, timeOut=timeOut, ) if not enqueue["OK"]: self.log.error("Could not enqueue task", enqueue["Message"]) else: self.log.debug("successfully enqueued task", "'%s'" % taskID) # # update monitor if self.__rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(TimeUtilities.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "status": "Attempted", "objectID": request.RequestID, "nbObject": 1, } ) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep(0.1) break self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache)) processed = self.processPool().processResults() # This happens when the result queue is screwed up. # Returning S_ERROR proved not to be sufficient, # and when in this situation, there is nothing we can do. # So we just exit. runit will restart from scratch. if processed < 0: self.log.fatal("Results queue is screwed up") sys.exit(1) # # clean return return S_OK() def getTimeout(self, request): """get timeout for request""" timeout = 0 for op in request: if op.Status not in ("Waiting", "Scheduled", "Queued"): continue if op.Type not in self.timeOuts: timeout += self.__operationTimeout else: perOp = self.timeOuts[op.Type].get("PerOperation", self.__operationTimeout) perFiles = self.timeOuts[op.Type].get("PerFile", self.__fileTimeout) * len(op) timeout += perOp + perFiles self.log.info( "estimated timeOut for request", "(%s/%s) is %s" % (request.RequestID, request.RequestName, timeout) ) return timeout def finalize(self): """agent finalization""" if self.__processPool: self.processPool().finalize(timeout=self.__poolTimeout) self.putAllRequests() return S_OK() def resultCallback(self, taskID, taskResult): """definition of request callback function :param str taskID: Request.RequestID :param dict taskResult: task result S_OK(Request)/S_ERROR(Message) """ # # clean cache res = self.putRequest(taskID, taskResult) self.log.info( "callback:", "%s result is %s(%s), put %s(%s)" % ( taskID, "S_OK" if taskResult["OK"] else "S_ERROR", taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"], "S_OK" if res["OK"] else "S_ERROR", "" if res["OK"] else res["Message"], ), ) def exceptionCallback(self, taskID, taskException): """definition of exception callback function :param str taskID: Request.RequestID :param Exception taskException: Exception instance """ self.log.error("exceptionCallback:", "%s was hit by exception %s" % (taskID, taskException)) self.putRequest(taskID) def __rmsMonitoringReporting(self): """This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which is done by the MonitoringReporter and is send to the 'RMSMonitoring' type. :return: True / False """ result = self.rmsMonitoringReporter.commit() return result["OK"]