class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg logging logger: the logger :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger.""" def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) def setRootLogger(quiet, debug): """Sets the root logger with the desired verbosity level The root logger logs to logs/twlog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" createLogdir('logs') createLogdir('logs/processes') createLogdir('logs/tasks') if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger("master") logger.debug("PID %s." % os.getpid()) logger.debug("Logging level initialized to %s." % loglevel) return logger self.STOP = False self.TEST = test self.logger = setRootLogger(quiet, debug) self.config = config resthost = None self.restURInoAPI = None if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration") elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: resthost = MODEURL[self.config.TaskWorker.mode]['host'] self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] else: resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_- self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] if resthost is None: raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration") self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 2) self.logger.debug("Hostcert: %s, hostkey: %s" %(str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)] if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry: raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.") if self.TEST: self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb') else: self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb') self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames] def getRecurringActionInst(self, actionName): mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName) return getattr(mod, actionName)() def _lockWork(self, limit, getstatus, setstatus): """Today this is always returning true, because we do not want the worker to die if the server endpoint is not avaialable. Prints a log entry if answer is greater than 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus} try: self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq)) except HTTPException as hte: #Using a msg variable and only one self.logger.error so that messages do not get shuffled msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (str(hte), configreq) if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \ not hte.headers.get('X-Error-Http', -1) == '400': msg += "Task Worker could not update work to the server: \n" +\ "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) +\ "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown')) msg += "Probably no task to be updated\n" if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: msg += "TW could not update work to the server:\n" msg += "%s \n" %(str(traceback.format_exc())) msg += "\turl: %s\n" %(getattr(hte, 'url', 'unknown')) msg += "\tresult: %s\n" %(getattr(hte, 'result', 'unknown')) self.logger.error(msg) except Exception as exc: msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (str(exc), configreq) self.logger.error(msg + traceback.format_exc()) return True def _getWork(self, limit, getstatus): configreq = {'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus} pendingwork = [] try: pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data = configreq)[0]['result'] except HTTPException as hte: self.logger.error("HTTP Error during _getWork: %s" % str(hte)) self.logger.error("Could not get any work from the server: \n" + "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) + "\treason: %s" %(hte.headers.get('X-Error-Detail', 'unknown'))) if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: self.logger.error("Server could not acquire any work from the server:") self.logger.error("%s " %(str(traceback.format_exc()))) self.logger.error("\turl: %s\n" %(getattr(hte, 'url', 'unknown'))) self.logger.error("\tresult: %s\n" %(getattr(hte, 'result', 'unknown'))) except Exception as exc: self.logger.error("Server could not process the request: %s" %(str(exc))) self.logger.error(traceback.format_exc()) return pendingwork def quit(self, code, traceback_): self.logger.info("Received kill request. Waiting for the workers...") self.STOP = True def updateWork(self, taskname, status): configreq = {'workflow': taskname, 'status': status, 'subresource': 'state'} retry = True while retry: try: self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq)) retry = False except HTTPException as hte: #Using a msg variable and only one self.logger.error so that messages do not get shuffled msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (str(hte), configreq) msg += "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) msg += "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown')) msg += "\turl: %s\n" %(getattr(hte, 'url', 'unknown')) msg += "\tresult: %s\n" %(getattr(hte, 'result', 'unknown')) msg += "%s \n" %(str(traceback.format_exc())) self.logger.error(msg) retry = False if int(hte.headers.get('X-Error-Http', '0')) == 503: #503 - Database/Service unavailable. Maybe Intervention of CMSWEB ongoing? retry = True time_sleep = 30 + random.randint(10, 30) self.logger.info("Sleeping %s seconds and will try to update again." % str(time_sleep)) time.sleep(time_sleep) except Exception as exc: msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (str(exc), configreq) self.logger.error(msg + traceback.format_exc()) retry = False def algorithm(self): """I'm the intelligent guy taking care of getting the work and distribuiting it to the slave processes.""" self.logger.debug("Starting") while(not self.STOP): for status, worktype, failstatus in states(): limit = self.slaves.queueableTasks() if not self._lockWork(limit=limit, getstatus=status, setstatus='HOLDING'): continue ## Warning: If we fail to retrieve tasks on HOLDING (e.g. because cmsweb is down) ## we may end up executing the wrong worktype later on. A solution would be to ## save the previous task state in a new column of the TaskDB. pendingwork = self._getWork(limit=limit, getstatus='HOLDING') self.logger.info("Retrieved a total of %d %s works" %(len(pendingwork), worktype)) self.logger.debug("Retrieved the following works: \n%s" %(str(pendingwork))) self.slaves.injectWorks([(worktype, task, failstatus, None) for task in pendingwork]) for task in pendingwork: self.updateWork(task['tm_taskname'], 'QUEUED') for action in self.recurringActions: if action.isTimeToGo(): #Maybe we should use new slaves and not reuse the ones used for the tasks self.logger.debug("Injecting recurring action: \n%s" %(str(action.__module__))) self.slaves.injectWorks([(handleRecurring, {'tm_taskname' : action.__module__}, 'FAILED', action.__module__)]) self.logger.info('Master Worker status:') self.logger.info(' - free slaves: %d' % self.slaves.freeSlaves()) self.logger.info(' - acquired tasks: %d' % self.slaves.queuedTasks()) self.logger.info(' - tasks pending in queue: %d' % self.slaves.pendingTasks()) time.sleep(self.config.TaskWorker.polling) finished = self.slaves.checkFinished() self.logger.debug("Master Worker Exiting Main Cycle")
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg logging logger: the logger :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger.""" def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) def setRootLogger(quiet, debug): """Sets the root logger with the desired verbosity level The root logger logs to logs/twlog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" createLogdir('logs') createLogdir('logs/processes') createLogdir('logs/tasks') if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger("master") logger.debug("PID %s.", os.getpid()) logger.debug("Logging level initialized to %s.", loglevel) return logger self.STOP = False self.TEST = test self.logger = setRootLogger(quiet, debug) self.config = config resthost = None self.restURInoAPI = None if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration") elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: resthost = MODEURL[self.config.TaskWorker.mode]['host'] self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] else: resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_- self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] if resthost is None: raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration") #Let's increase the server's retries for recoverable errors in the MasterWorker #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...) self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 20, logger = self.logger) self.logger.debug("Hostcert: %s, hostkey: %s", str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey)) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)] if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry: raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.") if self.TEST: self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb') else: self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb') self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames] def getRecurringActionInst(self, actionName): mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName) return getattr(mod, actionName)() def _lockWork(self, limit, getstatus, setstatus): """Today this is always returning true, because we do not want the worker to die if the server endpoint is not avaialable. Prints a log entry if answer is greater than 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus} try: self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq)) except HTTPException as hte: msg = "HTTP Error during _lockWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) return False except Exception: #pylint: disable=broad-except self.logger.exception("Server could not process the _lockWork request (prameters are %s)", configreq) return False return True def _getWork(self, limit, getstatus): configreq = {'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus} pendingwork = [] try: pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data = configreq)[0]['result'] except HTTPException as hte: msg = "HTTP Error during _getWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) except Exception: #pylint: disable=broad-except self.logger.exception("Server could not process the _getWork request (prameters are %s)", configreq) return pendingwork def quit_(self, dummyCode, dummyTraceback): self.logger.info("Received kill request. Setting STOP flag in the master process...") self.STOP = True def updateWork(self, taskname, command, status): configreq = {'workflow': taskname, 'command': command, 'status': status, 'subresource': 'state'} try: self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq)) except HTTPException as hte: msg = "HTTP Error during updateWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) except Exception: #pylint: disable=broad-except self.logger.exception("Server could not process the updateWork request (prameters are %s)", configreq) def failQueuedTasks(self): """ This method is used at the TW startup and it fails QUEUED tasks that supposedly could not communicate with the REST and update their status. The method put those task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of the command field. """ limit = self.slaves.nworkers * 2 total = 0 while True: pendingwork = self._getWork(limit=limit, getstatus='QUEUED') for task in pendingwork: self.logger.debug("Failing QUEUED task %s", task['tm_taskname']) dummyWorktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']] self.updateWork(task['tm_taskname'], task['tm_task_command'], failstatus) if not len(pendingwork): self.logger.info("Finished failing QUEUED tasks (total %s)", total) break #too bad "do..while" does not exist in python... else: total += len(pendingwork) self.logger.info("Failed %s tasks (limit %s), getting next chunk of tasks", len(pendingwork), limit) def algorithm(self): """I'm the intelligent guy taking care of getting the work and distributing it to the slave processes.""" self.logger.debug("Failing QUEUED tasks before startup.") self.failQueuedTasks() self.logger.debug("Starting main loop.") while(not self.STOP): limit = self.slaves.queueableTasks() if not self._lockWork(limit=limit, getstatus='NEW', setstatus='HOLDING'): continue pendingwork = self._getWork(limit=limit, getstatus='HOLDING') if len(pendingwork) > 0: self.logger.info("Retrieved a total of %d works", len(pendingwork)) self.logger.debug("Retrieved the following works: \n%s", str(pendingwork)) toInject = [] for task in pendingwork: worktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']] toInject.append((worktype, task, failstatus, None)) for task in pendingwork: self.updateWork(task['tm_taskname'], task['tm_task_command'], 'QUEUED') self.slaves.injectWorks(toInject) for action in self.recurringActions: if action.isTimeToGo(): #Maybe we should use new slaves and not reuse the ones used for the tasks self.logger.debug("Injecting recurring action: \n%s", (str(action.__module__))) self.slaves.injectWorks([(handleRecurring, {'tm_taskname' : action.__module__}, 'FAILED', action.__module__)]) self.logger.info('Master Worker status:') self.logger.info(' - free slaves: %d', self.slaves.freeSlaves()) self.logger.info(' - acquired tasks: %d', self.slaves.queuedTasks()) self.logger.info(' - tasks pending in queue: %d', self.slaves.pendingTasks()) time.sleep(self.config.TaskWorker.polling) dummyFinished = self.slaves.checkFinished() self.logger.debug("Master Worker Exiting Main Cycle")
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg logging logger: the logger :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger.""" def getLogging(quiet, debug): """Retrieves a logger and set the proper level :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('twlog.log', when="midnight") logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = logging.getLogger() logger.debug("Logging level initialized to %s." %loglevel) return logger self.STOP = False self.TEST = test self.logger = getLogging(quiet, debug) self.config = config restinstance = None self.resturl = '/crabserver/prod/workflowdb' if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration") elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: restinstance = MODEURL[self.config.TaskWorker.mode]['host'] self.resturl = self.resturl.replace('prod', MODEURL[self.config.TaskWorker.mode]['instance']) else: restinstance = self.config.TaskWorker.resturl self.resturl = self.resturl.replace('prod', MODEURL[self.config.TaskWorker.mode]['instance']) if self.resturl is None or restinstance is None: raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration") self.server = HTTPRequests(restinstance, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey) self.logger.debug("Hostcert: %s, hostkey: %s" %(str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)] if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry: raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.") if self.TEST: self.slaves = TestWorker(self.config, restinstance, self.resturl) else: self.slaves = Worker(self.config, restinstance, self.resturl) self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames] def getRecurringActionInst(self, actionName): mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName) return getattr(mod, actionName)() def _lockWork(self, limit, getstatus, setstatus): """Today this is alays returning true, because we do not want the worker to day if the server endpoint is not avaialable. Prints a log entry if answer is greater then 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus} try: self.server.post(self.resturl, data = urllib.urlencode(configreq)) except HTTPException, hte: #Using a msg variable and only one self.logger.error so that messages do not get shuffled msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (str(hte), configreq) if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \ not hte.headers.get('X-Error-Http', -1) == '400': msg += "Task Worker could not update work to the server: \n" +\ "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) +\ "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown')) msg += "Probably no task to be updated\n" if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: msg += "TW could not update work to the server:\n" msg += "%s \n" %(str(traceback.format_exc())) msg += "\turl: %s\n" %(getattr(hte, 'url', 'unknown')) msg += "\tresult: %s\n" %(getattr(hte, 'result', 'unknown')) self.logger.error(msg) except Exception, exc: msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (str(exc), configreq) self.logger.error(msg + traceback.format_exc())
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg logging logger: the logger :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger.""" def getLogging(quiet, debug): """Retrieves a logger and set the proper level :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('twlog.log', when="midnight") logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = logging.getLogger() logger.debug("Logging level initialized to %s." %loglevel) return logger self.TEST = test self.logger = getLogging(quiet, debug) self.config = config restinstance = None self.resturl = '/crabserver/prod/workflowdb' if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration") elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: restinstance = MODEURL[self.config.TaskWorker.mode]['host'] self.resturl = self.resturl.replace('prod', MODEURL[self.config.TaskWorker.mode]['instance']) else: restinstance = self.config.TaskWorker.resturl self.resturl = self.resturl.replace('prod', MODEURL[self.config.TaskWorker.mode]['instance']) if self.resturl is None or restinstance is None: raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration") self.server = HTTPRequests(restinstance, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, version=__version__) self.logger.debug("Hostcert: %s, hostkey: %s" % (str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))) if self.TEST: self.slaves = TestWorker(self.config, restinstance, self.resturl) else: self.slaves = Worker(self.config, restinstance, self.resturl) self.slaves.begin() def _lockWork(self, limit, getstatus, setstatus): """Today this is alays returning true, because we do not want the worker to day if the server endpoint is not avaialable. Prints a log entry if answer is greater then 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus} try: self.server.post(self.resturl, data = urllib.urlencode(configreq)) except HTTPException, hte: if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \ not hte.headers.get('X-Error-Http', -1) == '400': self.logger.error("Server could not acquire any work from the server: \n" + "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) + "\treason: %s" %(hte.headers.get('X-Error-Detail', 'unknown'))) self.logger.error("Probably no task to be processed") if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: self.logger.error("Server could not acquire any work from the server:") self.logger.error("%s " %(str(traceback.format_exc()))) self.logger.error("\turl: %s\n" %(getattr(hte, 'url', 'unknown'))) self.logger.error("\tresult: %s\n" %(getattr(hte, 'result', 'unknown'))) except Exception, exc: self.logger.error("Server could not process the request: %s" %(str(exc)))
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg logging logger: the logger :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger.""" def getLogging(quiet, debug): """Retrieves a logger and set the proper level :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('twlog.log', when="midnight") logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = logging.getLogger() logger.debug("Logging level initialized to %s." % loglevel) return logger self.TEST = test self.logger = getLogging(quiet, debug) self.config = config restinstance = None self.resturl = '/crabserver/prod/workflowdb' if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException( "No mode provided: need to specify config.TaskWorker.mode in the configuration" ) elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: restinstance = MODEURL[self.config.TaskWorker.mode]['host'] self.resturl = self.resturl.replace( 'prod', MODEURL[self.config.TaskWorker.mode]['instance']) else: restinstance = self.config.TaskWorker.resturl self.resturl = self.resturl.replace( 'prod', MODEURL[self.config.TaskWorker.mode]['instance']) if self.resturl is None or restinstance is None: raise ConfigException( "No correct mode provided: need to specify config.TaskWorker.mode in the configuration" ) self.server = HTTPRequests(restinstance, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, version=__version__) self.logger.debug("Hostcert: %s, hostkey: %s" % (str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))) if self.TEST: self.slaves = TestWorker(self.config, restinstance, self.resturl) else: self.slaves = Worker(self.config, restinstance, self.resturl) self.slaves.begin() def _lockWork(self, limit, getstatus, setstatus): """Today this is alays returning true, because we do not want the worker to day if the server endpoint is not avaialable. Prints a log entry if answer is greater then 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = { 'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus } try: self.server.post(self.resturl, data=urllib.urlencode(configreq)) except HTTPException, hte: if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \ not hte.headers.get('X-Error-Http', -1) == '400': self.logger.error( "Server could not acquire any work from the server: \n" + "\tstatus: %s\n" % (hte.headers.get('X-Error-Http', 'unknown')) + "\treason: %s" % (hte.headers.get('X-Error-Detail', 'unknown'))) self.logger.error("Probably no task to be processed") if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: self.logger.error( "Server could not acquire any work from the server:") self.logger.error("%s " % (str(traceback.format_exc()))) self.logger.error("\turl: %s\n" % (getattr(hte, 'url', 'unknown'))) self.logger.error("\tresult: %s\n" % (getattr(hte, 'result', 'unknown'))) except Exception, exc: self.logger.error("Server could not process the request: %s" % (str(exc)))
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg logging logger: the logger :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger.""" def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) def setRootLogger(quiet, debug): """Sets the root logger with the desired verbosity level The root logger logs to logs/twlog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" createLogdir('logs') createLogdir('logs/processes') createLogdir('logs/tasks') if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger("master") logger.debug("PID %s." % os.getpid()) logger.debug("Logging level initialized to %s." % loglevel) return logger self.STOP = False self.TEST = test self.logger = setRootLogger(quiet, debug) self.config = config resthost = None self.restURInoAPI = None if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException( "No mode provided: need to specify config.TaskWorker.mode in the configuration" ) elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: resthost = MODEURL[self.config.TaskWorker.mode]['host'] self.restURInoAPI = '/crabserver/' + MODEURL[ self.config.TaskWorker.mode]['instance'] else: resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_- self.restURInoAPI = '/crabserver/' + MODEURL[ self.config.TaskWorker.mode]['instance'] if resthost is None: raise ConfigException( "No correct mode provided: need to specify config.TaskWorker.mode in the configuration" ) self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry=2) self.logger.debug("Hostcert: %s, hostkey: %s" % (str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [ retry * 20 * 2 for retry in range(self.config.TaskWorker.max_retry) ] if not len(self.config.TaskWorker.retry_interval ) == self.config.TaskWorker.max_retry: raise ConfigException( "No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry." ) if self.TEST: self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb') else: self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb') self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [ self.getRecurringActionInst(name) for name in recurringActionsNames ] def getRecurringActionInst(self, actionName): mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName) return getattr(mod, actionName)() def _lockWork(self, limit, getstatus, setstatus): """Today this is always returning true, because we do not want the worker to die if the server endpoint is not avaialable. Prints a log entry if answer is greater than 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = { 'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus } try: self.server.post(self.restURInoAPI + '/workflowdb', data=urllib.urlencode(configreq)) except HTTPException as hte: #Using a msg variable and only one self.logger.error so that messages do not get shuffled msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % ( str(hte), configreq) if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \ not hte.headers.get('X-Error-Http', -1) == '400': msg += "Task Worker could not update work to the server: \n" +\ "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) +\ "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown')) msg += "Probably no task to be updated\n" if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: msg += "TW could not update work to the server:\n" msg += "%s \n" % (str(traceback.format_exc())) msg += "\turl: %s\n" % (getattr(hte, 'url', 'unknown')) msg += "\tresult: %s\n" % (getattr(hte, 'result', 'unknown')) self.logger.error(msg) except Exception as exc: msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % ( str(exc), configreq) self.logger.error(msg + traceback.format_exc()) return True def _getWork(self, limit, getstatus): configreq = { 'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus } pendingwork = [] try: pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data=configreq)[0]['result'] except HTTPException as hte: self.logger.error("HTTP Error during _getWork: %s" % str(hte)) self.logger.error("Could not get any work from the server: \n" + "\tstatus: %s\n" % (hte.headers.get('X-Error-Http', 'unknown')) + "\treason: %s" % (hte.headers.get('X-Error-Detail', 'unknown'))) if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: self.logger.error( "Server could not acquire any work from the server:") self.logger.error("%s " % (str(traceback.format_exc()))) self.logger.error("\turl: %s\n" % (getattr(hte, 'url', 'unknown'))) self.logger.error("\tresult: %s\n" % (getattr(hte, 'result', 'unknown'))) except Exception as exc: self.logger.error("Server could not process the request: %s" % (str(exc))) self.logger.error(traceback.format_exc()) return pendingwork def quit(self, code, traceback_): self.logger.info("Received kill request. Waiting for the workers...") self.STOP = True def updateWork(self, taskname, status): configreq = { 'workflow': taskname, 'status': status, 'subresource': 'state' } retry = True while retry: try: self.server.post(self.restURInoAPI + '/workflowdb', data=urllib.urlencode(configreq)) retry = False except HTTPException as hte: #Using a msg variable and only one self.logger.error so that messages do not get shuffled msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % ( str(hte), configreq) msg += "\tstatus: %s\n" % (hte.headers.get( 'X-Error-Http', 'unknown')) msg += "\treason: %s\n" % (hte.headers.get( 'X-Error-Detail', 'unknown')) msg += "\turl: %s\n" % (getattr(hte, 'url', 'unknown')) msg += "\tresult: %s\n" % (getattr(hte, 'result', 'unknown')) msg += "%s \n" % (str(traceback.format_exc())) self.logger.error(msg) retry = False if int(hte.headers.get('X-Error-Http', '0')) == 503: #503 - Database/Service unavailable. Maybe Intervention of CMSWEB ongoing? retry = True time_sleep = 30 + random.randint(10, 30) self.logger.info( "Sleeping %s seconds and will try to update again." % str(time_sleep)) time.sleep(time_sleep) except Exception as exc: msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % ( str(exc), configreq) self.logger.error(msg + traceback.format_exc()) retry = False def algorithm(self): """I'm the intelligent guy taking care of getting the work and distribuiting it to the slave processes.""" self.logger.debug("Starting") while (not self.STOP): for status, worktype, failstatus in states(): limit = self.slaves.queueableTasks() if not self._lockWork( limit=limit, getstatus=status, setstatus='HOLDING'): continue ## Warning: If we fail to retrieve tasks on HOLDING (e.g. because cmsweb is down) ## we may end up executing the wrong worktype later on. A solution would be to ## save the previous task state in a new column of the TaskDB. pendingwork = self._getWork(limit=limit, getstatus='HOLDING') self.logger.info("Retrieved a total of %d %s works" % (len(pendingwork), worktype)) self.logger.debug("Retrieved the following works: \n%s" % (str(pendingwork))) self.slaves.injectWorks([(worktype, task, failstatus, None) for task in pendingwork]) for task in pendingwork: self.updateWork(task['tm_taskname'], 'QUEUED') for action in self.recurringActions: if action.isTimeToGo(): #Maybe we should use new slaves and not reuse the ones used for the tasks self.logger.debug("Injecting recurring action: \n%s" % (str(action.__module__))) self.slaves.injectWorks([(handleRecurring, { 'tm_taskname': action.__module__ }, 'FAILED', action.__module__)]) self.logger.info('Master Worker status:') self.logger.info(' - free slaves: %d' % self.slaves.freeSlaves()) self.logger.info(' - acquired tasks: %d' % self.slaves.queuedTasks()) self.logger.info(' - tasks pending in queue: %d' % self.slaves.pendingTasks()) time.sleep(self.config.TaskWorker.polling) finished = self.slaves.checkFinished() self.logger.debug("Master Worker Exiting Main Cycle")
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, logWarning, logDebug, sequential=False, console=False, name='master'): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg bool logWarning: it tells if a quiet logger is needed :arg bool logDebug: it tells if needs a verbose logger :arg bool sequential: it tells if to run in sequential (no subprocesses) mode. :arg bool console: it tells if to log to console. :arg string name: defines a name for the log of this master process""" def createLogdir(dirname): """ Create the directory dirname ignoring errors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) def createAndCleanLogDirectories(logsDir): # it can be named with the time stamp a TW started createLogdir(logsDir) createLogdir(logsDir + '/tasks') currentProcessesDir = logsDir + '/processes/' createLogdir(currentProcessesDir) # when running inside a container process logs will start with same # process numbers, i.e. same name, at any container restart. # to avoid clashes and confusion, we will put away all previous processes # logs when a TW instance starts. To this goal each TW which runs # creates a directory where new containers will move its logs, so # identify LastLogs_timestamp directory latestLogDir = None # the logs directory could be empty files = os.listdir(currentProcessesDir) files.sort( reverse=True ) # if there are multiple Latest*, will hit the latest first for f in files: if f.startswith('Latest'): latestLogDir = currentProcessesDir + f break if files and latestLogDir: # rename from Latest to Old oldLogsDir = latestLogDir.replace('Latest', 'Old') shutil.move(latestLogDir, oldLogsDir) else: print( "LatestLogDir not found in logs/processes, create a dummy dir to store old files" ) oldLogsDir = currentProcessesDir + 'OldLog-Unknwown' createLogdir(oldLogsDir) # move process logs for latest TW run to old directory for f in files: if f.startswith('proc.c3id'): shutil.move(currentProcessesDir + f, oldLogsDir) # create a new LateastLogs directory where to store logs from this TaskWorker YYMMDD_HHMMSS = time.strftime('%y%m%d_%H%M%S', time.localtime()) myDir = currentProcessesDir + 'LatestLogs-' + YYMMDD_HHMMSS createLogdir(myDir) def setRootLogger(logWarning, logDebug, console, name): """Sets the root logger with the desired verbosity level The root logger logs to logsDir/twlog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool logWarning: it tells if a quiet logger is needed :arg bool logDebug: it tells if needs a verbose logger :arg bool console: it tells if to log to console :arg string name: define a name for the log file of this master process :return logger: a logger with the appropriate logger level.""" # this must only done for real Master, not when it is used by TapeRecallStatus logsDir = config.TaskWorker.logsDir if name == 'master': createAndCleanLogDirectories(logsDir) if console: logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog(logsDir + '/twlog.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if logWarning: loglevel = logging.WARNING if logDebug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger(name, logsDir) logger.debug("PID %s.", os.getpid()) logger.debug("Logging level initialized to %s.", loglevel) return logger self.STOP = False self.TEST = sequential self.logger = setRootLogger(logWarning, logDebug, console, name) self.config = config self.restHost = None dbInstance = None try: instance = self.config.TaskWorker.instance except: msg = "No instance provided: need to specify config.TaskWorker.instance in the configuration" raise ConfigException(msg) if instance in SERVICE_INSTANCES: self.logger.info('Will connect to CRAB service: %s', instance) self.restHost = SERVICE_INSTANCES[instance]['restHost'] dbInstance = SERVICE_INSTANCES[instance]['dbInstance'] else: msg = "Invalid instance value '%s'" % instance raise ConfigException(msg) if instance is 'other': self.logger.info( 'Will use restHost and dbInstance from config file') try: self.restHost = self.config.TaskWorker.restHost dbInstance = self.config.TaskWorker.dbInstance except: msg = "Need to specify config.TaskWorker.restHost and dbInstance in the configuration" raise ConfigException(msg) self.restURInoAPI = '/crabserver/' + dbInstance self.logger.info('Will connect via URL: https://%s/%s', self.restHost, self.restURInoAPI) #Let's increase the server's retries for recoverable errors in the MasterWorker #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...) self.server = HTTPRequests(self.restHost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry=20, logger=self.logger) self.logger.debug("Hostcert: %s, hostkey: %s", str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey)) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [ retry * 20 * 2 for retry in range(self.config.TaskWorker.max_retry) ] if not len(self.config.TaskWorker.retry_interval ) == self.config.TaskWorker.max_retry: raise ConfigException( "No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry." ) # use the config to pass some useful global stuff to all workers # will use TaskWorker.cmscert/key to talk with CMSWEB self.config.TaskWorker.envForCMSWEB = newX509env( X509_USER_CERT=self.config.TaskWorker.cmscert, X509_USER_KEY=self.config.TaskWorker.cmskey) if self.TEST: self.slaves = TestWorker(self.config, self.restHost, self.restURInoAPI + '/workflowdb') else: self.slaves = Worker(self.config, self.restHost, self.restURInoAPI + '/workflowdb') self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [ self.getRecurringActionInst(name) for name in recurringActionsNames ] def getRecurringActionInst(self, actionName): mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName) return getattr(mod, actionName)(self.config.TaskWorker.logsDir) def _lockWork(self, limit, getstatus, setstatus): """Today this is always returning true, because we do not want the worker to die if the server endpoint is not avaialable. Prints a log entry if answer is greater than 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = { 'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus } try: self.server.post(self.restURInoAPI + '/workflowdb', data=urllib.urlencode(configreq)) except HTTPException as hte: msg = "HTTP Error during _lockWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) return False except Exception: #pylint: disable=broad-except self.logger.exception( "Server could not process the _lockWork request (prameters are %s)", configreq) return False return True def getWork(self, limit, getstatus, ignoreTWName=False): configreq = { 'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus } if ignoreTWName: configreq['workername'] = '%' pendingwork = [] try: pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data=configreq)[0]['result'] except HTTPException as hte: msg = "HTTP Error during getWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) except Exception: #pylint: disable=broad-except self.logger.exception( "Server could not process the getWork request (prameters are %s)", configreq) return pendingwork def quit_(self, dummyCode, dummyTraceback): self.logger.info( "Received kill request. Setting STOP flag in the master process..." ) self.STOP = True def updateWork(self, taskname, command, status): """ Update taskname setting the status and the command for it Return True if the change succeded, False otherwise """ configreq = { 'workflow': taskname, 'command': command, 'status': status, 'subresource': 'state' } try: self.server.post(self.restURInoAPI + '/workflowdb', data=urllib.urlencode(configreq)) except HTTPException as hte: msg = "HTTP Error during updateWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) except Exception: #pylint: disable=broad-except self.logger.exception( "Server could not process the updateWork request (prameters are %s)", configreq) else: return True #success return False #failure def failQueuedTasks(self): """ This method is used at the TW startup and it fails QUEUED tasks that supposedly could not communicate with the REST and update their status. The method put those task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of the command field. """ limit = self.slaves.nworkers * 2 total = 0 while True: pendingwork = self.getWork(limit=limit, getstatus='QUEUED') for task in pendingwork: self.logger.debug("Failing QUEUED task %s", task['tm_taskname']) if task['tm_task_command']: dummyWorktype, failstatus = STATE_ACTIONS_MAP[ task['tm_task_command']] else: failstatus = 'FAILED' self.updateWork(task['tm_taskname'], task['tm_task_command'], failstatus) if not len(pendingwork): self.logger.info("Finished failing QUEUED tasks (total %s)", total) break #too bad "do..while" does not exist in python... else: total += len(pendingwork) self.logger.info( "Failed %s tasks (limit %s), getting next chunk of tasks", len(pendingwork), limit) def failBannedTask(self, task): """ This method is used at the TW startup and it fails NEW tasks which I do not like The method put those task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of the command field. Initial implementation bans based on a list of usernames, other task attributes can be checked if needed by adding a bit of code Returns: True : if the task was declared bad and was failed False: for normal (good) tasks """ bannedUsernames = getattr(self.config.TaskWorker, 'bannedUsernames', []) if task['tm_username'] in bannedUsernames: self.logger.debug("Forcefully failing task %s", task['tm_taskname']) if task['tm_task_command']: dummyWorktype, failstatus = STATE_ACTIONS_MAP[ task['tm_task_command']] else: failstatus = 'FAILED' self.updateWork(task['tm_taskname'], task['tm_task_command'], failstatus) # TODO look into logging a message for the user return True return False def algorithm(self): """I'm the intelligent guy taking care of getting the work and distributing it to the slave processes.""" self.logger.debug("Failing QUEUED tasks before startup.") self.failQueuedTasks() self.logger.debug("Master Worker Starting Main Cycle.") while not self.STOP: limit = self.slaves.queueableTasks() if not self._lockWork( limit=limit, getstatus='NEW', setstatus='HOLDING'): time.sleep(self.config.TaskWorker.polling) continue pendingwork = self.getWork(limit=limit, getstatus='HOLDING') if len(pendingwork) > 0: self.logger.info("Retrieved a total of %d works", len(pendingwork)) self.logger.debug("Retrieved the following works: \n%s", str(pendingwork)) toInject = [] for task in pendingwork: if self.failBannedTask(task): continue if self.updateWork(task['tm_taskname'], task['tm_task_command'], 'QUEUED'): worktype, failstatus = STATE_ACTIONS_MAP[ task['tm_task_command']] toInject.append((worktype, task, failstatus, None)) else: #The task stays in HOLDING and will be acquired again later self.logger.info( "Skipping %s since it could not be updated to QUEUED. Will be retried in the next iteration", task['tm_taskname']) self.slaves.injectWorks(toInject) for action in self.recurringActions: if action.isTimeToGo(): #Maybe we should use new slaves and not reuse the ones used for the tasks self.logger.debug("Injecting recurring action: \n%s", (str(action.__module__))) self.slaves.injectWorks([(handleRecurring, { 'tm_username': '******', 'tm_taskname': action.__module__ }, 'FAILED', action.__module__)]) self.logger.info('Master Worker status:') self.logger.info(' - free slaves: %d', self.slaves.freeSlaves()) self.logger.info(' - acquired tasks: %d', self.slaves.queuedTasks()) self.logger.info(' - tasks pending in queue: %d', self.slaves.pendingTasks()) time.sleep(self.config.TaskWorker.polling) dummyFinished = self.slaves.checkFinished() self.logger.debug("Master Worker Exiting Main Cycle.")