def __init__(self): GangaThread.__init__(self, 'LGI_Pilot') self.log = getLogger('LGI.Pilot.Thread') if not os.path.exists(config['PilotScript']): self.log.error('pilotjob script not found: '+config['PilotScript']) if not os.path.exists(config['PilotDist']): self.log.error('pilotjob tarball not found: '+config['PilotDist'])
def startup(self): """ Start a background thread that periodically run()s""" super(TaskRegistry, self).startup() from Ganga.Core.GangaThread import GangaThread self._main_thread = GangaThread(name="GangaTasks", target=self._thread_main) self._main_thread.start()
def __init__(self, name): is_critical = not config['enable_multiThreadMon'] GangaThread.__init__(self, name, critical=is_critical) self._currently_running_command = False self._running_cmd = None self._running_args = None self._thread_name = name
def __init__(self, session_name, sdir, fn, repo, afs): GangaThread.__init__(self, name='SessionLockRefresher', critical=True) self.session_name = session_name self.sdir = sdir self.fns = [fn] self.repos = [repo] self.afs = afs self.FileCheckTimes = {}
def __init__(self, session_name, sdir, fn, repo, afs): GangaThread.__init__(self, name="SessionLockRefresher", critical=False) self.session_name = session_name self.sdir = sdir self.fns = [fn] self.repos = [repo] self.afs = afs self.FileCheckTimes = {}
def resolve_file_locations(dataset, sites=None, cloud=None, token='ATLASDATADISK', debug=False): ''' Summarize the locations of files (in terms of sitename) of a dataset. If the sites argument is given, ignoring cloud and token arguments; otherwise using cloud and toke to retrieve sites from TiersOfATLAS. ''' if not sites: logger.debug('resolving sites with token: %s' % token) sites = dm_util.get_srmv2_sites(cloud, token=token, debug=debug) logger.debug('checking replicas at sites: %s' % str(sites)) replicas = {} # preparing the queue for querying lfn wq = Queue(len(sites)) for site in sites: wq.put(site) mylock = Lock() def worker(id): dq2 = DQ2() while not wq.empty(): try: site = wq.get(block=True, timeout=1) replicaInfo = dq2.listFileReplicas(site, dataset) logger.debug('resolving dataset files at %s, no files: %d' % (site,len(replicaInfo[0]['content'])) ) if replicaInfo: mylock.acquire() for guid in replicaInfo[0]['content']: if guid not in replicas: replicas[guid] = [] replicas[guid].append(site) mylock.release() except Empty: pass except DQException as err: logger.warning(str(err)) logger.warning('site %s excluded' % site) pass threads = [] nthread = len(sites) if nthread > 10: nthread = 10 for i in range(nthread): t = GangaThread(name='stager_ds_w_%d' % i, target=worker, kwargs={'id': i}) # t.setDaemon(False) threads.append(t) for t in threads: t.start() for t in threads: t.join() return replicas
def __init__(self): GangaThread.__init__(self, 'LGI_Resource') self.log = getLogger('LGI.Resource.Thread') config = Config.getConfig('LGI') if not os.path.exists(config['PilotDist']): self.log.error('cannot connect to LGI server: pilotjob tarball not found: '+config['PilotDist']) self.res = LGI.Resource(config['PilotDist']) # number of queued LGI jobs self.queued = None
def __init__(self, registry): GangaThread.__init__(self, name="JobRegistry_Monitor") log.debug("Constructing JobRegistry_Monitor") self.setDaemon(True) self.registry = registry self.__sleepCounter = 0.0 self.__updateTimeStamp = time.time() self.progressCallback = lambda x: None self.callbackHookDict = {} self.clientCallbackDict = {} self.alive = True self.enabled = False # run the monitoring loop continuosly (steps=-1) or just a specified # number of steps(>0) self.steps = -1 self.activeBackends = {} self.updateJobStatus = None self.errors = {} # Create the default backend update method and add to callback hook. self.makeUpdateJobStatusFunction() # Add credential checking to monitoring loop for _credObj in Credentials._allCredentials.itervalues(): log.debug("Setting callback hook for %s" % getName(_credObj)) self.setCallbackHook(self.makeCredCheckJobInsertor(_credObj), {}, True, timeout=config['creds_poll_rate']) # Add low disk-space checking to monitoring loop log.debug("Setting callback hook for disk space checking") self.setCallbackHook(self.diskSpaceCheckJobInsertor, {}, True, timeout=config['diskspace_poll_rate']) # synch objects # main loop mutex self.__mainLoopCond = threading.Condition() # cleanup synch self.__cleanUpEvent = threading.Event() # asynch mon loop running synch self.__monStepsTerminatedEvent = threading.Event() # event to signal the break of job lists iterators self.stopIter = threading.Event() self.stopIter.set() self._runningNow = False
def __init__(self, registry): GangaThread.__init__(self, name="JobRegistry_Monitor") log.debug("Constructing JobRegistry_Monitor") self.setDaemon(True) self.registry = registry self.__sleepCounter = 0.0 self.__updateTimeStamp = time.time() self.progressCallback = lambda x: None self.callbackHookDict = {} self.clientCallbackDict = {} self.alive = True self.enabled = False # run the monitoring loop continuosly (steps=-1) or just a specified # number of steps(>0) self.steps = -1 self.activeBackends = {} self.updateJobStatus = None self.errors = {} self.updateDict_ts = SynchronisedObject(UpdateDict()) # Create the default backend update method and add to callback hook. self.makeUpdateJobStatusFunction() # Add credential checking to monitoring loop for _credObj in Credentials._allCredentials.itervalues(): log.debug("Setting callback hook for %s" % getName(_credObj)) self.setCallbackHook(self.makeCredCheckJobInsertor(_credObj), {}, True, timeout=config['creds_poll_rate']) # Add low disk-space checking to monitoring loop log.debug("Setting callback hook for disk space checking") self.setCallbackHook(self.diskSpaceCheckJobInsertor, {}, True, timeout=config['diskspace_poll_rate']) # synch objects # main loop mutex self.__mainLoopCond = threading.Condition() # cleanup synch self.__cleanUpEvent = threading.Event() # asynch mon loop running synch self.__monStepsTerminatedEvent = threading.Event() # event to signal the break of job lists iterators self.stopIter = threading.Event() self.stopIter.set() self._runningNow = False
def __resolve_containers(self, containers, nthreads=10): '''resolving dataset containers''' datasets = {} wq = Queue(len(containers)) for ds in containers: wq.put(ds) mylock = Lock() def worker(id): dq2 = DQ2() while not wq.empty(): try: ds = wq.get(block=True, timeout=1) logger.debug('worker id: %d on dataset container: %s' % (id, ds)) datasets[ds] = [] ds_tmp = dq2.listDatasetsInContainer(ds) mylock.acquire() datasets[ds] = ds_tmp mylock.release() except DQException as err: logger.warning(str(err)) except Empty: pass profiler = ElapsedTimeProfiler(logger=logger) profiler.start() threads = [] for i in range(nthreads): t = GangaThread(name='stager_ds_w_%d' % i, target=worker, kwargs={'id': i}) # t.setDaemon(False) threads.append(t) for t in threads: t.start() for t in threads: t.join() profiler.check('resolving %d containers' % len(containers)) return datasets
def startup(self): """ Start a background thread that periodically run()s""" super(TaskRegistry, self).startup() from Ganga.Core.GangaThread import GangaThread self._main_thread = GangaThread(name="GangaTasks", target=self._thread_main) self._main_thread.start() # create a registry flusher self.flush_thread = RegistryFlusher(self) self.flush_thread.start()
def start(self): config = Config.getConfig("LGI") if config["StatsInterval"] == 0: self.log.debug("Not starting LGI stats thread because [LGI]StatsInterval is zero") return if not config["StatsFile"]: self.log.debug("Not starting LGI stats thread because [LGI]StatsFile is empty") return if config["Enable"] is False: self.log.debug("Not starting LGI stats thread because [LGI]Enable is False") return False return GangaThread.start(self)
def __init_worker_threads(self, num_worker_threads, worker_thread_prefix): if len(self.__worker_threads) > 0: logger.warning("Threads already started!") for i in self.__worker_threads: logger.info("Worker Thread: %s is already running!" % i.gangaName) return for i in range(num_worker_threads): t = GangaThread(name=worker_thread_prefix + str(i), auto_register=False, target=self.__worker_thread) t._Thread__args = (t, ) t._name = worker_thread_prefix + str(i) t._command = 'idle' t._timeout = 'N/A' t.start() self.__worker_threads.append(t)
def __init_worker_threads(self, num_worker_threads, worker_thread_prefix): if len(self.__worker_threads) > 0: logger.warning("Threads already started!") for i in self.__worker_threads: logger.info("Worker Thread: %s is already running!" % i.gangaName) return for i in range(num_worker_threads): t = GangaThread(name=worker_thread_prefix + str(i), auto_register=False, target=self.__worker_thread) t._Thread__args = (t,) t._name = worker_thread_prefix + str(i) t._command = "idle" t._timeout = "N/A" t.start() self.__worker_threads.append(t)
class TaskRegistry(Registry): def __init__(self, name, doc): super(TaskRegistry, self).__init__(name, doc) self._main_thread = None self.stored_slice = TaskRegistrySlice(self.name) self.stored_slice.objects = self self.stored_proxy = TaskRegistrySliceProxy(self.stored_slice) def getSlice(self): return self.stored_slice def getProxy(self): return self.stored_proxy def getIndexCache(self, obj): cached_values = ['status', 'id', 'name'] c = {} for cv in cached_values: if hasattr(obj, cv): c[cv] = getattr(obj, cv) this_slice = TaskRegistrySlice("tmp") for dpv in this_slice._display_columns: c["display:" + dpv] = this_slice._get_display_value(obj, dpv) return c def _thread_main(self): """ This is an internal function; the main loop of the background thread """ from Ganga.Core.GangaRepository import getRegistry while getRegistry("jobs").hasStarted() is not True: time.sleep(0.1) if self._main_thread is None or self._main_thread.should_stop(): return while True: from Ganga.Core import monitoring_component if (not monitoring_component is None and monitoring_component.enabled ) or config['ForceTaskMonitoring']: break time.sleep(0.1) if self._main_thread is None or self._main_thread.should_stop(): return # setup the tasks - THIS IS INCOMPATIBLE WITH CONCURRENCY # and must go away soon for tid in self.ids(): try: self[tid].startup() except Exception as err: logger.error( "Unknown/Unexpected Error in starting up tasks main loop") logger.error("Exiting: err=%s" % str(err)) return logger.debug("Entering main loop") # Main loop while self._main_thread is not None and not self._main_thread.should_stop( ): # If monitoring is enabled (or forced for Tasks) loop over each one and update if (config['ForceTaskMonitoring'] or monitoring_component.enabled ) and not config['disableTaskMon']: for tid in self.ids(): logger.debug("Running over tid: %s" % str(tid)) try: p = self[tid] p.update() except Exception as x: logger.error( "Exception occurred in task monitoring loop: %s %s\nThe offending task was paused." % (x.__class__, x)) type_, value_, traceback_ = sys.exc_info() logger.error("Full traceback:\n %s" % ' '.join( traceback.format_exception(type_, value_, traceback_))) p.pause() if self._main_thread.should_stop(): break if self._main_thread.should_stop(): break logger.debug("TaskRegistry Sleeping for: %s seconds" % str(config['TaskLoopFrequency'])) # Sleep interruptible for 10 seconds for i in range(0, int(config['TaskLoopFrequency'] * 100)): if self._main_thread.should_stop(): break time.sleep(0.01) def startup(self): """ Start a background thread that periodically run()s""" super(TaskRegistry, self).startup() from Ganga.Core.GangaThread import GangaThread self._main_thread = GangaThread(name="GangaTasks", target=self._thread_main) self._main_thread.start() # create a registry flusher self.flush_thread = RegistryFlusher(self, 'TaskRegistryFlusher') self.flush_thread.start() def shutdown(self): self.flush_thread.join() super(TaskRegistry, self).shutdown() def stop(self): if self._main_thread is not None: self._main_thread.stop() self._main_thread.join()
def get_complete_files_replicas(self, nthread=10, diskOnly=True): '''Gets a comprehensive dataset information about the contents and the location of COMPLETE replicas''' if not self.complete_files_replicas: re_tapeSite = re.compile('.*TAPE$') ds_info = {} self.__expand_datasets() wq = Queue(len(self.dataset)) for ds in self.dataset: wq.put(ds) mylock = Lock() def worker(id): dq2 = DQ2() while not wq.empty(): try: ds = wq.get(block=True, timeout=1) logger.debug('worker id: %d on dataset: %s' % (id, ds)) # get contents (guids) of the complete dataset contents = dq2.listFilesInDataset(ds) # get locations of the complete dataset replicas locations = dq2.listDatasetReplicas(ds,complete=1) vuid = None try: vuid = locations.keys()[0] except IndexError as err: pass mylock.acquire() # updating ds_info hastable if vuid: ds_info[ds] = [] ds_sites = [] if diskOnly: for site in locations[vuid][1]: if not re_tapeSite.match(site): ds_sites.append(site) else: ds_sites = locations[vuid][1] ds_info[ds] += [ contents[0], ds_sites ] else: logger.warning('dataset not available: %s' % ds) mylock.release() except DQException as err: logger.warning(str(err)) except Empty: pass # prepare and run the query threads profiler = ElapsedTimeProfiler(logger=logger) profiler.start() threads = [] for i in range(nthread): t = GangaThread(name='stager_ds_w_%d' % i, target=worker, kwargs={'id': i}) # t.setDaemon(False) threads.append(t) for t in threads: t.start() for t in threads: t.join() self.complete_files_replicas = ds_info profiler.check( 'information collected: %d datasets' % ( len(self.complete_files_replicas.keys()) ) ) else: logger.debug('using cached complete_files_replicas') pass return self.complete_files_replicas
def __init__(self, name): GangaThread.__init__(self, name)
for job in jobs: for subjob in job.subjobs: try: process_subjob(job,subjob,thread_dirac_server) except: logger.warning('Exception in process_subjob:') logger.warning(sys.exc_info()[0]) logger.warning(sys.exc_info()[1]) if test_paused() or ct.should_stop(): break test_sleep(10) logger.info('HC Monitor Thread: Disconnected from DB') ct = GangaThread(name="HCMonitorThread", target=hc_monitor_thread) logger.info('Connected to DB') if len(jobs): # Wait one minute, to let the minute counter update, and avoid # problems with the at command submitting on second = 0 test_sleep(60) ct.start() test_sleep(60) while (test_active() and not test_paused()): test = Test.objects.get(pk=testid) # logger.info('HC Copy Thread: TOP OF MAIN LOOP') for job in jobs: if not test_active() or test_paused() or ct.should_stop():
def get_complete_files_replicas(self, nthread=10, diskOnly=True): '''Gets a comprehensive dataset information about the contents and the location of COMPLETE replicas''' if not self.complete_files_replicas: re_tapeSite = re.compile('.*TAPE$') ds_info = {} self.__expand_datasets() wq = Queue(len(self.dataset)) for ds in self.dataset: wq.put(ds) mylock = Lock() def worker(id): dq2 = DQ2() while not wq.empty(): try: ds = wq.get(block=True, timeout=1) logger.debug('worker id: %d on dataset: %s' % (id, ds)) # get contents (guids) of the complete dataset contents = dq2.listFilesInDataset(ds) # get locations of the complete dataset replicas locations = dq2.listDatasetReplicas(ds, complete=1) vuid = None try: vuid = locations.keys()[0] except IndexError as err: pass mylock.acquire() # updating ds_info hastable if vuid: ds_info[ds] = [] ds_sites = [] if diskOnly: for site in locations[vuid][1]: if not re_tapeSite.match(site): ds_sites.append(site) else: ds_sites = locations[vuid][1] ds_info[ds] += [contents[0], ds_sites] else: logger.warning('dataset not available: %s' % ds) mylock.release() except DQException as err: logger.warning(str(err)) except Empty: pass # prepare and run the query threads profiler = ElapsedTimeProfiler(logger=logger) profiler.start() threads = [] for i in range(nthread): t = GangaThread(name='stager_ds_w_%d' % i, target=worker, kwargs={'id': i}) # t.setDaemon(False) threads.append(t) for t in threads: t.start() for t in threads: t.join() self.complete_files_replicas = ds_info profiler.check('information collected: %d datasets' % (len(self.complete_files_replicas.keys()))) else: logger.debug('using cached complete_files_replicas') pass return self.complete_files_replicas
def resolve_file_locations(dataset, sites=None, cloud=None, token='ATLASDATADISK', debug=False): ''' Summarize the locations of files (in terms of sitename) of a dataset. If the sites argument is given, ignoring cloud and token arguments; otherwise using cloud and toke to retrieve sites from TiersOfATLAS. ''' if not sites: logger.debug('resolving sites with token: %s' % token) sites = dm_util.get_srmv2_sites(cloud, token=token, debug=debug) logger.debug('checking replicas at sites: %s' % str(sites)) replicas = {} # preparing the queue for querying lfn wq = Queue(len(sites)) for site in sites: wq.put(site) mylock = Lock() def worker(id): dq2 = DQ2() while not wq.empty(): try: site = wq.get(block=True, timeout=1) replicaInfo = dq2.listFileReplicas(site, dataset) logger.debug('resolving dataset files at %s, no files: %d' % (site, len(replicaInfo[0]['content']))) if replicaInfo: mylock.acquire() for guid in replicaInfo[0]['content']: if guid not in replicas: replicas[guid] = [] replicas[guid].append(site) mylock.release() except Empty: pass except DQException as err: logger.warning(str(err)) logger.warning('site %s excluded' % site) pass threads = [] nthread = len(sites) if nthread > 10: nthread = 10 for i in range(nthread): t = GangaThread(name='stager_ds_w_%d' % i, target=worker, kwargs={'id': i}) # t.setDaemon(False) threads.append(t) for t in threads: t.start() for t in threads: t.join() return replicas
logger.info('HC Copy Thread: Connected to DB') while (test_active(cursor2) and not test_paused(cursor2) and not ct.should_stop()): logger.info('HC Copy Thread: TOP OF MAIN LOOP') for job in jobs: if test_paused(cursor2) or ct.should_stop(): break copyJob(cursor2,job) conn2.commit() conn2.commit() test_sleep(20) cursor2.close () conn2.commit() conn2.close() logger.info('HC Copy Thread: Disconnected from DB') ct = GangaThread(name="HCCopyThread", target=hc_copy_thread) conn = hcutil.connect() cursor = conn.cursor () logger.info('Connected to DB') if len(jobs): ct.start() while (test_active(cursor) and not test_paused(cursor)): try: print_summary(cursor) except: logger.warning('Bug during print_summary') logger.warning(sys.exc_info()[0]) logger.warning(sys.exc_info()[1]) for job in jobs:
class TaskRegistry(Registry): def __init__(self, name, doc, dirty_flush_counter=10, update_index_time=30): super(TaskRegistry, self).__init__(name, doc, dirty_flush_counter=10, update_index_time=30) self._main_thread = None def getProxy(self): this_slice = TaskRegistrySlice(self.name) this_slice.objects = self return TaskRegistrySliceProxy(this_slice) def getIndexCache(self, obj): if obj.getNodeData() is None: raise Exception("Currently don't support Index Caching") cached_values = ['status', 'id', 'name'] c = {} for cv in cached_values: if cv in obj.getNodeData(): c[cv] = obj.getNodeAttribute(cv) this_slice = TaskRegistrySlice("tmp") for dpv in this_slice._display_columns: c["display:" + dpv] = this_slice._get_display_value(obj, dpv) return c def _thread_main(self): """ This is an internal function; the main loop of the background thread """ # Add runtime handlers for all the taskified applications, since now # all the backends are loaded from Ganga.GPIDev.Adapters.ApplicationRuntimeHandlers import allHandlers from .TaskApplication import handler_map for basename, name in handler_map: for backend in allHandlers.getAllBackends(basename): allHandlers.add(name, backend, allHandlers.get(basename, backend)) from Ganga.Core.GangaRepository import getRegistry while getRegistry("jobs").hasStarted() is not True: time.sleep(0.1) if self._main_thread is None or self._main_thread.should_stop(): return while True: from Ganga.Core import monitoring_component if (not monitoring_component is None and monitoring_component.enabled ) or config['ForceTaskMonitoring']: break time.sleep(0.1) if self._main_thread is None or self._main_thread.should_stop(): return # setup the tasks - THIS IS INCOMPATIBLE WITH CONCURRENCY # and must go away soon for tid in self.ids(): try: self[tid]._getWriteAccess() self[tid].startup() except RegistryError: continue except Exception as err: logger.error( "Unknown/Unexpected Error in starting up tasks main loop") logger.error("Exiting: err=%s" % str(err)) return logger.debug("Entering main loop") # Main loop while self._main_thread is not None and not self._main_thread.should_stop( ): # For each task try to run it if config['ForceTaskMonitoring'] or monitoring_component.enabled: for tid in self.ids(): logger.debug("Running over tid: %s" % str(tid)) try: from Ganga.GPIDev.Lib.Tasks import ITask if isType(self[tid], ITask): # for new ITasks, always need write access self[tid]._getWriteAccess() p = self[tid] else: if self[tid].status in [ "running", "running/pause" ]: self[tid]._getWriteAccess() p = self[tid] elif self[tid].status is 'completed' and ( self[tid].n_status('ready') or self[tid].n_status('running')): self[tid].updateStatus() continue else: continue except RegistryError: # could not acquire lock continue if self._main_thread.should_stop(): break try: from Ganga.GPIDev.Lib.Tasks import ITask if isType(self[tid], ITask): # for new ITasks, always call update() p.update() else: # TODO: Make this user-configurable and add better # error message if (p.n_status("failed") * 100.0 / (20 + p.n_status("completed")) > 20): p.pause() logger.error( "Task %s paused - %i jobs have failed while only %i jobs have completed successfully." % (p.name, p.n_status("failed"), p.n_status("completed"))) logger.error( "Please investigate the cause of the failing jobs and then remove the previously failed jobs using job.remove()" ) logger.error( "You can then continue to run this task with tasks(%i).run()" % p.id) continue numjobs = p.submitJobs() if numjobs > 0: self._flush([p]) # finalise any required transforms p.finaliseTransforms() p.updateStatus() except Exception as x: logger.error( "Exception occurred in task monitoring loop: %s %s\nThe offending task was paused." % (x.__class__, x)) type_, value_, traceback_ = sys.exc_info() logger.error("Full traceback:\n %s" % ' '.join( traceback.format_exception(type_, value_, traceback_))) p.pause() if self._main_thread.should_stop(): break if self._main_thread.should_stop(): break logger.debug("TaskRegistry Sleeping for: %s seconds" % str(config['TaskLoopFrequency'])) # Sleep interruptible for 10 seconds for i in range(0, int(config['TaskLoopFrequency'] * 100)): if self._main_thread.should_stop(): break time.sleep(0.01) def startup(self): """ Start a background thread that periodically run()s""" super(TaskRegistry, self).startup() from Ganga.Core.GangaThread import GangaThread self._main_thread = GangaThread(name="GangaTasks", target=self._thread_main) self._main_thread.start() def shutdown(self): super(TaskRegistry, self).shutdown() def stop(self): if self._main_thread is not None: self._main_thread.stop()
def __init__(self, name): GangaThread.__init__(self, name=name)
def start(self): if Config.getConfig('LGI')['Enable'] is False: self.log.debug('Not starting LGI pilot thread because [LGI]Enable is False') return False return GangaThread.start(self)
class TaskRegistry(Registry): def __init__(self, name, doc, dirty_flush_counter=10, update_index_time=30): super(TaskRegistry, self).__init__( name, doc, dirty_flush_counter=10, update_index_time=30 ) self._main_thread = None def getProxy(self): this_slice = TaskRegistrySlice(self.name) this_slice.objects = self return TaskRegistrySliceProxy(this_slice) def getIndexCache(self, obj): if obj.getNodeData() is None: raise Exception("Currently don't support Index Caching") cached_values = ['status', 'id', 'name'] c = {} for cv in cached_values: if cv in obj.getNodeData(): c[cv] = obj.getNodeAttribute(cv) this_slice = TaskRegistrySlice("tmp") for dpv in this_slice._display_columns: c["display:" + dpv] = this_slice._get_display_value(obj, dpv) return c def _thread_main(self): """ This is an internal function; the main loop of the background thread """ # Add runtime handlers for all the taskified applications, since now # all the backends are loaded from Ganga.GPIDev.Adapters.ApplicationRuntimeHandlers import allHandlers from .TaskApplication import handler_map for basename, name in handler_map: for backend in allHandlers.getAllBackends(basename): allHandlers.add( name, backend, allHandlers.get(basename, backend)) from Ganga.Core.GangaRepository import getRegistry while getRegistry("jobs").hasStarted() is not True: time.sleep(0.1) if self._main_thread is None or self._main_thread.should_stop(): return while True: from Ganga.Core import monitoring_component if (not monitoring_component is None and monitoring_component.enabled) or config['ForceTaskMonitoring']: break time.sleep(0.1) if self._main_thread is None or self._main_thread.should_stop(): return # setup the tasks - THIS IS INCOMPATIBLE WITH CONCURRENCY # and must go away soon for tid in self.ids(): try: self[tid]._getWriteAccess() self[tid].startup() except RegistryError: continue except Exception as err: logger.error("Unknown/Unexpected Error in starting up tasks main loop") logger.error("Exiting: err=%s" % str(err)) return logger.debug("Entering main loop") # Main loop while self._main_thread is not None and not self._main_thread.should_stop(): # For each task try to run it if config['ForceTaskMonitoring'] or monitoring_component.enabled: for tid in self.ids(): logger.debug("Running over tid: %s" % str(tid)) try: from Ganga.GPIDev.Lib.Tasks import ITask if isType(self[tid], ITask): # for new ITasks, always need write access self[tid]._getWriteAccess() p = self[tid] else: if self[tid].status in ["running", "running/pause"]: self[tid]._getWriteAccess() p = self[tid] elif self[tid].status is 'completed' and (self[tid].n_status('ready') or self[tid].n_status('running')): self[tid].updateStatus() continue else: continue except RegistryError: # could not acquire lock continue if self._main_thread.should_stop(): break try: from Ganga.GPIDev.Lib.Tasks import ITask if isType(self[tid], ITask): # for new ITasks, always call update() p.update() else: # TODO: Make this user-configurable and add better # error message if (p.n_status("failed") * 100.0 / (20 + p.n_status("completed")) > 20): p.pause() logger.error("Task %s paused - %i jobs have failed while only %i jobs have completed successfully." % ( p.name, p.n_status("failed"), p.n_status("completed"))) logger.error( "Please investigate the cause of the failing jobs and then remove the previously failed jobs using job.remove()") logger.error( "You can then continue to run this task with tasks(%i).run()" % p.id) continue numjobs = p.submitJobs() if numjobs > 0: self._flush([p]) # finalise any required transforms p.finaliseTransforms() p.updateStatus() except Exception as x: logger.error( "Exception occurred in task monitoring loop: %s %s\nThe offending task was paused." % (x.__class__, x)) type_, value_, traceback_ = sys.exc_info() logger.error("Full traceback:\n %s" % ' '.join( traceback.format_exception(type_, value_, traceback_))) p.pause() if self._main_thread.should_stop(): break if self._main_thread.should_stop(): break logger.debug("TaskRegistry Sleeping for: %s seconds" % str(config['TaskLoopFrequency'])) # Sleep interruptible for 10 seconds for i in range(0, int(config['TaskLoopFrequency'] * 100)): if self._main_thread.should_stop(): break time.sleep(0.01) def startup(self): """ Start a background thread that periodically run()s""" super(TaskRegistry, self).startup() from Ganga.Core.GangaThread import GangaThread self._main_thread = GangaThread(name="GangaTasks", target=self._thread_main) self._main_thread.start() def shutdown(self): super(TaskRegistry, self).shutdown() def stop(self): if self._main_thread is not None: self._main_thread.stop()
def __init__(self, name): GangaThread.__init__(self, name) self._currently_running_command = False self._running_cmd = None self._thread_name = name
def __init__(self): GangaThread.__init__(self, "LGI_Stats") self.log = getLogger("LGI.Stats.Thread")
class TaskRegistry(Registry): def __init__(self, name, doc, dirty_flush_counter=10, update_index_time=30): super(TaskRegistry, self).__init__( name, doc, dirty_flush_counter=dirty_flush_counter, update_index_time=update_index_time ) self._main_thread = None self.stored_slice = TaskRegistrySlice(self.name) self.stored_slice.objects = self self.stored_proxy = TaskRegistrySliceProxy(self.stored_slice) def getSlice(self): return self.stored_slice def getProxy(self): return self.stored_proxy def getIndexCache(self, obj): if obj._data is None: raise Exception("Currently don't support Index Caching") cached_values = ['status', 'id', 'name'] c = {} for cv in cached_values: if cv in obj._data: c[cv] = getattr(obj, cv) this_slice = TaskRegistrySlice("tmp") for dpv in this_slice._display_columns: c["display:" + dpv] = this_slice._get_display_value(obj, dpv) return c def _thread_main(self): """ This is an internal function; the main loop of the background thread """ from Ganga.Core.GangaRepository import getRegistry while getRegistry("jobs").hasStarted() is not True: time.sleep(0.1) if self._main_thread is None or self._main_thread.should_stop(): return while True: from Ganga.Core import monitoring_component if (not monitoring_component is None and monitoring_component.enabled) or config['ForceTaskMonitoring']: break time.sleep(0.1) if self._main_thread is None or self._main_thread.should_stop(): return # setup the tasks - THIS IS INCOMPATIBLE WITH CONCURRENCY # and must go away soon for tid in self.ids(): try: self[tid].startup() except Exception as err: logger.error("Unknown/Unexpected Error in starting up tasks main loop") logger.error("Exiting: err=%s" % str(err)) return logger.debug("Entering main loop") # Main loop while self._main_thread is not None and not self._main_thread.should_stop(): # If monitoring is enabled (or forced for Tasks) loop over each one and update if (config['ForceTaskMonitoring'] or monitoring_component.enabled) and not config['disableTaskMon']: for tid in self.ids(): logger.debug("Running over tid: %s" % str(tid)) try: p = self[tid] p.update() except Exception as x: logger.error( "Exception occurred in task monitoring loop: %s %s\nThe offending task was paused." % (x.__class__, x)) type_, value_, traceback_ = sys.exc_info() logger.error("Full traceback:\n %s" % ' '.join( traceback.format_exception(type_, value_, traceback_))) p.pause() if self._main_thread.should_stop(): break if self._main_thread.should_stop(): break logger.debug("TaskRegistry Sleeping for: %s seconds" % str(config['TaskLoopFrequency'])) # Sleep interruptible for 10 seconds for i in range(0, int(config['TaskLoopFrequency'] * 100)): if self._main_thread.should_stop(): break time.sleep(0.01) def startup(self): """ Start a background thread that periodically run()s""" super(TaskRegistry, self).startup() from Ganga.Core.GangaThread import GangaThread self._main_thread = GangaThread(name="GangaTasks", target=self._thread_main) self._main_thread.start() # create a registry flusher self.flush_thread = RegistryFlusher(self) self.flush_thread.start() def shutdown(self): self.flush_thread.join() super(TaskRegistry, self).shutdown() def stop(self): if self._main_thread is not None: self._main_thread.stop() self._main_thread.join()
logger.info('HC Plot Summarize Thread: Disconnected.') def hc_copy_thread(): test_sleep(60) logger.info('HC Copy Thread: Connected to DB') while (test_active() and not test_paused() and not ct.should_stop()): logger.debug('HC Copy Thread: TOP OF MAIN LOOP') for job in jobs: if test_paused() or ct.should_stop(): break copyJob(job) test_sleep(30) logger.info('HC Copy Thread: Disconnected from DB') ct = GangaThread(name="HCCopyThread", target=hc_copy_thread) pt = GangaThread(name="HCPlotSummary", target=hc_plot_summarize) logger.info('Connected to DB') if len(jobs): ct.start() pt.start() while (test_active() and not test_paused()): #We need to refresh the test object test = Test.objects.get(pk=testid) try: print_summary()