def run(self): try: # print '%s start' % self.uid while not self.term.is_set(): # print '%s run' % self.uid time.sleep(SLEEP) if self.num == 4 and self.pnum == 1: print "4" ru.raise_on(self.uid, RAISE_ON) # print '%s stop' % self.uid except Exception as e: print '%s error %s [%s]' % (self.uid, e, type(e)) except SystemExit: print '%s exit' % (self.uid) except KeyboardInterrupt: print '%s intr' % (self.uid) finally: print '%s final' % (self.uid)
def run(self): self.uid = self.uid + '.thread' self.log = ru.get_logger('radical.' + self.uid, level=self.verbose) # a simple worker routine which sleeps repeatedly for a random number of # seconds, until a term signal is set. The given 'worker' can be a thread # or process, or in fact anything which has a self.uid and self.term. try: self.log.info('%-10s : work start' % self.uid) while not self.term.is_set(): item = WORK_MIN + (random.random() * (WORK_MAX - WORK_MIN)) self.log.info('%-10s : %ds sleep start' % (self.uid, item)) time.sleep(item) self.log.info('%-10s : %ds sleep stop' % (self.uid, item)) ru.raise_on('work') self.log.info('%-10s : work term requested' % self.uid) except Exception as e: self.log.info('%-10s : work fail [%s]' % (self.uid, e)) raise self.log.info('%-10s : thread exit requested' % self.uid)
def work(self): item = WORK_MIN + (random.random() * (WORK_MAX - WORK_MIN)) self.log.info('%-10s : %ds sleep start' % (self.uid, item)) time.sleep(item) self.log.info('%-10s : %ds sleep stop' % (self.uid, item)) ru.raise_on('work')
def run(self): try: self.log.info('%-10s : start' % self.uid) while not self.term.is_set(): time.sleep(0.1) # start things ru.raise_on('watch') for t in self.things: if not t.is_alive(): self.log.info('%-10s : %s died' % (self.uid, t.uid)) # a child died. We kill the other children and # terminate. # self.stop() return self.log.info('%-10s : %s ok' % (self.uid, t.uid)) except ru.ThreadExit: raise RuntimeError('%-10s : watcher exit requested [%s]' % \ (self.uid, self.ident)) except Exception as e: raise RuntimeError('%-10s : watcher error' % self.uid) finally: self.log.info('%-10s : stop' % self.uid)
def run(self): try: self.log.info('%-10s : start' % self.uid) while not self.term.is_set(): time.sleep(0.1) # start things ru.raise_on('watch') for t in self.things: if not t.is_alive(): self.log.info('%-10s : %s died' % (self.uid, t.uid)) # a child died. We kill the other children and # terminate. # self.stop() return self.log.info('%-10s : %s ok' % (self.uid, t.uid)) except ThreadExit: raise RuntimeError('%-10s : watcher exit requested [%s]' % \ (self.uid, self.ident)) except Exception as e: raise RuntimeError('%-10s : watcher error' % self.uid) finally: self.log.info('%-10s : stop' % self.uid)
def __init__(self, name, cfg, verbose): mt.Thread.__init__(self) self.uid = name self.verbose = verbose self.log = ru.get_logger('radical.' + self.uid, level=self.verbose) self.cfg = cfg self.term = mt.Event() ru.raise_on('init')
def work(self, units): if not isinstance(units, list): units = [units] self.advance(units, rps.AGENT_EXECUTING, publish=True, push=False) ru.raise_on('work bulk') for unit in units: self._handle_unit(unit)
def run(self): # We can't catch SIGINT, for the reasons discussed in the introduction. # With the default SIGINT handler, SIGINT can hit in unexpected places, # mostly when thread termination and process termination race. Thus we # can't use SIGINT at all. # # We can, however, use a different signal to communicate termination # requests from sub-threads to the main thread. Here we use `SIGUSR2` # (`SIGUSR1` is reserved for debugging purposes in the radical stack). # # We also install a `SIGTERM` handler, to initiate orderly shutdown on # system termination signals. # signal.signal(signal.SIGTERM, sigterm_handler) signal.signal(signal.SIGUSR2, sigusr2_handler) self.ospid = os.getpid() self.tid = mt.currentThread().ident self.uid = "p.%d.0 %8d.%s" % (self.pnum, self.ospid, self.tid) try: print '%s start' % self.uid # create worker thread self.worker = WorkerThread(self.num, self.pnum, 0) self.worker.start() self.watcher = WatcherThread([self.worker], self.num, self.pnum, 1) self.watcher.start() while True: print '%s run' % self.uid time.sleep(SLEEP) if self.num == 3 and self.pnum == 1: print "3" ru.raise_on(self.uid, RAISE_ON) print '%s stop' % self.uid except Exception as e: print '%s error %s [%s]' % (self.uid, e, type(e)) except SystemExit: print '%s exit' % (self.uid) except KeyboardInterrupt: print '%s intr' % (self.uid) finally: self.finalize()
def stop(self): # NOTE: this can be called from the watcher subthread # make sure the watcher loop is gone ru.raise_on('stop') self.term.set() # end watcher loop ru.raise_on('stop') # tell children whats up self._proc_term.set() # end process childs self._thread_term.set() # end thread childs ru.raise_on('stop') for t in self.things: self.log.info('%-10s : join %s' % (self.uid, t.uid)) t.stop() t.join(timeout=JOIN_TIMEOUT) if t.is_alive(): self.log.info('%-10s : kill %s' % (self.uid, t.uid)) ## # FIXME: differentiate between procs and threads ## ru.raise_in_thread(tident=t.ident) ## t.join(timeout=JOIN_TIMEOUT) if t.is_alive(): self.log.info('%-10s : zombied %s' % (self.uid, t.uid)) else: self.log.info('%-10s : joined %s' % (self.uid, t.uid)) ru.raise_on('stop') self.log.info('%-10s : stopped' % self.uid)
def __init__(self, name, cfg, verbose): ru.raise_on('init') self.uid = name self.verbose = verbose self.log = ru.get_logger('radical.' + self.uid, level=self.verbose) self.cfg = cfg self.things = list() self.term = None # child only ru.raise_on('init') ru.Process.__init__(self, name=self.uid, log=self.log)
def main(num): # *always* install SIGTERM and SIGINT handlers, which will translate those # signals into exceptable exceptions. signal.signal(signal.SIGTERM, sigterm_handler) signal.signal(signal.SIGUSR2, sigusr2_handler) watcher = None p1 = None p2 = None try: pid = os.getpid() tid = mt.currentThread().ident uid = "m.0.0 %8d.%s" % (pid, tid) print '%s start' % uid p1 = ProcessWorker(num, 1) p2 = ProcessWorker(num, 2) p1.start() p2.start() watcher = WatcherThread([p1, p2], num, 0, 1) watcher.start() while True: print '%s run' % uid time.sleep(SLEEP) if num == 1: print "1" ru.raise_on(uid, RAISE_ON) print '%s stop' % uid except RuntimeError as e: print '%s error %s [%s]' % (uid, e, type(e)) except SystemExit: print '%s exit' % (uid) except KeyboardInterrupt: print '%s intr' % (uid) finally: finalize(p1, p2, watcher)
def run(self): try: print '%s start' % self.uid while not self.term.is_set(): # print '%s run' % self.uid time.sleep(SLEEP) if self.num == 2 and self.pnum == 0: print "2" ru.raise_on(self.uid, RAISE_ON) if self.num == 5 and self.pnum == 1: print "5" ru.raise_on(self.uid, RAISE_ON) # check watchables for thing in self.to_watch: if thing.is_alive(): print '%s event: thing %s is alive' % (self.uid, thing.uid) else: print '%s event: thing %s has died' % (self.uid, thing.uid) ru.cancel_main_thread() assert (False) # we should never get here # check MainThread if not self.main.is_alive(): print '%s: main thread gone - terminate' % self.uid self.stop() print '%s stop' % self.uid except Exception as e: print '%s error %s [%s]' % (self.uid, e, type(e)) ru.cancel_main_thread() except SystemExit: print '%s exit' % (self.uid) # do *not* cancel main thread here! We get here after the cancel # signal has been sent in the main loop above finally: print '%s final' % (self.uid)
def inner_5(arg_1, arg_2): # pylint: disable=W0613 os.environ['RU_RAISE_ON_TEST'] = '3' for i in range(10): print(i) ru.raise_on('test') print() os.environ['RU_RAISE_ON_RAND'] = 'RANDOM_10' for i in range(100): try: ru.raise_on('rand') except Exception: print('raised on %d' % i)
def run(self): try: print '%s start' % self.uid while not self.term.is_set(): # print '%s run' % self.uid time.sleep(SLEEP) if self.num == 2 and self.pnum == 0: print "2" ru.raise_on(self.uid, RAISE_ON) if self.num == 5 and self.pnum == 1: print "5" ru.raise_on(self.uid, RAISE_ON) for thing in self.to_watch: if thing.is_alive(): print '%s event: thing %s is alive' % (self.uid, thing.uid) else: print '%s event: thing %s has died' % (self.uid, thing.uid) ru.cancel_main_thread('usr2') raise RuntimeError('thing %s has died - assert' % thing.uid) print '%s stop' % self.uid except Exception as e: print '%s error %s [%s]' % (self.uid, e, type(e)) ru.cancel_main_thread('usr2') except SystemExit: print '%s exit' % (self.uid) # do *not* cancel main thread here! We get here #ru.cancel_main_thread('usr2') except KeyboardInterrupt: print '%s intr' % (self.uid) ru.cancel_main_thread('usr2') finally: print '%s final' % (self.uid)
def __init__(self, name, cfg, term, verbose): mt.Thread.__init__(self) self.uid = name self.verbose = verbose self.log = ru.get_logger('radical.' + self.uid, level=verbose) self.cfg = cfg self.term = term ru.raise_on('init') # we don't allow subsubthreads # FIXME: this could be lifted, but we leave in place and # re-evaluate as needed. if not ru.is_main_thread(): raise RuntimeError('threads must be spawned by MainThread [%s]' % \ ru.get_thread_name())
def run(self): try: print '%s start' % self.uid while not self.term.is_set(): # print '%s run' % self.uid time.sleep(SLEEP) if self.num == 2 and self.pnum == 0: print "2" ru.raise_on(self.uid, RAISE_ON) if self.num == 5 and self.pnum == 1: print "5" ru.raise_on(self.uid, RAISE_ON) # check watchables for thing in self.to_watch: if thing.is_alive(): print '%s event: thing %s is alive' % (self.uid, thing.uid) else: print '%s event: thing %s has died' % (self.uid, thing.uid) ru.cancel_main_thread() assert(False) # we should never get here # check MainThread if not self.main.is_alive(): print '%s: main thread gone - terminate' % self.uid self.stop() print '%s stop' % self.uid except Exception as e: print '%s error %s [%s]' % (self.uid, e, type(e)) ru.cancel_main_thread() except SystemExit: print '%s exit' % (self.uid) # do *not* cancel main thread here! We get here after the cancel # signal has been sent in the main loop above finally: print '%s final' % (self.uid)
def _handle_unit(self, cu): ru.raise_on('work unit') # import pprint # self._log.info('handle cu: %s', pprint.pformat(cu)) try: # prep stdout/err so that we can append w/o checking for None cu['stdout'] = '' cu['stderr'] = '' cpt = cu['description']['cpu_process_type'] gpt = cu['description']['gpu_process_type'] # FIXME: use # FIXME: this switch is insufficient for mixed units (MPI/OpenMP) if cpt == 'MPI': launcher = self._mpi_launcher else : launcher = self._task_launcher if not launcher: raise RuntimeError("no launcher (process type = %s)" % cpt) self._log.debug("Launching unit with %s (%s).", launcher.name, launcher.launch_command) assert(cu['slots']) # Start a new subprocess to launch the unit self.spawn(launcher=launcher, cu=cu) except Exception as e: # append the startup error to the units stderr. This is # not completely correct (as this text is not produced # by the unit), but it seems the most intuitive way to # communicate that error to the application/user. self._log.exception("error running CU") if cu.get('stderr') is None: cu['stderr'] = '' cu['stderr'] += "\nPilot cannot start compute unit:\n%s\n%s" \ % (str(e), traceback.format_exc()) # Free the Slots, Flee the Flots, Ree the Frots! if cu.get('slots'): self.publish(rpc.AGENT_UNSCHEDULE_PUBSUB, cu) self.advance(cu, rps.FAILED, publish=True, push=False)
def _handle_unit(self, cu): ru.raise_on('work unit') # import pprint # self._log.info('handle cu: %s', pprint.pformat(cu)) try: # prep stdout/err so that we can append w/o checking for None cu['stdout'] = '' cu['stderr'] = '' cpt = cu['description']['cpu_process_type'] gpt = cu['description']['gpu_process_type'] # FIXME: use # FIXME: this switch is insufficient for mixed units (MPI/OpenMP) if cpt == 'MPI': launcher = self._mpi_launcher else: launcher = self._task_launcher if not launcher: raise RuntimeError("no launcher (process type = %s)" % cpt) self._log.debug("Launching unit with %s (%s).", launcher.name, launcher.launch_command) assert (cu['slots']) # Start a new subprocess to launch the unit self.spawn(launcher=launcher, cu=cu) except Exception as e: # append the startup error to the units stderr. This is # not completely correct (as this text is not produced # by the unit), but it seems the most intuitive way to # communicate that error to the application/user. self._log.exception("error running CU") if cu.get('stderr') is None: cu['stderr'] = '' cu['stderr'] += "\nPilot cannot start compute unit:\n%s\n%s" \ % (str(e), traceback.format_exc()) # Free the Slots, Flee the Flots, Ree the Frots! if cu.get('slots'): self.publish(rpc.AGENT_UNSCHEDULE_PUBSUB, cu) self.advance(cu, rps.FAILED, publish=True, push=False)
def inner_5(arg_1, arg_2): import os os.environ['RU_RAISE_ON_TEST'] = '3' for i in range(10): print i ru.raise_on('test') print import os os.environ['RU_RAISE_ON_RAND'] = 'RANDOM_10' for i in range(100): try: ru.raise_on('rand') except Exception as e: print 'raised on %d' % i
def initialize_child(self): setproctitle.setproctitle(self.uid) self.log = ru.get_logger('radical.' + self.uid + '.child', level=self.verbose) ru.raise_on('init') # first create threads and procs to be watched for name, cfg in self.cfg.iteritems(): self.log.info('child %s: ', name) if 'child' in name: child = Child(name=name, cfg=cfg, verbose=self.verbose) child.start() self.things.append(child) elif 'worker' in name: worker = Worker(name=name, cfg=cfg, verbose=self.verbose) worker.start() self.things.append(worker) ru.raise_on('init')
def stop(self): ru.raise_on('stop') assert (self.pid) # child was spanwed ## assert(self.is_parent) # is parent process ## assert(ru.is_main_thread()) # is main thread self.term.set() self.log.info('%-10s : stop child' % self.uid) self.watcher.stop() ru.raise_on('stop') ## # we check if the watcher finishes. ## if None == ru.watch_condition(cond=self.watcher.is_alive, ## target=False, ## timeout=JOIN_TIMEOUT): ## self.log.info('%-10s : could not stop child - kill' % self.uid) ## self.watcher.kill() ## FIXME: we could attempt a kill and *not* join afterwards, just let py GC ## do the rest ## FIXME: the above is equivalent to `t.join(timeout); t.is_alive() self.watcher.join(JOIN_TIMEOUT) self.log.info('%-10s : child stopped (alive: %s)' % (self.uid, bool(self.is_alive()))) ru.raise_on('stop')
def stop(self): ru.raise_on('stop') assert(self.pid) # child was spanwed ## assert(self.is_parent) # is parent process ## assert(ru.is_main_thread()) # is main thread self.term.set() self.log.info('%-10s : stop child' % self.uid) self.watcher.stop() ru.raise_on('stop') ## # we check if the watcher finishes. ## if None == ru.watch_condition(cond=self.watcher.is_alive, ## target=False, ## timeout=JOIN_TIMEOUT): ## self.log.info('%-10s : could not stop child - kill' % self.uid) ## self.watcher.kill() ## FIXME: we could attempt a kill and *not* join afterwards, just let py GC ## do the rest ## FIXME: the above is equivalent to `t.join(timeout); t.is_alive() self.watcher.join(JOIN_TIMEOUT) self.log.info('%-10s : child stopped (alive: %s)' % (self.uid, bool(self.is_alive()))) ru.raise_on('stop')
def work(worker): # a simple worker routine which sleeps repeatedly for a random number of # seconds, until a term signal is set. The given 'worker' can be a thread # or process, or in fact anything which has a self.uid and self.term. try: worker.log.info('%-10s : work start' % worker.uid) while not worker.term.is_set(): item = WORK_MIN + (random.random() * (WORK_MAX - WORK_MIN)) worker.log.info('%-10s : %ds sleep start' % (worker.uid, item)) time.sleep(item) worker.log.info('%-10s : %ds sleep stop' % (worker.uid, item)) ru.raise_on('work') worker.log.info('%-10s : work term requested' % worker.uid) except Exception as e: worker.log.info('%-10s : work fail [%s]' % (worker.uid, e)) raise
def _handle_unit(self, cu): ru.raise_on('work unit') try: if cu['description']['mpi']: launcher = self._mpi_launcher else: launcher = self._task_launcher if not launcher: raise RuntimeError("no launcher (mpi=%s)" % cu['description']['mpi']) self._log.debug("Launching unit with %s (%s).", launcher.name, launcher.launch_command) assert (cu['opaque_slots']) # FIXME: no assert, but check self._prof.prof('exec', msg='unit launch', uid=cu['uid']) # Start a new subprocess to launch the unit self.spawn(launcher=launcher, cu=cu) except Exception as e: # append the startup error to the units stderr. This is # not completely correct (as this text is not produced # by the unit), but it seems the most intuitive way to # communicate that error to the application/user. self._log.exception("error running CU") if cu.get('stderr') is None: cu['stderr'] = '' cu['stderr'] += "\nPilot cannot start compute unit:\n%s\n%s" \ % (str(e), traceback.format_exc()) # Free the Slots, Flee the Flots, Ree the Frots! if cu['opaque_slots']: self.publish(rpc.AGENT_UNSCHEDULE_PUBSUB, cu) self.advance(cu, rps.FAILED, publish=True, push=False)
def work(self, units): if not isinstance(units, list): units = [units] self.advance(units, rps.AGENT_STAGING_INPUT, publish=True, push=False) ru.raise_on('work bulk') # we first filter out any units which don't need any input staging, and # advance them again as a bulk. We work over the others one by one, and # advance them individually, to avoid stalling from slow staging ops. no_staging_units = list() staging_units = list() for unit in units: # check if we have any staging directives to be enacted in this # component actionables = list() for sd in unit['description'].get('input_staging', []): if sd['action'] in [rpc.LINK, rpc.COPY, rpc.MOVE, rpc.TARBALL]: actionables.append(sd) if actionables: staging_units.append([unit, actionables]) else: no_staging_units.append(unit) if no_staging_units: self.advance(no_staging_units, rps.AGENT_SCHEDULING_PENDING, publish=True, push=True) for unit, actionables in staging_units: self._handle_unit(unit, actionables)
def initialize_child(self): setproctitle.setproctitle(self.uid) self.log = ru.get_logger('radical.' + self.uid + '.child', level=self.verbose) ru.raise_on('init') # first create threads and procs to be watched for name,cfg in self.cfg.iteritems(): self.log.info('child %s: ', name) if 'child' in name: child = Child(name=name, cfg=cfg, verbose=self.verbose) child.start() self.things.append(child) elif 'worker' in name: worker = Worker(name=name, cfg=cfg, verbose=self.verbose) worker.start() self.things.append(worker) ru.raise_on('init')
def finalize_child(self): for thing in self.things: thing.stop() ru.raise_on('stop') self.log.info('%-10s : stop child' % self.uid) ru.raise_on('stop') self.log.info('%-10s : child stopped' % self.uid) ru.raise_on('stop')
def __init__(self, name, cfg, term, verbose): ru.raise_on('init') mp.Process.__init__(self) self.uid = name self.verbose = verbose self.log = ru.get_logger('radical.' + self.uid, level=verbose) self.is_parent = True self.cfg = cfg self.wterm = term # term sig shared with parent watcher self.term = mp.Event() # private term signal self.killed = False # start watcher for own children and threads ru.raise_on('init') self.watcher = Watcher(cfg, verbose='error') self.watcher.start() ru.raise_on('init')
def __init__(self, cfg, verbose): ru.raise_on('init') mt.Thread.__init__(self) self.cfg = cfg self.term = mt.Event() self._thread_term = mt.Event() self._proc_term = mp.Event() self.things = list() self.uid = None for name,_ in cfg.iteritems(): if 'watcher' in name: if self.uid: raise ValueError('only one watcher supported') self.uid = name self.log = ru.get_logger('radical.' + self.uid + '.child', level=verbose) ru.raise_on('init') # first create threads and procs to be watched for name,_cfg in cfg.iteritems(): self.log.info('child %s: ', name) if 'child' in name: child = Child(name=name, cfg=_cfg, term=self._proc_term, verbose=verbose) child.start() self.things.append(child) elif 'worker' in name: worker = Worker(name=name, cfg=_cfg, term=self._thread_term, verbose=verbose) worker.start() self.things.append(worker) ru.raise_on('init')
def __init__(self, cfg, verbose): ru.raise_on('init') mt.Thread.__init__(self) self.cfg = cfg self.term = mt.Event() self._thread_term = mt.Event() self._proc_term = mp.Event() self.things = list() self.uid = None for name, _ in cfg.iteritems(): if 'watcher' in name: if self.uid: raise ValueError('only one watcher supported') self.uid = name self.log = ru.get_logger('radical.' + self.uid + '.child', level=verbose) ru.raise_on('init') # first create threads and procs to be watched for name, _cfg in cfg.iteritems(): self.log.info('child %s: ', name) if 'child' in name: child = Child(name=name, cfg=_cfg, term=self._proc_term, verbose=verbose) child.start() self.things.append(child) elif 'worker' in name: worker = Worker(name=name, cfg=_cfg, term=self._thread_term, verbose=verbose) worker.start() self.things.append(worker) ru.raise_on('init')
def _handle_unit(self, unit, actionables): ru.raise_on('work unit') uid = unit['uid'] # NOTE: see documentation of cu['sandbox'] semantics in the ComputeUnit # class definition. sandbox = unit['unit_sandbox'] # By definition, this compoentn lives on the pilot's target resource. # As such, we *know* that all staging ops which would refer to the # resource now refer to file://localhost, and thus translate the unit, # pilot and resource sandboxes into that scope. Some assumptions are # made though: # # * paths are directly translatable across schemas # * resource level storage is in fact accessible via file:// # # FIXME: this is costly and should be cached. unit_sandbox = ru.Url(unit['unit_sandbox']) pilot_sandbox = ru.Url(unit['pilot_sandbox']) resource_sandbox = ru.Url(unit['resource_sandbox']) unit_sandbox.schema = 'file' pilot_sandbox.schema = 'file' resource_sandbox.schema = 'file' unit_sandbox.host = 'localhost' pilot_sandbox.host = 'localhost' resource_sandbox.host = 'localhost' src_context = { 'pwd': str(unit_sandbox), # !!! 'unit': str(unit_sandbox), 'pilot': str(pilot_sandbox), 'resource': str(resource_sandbox) } tgt_context = { 'pwd': str(unit_sandbox), # !!! 'unit': str(unit_sandbox), 'pilot': str(pilot_sandbox), 'resource': str(resource_sandbox) } # we can now handle the actionable staging directives for sd in actionables: action = sd['action'] flags = sd['flags'] did = sd['uid'] src = sd['source'] tgt = sd['target'] self._prof.prof('staging_in_start', uid=uid, msg=did) assert (action in [ rpc.COPY, rpc.LINK, rpc.MOVE, rpc.TRANSFER, rpc.TARBALL ]) # we only handle staging which does *not* include 'client://' src or # tgt URLs - those are handled by the umgr staging components if src.startswith('client://') and action != rpc.TARBALL: self._log.debug('skip staging for src %s', src) self._prof.prof('staging_in_skip', uid=uid, msg=did) continue if tgt.startswith('client://'): self._log.debug('skip staging for tgt %s', tgt) self._prof.prof('staging_in_skip', uid=uid, msg=did) continue # Fix for when the target PATH is empty # we assume current directory is the unit staging 'unit://' # and we assume the file to be copied is the base filename of the source if tgt is None: tgt = '' if tgt.strip() == '': tgt = 'unit:///{}'.format(os.path.basename(src)) # Fix for when the target PATH is exists *and* it is a folder # we assume the 'current directory' is the target folder # and we assume the file to be copied is the base filename of the source elif os.path.exists(tgt.strip()) and os.path.isdir(tgt.strip()): tgt = os.path.join(tgt, os.path.basename(src)) src = complete_url(src, src_context, self._log) tgt = complete_url(tgt, tgt_context, self._log) # Currently, we use the same schema for files and folders. assert (tgt.schema == 'file'), 'staging tgt must be file://' if action in [rpc.COPY, rpc.LINK, rpc.MOVE]: assert ( src.schema == 'file'), 'staging src expected as file://' # SAGA will take care of dir creation - but we do it manually # for local ops (copy, link, move) if flags & rpc.CREATE_PARENTS and action != rpc.TRANSFER: tgtdir = os.path.dirname(tgt.path) if tgtdir != sandbox: self._log.debug("mkdir %s", tgtdir) rpu.rec_makedir(tgtdir) if action == rpc.COPY: try: shutil.copytree(src.path, tgt.path) except OSError as exc: if exc.errno == errno.ENOTDIR: shutil.copy(src.path, tgt.path) else: raise elif action == rpc.LINK: # Fix issue/1513 if link source is file and target is folder. # should support POSIX standard where link is created # with the same name as the source if os.path.isfile(src.path) and os.path.isdir(tgt.path): os.symlink( src.path, '%s/%s' % (tgt.path, os.path.basename(src.path))) else: # default behavior os.symlink(src.path, tgt.path) elif action == rpc.MOVE: shutil.move(src.path, tgt.path) elif action == rpc.TRANSFER: # NOTE: TRANSFER directives don't arrive here right now. # FIXME: we only handle srm staging right now, and only for # a specific target proxy. Other TRANSFER directives are # left to umgr input staging. We should use SAGA to # attempt all staging ops which do not involve the client # machine. if src.schema == 'srm': # FIXME: cache saga handles srm_dir = rs.filesystem.Directory('srm://proxy/?SFN=bogus') srm_dir.copy(src, tgt) srm_dir.close() else: self._log.error('no transfer for %s -> %s', src, tgt) self._prof.prof('staging_in_fail', uid=uid, msg=did) raise NotImplementedError('unsupported transfer %s' % src) elif action == rpc.TARBALL: # If somethig was staged via the tarball method, the tarball is # extracted and then removed from the unit folder. self._log.debug('extract tarball for %s', uid) tar = tarfile.open('%s/%s.tar' % (os.path.dirname(tgt.path), uid)) tar.extractall(path=os.path.dirname(tgt.path)) tar.close() # FIXME: make tarball removal dependent on debug settings # os.remove(os.path.dirname(tgt.path) + '/' + uid + '.tar') self._prof.prof('staging_in_stop', uid=uid, msg=did) # all staging is done -- pass on to the scheduler self.advance(unit, rps.AGENT_SCHEDULING_PENDING, publish=True, push=True)
def run(self): # We can't catch signals from child processes and threads, so we only # look out for SIGTERM signals from the parent process. Upon receiving # such, we'll stop. # # We also start a watcher (WatcherThread) which babysits all spawned # threads and processes, an which will also call stop() on any problems. # This should then trickle up to the parent, who will also have # a watcher checking on us. self.ospid = os.getpid() self.tid = mt.currentThread().ident self.uid = "p.%d.0 %8d.%s" % (self.pnum, self.ospid, self.tid) try: # ------------------------------------------------------------------ def sigterm_handler(signum, frame): # on sigterm, we invoke stop(), which will exit. # Python should (tm) give that signal to the main thread. # If not, we lost. assert(mt.currentThread().name == 'MainThread') self.stop() # ------------------------------------------------------------------ signal.signal(signal.SIGTERM, sigterm_handler) print '%s start' % self.uid # create worker thread self.worker1 = WorkerThread(self.num, self.pnum, 0) self.worker1.start() self.worker2 = WorkerThread(self.num, self.pnum, 0) self.worker2.start() self.watcher = WatcherThread([self.worker1, self.worker2], self.num, self.pnum, 1) self.watcher.start() while True: print '%s run' % self.uid time.sleep(SLEEP) if self.num == 3 and self.pnum == 1: print "3" ru.raise_on(self.uid, RAISE_ON) print '%s stop' % self.uid except Exception as e: print '%s error %s [%s]' % (self.uid, e, type(e)) except SystemExit: print '%s exit' % (self.uid) except KeyboardInterrupt: print '%s intr' % (self.uid) finally: # we came here either due to errors in run(), KeyboardInterrupt from # the WatcherThread, or clean exit. Either way, we stop all # children. self.stop()
item = WORK_MIN + (random.random() * (WORK_MAX - WORK_MIN)) self.log.info('%-10s : %ds sleep start' % (self.uid, item)) time.sleep(item) self.log.info('%-10s : %ds sleep stop' % (self.uid, item)) ru.raise_on('work') self.log.info('%-10s : work term requested' % self.uid) except Exception as e: self.log.info('%-10s : work fail [%s]' % (self.uid, e)) raise self.log.info('%-10s : thread exit requested' % self.uid) # ------------------------------------------------------------------------------ # if __name__ == '__main__': setproctitle.setproctitle('rp.main') child = Child(name='root', cfg=config, verbose='debug') child.start() ru.raise_on('init') time.sleep(TIME_ALIVE) ru.raise_on('stop') child.stop() # ------------------------------------------------------------------------------
def _pilot_watcher_cb(self): # FIXME: we should actually use SAGA job state notifications! # FIXME: check how race conditions are handles: we may detect # a finalized SAGA job and change the pilot state -- but that # pilot may have transitioned into final state via the normal # notification mechanism already. That probably should be sorted # out by the pilot manager, which will receive notifications for # both transitions. As long as the final state is the same, # there should be no problem anyway. If it differs, the # 'cleaner' final state should prevail, in this ordering: # cancel # timeout # error # disappeared # This implies that we want to communicate 'final_cause' # we don't want to lock our members all the time. For that reason we # use a copy of the pilots_tocheck list and iterate over that, and only # lock other members when they are manipulated. ru.raise_on('pilot_watcher_cb') tc = rs.job.Container() with self._pilots_lock, self._check_lock: for pid in self._checking: tc.add(self._pilots[pid]['job']) states = tc.get_states() self._log.debug('bulk states: %s', states) # if none of the states is final, we have nothing to do. # We can't rely on the ordering of tasks and states in the task # container, so we hope that the task container's bulk state query lead # to a caching of state information, and we thus have cache hits when # querying the pilots individually final_pilots = list() with self._pilots_lock, self._check_lock: for pid in self._checking: state = self._pilots[pid]['job'].state self._log.debug('saga job state: %s %s', pid, state) if state in [rs.job.DONE, rs.job.FAILED, rs.job.CANCELED]: pilot = self._pilots[pid]['pilot'] if state == rs.job.DONE : pilot['state'] = rps.DONE if state == rs.job.FAILED : pilot['state'] = rps.FAILED if state == rs.job.CANCELED: pilot['state'] = rps.CANCELED final_pilots.append(pilot) if final_pilots: for pilot in final_pilots: with self._check_lock: # stop monitoring this pilot self._checking.remove(pilot['uid']) self._log.debug('final pilot %s %s', pilot['uid'], pilot['state']) self.advance(final_pilots, push=False, publish=True) # all checks are done, final pilots are weeded out. Now check if any # pilot is scheduled for cancellation and is overdue, and kill it # forcefully. to_cancel = list() with self._pilots_lock: for pid in self._pilots: pilot = self._pilots[pid]['pilot'] time_cr = pilot.get('cancel_requested') # check if the pilot is final meanwhile if pilot['state'] in rps.FINAL: continue if time_cr and time_cr + JOB_CANCEL_DELAY < time.time(): self._log.debug('pilot needs killing: %s : %s + %s < %s', pid, time_cr, JOB_CANCEL_DELAY, time.time()) del(pilot['cancel_requested']) self._log.debug(' cancel pilot %s', pid) to_cancel.append(pid) if to_cancel: self._kill_pilots(to_cancel) return True
def work(self, units): if not isinstance(units, list): units = [units] self.advance(units, rps.AGENT_STAGING_OUTPUT, publish=True, push=False) ru.raise_on('work bulk') # we first filter out any units which don't need any input staging, and # advance them again as a bulk. We work over the others one by one, and # advance them individually, to avoid stalling from slow staging ops. no_staging_units = list() staging_units = list() for unit in units: uid = unit['uid'] # From here on, any state update will hand control over to the umgr # again. The next unit update should thus push *all* unit details, # not only state. unit['$all'] = True unit['control'] = 'umgr_pending' # we always dig for stdout/stderr self._handle_unit_stdio(unit) # NOTE: all units get here after execution, even those which did not # finish successfully. We do that so that we can make # stdout/stderr available for failed units (see # _handle_unit_stdio above). But we don't need to perform any # other staging for those units, and in fact can make them # final. if unit['target_state'] != rps.DONE: unit['state'] = unit['target_state'] self._log.debug('unit %s skips staging (%s)', uid, unit['state']) no_staging_units.append(unit) continue # check if we have any staging directives to be enacted in this # component actionables = list() for sd in unit['description'].get('output_staging', []): if sd['action'] in [rpc.LINK, rpc.COPY, rpc.MOVE]: actionables.append(sd) if actionables: # this unit needs some staging staging_units.append([unit, actionables]) else: # this unit does not need any staging at this point, and can be # advanced unit['state'] = rps.UMGR_STAGING_OUTPUT_PENDING no_staging_units.append(unit) if no_staging_units: self.advance(no_staging_units, publish=True, push=True) for unit, actionables in staging_units: self._handle_unit_staging(unit, actionables)
def run(self): # We can't catch signals from child processes and threads, so we only # look out for SIGTERM signals from the parent process. Upon receiving # such, we'll stop. # # We also start a watcher (WatcherThread) which babysits all spawned # threads and processes, an which will also call stop() on any problems. # This should then trickle up to the parent, who will also have # a watcher checking on us. self.ospid = os.getpid() self.tid = mt.currentThread().ident self.uid = "p.%d.0 %8d.%s" % (self.pnum, self.ospid, self.tid) try: # ------------------------------------------------------------------ def sigterm_handler(signum, frame): # on sigterm, we invoke stop(), which will exit. # Python should (tm) give that signal to the main thread. # If not, we lost. assert (mt.currentThread().name == 'MainThread') self.stop() # ------------------------------------------------------------------ signal.signal(signal.SIGTERM, sigterm_handler) print '%s start' % self.uid # create worker thread self.worker1 = WorkerThread(self.num, self.pnum, 0) self.worker1.start() self.worker2 = WorkerThread(self.num, self.pnum, 0) self.worker2.start() self.watcher = WatcherThread([self.worker1, self.worker2], self.num, self.pnum, 1) self.watcher.start() while True: print '%s run' % self.uid time.sleep(SLEEP) if self.num == 3 and self.pnum == 1: print "3" ru.raise_on(self.uid, RAISE_ON) print '%s stop' % self.uid except Exception as e: print '%s error %s [%s]' % (self.uid, e, type(e)) except SystemExit: print '%s exit' % (self.uid) except KeyboardInterrupt: print '%s intr' % (self.uid) finally: # we came here either due to errors in run(), KeyboardInterrupt from # the WatcherThread, or clean exit. Either way, we stop all # children. self.stop()
# self.stop() return self.log.info('%-10s : %s ok' % (self.uid, t.uid)) except ThreadExit: raise RuntimeError('%-10s : watcher exit requested [%s]' % \ (self.uid, self.ident)) except Exception as e: raise RuntimeError('%-10s : watcher error' % self.uid) finally: self.log.info('%-10s : stop' % self.uid) # ------------------------------------------------------------------------------ # if __name__ == '__main__': setproctitle.setproctitle('rp.main') watcher = Watcher(config, verbose='debug') watcher.start() ru.raise_on('init') time.sleep(TIME_ALIVE) ru.raise_on('stop') watcher.stop() watcher.join() # ------------------------------------------------------------------------------
def _pilot_watcher_cb(self): # FIXME: we should actually use SAGA job state notifications! # FIXME: check how race conditions are handles: we may detect # a finalized SAGA job and change the pilot state -- but that # pilot may have transitioned into final state via the normal # notification mechanism already. That probably should be sorted # out by the pilot manager, which will receive notifications for # both transitions. As long as the final state is the same, # there should be no problem anyway. If it differs, the # 'cleaner' final state should prevail, in this ordering: # cancel # timeout # error # disappeared # This implies that we want to communicate 'final_cause' # we don't want to lock our members all the time. For that reason we # use a copy of the pilots_tocheck list and iterate over that, and only # lock other members when they are manipulated. ru.raise_on('pilot_watcher_cb') tc = rs.job.Container() with self._pilots_lock, self._check_lock: for pid in self._checking: tc.add(self._pilots[pid]['job']) states = tc.get_states() self._log.debug('bulk states: %s', states) # if none of the states is final, we have nothing to do. # We can't rely on the ordering of tasks and states in the task # container, so we hope that the task container's bulk state query lead # to a caching of state information, and we thus have cache hits when # querying the pilots individually final_pilots = list() with self._pilots_lock, self._check_lock: for pid in self._checking: state = self._pilots[pid]['job'].state self._log.debug('saga job state: %s %s', pid, state) if state in [rs.job.DONE, rs.job.FAILED, rs.job.CANCELED]: pilot = self._pilots[pid]['pilot'] if state == rs.job.DONE: pilot['state'] = rps.DONE if state == rs.job.FAILED: pilot['state'] = rps.FAILED if state == rs.job.CANCELED: pilot['state'] = rps.CANCELED final_pilots.append(pilot) if final_pilots: for pilot in final_pilots: with self._check_lock: # stop monitoring this pilot self._checking.remove(pilot['uid']) self._log.debug('final pilot %s %s', pilot['uid'], pilot['state']) self.advance(final_pilots, push=False, publish=True) # all checks are done, final pilots are weeded out. Now check if any # pilot is scheduled for cancellation and is overdue, and kill it # forcefully. to_cancel = list() to_advance = list() with self._pilots_lock: for pid in self._pilots: pilot = self._pilots[pid]['pilot'] time_cr = pilot.get('cancel_requested') # check if the pilot is final meanwhile if pilot['state'] in rps.FINAL: continue if time_cr and time_cr + JOB_CANCEL_DELAY < time.time(): self._log.debug('pilot needs killing: %s : %s + %s < %s', pid, time_cr, JOB_CANCEL_DELAY, time.time()) del (pilot['cancel_requested']) self._log.debug(' cancel pilot %s', pid) to_cancel.append(pid) if to_cancel: self._kill_pilots(to_cancel) return True
def stop(self): ru.raise_on('stop') self.term.set() ru.raise_on('stop')
def stop(self): self.term.set() ru.raise_on('stop')