def run(self):

        try:
          # print '%s start' % self.uid

            while not self.term.is_set():

              # print '%s run' % self.uid
                time.sleep(SLEEP)

                if self.num == 4 and self.pnum == 1:
                    print "4"
                    ru.raise_on(self.uid, RAISE_ON)
    
          # print '%s stop' % self.uid
    
        except Exception as e:
            print '%s error %s [%s]' % (self.uid, e, type(e))
    
        except SystemExit:
            print '%s exit' % (self.uid)
    
        except KeyboardInterrupt:
            print '%s intr' % (self.uid)
    
        finally:
            print '%s final' % (self.uid)
    def run(self):

        self.uid = self.uid + '.thread'
        self.log = ru.get_logger('radical.' + self.uid, level=self.verbose)

        # a simple worker routine which sleeps repeatedly for a random number of
        # seconds, until a term signal is set.  The given 'worker' can be a thread
        # or process, or in fact anything which has a self.uid and self.term.
    
        try:
            self.log.info('%-10s : work start' % self.uid)
    
            while not self.term.is_set():
    
                item = WORK_MIN + (random.random() * (WORK_MAX - WORK_MIN))
                self.log.info('%-10s : %ds sleep start' % (self.uid, item))
                time.sleep(item)
                self.log.info('%-10s : %ds sleep stop'  % (self.uid, item))
                ru.raise_on('work')
    
            self.log.info('%-10s : work term requested' % self.uid)
    
        except Exception as e:
            self.log.info('%-10s : work fail [%s]' % (self.uid, e))
            raise
    
        self.log.info('%-10s : thread exit requested' % self.uid)
예제 #3
0
    def run(self):

        self.uid = self.uid + '.thread'
        self.log = ru.get_logger('radical.' + self.uid, level=self.verbose)

        # a simple worker routine which sleeps repeatedly for a random number of
        # seconds, until a term signal is set.  The given 'worker' can be a thread
        # or process, or in fact anything which has a self.uid and self.term.

        try:
            self.log.info('%-10s : work start' % self.uid)

            while not self.term.is_set():

                item = WORK_MIN + (random.random() * (WORK_MAX - WORK_MIN))
                self.log.info('%-10s : %ds sleep start' % (self.uid, item))
                time.sleep(item)
                self.log.info('%-10s : %ds sleep stop' % (self.uid, item))
                ru.raise_on('work')

            self.log.info('%-10s : work term requested' % self.uid)

        except Exception as e:
            self.log.info('%-10s : work fail [%s]' % (self.uid, e))
            raise

        self.log.info('%-10s : thread exit requested' % self.uid)
 def work(self):
 	
     item = WORK_MIN + (random.random() * (WORK_MAX - WORK_MIN))
     self.log.info('%-10s : %ds sleep start' % (self.uid, item))
     time.sleep(item)
     self.log.info('%-10s : %ds sleep stop'  % (self.uid, item))
     ru.raise_on('work')
    def run(self):

        try:
            self.log.info('%-10s : start' % self.uid)
            while not self.term.is_set():
                time.sleep(0.1)  # start things
                ru.raise_on('watch')
                for t in self.things:
                    if not t.is_alive():
                        self.log.info('%-10s : %s died' % (self.uid, t.uid))
                        # a child died.  We kill the other children and
                        # terminate.
                      # self.stop()
                        return
                    self.log.info('%-10s : %s ok' % (self.uid, t.uid))

        except ru.ThreadExit:
            raise RuntimeError('%-10s : watcher exit requested [%s]' % \
                    (self.uid, self.ident))
       
        except Exception as e:
            raise RuntimeError('%-10s : watcher error' % self.uid)
       
        finally:
            self.log.info('%-10s : stop' % self.uid)
    def run(self):

        try:
            # print '%s start' % self.uid

            while not self.term.is_set():

                # print '%s run' % self.uid
                time.sleep(SLEEP)

                if self.num == 4 and self.pnum == 1:
                    print "4"
                    ru.raise_on(self.uid, RAISE_ON)

        # print '%s stop' % self.uid

        except Exception as e:
            print '%s error %s [%s]' % (self.uid, e, type(e))

        except SystemExit:
            print '%s exit' % (self.uid)

        except KeyboardInterrupt:
            print '%s intr' % (self.uid)

        finally:
            print '%s final' % (self.uid)
예제 #7
0
    def work(self):

        item = WORK_MIN + (random.random() * (WORK_MAX - WORK_MIN))
        self.log.info('%-10s : %ds sleep start' % (self.uid, item))
        time.sleep(item)
        self.log.info('%-10s : %ds sleep stop' % (self.uid, item))
        ru.raise_on('work')
예제 #8
0
    def run(self):

        try:
            self.log.info('%-10s : start' % self.uid)
            while not self.term.is_set():
                time.sleep(0.1)  # start things
                ru.raise_on('watch')
                for t in self.things:
                    if not t.is_alive():
                        self.log.info('%-10s : %s died' % (self.uid, t.uid))
                        # a child died.  We kill the other children and
                        # terminate.
                        # self.stop()
                        return
                    self.log.info('%-10s : %s ok' % (self.uid, t.uid))

        except ThreadExit:
            raise RuntimeError('%-10s : watcher exit requested [%s]' % \
                    (self.uid, self.ident))

        except Exception as e:
            raise RuntimeError('%-10s : watcher error' % self.uid)

        finally:
            self.log.info('%-10s : stop' % self.uid)
예제 #9
0
    def __init__(self, name, cfg, verbose):

        mt.Thread.__init__(self)
        self.uid = name
        self.verbose = verbose
        self.log = ru.get_logger('radical.' + self.uid, level=self.verbose)
        self.cfg = cfg
        self.term = mt.Event()

        ru.raise_on('init')
    def __init__(self, name, cfg, verbose):

        mt.Thread.__init__(self)
        self.uid     = name
        self.verbose = verbose
        self.log     = ru.get_logger('radical.' + self.uid, level=self.verbose)
        self.cfg     = cfg
        self.term    = mt.Event()

        ru.raise_on('init')
예제 #11
0
    def work(self, units):

        if not isinstance(units, list):
            units = [units]

        self.advance(units, rps.AGENT_EXECUTING, publish=True, push=False)

        ru.raise_on('work bulk')

        for unit in units:
            self._handle_unit(unit)
예제 #12
0
    def work(self, units):

        if not isinstance(units, list):
            units = [units]

        self.advance(units, rps.AGENT_EXECUTING, publish=True, push=False)

        ru.raise_on('work bulk')

        for unit in units:
            self._handle_unit(unit)
    def run(self):

        # We can't catch SIGINT, for the reasons discussed in the introduction.
        # With the default SIGINT handler, SIGINT can hit in unexpected places,
        # mostly when thread termination and process termination race.  Thus we
        # can't use SIGINT at all.
        # 
        # We can, however, use a different signal to communicate termination
        # requests from sub-threads to the main thread.  Here we use `SIGUSR2`
        # (`SIGUSR1` is reserved for debugging purposes in the radical stack).
        #
        # We also install a `SIGTERM` handler, to initiate orderly shutdown on
        # system termination signals.
        #
        signal.signal(signal.SIGTERM, sigterm_handler)
        signal.signal(signal.SIGUSR2, sigusr2_handler)

        self.ospid  = os.getpid() 
        self.tid  = mt.currentThread().ident 
        self.uid  = "p.%d.0 %8d.%s" % (self.pnum, self.ospid, self.tid)

        try:
            print '%s start' % self.uid

            # create worker thread
            self.worker = WorkerThread(self.num, self.pnum, 0)
            self.worker.start()
     
            self.watcher = WatcherThread([self.worker], self.num, self.pnum, 1)
            self.watcher.start()

            while True:
                print '%s run' % self.uid
                time.sleep(SLEEP)
                if self.num == 3 and self.pnum == 1:
                    print "3"
                    ru.raise_on(self.uid, RAISE_ON)

            print '%s stop' % self.uid


        except Exception as e:
            print '%s error %s [%s]' % (self.uid, e, type(e))
       
        except SystemExit:
            print '%s exit' % (self.uid)
       
        except KeyboardInterrupt:
            print '%s intr' % (self.uid)
       
        finally:
            self.finalize()
예제 #14
0
    def run(self):

        # We can't catch SIGINT, for the reasons discussed in the introduction.
        # With the default SIGINT handler, SIGINT can hit in unexpected places,
        # mostly when thread termination and process termination race.  Thus we
        # can't use SIGINT at all.
        #
        # We can, however, use a different signal to communicate termination
        # requests from sub-threads to the main thread.  Here we use `SIGUSR2`
        # (`SIGUSR1` is reserved for debugging purposes in the radical stack).
        #
        # We also install a `SIGTERM` handler, to initiate orderly shutdown on
        # system termination signals.
        #
        signal.signal(signal.SIGTERM, sigterm_handler)
        signal.signal(signal.SIGUSR2, sigusr2_handler)

        self.ospid = os.getpid()
        self.tid = mt.currentThread().ident
        self.uid = "p.%d.0 %8d.%s" % (self.pnum, self.ospid, self.tid)

        try:
            print '%s start' % self.uid

            # create worker thread
            self.worker = WorkerThread(self.num, self.pnum, 0)
            self.worker.start()

            self.watcher = WatcherThread([self.worker], self.num, self.pnum, 1)
            self.watcher.start()

            while True:
                print '%s run' % self.uid
                time.sleep(SLEEP)
                if self.num == 3 and self.pnum == 1:
                    print "3"
                    ru.raise_on(self.uid, RAISE_ON)

            print '%s stop' % self.uid

        except Exception as e:
            print '%s error %s [%s]' % (self.uid, e, type(e))

        except SystemExit:
            print '%s exit' % (self.uid)

        except KeyboardInterrupt:
            print '%s intr' % (self.uid)

        finally:
            self.finalize()
    def stop(self):

        # NOTE: this can be called from the watcher subthread
        
        # make sure the watcher loop is gone
        ru.raise_on('stop')
        self.term.set()         # end watcher loop
        ru.raise_on('stop')

        # tell children whats up
        self._proc_term.set()   # end process childs
        self._thread_term.set() # end thread childs
        ru.raise_on('stop')

        for t in self.things:
            self.log.info('%-10s : join    %s' % (self.uid, t.uid))
            t.stop()
            t.join(timeout=JOIN_TIMEOUT)

            if t.is_alive():
                self.log.info('%-10s : kill    %s' % (self.uid, t.uid))
             ## # FIXME: differentiate between procs and threads
             ## ru.raise_in_thread(tident=t.ident)
             ## t.join(timeout=JOIN_TIMEOUT)

            if t.is_alive():
                self.log.info('%-10s : zombied %s' % (self.uid, t.uid))
            else:
                self.log.info('%-10s : joined  %s' % (self.uid, t.uid))

            ru.raise_on('stop')

        self.log.info('%-10s : stopped' % self.uid)
예제 #16
0
    def stop(self):

        # NOTE: this can be called from the watcher subthread

        # make sure the watcher loop is gone
        ru.raise_on('stop')
        self.term.set()  # end watcher loop
        ru.raise_on('stop')

        # tell children whats up
        self._proc_term.set()  # end process childs
        self._thread_term.set()  # end thread childs
        ru.raise_on('stop')

        for t in self.things:
            self.log.info('%-10s : join    %s' % (self.uid, t.uid))
            t.stop()
            t.join(timeout=JOIN_TIMEOUT)

            if t.is_alive():
                self.log.info('%-10s : kill    %s' % (self.uid, t.uid))
            ## # FIXME: differentiate between procs and threads
            ## ru.raise_in_thread(tident=t.ident)
            ## t.join(timeout=JOIN_TIMEOUT)

            if t.is_alive():
                self.log.info('%-10s : zombied %s' % (self.uid, t.uid))
            else:
                self.log.info('%-10s : joined  %s' % (self.uid, t.uid))

            ru.raise_on('stop')

        self.log.info('%-10s : stopped' % self.uid)
    def __init__(self, name, cfg, verbose):

        ru.raise_on('init')

        self.uid       = name
        self.verbose   = verbose
        self.log       = ru.get_logger('radical.' + self.uid, level=self.verbose)
        self.cfg       = cfg
        self.things    = list()
        self.term      = None # child only

        ru.raise_on('init')

        ru.Process.__init__(self, name=self.uid, log=self.log)
예제 #18
0
    def __init__(self, name, cfg, verbose):

        ru.raise_on('init')

        self.uid = name
        self.verbose = verbose
        self.log = ru.get_logger('radical.' + self.uid, level=self.verbose)
        self.cfg = cfg
        self.things = list()
        self.term = None  # child only

        ru.raise_on('init')

        ru.Process.__init__(self, name=self.uid, log=self.log)
def main(num):

    # *always* install SIGTERM and SIGINT handlers, which will translate those
    # signals into exceptable exceptions.

    signal.signal(signal.SIGTERM, sigterm_handler)
    signal.signal(signal.SIGUSR2, sigusr2_handler)

    watcher = None
    p1      = None
    p2      = None

    try:
        pid = os.getpid() 
        tid = mt.currentThread().ident 
        uid = "m.0.0 %8d.%s" % (pid, tid)

        print '%s start' % uid
        p1 = ProcessWorker(num, 1)
        p2 = ProcessWorker(num, 2)
        
        p1.start()
        p2.start()

        watcher = WatcherThread([p1, p2], num, 0, 1)
        watcher.start()

        while True:
            print '%s run' % uid
            time.sleep(SLEEP)
            if num == 1:
                print "1"
                ru.raise_on(uid, RAISE_ON)

        print '%s stop' % uid

    except RuntimeError as e:
        print '%s error %s [%s]' % (uid, e, type(e))
    
    except SystemExit:
        print '%s exit' % (uid)
    
    except KeyboardInterrupt:
        print '%s intr' % (uid)
    
    finally:
        finalize(p1, p2, watcher)
    def run(self):

        try:
            print '%s start' % self.uid

            while not self.term.is_set():

                # print '%s run' % self.uid
                time.sleep(SLEEP)

                if self.num == 2 and self.pnum == 0:
                    print "2"
                    ru.raise_on(self.uid, RAISE_ON)

                if self.num == 5 and self.pnum == 1:
                    print "5"
                    ru.raise_on(self.uid, RAISE_ON)

                # check watchables
                for thing in self.to_watch:
                    if thing.is_alive():
                        print '%s event: thing %s is alive' % (self.uid,
                                                               thing.uid)
                    else:
                        print '%s event: thing %s has died' % (self.uid,
                                                               thing.uid)
                        ru.cancel_main_thread()
                        assert (False)  # we should never get here

                # check MainThread
                if not self.main.is_alive():
                    print '%s: main thread gone - terminate' % self.uid
                    self.stop()

            print '%s stop' % self.uid

        except Exception as e:
            print '%s error %s [%s]' % (self.uid, e, type(e))
            ru.cancel_main_thread()

        except SystemExit:
            print '%s exit' % (self.uid)
            # do *not* cancel main thread here!  We get here after the cancel
            # signal has been sent in the main loop above

        finally:
            print '%s final' % (self.uid)
예제 #21
0
def main(num):

    # *always* install SIGTERM and SIGINT handlers, which will translate those
    # signals into exceptable exceptions.

    signal.signal(signal.SIGTERM, sigterm_handler)
    signal.signal(signal.SIGUSR2, sigusr2_handler)

    watcher = None
    p1 = None
    p2 = None

    try:
        pid = os.getpid()
        tid = mt.currentThread().ident
        uid = "m.0.0 %8d.%s" % (pid, tid)

        print '%s start' % uid
        p1 = ProcessWorker(num, 1)
        p2 = ProcessWorker(num, 2)

        p1.start()
        p2.start()

        watcher = WatcherThread([p1, p2], num, 0, 1)
        watcher.start()

        while True:
            print '%s run' % uid
            time.sleep(SLEEP)
            if num == 1:
                print "1"
                ru.raise_on(uid, RAISE_ON)

        print '%s stop' % uid

    except RuntimeError as e:
        print '%s error %s [%s]' % (uid, e, type(e))

    except SystemExit:
        print '%s exit' % (uid)

    except KeyboardInterrupt:
        print '%s intr' % (uid)

    finally:
        finalize(p1, p2, watcher)
예제 #22
0
def inner_5(arg_1, arg_2):  # pylint: disable=W0613

    os.environ['RU_RAISE_ON_TEST'] = '3'

    for i in range(10):
        print(i)
        ru.raise_on('test')

    print()

    os.environ['RU_RAISE_ON_RAND'] = 'RANDOM_10'

    for i in range(100):
        try:
            ru.raise_on('rand')
        except Exception:
            print('raised on %d' % i)
예제 #23
0
    def run(self):

        try:
            print '%s start' % self.uid

            while not self.term.is_set():

                # print '%s run' % self.uid
                time.sleep(SLEEP)

                if self.num == 2 and self.pnum == 0:
                    print "2"
                    ru.raise_on(self.uid, RAISE_ON)

                if self.num == 5 and self.pnum == 1:
                    print "5"
                    ru.raise_on(self.uid, RAISE_ON)

                for thing in self.to_watch:
                    if thing.is_alive():
                        print '%s event: thing %s is alive' % (self.uid,
                                                               thing.uid)
                    else:
                        print '%s event: thing %s has died' % (self.uid,
                                                               thing.uid)
                        ru.cancel_main_thread('usr2')
                        raise RuntimeError('thing %s has died - assert' %
                                           thing.uid)

            print '%s stop' % self.uid

        except Exception as e:
            print '%s error %s [%s]' % (self.uid, e, type(e))
            ru.cancel_main_thread('usr2')

        except SystemExit:
            print '%s exit' % (self.uid)
            # do *not* cancel main thread here!  We get here
        #ru.cancel_main_thread('usr2')

        except KeyboardInterrupt:
            print '%s intr' % (self.uid)
            ru.cancel_main_thread('usr2')

        finally:
            print '%s final' % (self.uid)
예제 #24
0
    def __init__(self, name, cfg, term, verbose):

        mt.Thread.__init__(self)
        self.uid = name
        self.verbose = verbose
        self.log = ru.get_logger('radical.' + self.uid, level=verbose)
        self.cfg = cfg
        self.term = term

        ru.raise_on('init')

        # we don't allow subsubthreads
        # FIXME: this could be lifted, but we leave in place and
        #        re-evaluate as needed.
        if not ru.is_main_thread():
            raise RuntimeError('threads must be spawned by MainThread [%s]' % \
                    ru.get_thread_name())
    def run(self):

        try:
            print '%s start' % self.uid

            while not self.term.is_set():

              # print '%s run' % self.uid
                time.sleep(SLEEP)

                if self.num == 2 and self.pnum == 0:
                    print "2"
                    ru.raise_on(self.uid, RAISE_ON)

                if self.num == 5 and self.pnum == 1:
                    print "5"
                    ru.raise_on(self.uid, RAISE_ON)

                # check watchables
                for thing in self.to_watch:
                    if thing.is_alive():
                        print '%s event: thing %s is alive' % (self.uid, thing.uid)
                    else:
                        print '%s event: thing %s has died' % (self.uid, thing.uid)
                        ru.cancel_main_thread()
                        assert(False) # we should never get here

                # check MainThread
                if not self.main.is_alive():
                    print '%s: main thread gone - terminate' % self.uid
                    self.stop()

            print '%s stop' % self.uid


        except Exception as e:
            print '%s error %s [%s]' % (self.uid, e, type(e))
            ru.cancel_main_thread()
       
        except SystemExit:
            print '%s exit' % (self.uid)
            # do *not* cancel main thread here!  We get here after the cancel
            # signal has been sent in the main loop above
       
        finally:
            print '%s final' % (self.uid)
    def __init__(self, name, cfg, term, verbose):

        mt.Thread.__init__(self)
        self.uid     = name
        self.verbose = verbose
        self.log     = ru.get_logger('radical.' + self.uid, level=verbose)
        self.cfg     = cfg
        self.term    = term

        ru.raise_on('init')

        # we don't allow subsubthreads
        # FIXME: this could be lifted, but we leave in place and
        #        re-evaluate as needed.
        if not ru.is_main_thread():
            raise RuntimeError('threads must be spawned by MainThread [%s]' % \
                    ru.get_thread_name())
예제 #27
0
    def _handle_unit(self, cu):

        ru.raise_on('work unit')
      # import pprint
      # self._log.info('handle cu: %s', pprint.pformat(cu))

        try:
            # prep stdout/err so that we can append w/o checking for None
            cu['stdout'] = ''
            cu['stderr'] = ''

            cpt = cu['description']['cpu_process_type']
            gpt = cu['description']['gpu_process_type']  # FIXME: use

            # FIXME: this switch is insufficient for mixed units (MPI/OpenMP)
            if cpt == 'MPI': launcher = self._mpi_launcher
            else           : launcher = self._task_launcher

            if not launcher:
                raise RuntimeError("no launcher (process type = %s)" % cpt)

            self._log.debug("Launching unit with %s (%s).",
                            launcher.name, launcher.launch_command)

            assert(cu['slots'])

            # Start a new subprocess to launch the unit
            self.spawn(launcher=launcher, cu=cu)

        except Exception as e:
            # append the startup error to the units stderr.  This is
            # not completely correct (as this text is not produced
            # by the unit), but it seems the most intuitive way to
            # communicate that error to the application/user.
            self._log.exception("error running CU")
            if cu.get('stderr') is None:
                cu['stderr'] = ''
            cu['stderr'] += "\nPilot cannot start compute unit:\n%s\n%s" \
                            % (str(e), traceback.format_exc())

            # Free the Slots, Flee the Flots, Ree the Frots!
            if cu.get('slots'):
                self.publish(rpc.AGENT_UNSCHEDULE_PUBSUB, cu)

            self.advance(cu, rps.FAILED, publish=True, push=False)
예제 #28
0
    def _handle_unit(self, cu):

        ru.raise_on('work unit')
        # import pprint
        # self._log.info('handle cu: %s', pprint.pformat(cu))

        try:
            # prep stdout/err so that we can append w/o checking for None
            cu['stdout'] = ''
            cu['stderr'] = ''

            cpt = cu['description']['cpu_process_type']
            gpt = cu['description']['gpu_process_type']  # FIXME: use

            # FIXME: this switch is insufficient for mixed units (MPI/OpenMP)
            if cpt == 'MPI': launcher = self._mpi_launcher
            else: launcher = self._task_launcher

            if not launcher:
                raise RuntimeError("no launcher (process type = %s)" % cpt)

            self._log.debug("Launching unit with %s (%s).", launcher.name,
                            launcher.launch_command)

            assert (cu['slots'])

            # Start a new subprocess to launch the unit
            self.spawn(launcher=launcher, cu=cu)

        except Exception as e:
            # append the startup error to the units stderr.  This is
            # not completely correct (as this text is not produced
            # by the unit), but it seems the most intuitive way to
            # communicate that error to the application/user.
            self._log.exception("error running CU")
            if cu.get('stderr') is None:
                cu['stderr'] = ''
            cu['stderr'] += "\nPilot cannot start compute unit:\n%s\n%s" \
                            % (str(e), traceback.format_exc())

            # Free the Slots, Flee the Flots, Ree the Frots!
            if cu.get('slots'):
                self.publish(rpc.AGENT_UNSCHEDULE_PUBSUB, cu)

            self.advance(cu, rps.FAILED, publish=True, push=False)
    def run(self):

        try:
            print '%s start' % self.uid

            while not self.term.is_set():

              # print '%s run' % self.uid
                time.sleep(SLEEP)

                if self.num == 2 and self.pnum == 0:
                    print "2"
                    ru.raise_on(self.uid, RAISE_ON)

                if self.num == 5 and self.pnum == 1:
                    print "5"
                    ru.raise_on(self.uid, RAISE_ON)

                for thing in self.to_watch:
                    if thing.is_alive():
                        print '%s event: thing %s is alive' % (self.uid, thing.uid)
                    else:
                        print '%s event: thing %s has died' % (self.uid, thing.uid)
                        ru.cancel_main_thread('usr2')
                        raise RuntimeError('thing %s has died - assert' % thing.uid)

            print '%s stop' % self.uid


        except Exception as e:
            print '%s error %s [%s]' % (self.uid, e, type(e))
            ru.cancel_main_thread('usr2')
       
        except SystemExit:
            print '%s exit' % (self.uid)
            # do *not* cancel main thread here!  We get here 
           #ru.cancel_main_thread('usr2')
       
        except KeyboardInterrupt:
            print '%s intr' % (self.uid)
            ru.cancel_main_thread('usr2')
       
        finally:
            print '%s final' % (self.uid)
예제 #30
0
def inner_5(arg_1, arg_2):

    import os
    os.environ['RU_RAISE_ON_TEST'] = '3'

    for i in range(10):
        print i
        ru.raise_on('test')

    print

    import os
    os.environ['RU_RAISE_ON_RAND'] = 'RANDOM_10'

    for i in range(100):
        try:
            ru.raise_on('rand')
        except Exception as e:
            print 'raised on %d' % i
예제 #31
0
    def initialize_child(self):

        setproctitle.setproctitle(self.uid)

        self.log = ru.get_logger('radical.' + self.uid + '.child',
                                 level=self.verbose)
        ru.raise_on('init')

        # first create threads and procs to be watched
        for name, cfg in self.cfg.iteritems():
            self.log.info('child %s: ', name)
            if 'child' in name:
                child = Child(name=name, cfg=cfg, verbose=self.verbose)
                child.start()
                self.things.append(child)
            elif 'worker' in name:
                worker = Worker(name=name, cfg=cfg, verbose=self.verbose)
                worker.start()
                self.things.append(worker)
            ru.raise_on('init')
예제 #32
0
    def stop(self):

        ru.raise_on('stop')

        assert (self.pid)  # child was spanwed
        ## assert(self.is_parent)        # is parent process
        ## assert(ru.is_main_thread())   # is main thread

        self.term.set()

        self.log.info('%-10s : stop child' % self.uid)
        self.watcher.stop()
        ru.raise_on('stop')

        ## # we check if the watcher finishes.
        ## if None == ru.watch_condition(cond=self.watcher.is_alive,
        ##                               target=False,
        ##                               timeout=JOIN_TIMEOUT):
        ##     self.log.info('%-10s : could not stop child - kill' % self.uid)
        ##     self.watcher.kill()
        ## FIXME: we could attempt a kill and *not* join afterwards, just let py GC
        ##        do the rest
        ## FIXME: the above is equivalent to `t.join(timeout); t.is_alive()

        self.watcher.join(JOIN_TIMEOUT)
        self.log.info('%-10s : child stopped (alive: %s)' %
                      (self.uid, bool(self.is_alive())))
        ru.raise_on('stop')
    def stop(self):

        ru.raise_on('stop')

        assert(self.pid)              # child was spanwed
     ## assert(self.is_parent)        # is parent process
     ## assert(ru.is_main_thread())   # is main thread

        self.term.set()

        self.log.info('%-10s : stop child' % self.uid)
        self.watcher.stop()
        ru.raise_on('stop')

     ## # we check if the watcher finishes.
     ## if None == ru.watch_condition(cond=self.watcher.is_alive,
     ##                               target=False,
     ##                               timeout=JOIN_TIMEOUT):
     ##     self.log.info('%-10s : could not stop child - kill' % self.uid)
     ##     self.watcher.kill()
     ## FIXME: we could attempt a kill and *not* join afterwards, just let py GC
     ##        do the rest
     ## FIXME: the above is equivalent to `t.join(timeout); t.is_alive()

        self.watcher.join(JOIN_TIMEOUT)
        self.log.info('%-10s : child stopped (alive: %s)' % (self.uid, bool(self.is_alive())))
        ru.raise_on('stop')
예제 #34
0
def work(worker):

    # a simple worker routine which sleeps repeatedly for a random number of
    # seconds, until a term signal is set.  The given 'worker' can be a thread
    # or process, or in fact anything which has a self.uid and self.term.

    try:
        worker.log.info('%-10s : work start' % worker.uid)

        while not worker.term.is_set():

            item = WORK_MIN + (random.random() * (WORK_MAX - WORK_MIN))
            worker.log.info('%-10s : %ds sleep start' % (worker.uid, item))
            time.sleep(item)
            worker.log.info('%-10s : %ds sleep stop' % (worker.uid, item))
            ru.raise_on('work')

        worker.log.info('%-10s : work term requested' % worker.uid)

    except Exception as e:
        worker.log.info('%-10s : work fail [%s]' % (worker.uid, e))
        raise
def work(worker):
	
    # a simple worker routine which sleeps repeatedly for a random number of
    # seconds, until a term signal is set.  The given 'worker' can be a thread
    # or process, or in fact anything which has a self.uid and self.term.

    try:
        worker.log.info('%-10s : work start' % worker.uid)

        while not worker.term.is_set():

            item = WORK_MIN + (random.random() * (WORK_MAX - WORK_MIN))
            worker.log.info('%-10s : %ds sleep start' % (worker.uid, item))
            time.sleep(item)
            worker.log.info('%-10s : %ds sleep stop'  % (worker.uid, item))
            ru.raise_on('work')

        worker.log.info('%-10s : work term requested' % worker.uid)

    except Exception as e:
        worker.log.info('%-10s : work fail [%s]' % (worker.uid, e))
        raise
예제 #36
0
    def _handle_unit(self, cu):

        ru.raise_on('work unit')

        try:
            if cu['description']['mpi']:
                launcher = self._mpi_launcher
            else:
                launcher = self._task_launcher

            if not launcher:
                raise RuntimeError("no launcher (mpi=%s)" %
                                   cu['description']['mpi'])

            self._log.debug("Launching unit with %s (%s).", launcher.name,
                            launcher.launch_command)

            assert (cu['opaque_slots'])  # FIXME: no assert, but check
            self._prof.prof('exec', msg='unit launch', uid=cu['uid'])

            # Start a new subprocess to launch the unit
            self.spawn(launcher=launcher, cu=cu)

        except Exception as e:
            # append the startup error to the units stderr.  This is
            # not completely correct (as this text is not produced
            # by the unit), but it seems the most intuitive way to
            # communicate that error to the application/user.
            self._log.exception("error running CU")
            if cu.get('stderr') is None:
                cu['stderr'] = ''
            cu['stderr'] += "\nPilot cannot start compute unit:\n%s\n%s" \
                            % (str(e), traceback.format_exc())

            # Free the Slots, Flee the Flots, Ree the Frots!
            if cu['opaque_slots']:
                self.publish(rpc.AGENT_UNSCHEDULE_PUBSUB, cu)

            self.advance(cu, rps.FAILED, publish=True, push=False)
예제 #37
0
    def work(self, units):

        if not isinstance(units, list):
            units = [units]

        self.advance(units, rps.AGENT_STAGING_INPUT, publish=True, push=False)

        ru.raise_on('work bulk')

        # we first filter out any units which don't need any input staging, and
        # advance them again as a bulk.  We work over the others one by one, and
        # advance them individually, to avoid stalling from slow staging ops.

        no_staging_units = list()
        staging_units = list()

        for unit in units:

            # check if we have any staging directives to be enacted in this
            # component
            actionables = list()
            for sd in unit['description'].get('input_staging', []):

                if sd['action'] in [rpc.LINK, rpc.COPY, rpc.MOVE, rpc.TARBALL]:
                    actionables.append(sd)

            if actionables:
                staging_units.append([unit, actionables])
            else:
                no_staging_units.append(unit)

        if no_staging_units:
            self.advance(no_staging_units,
                         rps.AGENT_SCHEDULING_PENDING,
                         publish=True,
                         push=True)

        for unit, actionables in staging_units:
            self._handle_unit(unit, actionables)
    def initialize_child(self):

        setproctitle.setproctitle(self.uid)
    	
        self.log = ru.get_logger('radical.' + self.uid + '.child', 
                                 level=self.verbose)
        ru.raise_on('init')

        # first create threads and procs to be watched
        for name,cfg in self.cfg.iteritems():
            self.log.info('child %s: ', name)
            if 'child' in name:
                child = Child(name=name, 
                              cfg=cfg, 
                              verbose=self.verbose)
                child.start()
                self.things.append(child)
            elif 'worker' in name:
                worker = Worker(name=name, 
                                cfg=cfg, 
                                verbose=self.verbose)
                worker.start()
                self.things.append(worker)
            ru.raise_on('init')
    def finalize_child(self):

        for thing in self.things:
            thing.stop()

        ru.raise_on('stop')

        self.log.info('%-10s : stop child' % self.uid)
        ru.raise_on('stop')

        self.log.info('%-10s : child stopped' % self.uid)
        ru.raise_on('stop')
예제 #40
0
    def finalize_child(self):

        for thing in self.things:
            thing.stop()

        ru.raise_on('stop')

        self.log.info('%-10s : stop child' % self.uid)
        ru.raise_on('stop')

        self.log.info('%-10s : child stopped' % self.uid)
        ru.raise_on('stop')
예제 #41
0
    def __init__(self, name, cfg, term, verbose):

        ru.raise_on('init')
        mp.Process.__init__(self)

        self.uid = name
        self.verbose = verbose
        self.log = ru.get_logger('radical.' + self.uid, level=verbose)
        self.is_parent = True
        self.cfg = cfg
        self.wterm = term  # term sig shared with parent watcher
        self.term = mp.Event()  # private term signal
        self.killed = False

        # start watcher for own children and threads
        ru.raise_on('init')
        self.watcher = Watcher(cfg, verbose='error')
        self.watcher.start()
        ru.raise_on('init')
    def __init__(self, name, cfg, term, verbose):

        ru.raise_on('init')
        mp.Process.__init__(self)

        self.uid       = name
        self.verbose   = verbose
        self.log       = ru.get_logger('radical.' + self.uid, level=verbose)
        self.is_parent = True
        self.cfg       = cfg
        self.wterm     = term             # term sig shared with parent watcher
        self.term      = mp.Event()       # private term signal
        self.killed    = False

        # start watcher for own children and threads
        ru.raise_on('init')
        self.watcher = Watcher(cfg, verbose='error') 
        self.watcher.start()
        ru.raise_on('init')
    def __init__(self, cfg, verbose):

        ru.raise_on('init')
        mt.Thread.__init__(self)

        self.cfg          = cfg
        self.term         = mt.Event()
        self._thread_term = mt.Event()
        self._proc_term   = mp.Event()
        self.things       = list()
        self.uid          = None

        for name,_ in cfg.iteritems():
            if 'watcher' in name:
                if self.uid:
                    raise ValueError('only one watcher supported')
                self.uid = name

        self.log = ru.get_logger('radical.' + self.uid + '.child', 
                                 level=verbose)
        
        ru.raise_on('init')

        # first create threads and procs to be watched
        for name,_cfg in cfg.iteritems():
            self.log.info('child %s: ', name)
            if 'child' in name:
                child = Child(name=name, 
                              cfg=_cfg, 
                              term=self._proc_term,
                              verbose=verbose)
                child.start()
                self.things.append(child)
            elif 'worker' in name:
                worker = Worker(name=name, 
                                cfg=_cfg, 
                                term=self._thread_term, 
                                verbose=verbose)
                worker.start()
                self.things.append(worker)
            ru.raise_on('init')
예제 #44
0
    def __init__(self, cfg, verbose):

        ru.raise_on('init')
        mt.Thread.__init__(self)

        self.cfg = cfg
        self.term = mt.Event()
        self._thread_term = mt.Event()
        self._proc_term = mp.Event()
        self.things = list()
        self.uid = None

        for name, _ in cfg.iteritems():
            if 'watcher' in name:
                if self.uid:
                    raise ValueError('only one watcher supported')
                self.uid = name

        self.log = ru.get_logger('radical.' + self.uid + '.child',
                                 level=verbose)

        ru.raise_on('init')

        # first create threads and procs to be watched
        for name, _cfg in cfg.iteritems():
            self.log.info('child %s: ', name)
            if 'child' in name:
                child = Child(name=name,
                              cfg=_cfg,
                              term=self._proc_term,
                              verbose=verbose)
                child.start()
                self.things.append(child)
            elif 'worker' in name:
                worker = Worker(name=name,
                                cfg=_cfg,
                                term=self._thread_term,
                                verbose=verbose)
                worker.start()
                self.things.append(worker)
            ru.raise_on('init')
예제 #45
0
    def _handle_unit(self, unit, actionables):

        ru.raise_on('work unit')

        uid = unit['uid']

        # NOTE: see documentation of cu['sandbox'] semantics in the ComputeUnit
        #       class definition.
        sandbox = unit['unit_sandbox']

        # By definition, this compoentn lives on the pilot's target resource.
        # As such, we *know* that all staging ops which would refer to the
        # resource now refer to file://localhost, and thus translate the unit,
        # pilot and resource sandboxes into that scope.  Some assumptions are
        # made though:
        #
        #   * paths are directly translatable across schemas
        #   * resource level storage is in fact accessible via file://
        #
        # FIXME: this is costly and should be cached.

        unit_sandbox = ru.Url(unit['unit_sandbox'])
        pilot_sandbox = ru.Url(unit['pilot_sandbox'])
        resource_sandbox = ru.Url(unit['resource_sandbox'])

        unit_sandbox.schema = 'file'
        pilot_sandbox.schema = 'file'
        resource_sandbox.schema = 'file'

        unit_sandbox.host = 'localhost'
        pilot_sandbox.host = 'localhost'
        resource_sandbox.host = 'localhost'

        src_context = {
            'pwd': str(unit_sandbox),  # !!!
            'unit': str(unit_sandbox),
            'pilot': str(pilot_sandbox),
            'resource': str(resource_sandbox)
        }
        tgt_context = {
            'pwd': str(unit_sandbox),  # !!!
            'unit': str(unit_sandbox),
            'pilot': str(pilot_sandbox),
            'resource': str(resource_sandbox)
        }

        # we can now handle the actionable staging directives
        for sd in actionables:

            action = sd['action']
            flags = sd['flags']
            did = sd['uid']
            src = sd['source']
            tgt = sd['target']

            self._prof.prof('staging_in_start', uid=uid, msg=did)

            assert (action in [
                rpc.COPY, rpc.LINK, rpc.MOVE, rpc.TRANSFER, rpc.TARBALL
            ])

            # we only handle staging which does *not* include 'client://' src or
            # tgt URLs - those are handled by the umgr staging components
            if src.startswith('client://') and action != rpc.TARBALL:
                self._log.debug('skip staging for src %s', src)
                self._prof.prof('staging_in_skip', uid=uid, msg=did)
                continue

            if tgt.startswith('client://'):
                self._log.debug('skip staging for tgt %s', tgt)
                self._prof.prof('staging_in_skip', uid=uid, msg=did)
                continue

            # Fix for when the target PATH is empty
            # we assume current directory is the unit staging 'unit://'
            # and we assume the file to be copied is the base filename of the source
            if tgt is None: tgt = ''
            if tgt.strip() == '':
                tgt = 'unit:///{}'.format(os.path.basename(src))
            # Fix for when the target PATH is exists *and* it is a folder
            # we assume the 'current directory' is the target folder
            # and we assume the file to be copied is the base filename of the source
            elif os.path.exists(tgt.strip()) and os.path.isdir(tgt.strip()):
                tgt = os.path.join(tgt, os.path.basename(src))

            src = complete_url(src, src_context, self._log)
            tgt = complete_url(tgt, tgt_context, self._log)

            # Currently, we use the same schema for files and folders.
            assert (tgt.schema == 'file'), 'staging tgt must be file://'

            if action in [rpc.COPY, rpc.LINK, rpc.MOVE]:
                assert (
                    src.schema == 'file'), 'staging src expected as file://'

            # SAGA will take care of dir creation - but we do it manually
            # for local ops (copy, link, move)
            if flags & rpc.CREATE_PARENTS and action != rpc.TRANSFER:
                tgtdir = os.path.dirname(tgt.path)
                if tgtdir != sandbox:
                    self._log.debug("mkdir %s", tgtdir)
                    rpu.rec_makedir(tgtdir)

            if action == rpc.COPY:
                try:
                    shutil.copytree(src.path, tgt.path)
                except OSError as exc:
                    if exc.errno == errno.ENOTDIR:
                        shutil.copy(src.path, tgt.path)
                    else:
                        raise

            elif action == rpc.LINK:

                # Fix issue/1513 if link source is file and target is folder.
                # should support POSIX standard where link is created
                # with the same name as the source
                if os.path.isfile(src.path) and os.path.isdir(tgt.path):
                    os.symlink(
                        src.path,
                        '%s/%s' % (tgt.path, os.path.basename(src.path)))

                else:  # default behavior
                    os.symlink(src.path, tgt.path)

            elif action == rpc.MOVE:
                shutil.move(src.path, tgt.path)

            elif action == rpc.TRANSFER:

                # NOTE:  TRANSFER directives don't arrive here right now.
                # FIXME: we only handle srm staging right now, and only for
                #        a specific target proxy. Other TRANSFER directives are
                #        left to umgr input staging.  We should use SAGA to
                #        attempt all staging ops which do not involve the client
                #        machine.
                if src.schema == 'srm':
                    # FIXME: cache saga handles
                    srm_dir = rs.filesystem.Directory('srm://proxy/?SFN=bogus')
                    srm_dir.copy(src, tgt)
                    srm_dir.close()

                else:
                    self._log.error('no transfer for %s -> %s', src, tgt)
                    self._prof.prof('staging_in_fail', uid=uid, msg=did)
                    raise NotImplementedError('unsupported transfer %s' % src)

            elif action == rpc.TARBALL:

                # If somethig was staged via the tarball method, the tarball is
                # extracted and then removed from the unit folder.
                self._log.debug('extract tarball for %s', uid)
                tar = tarfile.open('%s/%s.tar' %
                                   (os.path.dirname(tgt.path), uid))
                tar.extractall(path=os.path.dirname(tgt.path))
                tar.close()

            # FIXME: make tarball removal dependent on debug settings
            # os.remove(os.path.dirname(tgt.path) + '/' + uid + '.tar')

            self._prof.prof('staging_in_stop', uid=uid, msg=did)

        # all staging is done -- pass on to the scheduler
        self.advance(unit,
                     rps.AGENT_SCHEDULING_PENDING,
                     publish=True,
                     push=True)
    def run(self):

        # We can't catch signals from child processes and threads, so we only
        # look out for SIGTERM signals from the parent process.  Upon receiving
        # such, we'll stop.
        #
        # We also start a watcher (WatcherThread) which babysits all spawned
        # threads and processes, an which will also call stop() on any problems.
        # This should then trickle up to the parent, who will also have
        # a watcher checking on us.

        self.ospid = os.getpid() 
        self.tid   = mt.currentThread().ident 
        self.uid   = "p.%d.0 %8d.%s" % (self.pnum, self.ospid, self.tid)

        try:
            # ------------------------------------------------------------------
            def sigterm_handler(signum, frame):
                # on sigterm, we invoke stop(), which will exit.
                # Python should (tm) give that signal to the main thread.  
                # If not, we lost.
                assert(mt.currentThread().name == 'MainThread')
                self.stop()
            # ------------------------------------------------------------------
            signal.signal(signal.SIGTERM, sigterm_handler)

            print '%s start' % self.uid

            # create worker thread
            self.worker1 = WorkerThread(self.num, self.pnum, 0)
            self.worker1.start()
     
            self.worker2 = WorkerThread(self.num, self.pnum, 0)
            self.worker2.start()
     
            self.watcher = WatcherThread([self.worker1, self.worker2], 
                                          self.num, self.pnum, 1)
            self.watcher.start()

            while True:
                print '%s run' % self.uid
                time.sleep(SLEEP)
                if self.num == 3 and self.pnum == 1:
                    print "3"
                    ru.raise_on(self.uid, RAISE_ON)

            print '%s stop' % self.uid

        except Exception as e:
            print '%s error %s [%s]' % (self.uid, e, type(e))
       
        except SystemExit:
            print '%s exit' % (self.uid)
       
        except KeyboardInterrupt:
            print '%s intr' % (self.uid)
       
        finally:
            # we came here either due to errors in run(), KeyboardInterrupt from
            # the WatcherThread, or clean exit.  Either way, we stop all
            # children.
            self.stop()
                item = WORK_MIN + (random.random() * (WORK_MAX - WORK_MIN))
                self.log.info('%-10s : %ds sleep start' % (self.uid, item))
                time.sleep(item)
                self.log.info('%-10s : %ds sleep stop'  % (self.uid, item))
                ru.raise_on('work')
    
            self.log.info('%-10s : work term requested' % self.uid)
    
        except Exception as e:
            self.log.info('%-10s : work fail [%s]' % (self.uid, e))
            raise
    
        self.log.info('%-10s : thread exit requested' % self.uid)


# ------------------------------------------------------------------------------
#
if __name__ == '__main__':

    setproctitle.setproctitle('rp.main')

    child = Child(name='root', cfg=config, verbose='debug')
    child.start()
    ru.raise_on('init')
    time.sleep(TIME_ALIVE)
    ru.raise_on('stop')
    child.stop()

# ------------------------------------------------------------------------------

예제 #48
0
    def _pilot_watcher_cb(self):

        # FIXME: we should actually use SAGA job state notifications!
        # FIXME: check how race conditions are handles: we may detect
        #        a finalized SAGA job and change the pilot state -- but that
        #        pilot may have transitioned into final state via the normal
        #        notification mechanism already.  That probably should be sorted
        #        out by the pilot manager, which will receive notifications for
        #        both transitions.  As long as the final state is the same,
        #        there should be no problem anyway.  If it differs, the
        #        'cleaner' final state should prevail, in this ordering:
        #          cancel
        #          timeout
        #          error
        #          disappeared
        #        This implies that we want to communicate 'final_cause'

        # we don't want to lock our members all the time.  For that reason we
        # use a copy of the pilots_tocheck list and iterate over that, and only
        # lock other members when they are manipulated.

        ru.raise_on('pilot_watcher_cb')

        tc = rs.job.Container()
        with self._pilots_lock, self._check_lock:

            for pid in self._checking:
                tc.add(self._pilots[pid]['job'])

        states = tc.get_states()

        self._log.debug('bulk states: %s', states)

        # if none of the states is final, we have nothing to do.
        # We can't rely on the ordering of tasks and states in the task
        # container, so we hope that the task container's bulk state query lead
        # to a caching of state information, and we thus have cache hits when
        # querying the pilots individually

        final_pilots = list()
        with self._pilots_lock, self._check_lock:

            for pid in self._checking:

                state = self._pilots[pid]['job'].state
                self._log.debug('saga job state: %s %s', pid, state)

                if state in [rs.job.DONE, rs.job.FAILED, rs.job.CANCELED]:
                    pilot = self._pilots[pid]['pilot']
                    if state == rs.job.DONE    : pilot['state'] = rps.DONE
                    if state == rs.job.FAILED  : pilot['state'] = rps.FAILED
                    if state == rs.job.CANCELED: pilot['state'] = rps.CANCELED
                    final_pilots.append(pilot)

        if final_pilots:

            for pilot in final_pilots:

                with self._check_lock:
                    # stop monitoring this pilot
                    self._checking.remove(pilot['uid'])

                self._log.debug('final pilot %s %s', pilot['uid'], pilot['state'])

            self.advance(final_pilots, push=False, publish=True)

        # all checks are done, final pilots are weeded out.  Now check if any
        # pilot is scheduled for cancellation and is overdue, and kill it
        # forcefully.
        to_cancel  = list()
        with self._pilots_lock:

            for pid in self._pilots:

                pilot   = self._pilots[pid]['pilot']
                time_cr = pilot.get('cancel_requested')

                # check if the pilot is final meanwhile
                if pilot['state'] in rps.FINAL:
                    continue

                if time_cr and time_cr + JOB_CANCEL_DELAY < time.time():
                    self._log.debug('pilot needs killing: %s :  %s + %s < %s',
                            pid, time_cr, JOB_CANCEL_DELAY, time.time())
                    del(pilot['cancel_requested'])
                    self._log.debug(' cancel pilot %s', pid)
                    to_cancel.append(pid)

        if to_cancel:
            self._kill_pilots(to_cancel)

        return True
예제 #49
0
    def work(self, units):

        if not isinstance(units, list):
            units = [units]

        self.advance(units, rps.AGENT_STAGING_OUTPUT, publish=True, push=False)

        ru.raise_on('work bulk')

        # we first filter out any units which don't need any input staging, and
        # advance them again as a bulk.  We work over the others one by one, and
        # advance them individually, to avoid stalling from slow staging ops.

        no_staging_units = list()
        staging_units = list()

        for unit in units:

            uid = unit['uid']

            # From here on, any state update will hand control over to the umgr
            # again.  The next unit update should thus push *all* unit details,
            # not only state.
            unit['$all'] = True
            unit['control'] = 'umgr_pending'

            # we always dig for stdout/stderr
            self._handle_unit_stdio(unit)

            # NOTE: all units get here after execution, even those which did not
            #       finish successfully.  We do that so that we can make
            #       stdout/stderr available for failed units (see
            #       _handle_unit_stdio above).  But we don't need to perform any
            #       other staging for those units, and in fact can make them
            #       final.
            if unit['target_state'] != rps.DONE:
                unit['state'] = unit['target_state']
                self._log.debug('unit %s skips staging (%s)', uid,
                                unit['state'])
                no_staging_units.append(unit)
                continue

            # check if we have any staging directives to be enacted in this
            # component
            actionables = list()
            for sd in unit['description'].get('output_staging', []):
                if sd['action'] in [rpc.LINK, rpc.COPY, rpc.MOVE]:
                    actionables.append(sd)

            if actionables:
                # this unit needs some staging
                staging_units.append([unit, actionables])
            else:
                # this unit does not need any staging at this point, and can be
                # advanced
                unit['state'] = rps.UMGR_STAGING_OUTPUT_PENDING
                no_staging_units.append(unit)

        if no_staging_units:
            self.advance(no_staging_units, publish=True, push=True)

        for unit, actionables in staging_units:
            self._handle_unit_staging(unit, actionables)
    def run(self):

        # We can't catch signals from child processes and threads, so we only
        # look out for SIGTERM signals from the parent process.  Upon receiving
        # such, we'll stop.
        #
        # We also start a watcher (WatcherThread) which babysits all spawned
        # threads and processes, an which will also call stop() on any problems.
        # This should then trickle up to the parent, who will also have
        # a watcher checking on us.

        self.ospid = os.getpid()
        self.tid = mt.currentThread().ident
        self.uid = "p.%d.0 %8d.%s" % (self.pnum, self.ospid, self.tid)

        try:
            # ------------------------------------------------------------------
            def sigterm_handler(signum, frame):
                # on sigterm, we invoke stop(), which will exit.
                # Python should (tm) give that signal to the main thread.
                # If not, we lost.
                assert (mt.currentThread().name == 'MainThread')
                self.stop()

            # ------------------------------------------------------------------
            signal.signal(signal.SIGTERM, sigterm_handler)

            print '%s start' % self.uid

            # create worker thread
            self.worker1 = WorkerThread(self.num, self.pnum, 0)
            self.worker1.start()

            self.worker2 = WorkerThread(self.num, self.pnum, 0)
            self.worker2.start()

            self.watcher = WatcherThread([self.worker1, self.worker2],
                                         self.num, self.pnum, 1)
            self.watcher.start()

            while True:
                print '%s run' % self.uid
                time.sleep(SLEEP)
                if self.num == 3 and self.pnum == 1:
                    print "3"
                    ru.raise_on(self.uid, RAISE_ON)

            print '%s stop' % self.uid

        except Exception as e:
            print '%s error %s [%s]' % (self.uid, e, type(e))

        except SystemExit:
            print '%s exit' % (self.uid)

        except KeyboardInterrupt:
            print '%s intr' % (self.uid)

        finally:
            # we came here either due to errors in run(), KeyboardInterrupt from
            # the WatcherThread, or clean exit.  Either way, we stop all
            # children.
            self.stop()
예제 #51
0
                        # self.stop()
                        return
                    self.log.info('%-10s : %s ok' % (self.uid, t.uid))

        except ThreadExit:
            raise RuntimeError('%-10s : watcher exit requested [%s]' % \
                    (self.uid, self.ident))

        except Exception as e:
            raise RuntimeError('%-10s : watcher error' % self.uid)

        finally:
            self.log.info('%-10s : stop' % self.uid)


# ------------------------------------------------------------------------------
#
if __name__ == '__main__':

    setproctitle.setproctitle('rp.main')

    watcher = Watcher(config, verbose='debug')
    watcher.start()
    ru.raise_on('init')
    time.sleep(TIME_ALIVE)
    ru.raise_on('stop')
    watcher.stop()
    watcher.join()

# ------------------------------------------------------------------------------
예제 #52
0
    def _pilot_watcher_cb(self):

        # FIXME: we should actually use SAGA job state notifications!
        # FIXME: check how race conditions are handles: we may detect
        #        a finalized SAGA job and change the pilot state -- but that
        #        pilot may have transitioned into final state via the normal
        #        notification mechanism already.  That probably should be sorted
        #        out by the pilot manager, which will receive notifications for
        #        both transitions.  As long as the final state is the same,
        #        there should be no problem anyway.  If it differs, the
        #        'cleaner' final state should prevail, in this ordering:
        #          cancel
        #          timeout
        #          error
        #          disappeared
        #        This implies that we want to communicate 'final_cause'

        # we don't want to lock our members all the time.  For that reason we
        # use a copy of the pilots_tocheck list and iterate over that, and only
        # lock other members when they are manipulated.

        ru.raise_on('pilot_watcher_cb')

        tc = rs.job.Container()
        with self._pilots_lock, self._check_lock:

            for pid in self._checking:
                tc.add(self._pilots[pid]['job'])

        states = tc.get_states()

        self._log.debug('bulk states: %s', states)

        # if none of the states is final, we have nothing to do.
        # We can't rely on the ordering of tasks and states in the task
        # container, so we hope that the task container's bulk state query lead
        # to a caching of state information, and we thus have cache hits when
        # querying the pilots individually

        final_pilots = list()
        with self._pilots_lock, self._check_lock:

            for pid in self._checking:

                state = self._pilots[pid]['job'].state
                self._log.debug('saga job state: %s %s', pid, state)

                if state in [rs.job.DONE, rs.job.FAILED, rs.job.CANCELED]:
                    pilot = self._pilots[pid]['pilot']
                    if state == rs.job.DONE: pilot['state'] = rps.DONE
                    if state == rs.job.FAILED: pilot['state'] = rps.FAILED
                    if state == rs.job.CANCELED: pilot['state'] = rps.CANCELED
                    final_pilots.append(pilot)

        if final_pilots:

            for pilot in final_pilots:

                with self._check_lock:
                    # stop monitoring this pilot
                    self._checking.remove(pilot['uid'])

                self._log.debug('final pilot %s %s', pilot['uid'],
                                pilot['state'])

            self.advance(final_pilots, push=False, publish=True)

        # all checks are done, final pilots are weeded out.  Now check if any
        # pilot is scheduled for cancellation and is overdue, and kill it
        # forcefully.
        to_cancel = list()
        to_advance = list()
        with self._pilots_lock:

            for pid in self._pilots:

                pilot = self._pilots[pid]['pilot']
                time_cr = pilot.get('cancel_requested')

                # check if the pilot is final meanwhile
                if pilot['state'] in rps.FINAL:
                    continue

                if time_cr and time_cr + JOB_CANCEL_DELAY < time.time():
                    self._log.debug('pilot needs killing: %s :  %s + %s < %s',
                                    pid, time_cr, JOB_CANCEL_DELAY,
                                    time.time())
                    del (pilot['cancel_requested'])
                    self._log.debug(' cancel pilot %s', pid)
                    to_cancel.append(pid)

        if to_cancel:
            self._kill_pilots(to_cancel)

        return True
예제 #53
0
    def stop(self):

        ru.raise_on('stop')
        self.term.set()
        ru.raise_on('stop')
    def stop(self):

        self.term.set()
        ru.raise_on('stop')