Python RLock示例，radical.utils.RLock Python示例

示例#1

0

显示文件

文件： base.py 项目： karahbit/radical.pilot

    def initialize(self):

        self._umgr = self._cfg.owner

        self._early = dict()  # early-bound units, pid-sorted
        self._pilots = dict()  # dict of known pilots
        self._pilots_lock = ru.RLock()  # lock on the above dict
        self._units = dict()  # dict of scheduled unit IDs
        self._units_lock = ru.RLock()  # lock on the above dict
        self._waiting = dict()  # dict for units waiting on deps
        self._waiting_lock = dict()  # lock on the above dict

        # configure the scheduler instance
        self._configure()

        self.register_input(rps.UMGR_SCHEDULING_PENDING,
                            rpc.UMGR_SCHEDULING_QUEUE, self.work)

        self.register_output(rps.UMGR_STAGING_INPUT_PENDING,
                             rpc.UMGR_STAGING_INPUT_QUEUE)

        # Some schedulers care about states (of pilots and/or units), some
        # don't.  Either way, we here subscribe to state updates.
        self.register_subscriber(rpc.STATE_PUBSUB, self._base_state_cb)

        # Schedulers use that command channel to get information about
        # pilots being added or removed.
        self.register_subscriber(rpc.CONTROL_PUBSUB, self._base_command_cb)

        # cache the local client sandbox to avoid repeated os calls
        self._client_sandbox = os.getcwd()

示例#2

0

显示文件

    def __init__(self, env=None, sid=None):

        super(SimulatedEnactor, self).__init__(sid=sid)

        # List with all the workflows that are executing and require to be
        # monitored. This list is atomic and requires a lock
        self._to_monitor = list()

        self._prof.prof('enactor_setup', uid=self._uid)
        # Lock to provide atomicity in the monitoring data structure
        self._monitoring_lock  = ru.RLock('cm.monitor_lock')
        self._cb_lock          = ru.RLock('enactor.cb_lock')
        self._callbacks        = dict()

        # Creating a thread to execute the monitoring method.
        self._monitoring_thread = None  # Private attribute that will hold the thread
        self._terminate_monitor = mt.Event()  # Thread event to terminate.

        self._sim_env = env
        self._run = False

        self._terminate_simulation = mt.Event()
        self._simulation_thread = mt.Thread(target=self._sim_run,
                                            name='sim-thread')
        self._simulation_thread.start()  # Thread event to terminate.
        self._prof.prof('enactor_started', uid=self._uid)

示例#3

0

显示文件

文件： continuous_ordered.py 项目： karahbit/radical.pilot

    def _configure(self):

        Continuous._configure(self)

        # This scheduler will wait for state updates, and will consider a unit
        # `done` once it reaches a trigger state.  When a state update is found
        # which shows that the units reached that state, it is marked as 'done'
        # in the respective order of its namespace.
        #
        self._trigger_state = rps.UMGR_STAGING_OUTPUT_PENDING
        self.register_subscriber(rpc.STATE_PUBSUB, self._state_cb)

        # a namespace entry will look like this:
        #
        #   { 'current' : 0,   # BoT currently operated on - starts with '0'
        #     0 :              # sequential BoT numbering  `n`
        #     {'size': 128,    # number of units to expect   `max`
        #      'uids': [...]}, # ids    of units to be scheduled
        #      'done': [...]}, # ids    of units in trigger state
        #     },
        #     ...
        #   }
        #
        # prepare an initial entry for each ns which ensures that BOT #0 is
        # runnable once it arrives.

        self._lock = ru.RLock()  # lock on the ns
        self._units = dict()  # unit registry (we use uids otherwise)
        self._unordered = list()  # IDs of units which are not ordered
        self._ns = dict()  # nothing has run, yet

        self._ns_init = {'current': 0}
        self._order_init = {'size': 0, 'uids': list(), 'done': list()}

示例#4

0

显示文件

    def initialize(self):

        self._pwd = os.getcwd()

        self.register_input(rps.AGENT_EXECUTING_PENDING,
                            rpc.AGENT_EXECUTING_QUEUE, self.work)

        self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING,
                             rpc.AGENT_STAGING_OUTPUT_QUEUE)

        self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB)
        self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb)

        self._cancel_lock    = ru.RLock()
        self._cus_to_cancel  = list()
        self._watch_queue    = queue.Queue ()

        self._pid = self._cfg['pid']

        self.task_map = {}
        self.task_map_lock = ru.Lock()

        # we needs the LaunchMethod to construct commands.
        assert(self._cfg['task_launch_method'] ==
               self._cfg['mpi_launch_method' ] == "ORTE_LIB"), \
               "ORTE_LIB spawner only works with ORTE_LIB LaunchMethod."

        self._task_launcher = rp.agent.LaunchMethod.create(name="ORTE_LIB",
                                           cfg=self._cfg, session=self._session)
        self._orte_initialized = False
        self._cu_environment   = self._populate_cu_environment()

        self.gtod   = "%s/gtod" % self._pwd
        self.tmpdir = tempfile.gettempdir()

示例#5

0

显示文件

def test_locks():
    '''
    Test debug lock wrappers
    '''

    os.environ['RADICAL_DEBUG'] = 'True'

    l  = ru.Lock()
    rl = ru.RLock(name='bar')

    assert(not l.waits)
    assert(not rl.waits)

    with l:
        with rl:
            assert(not l.waits)
            assert(not rl.waits)

    assert(l.name  in ru.debug._debug_helper.locks)                       # noqa
    assert(rl.name in ru.debug._debug_helper.rlocks)                      # noqa

    ru.debug._debug_helper.unregister_lock(l.name)                        # noqa
    ru.debug._debug_helper.unregister_rlock(rl.name)                      # noqa

    assert(l.name  not in ru.debug._debug_helper.locks)                   # noqa
    assert(rl.name not in ru.debug._debug_helper.rlocks)                  # noqa

示例#6

0

显示文件

文件： base.py 项目： virthead/COMPASS-ProdSys

    def __init__(self, adaptor_info, adaptor_options=[]):

        self._info = adaptor_info
        self._opts = adaptor_options
        self._name = adaptor_info['name']
        self._schemas = adaptor_info['schemas']

        self._lock = ru.RLock(self._name)
        self._logger = ru.get_logger('radical.saga.api')

        has_enabled = False
        for option in self._opts:
            if option['name'] == 'enabled':
                has_enabled = True

        if not has_enabled:
            # *every* adaptor needs an 'enabled' option!
            self._opts.append({
                'category': self._name,
                'name': 'enabled',
                'type': bool,
                'default': True,
                'valid_options': [True, False],
                'documentation': "Enable / disable loading of the adaptor",
                'env_variable': None
            })

        ruc.Configurable.__init__(self, 'saga')
        ruc.Configurable.config_options(self, self._name, self._opts)

示例#7

0

显示文件

    def _configure(self):

        self._wait_pool = dict()  # set of unscheduled units
        self._wait_lock = ru.RLock()  # look on the above set

        self._pids = list()
        self._idx = 0

示例#8

0

显示文件

文件： abds.py 项目： karahbit/radical.pilot

    def initialize(self):

        from .... import pilot as rp

        self._pwd = os.getcwd()

        self.register_input(rps.AGENT_EXECUTING_PENDING,
                            rpc.AGENT_EXECUTING_QUEUE, self.work)

        self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING,
                             rpc.AGENT_STAGING_OUTPUT_QUEUE)

        self.register_publisher(rpc.AGENT_UNSCHEDULE_PUBSUB)
        self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb)

        self._cancel_lock = ru.RLock()
        self._cus_to_cancel = list()
        self._cus_to_watch = list()
        self._watch_queue = queue.Queue()

        self._pid = self._cfg['pid']

        # run watcher thread
        self._terminate = threading.Event()
        self._watcher = threading.Thread(target=self._watch, name="Watcher")
        self._watcher.daemon = True
        self._watcher.start()

        # The AgentExecutingComponent needs the LaunchMethod to construct
        # commands.
        self._task_launcher = rp.agent.LaunchMethod.create(
            name=self._cfg['task_launch_method'],
            cfg=self._cfg,
            session=self._session)

        self._mpi_launcher = rp.agent.LaunchMethod.create(
            name=self._cfg['mpi_launch_method'],
            cfg=self._cfg,
            session=self._session)

        self._cu_environment = self._populate_cu_environment()

        self.gtod = "%s/gtod" % self._pwd
        self.tmpdir = tempfile.gettempdir()

        # if we need to transplant any original env into the CU, we dig the
        # respective keys from the dump made by bootstrap_0.sh
        self._env_cu_export = dict()
        if self._cfg.get('export_to_cu'):
            with open('env.orig', 'r') as f:
                for line in f.readlines():
                    if '=' in line:
                        k, v = line.split('=', 1)
                        key = k.strip()
                        val = v.strip()
                        if key in self._cfg['export_to_cu']:
                            self._env_cu_export[key] = val

示例#9

0

显示文件

文件： round_robin.py 项目： karahbit/radical.pilot

    def _configure(self):

        self._wait_pool = list()      # set of unscheduled units
        self._wait_lock = ru.RLock()  # look on the above set

        self._pids = list()
        self._idx  = 0

        self._log.debug('RoundRobin umgr scheduler configured')

示例#10

0

显示文件

    def __init__ (self, command, cfg=None, logger=None) :
        """
        The class constructor, which runs (execvpe) command in a separately
        forked process.  The bew process will inherit the environment of the
        application process.

        :type  command: string or list of strings
        :param command: The given command is what is run as a child, and
        fed/drained via pty pipes.  If given as string, command is split into an
        array of strings, using :func:`shlex.split`.

        :type  logger:  :class:`radical.utils.logger.Logger` instance
        :param logger:  logger stream to send status messages to.
        """

        if cfg:
            self._cfg = cfg
        else:
            self._cfg = ru.Config('radical.saga', 'utils')['pty']

        self.logger = logger
        if  not  self.logger : self.logger = ru.Logger('radical.saga.pty')
        self.logger.debug ("PTYProcess init %s" % self)


        if isinstance (command, basestring) :
            command = shlex.split (command)

        if not isinstance (command, list) :
            raise se.BadParameter ("PTYProcess expects string or list command")

        if len(command) < 1 :
            raise se.BadParameter ("PTYProcess expects non-empty command")

        self.rlock   = ru.RLock ("pty process %s" % command)

        self.command = command  # list of strings too run()


        self.cache   = ""      # data cache
        self.tail    = ""      # tail of data data cache for error messages
        self.child   = None    # the process as created by subprocess.Popen
        self.ptyio   = None    # the process' io channel, from pty.fork()

        self.exit_code        = None  # child died with code (may be revived)
        self.exit_signal      = None  # child kill by signal (may be revived)

        self.recover_max      = 3  # TODO: make configure option.  This does not
        self.recover_attempts = 0  # apply for recovers triggered by gc_timeout!

        try :
            self.initialize ()

        except Exception as e :
            raise ptye.translate_exception (e, "pty or process creation failed")

示例#11

0

显示文件

文件： plugin_base.py 项目： xielm12/radical.ensemblemd

    def __init__(self, adaptor_info, adaptor_options=[]):

        self._info = adaptor_info
        self._opts = adaptor_options
        self._name = adaptor_info['name']

        self._lock = ru.RLock(self._name)
        self._logger = ru.get_logger('radical.enmd.{0}'.format(self._name))

        self._reporter = ru.LogReporter(
            name='radical.enmd.{0}'.format(self._name))

示例#12

0

显示文件

文件： component.py 项目： eirrgang/radical.pilot

    def _initialize(self):
        '''
        initialization of component base class goes here
        '''
        # components can always publish logs, state updates and control messages
        #  self.register_publisher(rpc.LOG_PUBSUB)
        self.register_publisher(rpc.STATE_PUBSUB)
        self.register_publisher(rpc.CONTROL_PUBSUB)

        # set controller callback to handle cancellation requests
        self._cancel_list = list()
        self._cancel_lock = ru.RLock('comp.cancel_lock.%s' % self._uid)
        self.register_subscriber(rpc.CONTROL_PUBSUB, self._cancel_monitor_cb)

        # call component level initialize
        self.initialize()
        self._prof.prof('component_init')

示例#13

0

显示文件

    def initialize(self):

        # we keep a cache of SAGA dir handles
        self._fs_cache = dict()
        self._js_cache = dict()
        self._pilots = dict()
        self._pilots_lock = ru.RLock()

        self.register_input(rps.UMGR_STAGING_INPUT_PENDING,
                            rpc.UMGR_STAGING_INPUT_QUEUE, self.work)

        # FIXME: this queue is inaccessible, needs routing via mongodb
        self.register_output(rps.AGENT_STAGING_INPUT_PENDING, None)

        # we subscribe to the command channel to learn about pilots being added
        # to this unit manager.
        self.register_subscriber(rpc.CONTROL_PUBSUB, self._base_command_cb)

示例#14

0

显示文件

文件： continuous_colo.py 项目： karahbit/radical.pilot

    def _configure(self):

        Continuous._configure(self)

        # a 'bag' entry will look like this:
        #
        #   {
        #      'size': 128,    # number of units to expect
        #      'uids': [...]}, # ids    of units to be scheduled
        #   }

        self._lock      = ru.RLock()   # lock on the bags
        self._units     = dict()       # unit registry (we use uids otherwise)
        self._unordered = list()       # IDs of units which are not colocated
        self._bags      = dict()       # nothing has run, yet

        self._bag_init  = {'size' : 0,
                           'uids' : list()}

示例#15

0

显示文件

    def __init__(self, adaptor_info, adaptor_options=None, expand_env=True):

        # FIXME: engine is loading cfg already, here we load again...

        self._info = adaptor_info
        self._name = adaptor_info['name']
        self._schemas = adaptor_info['schemas']

        self._lock = ru.RLock(self._name)
        self._logger = ru.Logger('radical.saga.api')

        # we need to expand later once we got env from the remote resource
        self._cfg = ru.Config(module='radical.saga',
                              name=self._name,
                              expand=expand_env)

        if 'enabled' not in self._cfg:
            self._cfg['enabled'] = True

示例#16

0

显示文件

    def initialize(self):

        self._pwd = os.getcwd()

        self.register_input(rps.AGENT_EXECUTING_PENDING,
                            rpc.AGENT_EXECUTING_QUEUE, self.work)

        self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING,
                             rpc.AGENT_STAGING_OUTPUT_QUEUE)

        self.register_publisher(rpc.AGENT_UNSCHEDULE_PUBSUB)

        self._terminate = mt.Event()
        self._tasks_lock = ru.RLock()
        self._tasks = list()
        self._delay = 0.1

        self._watcher = mt.Thread(target=self._timed)
        self._watcher.daemon = True
        self._watcher.start()

示例#17

0

显示文件

    def initialize(self):

        self._pwd = os.getcwd()

        self.register_input(rps.AGENT_EXECUTING_PENDING,
                            rpc.AGENT_EXECUTING_QUEUE, self.work)

        self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING,
                             rpc.AGENT_STAGING_OUTPUT_QUEUE)

        self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB)
        self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb)

        self._cancel_lock    = ru.RLock()
        self._cus_to_cancel  = list()
        self._cus_to_watch   = list()
        self._watch_queue    = queue.Queue ()

        self._pid = self._cfg['pid']

        # run watcher thread
        self._watcher = mt.Thread(target=self._watch)
      # self._watcher.daemon = True
        self._watcher.start()

        # The AgentExecutingComponent needs the LaunchMethod to construct
        # commands.
        self._task_launcher = rp.agent.LaunchMethod.create(
                name    = self._cfg.get('task_launch_method'),
                cfg     = self._cfg,
                session = self._session)

        self._mpi_launcher = rp.agent.LaunchMethod.create(
                name    = self._cfg.get('mpi_launch_method'),
                cfg     = self._cfg,
                session = self._session)

        self.gtod   = "%s/gtod" % self._pwd
        self.tmpdir = tempfile.gettempdir()

示例#18

0

显示文件

文件： shell.py 项目： eirrgang/radical.pilot

    def initialize(self):

        from .... import pilot as rp

        self._pwd = os.getcwd()

        self.register_input(rps.EXECUTING_PENDING, rpc.AGENT_EXECUTING_QUEUE,
                            self.work)

        self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING,
                             rpc.AGENT_STAGING_OUTPUT_QUEUE)

        self.register_publisher(rpc.AGENT_UNSCHEDULE_PUBSUB)
        self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb)

        # Mimic what virtualenv's "deactivate" would do
        self._deactivate = "\n# deactivate pilot virtualenv\n"

        old_path = os.environ.get('_OLD_VIRTUAL_PATH', None)
        old_ppath = os.environ.get('_OLD_VIRTUAL_PYTHONPATH', None)
        old_home = os.environ.get('_OLD_VIRTUAL_PYTHONHOME', None)
        old_ps1 = os.environ.get('_OLD_VIRTUAL_PS1', None)

        if old_ppath: self._deactivate += 'export PATH="%s"\n' % old_ppath
        if old_path: self._deactivate += 'export PYTHONPATH="%s"\n' % old_path
        if old_home: self._deactivate += 'export PYTHON_HOME="%s"\n' % old_home
        if old_ps1: self._deactivate += 'export PS1="%s"\n' % old_ps1

        self._deactivate += 'unset VIRTUAL_ENV\n\n'

        # FIXME: we should not alter the environment of the running agent, but
        #        only make sure that the CU finds a pristine env.  That also
        #        holds for the unsetting below -- AM
        if old_path: os.environ['PATH'] = old_path
        if old_ppath: os.environ['PYTHONPATH'] = old_ppath
        if old_home: os.environ['PYTHON_HOME'] = old_home
        if old_ps1: os.environ['PS1'] = old_ps1

        if 'VIRTUAL_ENV' in os.environ:
            del (os.environ['VIRTUAL_ENV'])

        # simplify shell startup / prompt detection
        os.environ['PS1'] = '$ '

        self._task_launcher = rp.agent.LaunchMethod.create(
            name=self._cfg['task_launch_method'],
            cfg=self._cfg,
            session=self._session)

        self._mpi_launcher = rp.agent.LaunchMethod.create(
            name=self._cfg['mpi_launch_method'],
            cfg=self._cfg,
            session=self._session)

        # TODO: test that this actually works
        # Remove the configured set of environment variables from the
        # environment that we pass to Popen.
        for e in list(os.environ.keys()):

            env_removables = list()

            if self._mpi_launcher:
                env_removables += self._mpi_launcher.env_removables

            if self._task_launcher:
                env_removables += self._task_launcher.env_removables

            for r in env_removables:
                if e.startswith(r):
                    os.environ.pop(e, None)

        # if we need to transplant any original env into the CU, we dig the
        # respective keys from the dump made by bootstrap_0.sh
        self._env_cu_export = dict()
        if self._cfg.get('export_to_cu'):
            with open('env.orig', 'r') as f:
                for line in f.readlines():
                    if '=' in line:
                        k, v = line.split('=', 1)
                        key = k.strip()
                        val = v.strip()
                        if key in self._cfg['export_to_cu']:
                            self._env_cu_export[key] = val

        # the registry keeps track of units to watch, indexed by their shell
        # spawner process ID.  As the registry is shared between the spawner and
        # watcher thread, we use a lock while accessing it.
        self._registry = dict()
        self._registry_lock = ru.RLock()

        self._cus_to_cancel = list()
        self._cancel_lock = ru.RLock()

        self._cached_events = list()  # keep monitoring events for pid's which
        # are not yet known

        # get some threads going -- those will do all the work.
        import radical.saga.utils.pty_shell as sups

        self.launcher_shell = sups.PTYShell("fork://localhost/")
        self.monitor_shell = sups.PTYShell("fork://localhost/")

        # run the spawner on the shells
        # tmp = tempfile.gettempdir()
        # Moving back to shared file system again, until it reaches maturity,
        # as this breaks launch methods with a hop, e.g. ssh.
        # FIXME: see #658
        self._pid = self._cfg['pid']
        self._spawner_tmp = "/%s/%s-%s" % (self._pwd, self._pid, self.uid)

        ret, out, _  = self.launcher_shell.run_sync \
                           ("/bin/sh %s/agent/executing/shell_spawner.sh %s"
                           % (os.path.dirname (rp.__file__), self._spawner_tmp))
        if ret != 0:
            raise RuntimeError("launcher bootstrap failed: (%s)(%s)", ret, out)

        ret, out, _  = self.monitor_shell.run_sync \
                           ("/bin/sh %s/agent/executing/shell_spawner.sh %s"
                           % (os.path.dirname (rp.__file__), self._spawner_tmp))
        if ret != 0:
            raise RuntimeError("monitor bootstrap failed: (%s)(%s)", ret, out)

        # run watcher thread
        self._terminate = threading.Event()
        self._watcher = threading.Thread(target=self._watch, name="Watcher")
        self._watcher.daemon = True
        self._watcher.start()

        self.gtod = "%s/gtod" % self._pwd

示例#19

0

显示文件

    def __init__(self):

        self.logger = ru.Logger('radical.saga.pty')
        self.registry = {}
        self.rlock = ru.RLock('pty shell factory')

示例#20

0

显示文件

    def __init__(self, pmgr, descr):

        # 'static' members
        self._descr = descr.as_dict()

        # sanity checks on description
        for check in ['resource', 'cores', 'runtime']:
            if not self._descr.get(check):
                raise ValueError("ComputePilotDescription needs '%s'" % check)

        # initialize state
        self._pmgr       = pmgr
        self._session    = self._pmgr.session
        self._prof       = self._session._prof
        self._uid        = ru.generate_id('pilot.%(item_counter)04d',
                                           ru.ID_CUSTOM,
                                           ns=self._session.uid)
        self._state      = rps.NEW
        self._log        = pmgr._log
        self._pilot_dict = dict()
        self._callbacks  = dict()
        self._cache      = dict()    # cache of SAGA dir handles
        self._cb_lock    = ru.RLock()

        # pilot failures can trigger app termination
        self._exit_on_error = self._descr.get('exit_on_error')

        for m in rpc.PMGR_METRICS:
            self._callbacks[m] = dict()

        # we always invoke the default state cb
        self._callbacks[rpc.PILOT_STATE][self._default_state_cb.__name__] = {
                'cb'      : self._default_state_cb,
                'cb_data' : None}

        # `as_dict()` needs `pilot_dict` and other attributes.  Those should all
        # be available at this point (apart from the sandboxes), so we now
        # query for those sandboxes.
        self._pilot_jsurl      = ru.Url()
        self._pilot_jshop      = ru.Url()
        self._resource_sandbox = ru.Url()
        self._session_sandbox  = ru.Url()
        self._pilot_sandbox    = ru.Url()
        self._client_sandbox   = ru.Url()

        pilot = self.as_dict()

        self._pilot_jsurl, self._pilot_jshop \
                               = self._session._get_jsurl           (pilot)
        self._resource_sandbox = self._session._get_resource_sandbox(pilot)
        self._session_sandbox  = self._session._get_session_sandbox (pilot)
        self._pilot_sandbox    = self._session._get_pilot_sandbox   (pilot)
        self._client_sandbox   = self._session._get_client_sandbox()

        # we need to expand plaaceholders in the sandboxes
        # FIXME: this code is a duplication from the pilot launcher code
        expand = dict()
        for k,v in pilot['description'].items():
            if v is None:
                v = ''
            expand['pd.%s' % k] = v
            if isinstance(v, str):
                expand['pd.%s' % k.upper()] = v.upper()
                expand['pd.%s' % k.lower()] = v.lower()
            else:
                expand['pd.%s' % k.upper()] = v
                expand['pd.%s' % k.lower()] = v

        self._resource_sandbox.path  = self._resource_sandbox.path % expand
        self._session_sandbox .path  = self._session_sandbox .path % expand
        self._pilot_sandbox   .path  = self._pilot_sandbox   .path % expand

示例#21

0

显示文件

    def initialize(self):

        from .... import pilot as rp

        self._pwd = os.getcwd()
        self._tmp = self._pwd   # keep temporary files in $PWD for now (slow)

        self.register_input(rps.AGENT_EXECUTING_PENDING,
                            rpc.AGENT_EXECUTING_QUEUE, self.work)

        self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING,
                             rpc.AGENT_STAGING_OUTPUT_QUEUE)

        self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB)
        self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb)

        # Mimic what virtualenv's "deactivate" would do
        self._deactivate = "\n# deactivate pilot virtualenv\n"

        old_path  = os.environ.get('_OLD_VIRTUAL_PATH',       None)
        old_ppath = os.environ.get('_OLD_VIRTUAL_PYTHONPATH', None)
        old_home  = os.environ.get('_OLD_VIRTUAL_PYTHONHOME', None)
        old_ps1   = os.environ.get('_OLD_VIRTUAL_PS1',        None)

        if old_ppath: self._deactivate += 'export PATH="%s"\n'        % old_ppath
        if old_path : self._deactivate += 'export PYTHONPATH="%s"\n'  % old_path
        if old_home : self._deactivate += 'export PYTHON_HOME="%s"\n' % old_home
        if old_ps1  : self._deactivate += 'export PS1="%s"\n'         % old_ps1

        self._deactivate += 'unset VIRTUAL_ENV\n\n'

        # FIXME: we should not alter the environment of the running agent, but
        #        only make sure that the CU finds a pristine env.  That also
        #        holds for the unsetting below -- AM
        if old_path : os.environ['PATH']        = old_path
        if old_ppath: os.environ['PYTHONPATH']  = old_ppath
        if old_home : os.environ['PYTHON_HOME'] = old_home
        if old_ps1  : os.environ['PS1']         = old_ps1

        if 'VIRTUAL_ENV' in os.environ :
            del(os.environ['VIRTUAL_ENV'])

        self._task_launcher = None
        self._mpi_launcher  = None

        try:
            self._task_launcher = rp.agent.LaunchMethod.create(
                    name    = self._cfg['task_launch_method'],
                    cfg     = self._cfg,
                    session = self._session)
        except:
            self._log.warn('no task launcher found')

        try:
            self._mpi_launcher = rp.agent.LaunchMethod.create(
                    name    = self._cfg['mpi_launch_method'],
                    cfg     = self._cfg,
                    session = self._session)
        except:
            self._log.warn('no mpi launcher found')

        # TODO: test that this actually works
        # Remove the configured set of environment variables from the
        # environment that we pass to Popen.
        for e in list(os.environ.keys()):
            env_removables = list()
            if self._mpi_launcher : env_removables += self._mpi_launcher.env_removables
            if self._task_launcher: env_removables += self._task_launcher.env_removables
            for r in  env_removables:
                if e.startswith(r):
                    os.environ.pop(e, None)

        # if we need to transplant any original env into the CU, we dig the
        # respective keys from the dump made by bootstrap_0.sh
        self._env_cu_export = dict()
        if self._cfg.get('export_to_cu'):
            with open('env.orig', 'r') as f:
                for line in f.readlines():
                    if '=' in line:
                        k,v = line.split('=', 1)
                        key = k.strip()
                        val = v.strip()
                        if key in self._cfg['export_to_cu']:
                            self._env_cu_export[key] = val

        # the registry keeps track of units to watch
        self._registry      = dict()
        self._registry_lock = ru.RLock()

        self._to_cancel  = list()
        self._cancel_lock    = ru.RLock()

        self._cached_events = list()  # keep monitoring events for pid's which
                                      # are not yet known

        self.gtod = "%s/gtod" % self._pwd

        # create line buffered fifo's to communicate with the shell executor
        self._fifo_cmd_name = "%s/%s.cmd.pipe" % (self._tmp, self._uid)
        self._fifo_inf_name = "%s/%s.inf.pipe" % (self._tmp, self._uid)

        os.mkfifo(self._fifo_cmd_name)
        os.mkfifo(self._fifo_inf_name)

        self._fifo_cmd = open(self._fifo_cmd_name, 'w+', 1)
        self._fifo_inf = open(self._fifo_inf_name, 'r+', 1)

        # run thread to watch then info fifo
        self._terminate = threading.Event()
        self._watcher   = threading.Thread(target=self._watch, name="Watcher")
        self._watcher.daemon = True
        self._watcher.start ()

        # start the shell executor
        sh_exe   = "%s/shell_spawner_fs.sh" % os.path.dirname(__file__)
        sh_cmd   = "%s %s %s %s" % (sh_exe, self._pwd, self._tmp, self.uid)
                                 # script   base       work       sid
        self._log.debug('start shell executor [%s]', sh_cmd)
        self._sh = sp.Popen(sh_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)

示例#22

0

显示文件

    def __init__(self):

        self.logger = rul.getLogger('saga', 'PTYShellFactory')
        self.registry = {}
        self.rlock = ru.RLock('pty shell factory')

示例#23

0

显示文件

文件： bookkeeper.py 项目： radical-project/campaign_manager

    def __init__(self,
                 campaign,
                 resources,
                 objective=None,
                 planner='random',
                 sid=None):

        self._campaign = {'campaign': campaign, 'state': st.NEW}
        if sid:
            self._sid = sid
        else:
            self._sid = ru.generate_id('rcm.session', mode=ru.ID_PRIVATE)
        self._uid = ru.generate_id('bookkeper.%(counter)04d',
                                   mode=ru.ID_CUSTOM,
                                   ns=self._sid)

        self._resources = resources
        self._checkpoints = None
        self._plan = None
        self._objective = objective
        self._unavail_resources = []
        self._workflows_state = dict()

        self._exec_state_lock = ru.RLock('workflows_state_lock')
        self._monitor_lock = ru.RLock('monitor_list_lock')
        self._time = 0  # The time in the campaign's world.
        self._workflows_to_monitor = list()
        self._est_end_times = dict()
        self._env = Environment()
        self._enactor = SimulatedEnactor(env=self._env, sid=self._sid)
        self._enactor.register_state_cb(self.state_update_cb)

        # Creating a thread to execute the monitoring and work methods.
        # One flag for both threads may be enough  to monitor and check.
        self._terminate_event = mt.Event()  # Thread event to terminate.
        self._work_thread = None  # Private attribute that will hold the thread
        self._monitoring_thread = None  # Private attribute that will hold the thread
        self._cont = False
        self._hold = False

        path = os.getcwd() + '/' + self._sid

        self._logger = ru.Logger(name=self._uid, path=path, level='DEBUG')
        self._prof = ru.Profiler(name=self._uid, path=path)

        num_oper = [
            workflow['num_oper'] for workflow in self._campaign['campaign']
        ]
        if planner.lower() == 'random':
            self._planner = RandomPlanner(campaign=self._campaign['campaign'],
                                          resources=self._resources,
                                          num_oper=num_oper,
                                          sid=self._sid)
        elif planner.lower() == 'heft':
            self._planner = HeftPlanner(campaign=self._campaign['campaign'],
                                        resources=self._resources,
                                        num_oper=num_oper,
                                        sid=self._sid)
        else:
            self._logger.warning('Planner %s is not implemented. Rolling to a \
                                  random planner')
            self._planner = RandomPlanner(campaign=self._campaign['campaign'],
                                          resources=self._resources,
                                          num_oper=num_oper,
                                          sid=self._sid)

示例#24

0

显示文件

    def initialize(self):

        # register unit input channels
        self.register_input(rps.AGENT_SCHEDULING_PENDING,
                            rpc.AGENT_SCHEDULING_QUEUE, self._schedule_units)

        # register unit output channels
        self.register_output(rps.AGENT_EXECUTING_PENDING,
                             rpc.AGENT_EXECUTING_QUEUE)

        # we need unschedule updates to learn about units for which to free the
        # allocated cores.  Those updates MUST be issued after execution, ie.
        # by the AgentExecutionComponent.
        self.register_subscriber(rpc.AGENT_UNSCHEDULE_PUBSUB,
                                 self.unschedule_cb)

        # we don't want the unschedule above to compete with actual
        # scheduling attempts, so we move the re-scheduling of units from the
        # wait pool into a separate thread (ie. register a separate callback).
        # This is triggered by the unscheduled_cb.
        #
        # NOTE: we could use a local queue here.  Using a zmq bridge goes toward
        #       an distributed scheduler, and is also easier to implement right
        #       now, since `Component` provides the right mechanisms...
        self.register_publisher(rpc.AGENT_SCHEDULE_PUBSUB)
        self.register_subscriber(rpc.AGENT_SCHEDULE_PUBSUB, self.schedule_cb)

        # The scheduler needs the ResourceManager information which have been collected
        # during agent startup.  We dig them out of the config at this point.
        #
        # NOTE: this information is insufficient for the torus scheduler!
        self._pid = self._cfg['pid']
        self._rm_info = self._cfg['rm_info']
        self._rm_lm_info = self._cfg['rm_info']['lm_info']
        self._rm_node_list = self._cfg['rm_info']['node_list']
        self._rm_sockets_per_node = self._cfg['rm_info']['sockets_per_node']
        self._rm_cores_per_socket = self._cfg['rm_info']['cores_per_socket']
        self._rm_gpus_per_socket = self._cfg['rm_info']['gpus_per_socket']
        self._rm_lfs_per_node = self._cfg['rm_info']['lfs_per_node']

        if not self._rm_node_list:
            raise RuntimeError(
                "ResourceManager %s didn't _configure node_list." %
                self._rm_info['name'])

        if self._rm_cores_per_socket is None:
            raise RuntimeError(
                "ResourceManager %s didn't _configure cores_per_socket." %
                self._rm_info['name'])

        if self._rm_sockets_per_node is None:
            raise RuntimeError(
                "ResourceManager %s didn't _configure sockets_per_node." %
                self._rm_info['name'])

        if self._rm_gpus_per_socket is None:
            raise RuntimeError(
                "ResourceManager %s didn't _configure gpus_per_socket." %
                self._rm_info['name'])

        # create and initialize the wait pool
        self._wait_pool = list()  # pool of waiting units
        self._wait_lock = ru.RLock()  # look on the above pool
        self._slot_lock = ru.RLock()  # lock slot allocation/deallocation

        # configure the scheduler instance
        self._configure()
        self._log.debug("slot status after  init      : %s",
                        self.slot_status())

示例#25

0

显示文件

文件： component.py 项目： eirrgang/radical.pilot

    def __init__(self, cfg, session):
        '''
        This constructor MUST be called by inheriting classes, as it specifies
        the operation mode of the component: components can spawn a child
        process, or not.

        If a child will be spawned later, then the child process state can be
        initialized by overloading the`initialize()` method.

        Note that this policy should be strictly followed by all derived
        classes, as we will otherwise carry state over the process fork.  That
        can become nasty if the state included any form of locking (like, for
        profiling or locking).

        The symmetric teardown methods are called `finalize()`.

        Constructors of inheriting components *may* call start() in their
        constructor.
        '''

        # NOTE: a fork will not duplicate any threads of the parent process --
        #       but it will duplicate any locks which are shared between the
        #       parent process and its threads -- and those locks might be in
        #       any state at this point.  As such, each child has to make
        #       sure to never, ever, use any of the inherited locks, but instead
        #       to create it's own set of locks in self.initialize.

        self._cfg = cfg
        self._uid = cfg.uid
        self._session = session

        # we always need an UID
        assert (self._uid), 'Component needs a uid (%s)' % type(self)

        # state we carry over the fork
        self._debug = cfg.get('debug')
        self._owner = cfg.get('owner', self.uid)
        self._ctype = "%s.%s" % (self.__class__.__module__,
                                 self.__class__.__name__)
        self._number = cfg.get('number', 0)
        self._name = cfg.get('name.%s' % self._number,
                             '%s.%s' % (self._ctype, self._number))

        self._bridges = list()  # communication bridges
        self._components = list()  # sub-components
        self._inputs = dict()  # queues to get things from
        self._outputs = dict()  # queues to send things to
        self._workers = dict()  # methods to work on things
        self._publishers = dict()  # channels to send notifications to
        self._threads = dict()  # subscriber and idler threads
        self._cb_lock = ru.RLock('comp.cb_lock.%s' % self._name)
        # guard threaded callback invokations

        self._subscribers = dict()  # ZMQ Subscriber classes

        if self._owner == self.uid:
            self._owner = 'root'

        self._prof = self._session._get_profiler(name=self.uid)
        self._rep = self._session._get_reporter(name=self.uid)
        self._log = self._session._get_logger(name=self.uid, level=self._debug)
        # self._prof.register_timing(name='component_lifetime',
        #                            scope='uid=%s' % self.uid,
        #                            start='component_start',
        #                            stop='component_stop')
        # self._prof.register_timing(name='entity_runtime',
        #                            scope='entity',
        #                            start='get',
        #                            stop=['put', 'drop'])
        self._prof.prof('init1', uid=self._uid, msg=self._prof.path)

        self._q = None
        self._in = None
        self._out = None
        self._poll = None
        self._ctx = None

        self._thread = None
        self._term = mt.Event()

示例#26

0

显示文件

文件： session.py 项目： eirrgang/radical.pilot

    def __init__(self, dburl=None, uid=None, cfg=None, _primary=True):
        '''
        Creates a new session.  A new Session instance is created and
        stored in the database.

        **Arguments:**
            * **dburl** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **cfg** (`str` or `dict`): a named or instantiated configuration
              to be used for the session.

            * **uid** (`string`): Create a session with this UID.  Session UIDs
              MUST be unique - otherwise they will lead to conflicts in the
              underlying database, resulting in undefined behaviours (or worse).

            * **_primary** (`bool`): only sessions created by the original
              application process (via `rp.Session()`, will connect to the  DB.
              Secondary session instances are instantiated internally in
              processes spawned (directly or indirectly) by the initial session,
              for example in some of it's components.  A secondary session will
              inherit the original session ID, but will not attempt to create
              a new DB collection - if such a DB connection is needed, the
              component needs to establish that on its own.
        '''

        # NOTE: `name` and `cfg` are overloaded, the user cannot point to
        #       a predefined config and amed it at the same time.  This might
        #       be ok for the session, but introduces a minor API inconsistency.
        name = 'default'
        if isinstance(cfg, str):
            name = cfg
            cfg  = None

        self._dbs     = None
        self._closed  = False
        self._primary = _primary

        self._pmgrs   = dict()  # map IDs to pmgr instances
        self._umgrs   = dict()  # map IDs to umgr instances
        self._cmgr    = None    # only primary sessions have a cmgr

        self._cfg     = ru.Config('radical.pilot.session',  name=name, cfg=cfg)
        self._rcfgs   = ru.Config('radical.pilot.resource', name='*')

        if _primary:

            pwd = os.getcwd()

            if not self._cfg.sid:
                if uid:
                    self._cfg.sid = uid
                else:
                    self._cfg.sid = ru.generate_id('rp.session',
                                                   mode=ru.ID_PRIVATE)
            if not self._cfg.base:
                self._cfg.base = pwd

            if not self._cfg.path:
                self._cfg.path = '%s/%s' % (self._cfg.base, self._cfg.sid)

            if not self._cfg.client_sandbox:
                self._cfg.client_sandbox = pwd

        else:
            for k in ['sid', 'base', 'path']:
                assert(k in self._cfg), 'non-primary session misses %s' % k

        # change RU defaults to point logfiles etc. to the session sandbox
        def_cfg             = ru.DefaultConfig()
        def_cfg.log_dir     = self._cfg.path
        def_cfg.report_dir  = self._cfg.path
        def_cfg.profile_dir = self._cfg.path

        self._uid  = self._cfg.sid

        self._prof = self._get_profiler(name=self._uid)
        self._rep  = self._get_reporter(name=self._uid)
        self._log  = self._get_logger  (name=self._uid,
                                       level=self._cfg.get('debug'))

        from . import version_detail as rp_version_detail
        self._log.info('radical.pilot version: %s' % rp_version_detail)
        self._log.info('radical.saga  version: %s' % rs.version_detail)
        self._log.info('radical.utils version: %s' % ru.version_detail)

        self._prof.prof('session_start', uid=self._uid, msg=int(_primary))

        # now we have config and uid - initialize base class (saga session)
        rs.Session.__init__(self, uid=self._uid)

        # cache sandboxes etc.
        self._cache_lock = ru.RLock()
        self._cache      = {'resource_sandbox' : dict(),
                            'session_sandbox'  : dict(),
                            'pilot_sandbox'    : dict(),
                            'client_sandbox'   : self._cfg.client_sandbox,
                            'js_shells'        : dict(),
                            'fs_dirs'          : dict()}

        if _primary:
            self._initialize_primary(dburl)

        # at this point we have a DB connection, logger, etc, and are done
        self._prof.prof('session_ok', uid=self._uid, msg=int(_primary))

示例#27

0

显示文件

    def __init__(self, session, cfg='default'):
        '''
        Creates a new PilotManager and attaches is to the session.

        **Arguments:**
            * session [:class:`rp.Session`]:
              The session instance to use.
            * cfg (`dict` or `string`):
              The configuration or name of configuration to use.

        **Returns:**
            * A new `PilotManager` object [:class:`rp.PilotManager`].
        '''

        assert (session.primary), 'pmgr needs primary session'

        self._pilots = dict()
        self._pilots_lock = ru.RLock('pmgr.pilots_lock')
        self._callbacks = dict()
        self._pcb_lock = ru.RLock('pmgr.pcb_lock')
        self._terminate = mt.Event()
        self._closed = False
        self._rec_id = 0  # used for session recording
        self._uid = ru.generate_id('pmgr.%(item_counter)04d',
                                   ru.ID_CUSTOM,
                                   ns=session.uid)

        for m in rpc.PMGR_METRICS:
            self._callbacks[m] = dict()

        # NOTE: `name` and `cfg` are overloaded, the user cannot point to
        #       a predefined config and amed it at the same time.  This might
        #       be ok for the session, but introduces a minor API inconsistency.
        #
        name = None
        if isinstance(cfg, str):
            name = cfg
            cfg = None

        cfg = ru.Config('radical.pilot.pmgr', name=name, cfg=cfg)
        cfg.uid = self._uid
        cfg.owner = self._uid
        cfg.sid = session.uid
        cfg.base = session.base
        cfg.path = session.path
        cfg.dburl = session.dburl
        cfg.heartbeat = session.cfg.heartbeat

        rpu.Component.__init__(self, cfg, session=session)
        self.start()

        self._log.info('started pmgr %s', self._uid)
        self._rep.info('<<create pilot manager')

        # create pmgr bridges and components, use session cmgr for that
        self._cmgr = rpu.ComponentManager(self._cfg)
        self._cmgr.start_bridges()
        self._cmgr.start_components()

        # The output queue is used to forward submitted pilots to the
        # launching component.
        self.register_output(rps.PMGR_LAUNCHING_PENDING,
                             rpc.PMGR_LAUNCHING_QUEUE)

        # we also listen on the control pubsub, to learn about completed staging
        # directives
        self.register_subscriber(rpc.CONTROL_PUBSUB, self._staging_ack_cb)
        self._active_sds = dict()
        self._sds_lock = ru.Lock('pmgr_sds_lock')

        # register the state notification pull cb and hb pull cb
        # FIXME: we may want to have the frequency configurable
        # FIXME: this should be a tailing cursor in the update worker
        self.register_timed_cb(self._state_pull_cb,
                               timer=self._cfg['db_poll_sleeptime'])
        self.register_timed_cb(self._pilot_heartbeat_cb,
                               timer=self._cfg['db_poll_sleeptime'])

        # also listen to the state pubsub for pilot state changes
        self.register_subscriber(rpc.STATE_PUBSUB, self._state_sub_cb)

        # let session know we exist
        self._session._register_pmgr(self)

        self._prof.prof('setup_done', uid=self._uid)
        self._rep.ok('>>ok\n')

示例#28

0

显示文件

    def __init__(self):

        self.logger = ru.Logger('radical.saga.pty')
        self.registry = {}
        self.rlock = ru.RLock('pty shell factory')
        self.cfg = ru.Config('radical.saga', 'utils')['pty']

示例#29

0

显示文件

    def __init__(self, session, cfg='default', scheduler=None):
        """
        Creates a new UnitManager and attaches it to the session.

        **Arguments:**
            * session [:class:`radical.pilot.Session`]:
              The session instance to use.
            * cfg (`dict` or `string`):
              The configuration or name of configuration to use.
            * scheduler (`string`):
              The name of the scheduler plug-in to use.

        **Returns:**
            * A new `UnitManager` object [:class:`radical.pilot.UnitManager`].
        """

        self._pilots      = dict()
        self._pilots_lock = ru.RLock('umgr.pilots_lock')
        self._units       = dict()
        self._units_lock  = ru.RLock('umgr.units_lock')
        self._callbacks   = dict()
        self._cb_lock     = ru.RLock('umgr.cb_lock')
        self._terminate   = mt.Event()
        self._closed      = False
        self._rec_id      = 0       # used for session recording
        self._uid         = ru.generate_id('umgr.%(item_counter)04d',
                                           ru.ID_CUSTOM, ns=session.uid)

        for m in rpc.UMGR_METRICS:
            self._callbacks[m] = dict()

        # NOTE: `name` and `cfg` are overloaded, the user cannot point to
        #       a predefined config and amed it at the same time.  This might
        #       be ok for the session, but introduces a minor API inconsistency.
        #
        name = None
        if isinstance(cfg, str):
            name = cfg
            cfg  = None

        cfg           = ru.Config('radical.pilot.umgr', name=name, cfg=cfg)
        cfg.uid       = self._uid
        cfg.owner     = self._uid
        cfg.sid       = session.uid
        cfg.base      = session.base
        cfg.path      = session.path
        cfg.dburl     = session.dburl
        cfg.heartbeat = session.cfg.heartbeat

        if scheduler:
            # overwrite the scheduler from the config file
            cfg.scheduler = scheduler


        rpu.Component.__init__(self, cfg, session=session)
        self.start()

        self._log.info('started umgr %s', self._uid)
        self._rep.info('<<create unit manager')

        # create pmgr bridges and components, use session cmgr for that
        self._cmgr = rpu.ComponentManager(self._cfg)
        self._cmgr.start_bridges()
        self._cmgr.start_components()

        # The output queue is used to forward submitted units to the
        # scheduling component.
        self.register_output(rps.UMGR_SCHEDULING_PENDING,
                             rpc.UMGR_SCHEDULING_QUEUE)

        # the umgr will also collect units from the agent again, for output
        # staging and finalization
        if self._cfg.bridges.umgr_staging_output_queue:
            self._has_sout = True
            self.register_output(rps.UMGR_STAGING_OUTPUT_PENDING,
                                 rpc.UMGR_STAGING_OUTPUT_QUEUE)
        else:
            self._has_sout = False

        # register the state notification pull cb
        # FIXME: this should be a tailing cursor in the update worker
        self.register_timed_cb(self._state_pull_cb,
                               timer=self._cfg['db_poll_sleeptime'])

        # register callback which pulls units back from agent
        # FIXME: this should be a tailing cursor in the update worker
        self.register_timed_cb(self._unit_pull_cb,
                               timer=self._cfg['db_poll_sleeptime'])

        # also listen to the state pubsub for unit state changes
        self.register_subscriber(rpc.STATE_PUBSUB, self._state_sub_cb)

        # let session know we exist
        self._session._register_umgr(self)

        self._prof.prof('setup_done', uid=self._uid)
        self._rep.ok('>>ok\n')

示例#30

0

显示文件

    def initialize(self):

        self._pwd = os.getcwd()
        self.gtod = "%s/gtod" % self._pwd

        self.register_input(rps.AGENT_EXECUTING_PENDING,
                            rpc.AGENT_EXECUTING_QUEUE, self.work)

        self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING,
                             rpc.AGENT_STAGING_OUTPUT_QUEUE)

        self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB)
        self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb)

        addr_wrk = self._cfg['bridges']['funcs_req_queue']
        addr_res = self._cfg['bridges']['funcs_res_queue']

        self._log.debug('wrk in  addr: %s', addr_wrk['addr_in' ])
        self._log.debug('res out addr: %s', addr_res['addr_out'])

        self._funcs_req = rpu.Queue(self._session, 'funcs_req_queue',
                                    rpu.QUEUE_INPUT, self._cfg,
                                    addr_wrk['addr_in'])
        self._funcs_res = rpu.Queue(self._session, 'funcs_res_queue',
                                    rpu.QUEUE_OUTPUT, self._cfg,
                                    addr_res['addr_out'])

        self._cancel_lock    = ru.RLock()
        self._cus_to_cancel  = list()
        self._cus_to_watch   = list()
        self._watch_queue    = queue.Queue ()

        self._pid = self._cfg['pid']

        # run watcher thread
        self._collector = mt.Thread(target=self._collect)
        self._collector.daemon = True
        self._collector.start()

        # we need to launch the executors on all nodes, and use the
        # agent_launcher for that
        self._launcher = rp.agent.LaunchMethod.create(
                name    = self._cfg.get('agent_launch_method'),
                cfg     = self._cfg,
                session = self._session)

        # now run the func launcher on all nodes
        ve  = os.environ.get('VIRTUAL_ENV',  '')
        exe = ru.which('radical-pilot-agent-funcs')

        if not exe:
            exe = '%s/rp_install/bin/radical-pilot-agent-funcs' % self._pwd

        for idx, node in enumerate(self._cfg['rm_info']['node_list']):
            uid   = 'func_exec.%04d' % idx
            pwd   = '%s/%s' % (self._pwd, uid)
            funcs = {'uid'        : uid,
                     'description': {'executable'   : exe,
                                     'arguments'    : [pwd, ve],
                                     'cpu_processes': 1,
                                     'environment'  : [],
                                    },
                     'slots'      : {'nodes'        : [{'name'  : node[0],
                                                        'uid'   : node[1],
                                                        'cores' : [[0]],
                                                        'gpus'  : []
                                                       }]
                                    },
                     'cfg'        : {'addr_wrk'     : addr_wrk['addr_out'],
                                     'addr_res'     : addr_res['addr_in']
                                    }
                    }
            self._spawn(self._launcher, funcs)