def initialize(self): self._umgr = self._cfg.owner self._early = dict() # early-bound units, pid-sorted self._pilots = dict() # dict of known pilots self._pilots_lock = ru.RLock() # lock on the above dict self._units = dict() # dict of scheduled unit IDs self._units_lock = ru.RLock() # lock on the above dict self._waiting = dict() # dict for units waiting on deps self._waiting_lock = dict() # lock on the above dict # configure the scheduler instance self._configure() self.register_input(rps.UMGR_SCHEDULING_PENDING, rpc.UMGR_SCHEDULING_QUEUE, self.work) self.register_output(rps.UMGR_STAGING_INPUT_PENDING, rpc.UMGR_STAGING_INPUT_QUEUE) # Some schedulers care about states (of pilots and/or units), some # don't. Either way, we here subscribe to state updates. self.register_subscriber(rpc.STATE_PUBSUB, self._base_state_cb) # Schedulers use that command channel to get information about # pilots being added or removed. self.register_subscriber(rpc.CONTROL_PUBSUB, self._base_command_cb) # cache the local client sandbox to avoid repeated os calls self._client_sandbox = os.getcwd()
def __init__(self, env=None, sid=None): super(SimulatedEnactor, self).__init__(sid=sid) # List with all the workflows that are executing and require to be # monitored. This list is atomic and requires a lock self._to_monitor = list() self._prof.prof('enactor_setup', uid=self._uid) # Lock to provide atomicity in the monitoring data structure self._monitoring_lock = ru.RLock('cm.monitor_lock') self._cb_lock = ru.RLock('enactor.cb_lock') self._callbacks = dict() # Creating a thread to execute the monitoring method. self._monitoring_thread = None # Private attribute that will hold the thread self._terminate_monitor = mt.Event() # Thread event to terminate. self._sim_env = env self._run = False self._terminate_simulation = mt.Event() self._simulation_thread = mt.Thread(target=self._sim_run, name='sim-thread') self._simulation_thread.start() # Thread event to terminate. self._prof.prof('enactor_started', uid=self._uid)
def _configure(self): Continuous._configure(self) # This scheduler will wait for state updates, and will consider a unit # `done` once it reaches a trigger state. When a state update is found # which shows that the units reached that state, it is marked as 'done' # in the respective order of its namespace. # self._trigger_state = rps.UMGR_STAGING_OUTPUT_PENDING self.register_subscriber(rpc.STATE_PUBSUB, self._state_cb) # a namespace entry will look like this: # # { 'current' : 0, # BoT currently operated on - starts with '0' # 0 : # sequential BoT numbering `n` # {'size': 128, # number of units to expect `max` # 'uids': [...]}, # ids of units to be scheduled # 'done': [...]}, # ids of units in trigger state # }, # ... # } # # prepare an initial entry for each ns which ensures that BOT #0 is # runnable once it arrives. self._lock = ru.RLock() # lock on the ns self._units = dict() # unit registry (we use uids otherwise) self._unordered = list() # IDs of units which are not ordered self._ns = dict() # nothing has run, yet self._ns_init = {'current': 0} self._order_init = {'size': 0, 'uids': list(), 'done': list()}
def initialize(self): self._pwd = os.getcwd() self.register_input(rps.AGENT_EXECUTING_PENDING, rpc.AGENT_EXECUTING_QUEUE, self.work) self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING, rpc.AGENT_STAGING_OUTPUT_QUEUE) self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB) self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb) self._cancel_lock = ru.RLock() self._cus_to_cancel = list() self._watch_queue = queue.Queue () self._pid = self._cfg['pid'] self.task_map = {} self.task_map_lock = ru.Lock() # we needs the LaunchMethod to construct commands. assert(self._cfg['task_launch_method'] == self._cfg['mpi_launch_method' ] == "ORTE_LIB"), \ "ORTE_LIB spawner only works with ORTE_LIB LaunchMethod." self._task_launcher = rp.agent.LaunchMethod.create(name="ORTE_LIB", cfg=self._cfg, session=self._session) self._orte_initialized = False self._cu_environment = self._populate_cu_environment() self.gtod = "%s/gtod" % self._pwd self.tmpdir = tempfile.gettempdir()
def test_locks(): ''' Test debug lock wrappers ''' os.environ['RADICAL_DEBUG'] = 'True' l = ru.Lock() rl = ru.RLock(name='bar') assert(not l.waits) assert(not rl.waits) with l: with rl: assert(not l.waits) assert(not rl.waits) assert(l.name in ru.debug._debug_helper.locks) # noqa assert(rl.name in ru.debug._debug_helper.rlocks) # noqa ru.debug._debug_helper.unregister_lock(l.name) # noqa ru.debug._debug_helper.unregister_rlock(rl.name) # noqa assert(l.name not in ru.debug._debug_helper.locks) # noqa assert(rl.name not in ru.debug._debug_helper.rlocks) # noqa
def __init__(self, adaptor_info, adaptor_options=[]): self._info = adaptor_info self._opts = adaptor_options self._name = adaptor_info['name'] self._schemas = adaptor_info['schemas'] self._lock = ru.RLock(self._name) self._logger = ru.get_logger('radical.saga.api') has_enabled = False for option in self._opts: if option['name'] == 'enabled': has_enabled = True if not has_enabled: # *every* adaptor needs an 'enabled' option! self._opts.append({ 'category': self._name, 'name': 'enabled', 'type': bool, 'default': True, 'valid_options': [True, False], 'documentation': "Enable / disable loading of the adaptor", 'env_variable': None }) ruc.Configurable.__init__(self, 'saga') ruc.Configurable.config_options(self, self._name, self._opts)
def _configure(self): self._wait_pool = dict() # set of unscheduled units self._wait_lock = ru.RLock() # look on the above set self._pids = list() self._idx = 0
def initialize(self): from .... import pilot as rp self._pwd = os.getcwd() self.register_input(rps.AGENT_EXECUTING_PENDING, rpc.AGENT_EXECUTING_QUEUE, self.work) self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING, rpc.AGENT_STAGING_OUTPUT_QUEUE) self.register_publisher(rpc.AGENT_UNSCHEDULE_PUBSUB) self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb) self._cancel_lock = ru.RLock() self._cus_to_cancel = list() self._cus_to_watch = list() self._watch_queue = queue.Queue() self._pid = self._cfg['pid'] # run watcher thread self._terminate = threading.Event() self._watcher = threading.Thread(target=self._watch, name="Watcher") self._watcher.daemon = True self._watcher.start() # The AgentExecutingComponent needs the LaunchMethod to construct # commands. self._task_launcher = rp.agent.LaunchMethod.create( name=self._cfg['task_launch_method'], cfg=self._cfg, session=self._session) self._mpi_launcher = rp.agent.LaunchMethod.create( name=self._cfg['mpi_launch_method'], cfg=self._cfg, session=self._session) self._cu_environment = self._populate_cu_environment() self.gtod = "%s/gtod" % self._pwd self.tmpdir = tempfile.gettempdir() # if we need to transplant any original env into the CU, we dig the # respective keys from the dump made by bootstrap_0.sh self._env_cu_export = dict() if self._cfg.get('export_to_cu'): with open('env.orig', 'r') as f: for line in f.readlines(): if '=' in line: k, v = line.split('=', 1) key = k.strip() val = v.strip() if key in self._cfg['export_to_cu']: self._env_cu_export[key] = val
def _configure(self): self._wait_pool = list() # set of unscheduled units self._wait_lock = ru.RLock() # look on the above set self._pids = list() self._idx = 0 self._log.debug('RoundRobin umgr scheduler configured')
def __init__ (self, command, cfg=None, logger=None) : """ The class constructor, which runs (execvpe) command in a separately forked process. The bew process will inherit the environment of the application process. :type command: string or list of strings :param command: The given command is what is run as a child, and fed/drained via pty pipes. If given as string, command is split into an array of strings, using :func:`shlex.split`. :type logger: :class:`radical.utils.logger.Logger` instance :param logger: logger stream to send status messages to. """ if cfg: self._cfg = cfg else: self._cfg = ru.Config('radical.saga', 'utils')['pty'] self.logger = logger if not self.logger : self.logger = ru.Logger('radical.saga.pty') self.logger.debug ("PTYProcess init %s" % self) if isinstance (command, basestring) : command = shlex.split (command) if not isinstance (command, list) : raise se.BadParameter ("PTYProcess expects string or list command") if len(command) < 1 : raise se.BadParameter ("PTYProcess expects non-empty command") self.rlock = ru.RLock ("pty process %s" % command) self.command = command # list of strings too run() self.cache = "" # data cache self.tail = "" # tail of data data cache for error messages self.child = None # the process as created by subprocess.Popen self.ptyio = None # the process' io channel, from pty.fork() self.exit_code = None # child died with code (may be revived) self.exit_signal = None # child kill by signal (may be revived) self.recover_max = 3 # TODO: make configure option. This does not self.recover_attempts = 0 # apply for recovers triggered by gc_timeout! try : self.initialize () except Exception as e : raise ptye.translate_exception (e, "pty or process creation failed")
def __init__(self, adaptor_info, adaptor_options=[]): self._info = adaptor_info self._opts = adaptor_options self._name = adaptor_info['name'] self._lock = ru.RLock(self._name) self._logger = ru.get_logger('radical.enmd.{0}'.format(self._name)) self._reporter = ru.LogReporter( name='radical.enmd.{0}'.format(self._name))
def _initialize(self): ''' initialization of component base class goes here ''' # components can always publish logs, state updates and control messages # self.register_publisher(rpc.LOG_PUBSUB) self.register_publisher(rpc.STATE_PUBSUB) self.register_publisher(rpc.CONTROL_PUBSUB) # set controller callback to handle cancellation requests self._cancel_list = list() self._cancel_lock = ru.RLock('comp.cancel_lock.%s' % self._uid) self.register_subscriber(rpc.CONTROL_PUBSUB, self._cancel_monitor_cb) # call component level initialize self.initialize() self._prof.prof('component_init')
def initialize(self): # we keep a cache of SAGA dir handles self._fs_cache = dict() self._js_cache = dict() self._pilots = dict() self._pilots_lock = ru.RLock() self.register_input(rps.UMGR_STAGING_INPUT_PENDING, rpc.UMGR_STAGING_INPUT_QUEUE, self.work) # FIXME: this queue is inaccessible, needs routing via mongodb self.register_output(rps.AGENT_STAGING_INPUT_PENDING, None) # we subscribe to the command channel to learn about pilots being added # to this unit manager. self.register_subscriber(rpc.CONTROL_PUBSUB, self._base_command_cb)
def _configure(self): Continuous._configure(self) # a 'bag' entry will look like this: # # { # 'size': 128, # number of units to expect # 'uids': [...]}, # ids of units to be scheduled # } self._lock = ru.RLock() # lock on the bags self._units = dict() # unit registry (we use uids otherwise) self._unordered = list() # IDs of units which are not colocated self._bags = dict() # nothing has run, yet self._bag_init = {'size' : 0, 'uids' : list()}
def __init__(self, adaptor_info, adaptor_options=None, expand_env=True): # FIXME: engine is loading cfg already, here we load again... self._info = adaptor_info self._name = adaptor_info['name'] self._schemas = adaptor_info['schemas'] self._lock = ru.RLock(self._name) self._logger = ru.Logger('radical.saga.api') # we need to expand later once we got env from the remote resource self._cfg = ru.Config(module='radical.saga', name=self._name, expand=expand_env) if 'enabled' not in self._cfg: self._cfg['enabled'] = True
def initialize(self): self._pwd = os.getcwd() self.register_input(rps.AGENT_EXECUTING_PENDING, rpc.AGENT_EXECUTING_QUEUE, self.work) self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING, rpc.AGENT_STAGING_OUTPUT_QUEUE) self.register_publisher(rpc.AGENT_UNSCHEDULE_PUBSUB) self._terminate = mt.Event() self._tasks_lock = ru.RLock() self._tasks = list() self._delay = 0.1 self._watcher = mt.Thread(target=self._timed) self._watcher.daemon = True self._watcher.start()
def initialize(self): self._pwd = os.getcwd() self.register_input(rps.AGENT_EXECUTING_PENDING, rpc.AGENT_EXECUTING_QUEUE, self.work) self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING, rpc.AGENT_STAGING_OUTPUT_QUEUE) self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB) self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb) self._cancel_lock = ru.RLock() self._cus_to_cancel = list() self._cus_to_watch = list() self._watch_queue = queue.Queue () self._pid = self._cfg['pid'] # run watcher thread self._watcher = mt.Thread(target=self._watch) # self._watcher.daemon = True self._watcher.start() # The AgentExecutingComponent needs the LaunchMethod to construct # commands. self._task_launcher = rp.agent.LaunchMethod.create( name = self._cfg.get('task_launch_method'), cfg = self._cfg, session = self._session) self._mpi_launcher = rp.agent.LaunchMethod.create( name = self._cfg.get('mpi_launch_method'), cfg = self._cfg, session = self._session) self.gtod = "%s/gtod" % self._pwd self.tmpdir = tempfile.gettempdir()
def initialize(self): from .... import pilot as rp self._pwd = os.getcwd() self.register_input(rps.EXECUTING_PENDING, rpc.AGENT_EXECUTING_QUEUE, self.work) self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING, rpc.AGENT_STAGING_OUTPUT_QUEUE) self.register_publisher(rpc.AGENT_UNSCHEDULE_PUBSUB) self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb) # Mimic what virtualenv's "deactivate" would do self._deactivate = "\n# deactivate pilot virtualenv\n" old_path = os.environ.get('_OLD_VIRTUAL_PATH', None) old_ppath = os.environ.get('_OLD_VIRTUAL_PYTHONPATH', None) old_home = os.environ.get('_OLD_VIRTUAL_PYTHONHOME', None) old_ps1 = os.environ.get('_OLD_VIRTUAL_PS1', None) if old_ppath: self._deactivate += 'export PATH="%s"\n' % old_ppath if old_path: self._deactivate += 'export PYTHONPATH="%s"\n' % old_path if old_home: self._deactivate += 'export PYTHON_HOME="%s"\n' % old_home if old_ps1: self._deactivate += 'export PS1="%s"\n' % old_ps1 self._deactivate += 'unset VIRTUAL_ENV\n\n' # FIXME: we should not alter the environment of the running agent, but # only make sure that the CU finds a pristine env. That also # holds for the unsetting below -- AM if old_path: os.environ['PATH'] = old_path if old_ppath: os.environ['PYTHONPATH'] = old_ppath if old_home: os.environ['PYTHON_HOME'] = old_home if old_ps1: os.environ['PS1'] = old_ps1 if 'VIRTUAL_ENV' in os.environ: del (os.environ['VIRTUAL_ENV']) # simplify shell startup / prompt detection os.environ['PS1'] = '$ ' self._task_launcher = rp.agent.LaunchMethod.create( name=self._cfg['task_launch_method'], cfg=self._cfg, session=self._session) self._mpi_launcher = rp.agent.LaunchMethod.create( name=self._cfg['mpi_launch_method'], cfg=self._cfg, session=self._session) # TODO: test that this actually works # Remove the configured set of environment variables from the # environment that we pass to Popen. for e in list(os.environ.keys()): env_removables = list() if self._mpi_launcher: env_removables += self._mpi_launcher.env_removables if self._task_launcher: env_removables += self._task_launcher.env_removables for r in env_removables: if e.startswith(r): os.environ.pop(e, None) # if we need to transplant any original env into the CU, we dig the # respective keys from the dump made by bootstrap_0.sh self._env_cu_export = dict() if self._cfg.get('export_to_cu'): with open('env.orig', 'r') as f: for line in f.readlines(): if '=' in line: k, v = line.split('=', 1) key = k.strip() val = v.strip() if key in self._cfg['export_to_cu']: self._env_cu_export[key] = val # the registry keeps track of units to watch, indexed by their shell # spawner process ID. As the registry is shared between the spawner and # watcher thread, we use a lock while accessing it. self._registry = dict() self._registry_lock = ru.RLock() self._cus_to_cancel = list() self._cancel_lock = ru.RLock() self._cached_events = list() # keep monitoring events for pid's which # are not yet known # get some threads going -- those will do all the work. import radical.saga.utils.pty_shell as sups self.launcher_shell = sups.PTYShell("fork://localhost/") self.monitor_shell = sups.PTYShell("fork://localhost/") # run the spawner on the shells # tmp = tempfile.gettempdir() # Moving back to shared file system again, until it reaches maturity, # as this breaks launch methods with a hop, e.g. ssh. # FIXME: see #658 self._pid = self._cfg['pid'] self._spawner_tmp = "/%s/%s-%s" % (self._pwd, self._pid, self.uid) ret, out, _ = self.launcher_shell.run_sync \ ("/bin/sh %s/agent/executing/shell_spawner.sh %s" % (os.path.dirname (rp.__file__), self._spawner_tmp)) if ret != 0: raise RuntimeError("launcher bootstrap failed: (%s)(%s)", ret, out) ret, out, _ = self.monitor_shell.run_sync \ ("/bin/sh %s/agent/executing/shell_spawner.sh %s" % (os.path.dirname (rp.__file__), self._spawner_tmp)) if ret != 0: raise RuntimeError("monitor bootstrap failed: (%s)(%s)", ret, out) # run watcher thread self._terminate = threading.Event() self._watcher = threading.Thread(target=self._watch, name="Watcher") self._watcher.daemon = True self._watcher.start() self.gtod = "%s/gtod" % self._pwd
def __init__(self): self.logger = ru.Logger('radical.saga.pty') self.registry = {} self.rlock = ru.RLock('pty shell factory')
def __init__(self, pmgr, descr): # 'static' members self._descr = descr.as_dict() # sanity checks on description for check in ['resource', 'cores', 'runtime']: if not self._descr.get(check): raise ValueError("ComputePilotDescription needs '%s'" % check) # initialize state self._pmgr = pmgr self._session = self._pmgr.session self._prof = self._session._prof self._uid = ru.generate_id('pilot.%(item_counter)04d', ru.ID_CUSTOM, ns=self._session.uid) self._state = rps.NEW self._log = pmgr._log self._pilot_dict = dict() self._callbacks = dict() self._cache = dict() # cache of SAGA dir handles self._cb_lock = ru.RLock() # pilot failures can trigger app termination self._exit_on_error = self._descr.get('exit_on_error') for m in rpc.PMGR_METRICS: self._callbacks[m] = dict() # we always invoke the default state cb self._callbacks[rpc.PILOT_STATE][self._default_state_cb.__name__] = { 'cb' : self._default_state_cb, 'cb_data' : None} # `as_dict()` needs `pilot_dict` and other attributes. Those should all # be available at this point (apart from the sandboxes), so we now # query for those sandboxes. self._pilot_jsurl = ru.Url() self._pilot_jshop = ru.Url() self._resource_sandbox = ru.Url() self._session_sandbox = ru.Url() self._pilot_sandbox = ru.Url() self._client_sandbox = ru.Url() pilot = self.as_dict() self._pilot_jsurl, self._pilot_jshop \ = self._session._get_jsurl (pilot) self._resource_sandbox = self._session._get_resource_sandbox(pilot) self._session_sandbox = self._session._get_session_sandbox (pilot) self._pilot_sandbox = self._session._get_pilot_sandbox (pilot) self._client_sandbox = self._session._get_client_sandbox() # we need to expand plaaceholders in the sandboxes # FIXME: this code is a duplication from the pilot launcher code expand = dict() for k,v in pilot['description'].items(): if v is None: v = '' expand['pd.%s' % k] = v if isinstance(v, str): expand['pd.%s' % k.upper()] = v.upper() expand['pd.%s' % k.lower()] = v.lower() else: expand['pd.%s' % k.upper()] = v expand['pd.%s' % k.lower()] = v self._resource_sandbox.path = self._resource_sandbox.path % expand self._session_sandbox .path = self._session_sandbox .path % expand self._pilot_sandbox .path = self._pilot_sandbox .path % expand
def initialize(self): from .... import pilot as rp self._pwd = os.getcwd() self._tmp = self._pwd # keep temporary files in $PWD for now (slow) self.register_input(rps.AGENT_EXECUTING_PENDING, rpc.AGENT_EXECUTING_QUEUE, self.work) self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING, rpc.AGENT_STAGING_OUTPUT_QUEUE) self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB) self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb) # Mimic what virtualenv's "deactivate" would do self._deactivate = "\n# deactivate pilot virtualenv\n" old_path = os.environ.get('_OLD_VIRTUAL_PATH', None) old_ppath = os.environ.get('_OLD_VIRTUAL_PYTHONPATH', None) old_home = os.environ.get('_OLD_VIRTUAL_PYTHONHOME', None) old_ps1 = os.environ.get('_OLD_VIRTUAL_PS1', None) if old_ppath: self._deactivate += 'export PATH="%s"\n' % old_ppath if old_path : self._deactivate += 'export PYTHONPATH="%s"\n' % old_path if old_home : self._deactivate += 'export PYTHON_HOME="%s"\n' % old_home if old_ps1 : self._deactivate += 'export PS1="%s"\n' % old_ps1 self._deactivate += 'unset VIRTUAL_ENV\n\n' # FIXME: we should not alter the environment of the running agent, but # only make sure that the CU finds a pristine env. That also # holds for the unsetting below -- AM if old_path : os.environ['PATH'] = old_path if old_ppath: os.environ['PYTHONPATH'] = old_ppath if old_home : os.environ['PYTHON_HOME'] = old_home if old_ps1 : os.environ['PS1'] = old_ps1 if 'VIRTUAL_ENV' in os.environ : del(os.environ['VIRTUAL_ENV']) self._task_launcher = None self._mpi_launcher = None try: self._task_launcher = rp.agent.LaunchMethod.create( name = self._cfg['task_launch_method'], cfg = self._cfg, session = self._session) except: self._log.warn('no task launcher found') try: self._mpi_launcher = rp.agent.LaunchMethod.create( name = self._cfg['mpi_launch_method'], cfg = self._cfg, session = self._session) except: self._log.warn('no mpi launcher found') # TODO: test that this actually works # Remove the configured set of environment variables from the # environment that we pass to Popen. for e in list(os.environ.keys()): env_removables = list() if self._mpi_launcher : env_removables += self._mpi_launcher.env_removables if self._task_launcher: env_removables += self._task_launcher.env_removables for r in env_removables: if e.startswith(r): os.environ.pop(e, None) # if we need to transplant any original env into the CU, we dig the # respective keys from the dump made by bootstrap_0.sh self._env_cu_export = dict() if self._cfg.get('export_to_cu'): with open('env.orig', 'r') as f: for line in f.readlines(): if '=' in line: k,v = line.split('=', 1) key = k.strip() val = v.strip() if key in self._cfg['export_to_cu']: self._env_cu_export[key] = val # the registry keeps track of units to watch self._registry = dict() self._registry_lock = ru.RLock() self._to_cancel = list() self._cancel_lock = ru.RLock() self._cached_events = list() # keep monitoring events for pid's which # are not yet known self.gtod = "%s/gtod" % self._pwd # create line buffered fifo's to communicate with the shell executor self._fifo_cmd_name = "%s/%s.cmd.pipe" % (self._tmp, self._uid) self._fifo_inf_name = "%s/%s.inf.pipe" % (self._tmp, self._uid) os.mkfifo(self._fifo_cmd_name) os.mkfifo(self._fifo_inf_name) self._fifo_cmd = open(self._fifo_cmd_name, 'w+', 1) self._fifo_inf = open(self._fifo_inf_name, 'r+', 1) # run thread to watch then info fifo self._terminate = threading.Event() self._watcher = threading.Thread(target=self._watch, name="Watcher") self._watcher.daemon = True self._watcher.start () # start the shell executor sh_exe = "%s/shell_spawner_fs.sh" % os.path.dirname(__file__) sh_cmd = "%s %s %s %s" % (sh_exe, self._pwd, self._tmp, self.uid) # script base work sid self._log.debug('start shell executor [%s]', sh_cmd) self._sh = sp.Popen(sh_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
def __init__(self): self.logger = rul.getLogger('saga', 'PTYShellFactory') self.registry = {} self.rlock = ru.RLock('pty shell factory')
def __init__(self, campaign, resources, objective=None, planner='random', sid=None): self._campaign = {'campaign': campaign, 'state': st.NEW} if sid: self._sid = sid else: self._sid = ru.generate_id('rcm.session', mode=ru.ID_PRIVATE) self._uid = ru.generate_id('bookkeper.%(counter)04d', mode=ru.ID_CUSTOM, ns=self._sid) self._resources = resources self._checkpoints = None self._plan = None self._objective = objective self._unavail_resources = [] self._workflows_state = dict() self._exec_state_lock = ru.RLock('workflows_state_lock') self._monitor_lock = ru.RLock('monitor_list_lock') self._time = 0 # The time in the campaign's world. self._workflows_to_monitor = list() self._est_end_times = dict() self._env = Environment() self._enactor = SimulatedEnactor(env=self._env, sid=self._sid) self._enactor.register_state_cb(self.state_update_cb) # Creating a thread to execute the monitoring and work methods. # One flag for both threads may be enough to monitor and check. self._terminate_event = mt.Event() # Thread event to terminate. self._work_thread = None # Private attribute that will hold the thread self._monitoring_thread = None # Private attribute that will hold the thread self._cont = False self._hold = False path = os.getcwd() + '/' + self._sid self._logger = ru.Logger(name=self._uid, path=path, level='DEBUG') self._prof = ru.Profiler(name=self._uid, path=path) num_oper = [ workflow['num_oper'] for workflow in self._campaign['campaign'] ] if planner.lower() == 'random': self._planner = RandomPlanner(campaign=self._campaign['campaign'], resources=self._resources, num_oper=num_oper, sid=self._sid) elif planner.lower() == 'heft': self._planner = HeftPlanner(campaign=self._campaign['campaign'], resources=self._resources, num_oper=num_oper, sid=self._sid) else: self._logger.warning('Planner %s is not implemented. Rolling to a \ random planner') self._planner = RandomPlanner(campaign=self._campaign['campaign'], resources=self._resources, num_oper=num_oper, sid=self._sid)
def initialize(self): # register unit input channels self.register_input(rps.AGENT_SCHEDULING_PENDING, rpc.AGENT_SCHEDULING_QUEUE, self._schedule_units) # register unit output channels self.register_output(rps.AGENT_EXECUTING_PENDING, rpc.AGENT_EXECUTING_QUEUE) # we need unschedule updates to learn about units for which to free the # allocated cores. Those updates MUST be issued after execution, ie. # by the AgentExecutionComponent. self.register_subscriber(rpc.AGENT_UNSCHEDULE_PUBSUB, self.unschedule_cb) # we don't want the unschedule above to compete with actual # scheduling attempts, so we move the re-scheduling of units from the # wait pool into a separate thread (ie. register a separate callback). # This is triggered by the unscheduled_cb. # # NOTE: we could use a local queue here. Using a zmq bridge goes toward # an distributed scheduler, and is also easier to implement right # now, since `Component` provides the right mechanisms... self.register_publisher(rpc.AGENT_SCHEDULE_PUBSUB) self.register_subscriber(rpc.AGENT_SCHEDULE_PUBSUB, self.schedule_cb) # The scheduler needs the ResourceManager information which have been collected # during agent startup. We dig them out of the config at this point. # # NOTE: this information is insufficient for the torus scheduler! self._pid = self._cfg['pid'] self._rm_info = self._cfg['rm_info'] self._rm_lm_info = self._cfg['rm_info']['lm_info'] self._rm_node_list = self._cfg['rm_info']['node_list'] self._rm_sockets_per_node = self._cfg['rm_info']['sockets_per_node'] self._rm_cores_per_socket = self._cfg['rm_info']['cores_per_socket'] self._rm_gpus_per_socket = self._cfg['rm_info']['gpus_per_socket'] self._rm_lfs_per_node = self._cfg['rm_info']['lfs_per_node'] if not self._rm_node_list: raise RuntimeError( "ResourceManager %s didn't _configure node_list." % self._rm_info['name']) if self._rm_cores_per_socket is None: raise RuntimeError( "ResourceManager %s didn't _configure cores_per_socket." % self._rm_info['name']) if self._rm_sockets_per_node is None: raise RuntimeError( "ResourceManager %s didn't _configure sockets_per_node." % self._rm_info['name']) if self._rm_gpus_per_socket is None: raise RuntimeError( "ResourceManager %s didn't _configure gpus_per_socket." % self._rm_info['name']) # create and initialize the wait pool self._wait_pool = list() # pool of waiting units self._wait_lock = ru.RLock() # look on the above pool self._slot_lock = ru.RLock() # lock slot allocation/deallocation # configure the scheduler instance self._configure() self._log.debug("slot status after init : %s", self.slot_status())
def __init__(self, cfg, session): ''' This constructor MUST be called by inheriting classes, as it specifies the operation mode of the component: components can spawn a child process, or not. If a child will be spawned later, then the child process state can be initialized by overloading the`initialize()` method. Note that this policy should be strictly followed by all derived classes, as we will otherwise carry state over the process fork. That can become nasty if the state included any form of locking (like, for profiling or locking). The symmetric teardown methods are called `finalize()`. Constructors of inheriting components *may* call start() in their constructor. ''' # NOTE: a fork will not duplicate any threads of the parent process -- # but it will duplicate any locks which are shared between the # parent process and its threads -- and those locks might be in # any state at this point. As such, each child has to make # sure to never, ever, use any of the inherited locks, but instead # to create it's own set of locks in self.initialize. self._cfg = cfg self._uid = cfg.uid self._session = session # we always need an UID assert (self._uid), 'Component needs a uid (%s)' % type(self) # state we carry over the fork self._debug = cfg.get('debug') self._owner = cfg.get('owner', self.uid) self._ctype = "%s.%s" % (self.__class__.__module__, self.__class__.__name__) self._number = cfg.get('number', 0) self._name = cfg.get('name.%s' % self._number, '%s.%s' % (self._ctype, self._number)) self._bridges = list() # communication bridges self._components = list() # sub-components self._inputs = dict() # queues to get things from self._outputs = dict() # queues to send things to self._workers = dict() # methods to work on things self._publishers = dict() # channels to send notifications to self._threads = dict() # subscriber and idler threads self._cb_lock = ru.RLock('comp.cb_lock.%s' % self._name) # guard threaded callback invokations self._subscribers = dict() # ZMQ Subscriber classes if self._owner == self.uid: self._owner = 'root' self._prof = self._session._get_profiler(name=self.uid) self._rep = self._session._get_reporter(name=self.uid) self._log = self._session._get_logger(name=self.uid, level=self._debug) # self._prof.register_timing(name='component_lifetime', # scope='uid=%s' % self.uid, # start='component_start', # stop='component_stop') # self._prof.register_timing(name='entity_runtime', # scope='entity', # start='get', # stop=['put', 'drop']) self._prof.prof('init1', uid=self._uid, msg=self._prof.path) self._q = None self._in = None self._out = None self._poll = None self._ctx = None self._thread = None self._term = mt.Event()
def __init__(self, dburl=None, uid=None, cfg=None, _primary=True): ''' Creates a new session. A new Session instance is created and stored in the database. **Arguments:** * **dburl** (`string`): The MongoDB URL. If none is given, RP uses the environment variable RADICAL_PILOT_DBURL. If that is not set, an error will be raises. * **cfg** (`str` or `dict`): a named or instantiated configuration to be used for the session. * **uid** (`string`): Create a session with this UID. Session UIDs MUST be unique - otherwise they will lead to conflicts in the underlying database, resulting in undefined behaviours (or worse). * **_primary** (`bool`): only sessions created by the original application process (via `rp.Session()`, will connect to the DB. Secondary session instances are instantiated internally in processes spawned (directly or indirectly) by the initial session, for example in some of it's components. A secondary session will inherit the original session ID, but will not attempt to create a new DB collection - if such a DB connection is needed, the component needs to establish that on its own. ''' # NOTE: `name` and `cfg` are overloaded, the user cannot point to # a predefined config and amed it at the same time. This might # be ok for the session, but introduces a minor API inconsistency. name = 'default' if isinstance(cfg, str): name = cfg cfg = None self._dbs = None self._closed = False self._primary = _primary self._pmgrs = dict() # map IDs to pmgr instances self._umgrs = dict() # map IDs to umgr instances self._cmgr = None # only primary sessions have a cmgr self._cfg = ru.Config('radical.pilot.session', name=name, cfg=cfg) self._rcfgs = ru.Config('radical.pilot.resource', name='*') if _primary: pwd = os.getcwd() if not self._cfg.sid: if uid: self._cfg.sid = uid else: self._cfg.sid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE) if not self._cfg.base: self._cfg.base = pwd if not self._cfg.path: self._cfg.path = '%s/%s' % (self._cfg.base, self._cfg.sid) if not self._cfg.client_sandbox: self._cfg.client_sandbox = pwd else: for k in ['sid', 'base', 'path']: assert(k in self._cfg), 'non-primary session misses %s' % k # change RU defaults to point logfiles etc. to the session sandbox def_cfg = ru.DefaultConfig() def_cfg.log_dir = self._cfg.path def_cfg.report_dir = self._cfg.path def_cfg.profile_dir = self._cfg.path self._uid = self._cfg.sid self._prof = self._get_profiler(name=self._uid) self._rep = self._get_reporter(name=self._uid) self._log = self._get_logger (name=self._uid, level=self._cfg.get('debug')) from . import version_detail as rp_version_detail self._log.info('radical.pilot version: %s' % rp_version_detail) self._log.info('radical.saga version: %s' % rs.version_detail) self._log.info('radical.utils version: %s' % ru.version_detail) self._prof.prof('session_start', uid=self._uid, msg=int(_primary)) # now we have config and uid - initialize base class (saga session) rs.Session.__init__(self, uid=self._uid) # cache sandboxes etc. self._cache_lock = ru.RLock() self._cache = {'resource_sandbox' : dict(), 'session_sandbox' : dict(), 'pilot_sandbox' : dict(), 'client_sandbox' : self._cfg.client_sandbox, 'js_shells' : dict(), 'fs_dirs' : dict()} if _primary: self._initialize_primary(dburl) # at this point we have a DB connection, logger, etc, and are done self._prof.prof('session_ok', uid=self._uid, msg=int(_primary))
def __init__(self, session, cfg='default'): ''' Creates a new PilotManager and attaches is to the session. **Arguments:** * session [:class:`rp.Session`]: The session instance to use. * cfg (`dict` or `string`): The configuration or name of configuration to use. **Returns:** * A new `PilotManager` object [:class:`rp.PilotManager`]. ''' assert (session.primary), 'pmgr needs primary session' self._pilots = dict() self._pilots_lock = ru.RLock('pmgr.pilots_lock') self._callbacks = dict() self._pcb_lock = ru.RLock('pmgr.pcb_lock') self._terminate = mt.Event() self._closed = False self._rec_id = 0 # used for session recording self._uid = ru.generate_id('pmgr.%(item_counter)04d', ru.ID_CUSTOM, ns=session.uid) for m in rpc.PMGR_METRICS: self._callbacks[m] = dict() # NOTE: `name` and `cfg` are overloaded, the user cannot point to # a predefined config and amed it at the same time. This might # be ok for the session, but introduces a minor API inconsistency. # name = None if isinstance(cfg, str): name = cfg cfg = None cfg = ru.Config('radical.pilot.pmgr', name=name, cfg=cfg) cfg.uid = self._uid cfg.owner = self._uid cfg.sid = session.uid cfg.base = session.base cfg.path = session.path cfg.dburl = session.dburl cfg.heartbeat = session.cfg.heartbeat rpu.Component.__init__(self, cfg, session=session) self.start() self._log.info('started pmgr %s', self._uid) self._rep.info('<<create pilot manager') # create pmgr bridges and components, use session cmgr for that self._cmgr = rpu.ComponentManager(self._cfg) self._cmgr.start_bridges() self._cmgr.start_components() # The output queue is used to forward submitted pilots to the # launching component. self.register_output(rps.PMGR_LAUNCHING_PENDING, rpc.PMGR_LAUNCHING_QUEUE) # we also listen on the control pubsub, to learn about completed staging # directives self.register_subscriber(rpc.CONTROL_PUBSUB, self._staging_ack_cb) self._active_sds = dict() self._sds_lock = ru.Lock('pmgr_sds_lock') # register the state notification pull cb and hb pull cb # FIXME: we may want to have the frequency configurable # FIXME: this should be a tailing cursor in the update worker self.register_timed_cb(self._state_pull_cb, timer=self._cfg['db_poll_sleeptime']) self.register_timed_cb(self._pilot_heartbeat_cb, timer=self._cfg['db_poll_sleeptime']) # also listen to the state pubsub for pilot state changes self.register_subscriber(rpc.STATE_PUBSUB, self._state_sub_cb) # let session know we exist self._session._register_pmgr(self) self._prof.prof('setup_done', uid=self._uid) self._rep.ok('>>ok\n')
def __init__(self): self.logger = ru.Logger('radical.saga.pty') self.registry = {} self.rlock = ru.RLock('pty shell factory') self.cfg = ru.Config('radical.saga', 'utils')['pty']
def __init__(self, session, cfg='default', scheduler=None): """ Creates a new UnitManager and attaches it to the session. **Arguments:** * session [:class:`radical.pilot.Session`]: The session instance to use. * cfg (`dict` or `string`): The configuration or name of configuration to use. * scheduler (`string`): The name of the scheduler plug-in to use. **Returns:** * A new `UnitManager` object [:class:`radical.pilot.UnitManager`]. """ self._pilots = dict() self._pilots_lock = ru.RLock('umgr.pilots_lock') self._units = dict() self._units_lock = ru.RLock('umgr.units_lock') self._callbacks = dict() self._cb_lock = ru.RLock('umgr.cb_lock') self._terminate = mt.Event() self._closed = False self._rec_id = 0 # used for session recording self._uid = ru.generate_id('umgr.%(item_counter)04d', ru.ID_CUSTOM, ns=session.uid) for m in rpc.UMGR_METRICS: self._callbacks[m] = dict() # NOTE: `name` and `cfg` are overloaded, the user cannot point to # a predefined config and amed it at the same time. This might # be ok for the session, but introduces a minor API inconsistency. # name = None if isinstance(cfg, str): name = cfg cfg = None cfg = ru.Config('radical.pilot.umgr', name=name, cfg=cfg) cfg.uid = self._uid cfg.owner = self._uid cfg.sid = session.uid cfg.base = session.base cfg.path = session.path cfg.dburl = session.dburl cfg.heartbeat = session.cfg.heartbeat if scheduler: # overwrite the scheduler from the config file cfg.scheduler = scheduler rpu.Component.__init__(self, cfg, session=session) self.start() self._log.info('started umgr %s', self._uid) self._rep.info('<<create unit manager') # create pmgr bridges and components, use session cmgr for that self._cmgr = rpu.ComponentManager(self._cfg) self._cmgr.start_bridges() self._cmgr.start_components() # The output queue is used to forward submitted units to the # scheduling component. self.register_output(rps.UMGR_SCHEDULING_PENDING, rpc.UMGR_SCHEDULING_QUEUE) # the umgr will also collect units from the agent again, for output # staging and finalization if self._cfg.bridges.umgr_staging_output_queue: self._has_sout = True self.register_output(rps.UMGR_STAGING_OUTPUT_PENDING, rpc.UMGR_STAGING_OUTPUT_QUEUE) else: self._has_sout = False # register the state notification pull cb # FIXME: this should be a tailing cursor in the update worker self.register_timed_cb(self._state_pull_cb, timer=self._cfg['db_poll_sleeptime']) # register callback which pulls units back from agent # FIXME: this should be a tailing cursor in the update worker self.register_timed_cb(self._unit_pull_cb, timer=self._cfg['db_poll_sleeptime']) # also listen to the state pubsub for unit state changes self.register_subscriber(rpc.STATE_PUBSUB, self._state_sub_cb) # let session know we exist self._session._register_umgr(self) self._prof.prof('setup_done', uid=self._uid) self._rep.ok('>>ok\n')
def initialize(self): self._pwd = os.getcwd() self.gtod = "%s/gtod" % self._pwd self.register_input(rps.AGENT_EXECUTING_PENDING, rpc.AGENT_EXECUTING_QUEUE, self.work) self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING, rpc.AGENT_STAGING_OUTPUT_QUEUE) self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB) self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb) addr_wrk = self._cfg['bridges']['funcs_req_queue'] addr_res = self._cfg['bridges']['funcs_res_queue'] self._log.debug('wrk in addr: %s', addr_wrk['addr_in' ]) self._log.debug('res out addr: %s', addr_res['addr_out']) self._funcs_req = rpu.Queue(self._session, 'funcs_req_queue', rpu.QUEUE_INPUT, self._cfg, addr_wrk['addr_in']) self._funcs_res = rpu.Queue(self._session, 'funcs_res_queue', rpu.QUEUE_OUTPUT, self._cfg, addr_res['addr_out']) self._cancel_lock = ru.RLock() self._cus_to_cancel = list() self._cus_to_watch = list() self._watch_queue = queue.Queue () self._pid = self._cfg['pid'] # run watcher thread self._collector = mt.Thread(target=self._collect) self._collector.daemon = True self._collector.start() # we need to launch the executors on all nodes, and use the # agent_launcher for that self._launcher = rp.agent.LaunchMethod.create( name = self._cfg.get('agent_launch_method'), cfg = self._cfg, session = self._session) # now run the func launcher on all nodes ve = os.environ.get('VIRTUAL_ENV', '') exe = ru.which('radical-pilot-agent-funcs') if not exe: exe = '%s/rp_install/bin/radical-pilot-agent-funcs' % self._pwd for idx, node in enumerate(self._cfg['rm_info']['node_list']): uid = 'func_exec.%04d' % idx pwd = '%s/%s' % (self._pwd, uid) funcs = {'uid' : uid, 'description': {'executable' : exe, 'arguments' : [pwd, ve], 'cpu_processes': 1, 'environment' : [], }, 'slots' : {'nodes' : [{'name' : node[0], 'uid' : node[1], 'cores' : [[0]], 'gpus' : [] }] }, 'cfg' : {'addr_wrk' : addr_wrk['addr_out'], 'addr_res' : addr_res['addr_in'] } } self._spawn(self._launcher, funcs)