def merge_description (self, source) : """ merge additional information into the unit description -- such as resource information, or application specific data """ # print 'merging unit %s' % self.id # print ' with %s' % source # print ' and %s' % self.as_dict() # we only allow this in DESCRIBED or BOUND state if not self.state in [DESCRIBED, BOUND] : raise RuntimeError ('unit is not in DESCRIBED state (%s)' \ % self.state) ud_dict = self.as_dict () ru.dict_merge (ud_dict, source, policy='overwrite') ru.dict_stringexpand (ud_dict) ru.dict_stringexpand (ud_dict, self.session.cfg) # print '-------------' # import pprint # pprint.pprint (ud_dict) # print '-------------' # pprint.pprint (self.session.cfg) # print '-------------' # exit() for (key, val) in ud_dict.iteritems () : try : self.set_attribute (key, val) except : pass
def stop(self): # During ResourceManager termination, we call any existing shutdown hooks on the # launch methods. We only call LaunchMethod shutdown hooks *once* launch_methods = set() launch_methods.add(self._cfg.get('mpi_launch_method')) launch_methods.add(self._cfg.get('task_launch_method')) launch_methods.add(self._cfg.get('agent_launch_method')) launch_methods.discard(None) for lm in launch_methods: try: ru.dict_merge( self.lm_info, rpa.LaunchMethod.rm_shutdown_hook(name=lm, cfg=self._cfg, rm=self, lm_info=self.lm_info, log=self._log, profiler=self._prof)) except Exception as e: self._log.exception( "ResourceManager shutdown hook failed: %s" % e) raise self._log.info("ResourceManager shutdown hook succeeded (%s)" % lm)
def _load_resource_configs(self): self.is_valid() self._prof.prof('config_parser_start', uid=self._uid) # Loading all "default" resource configurations module_path = os.path.dirname(os.path.abspath(__file__)) default_cfgs = "%s/configs/resource_*.json" % module_path config_files = glob.glob(default_cfgs) for config_file in config_files: try: self._log.info("Load resource configurations from %s" % config_file) rcs = ResourceConfig.from_file(config_file) except Exception as e: self._log.exception("skip config file %s: %s" % (config_file, e)) raise RuntimeError('config error (%s) - abort' % e) for rc in rcs: self._log.info("Load resource configurations for %s" % rc) self._resource_configs[rc] = rcs[rc].as_dict() home = os.environ.get('HOME', '') user_cfgs = "%s/.radical/pilot/configs/resource_*.json" % home config_files = glob.glob(user_cfgs) for config_file in config_files: try: rcs = ResourceConfig.from_file(config_file) except Exception as e: self._log.exception("skip config file %s: %s" % (config_file, e)) raise RuntimeError('config error (%s) - abort' % e) for rc in rcs: self._log.info("Load resource configurations for %s" % rc) if rc in self._resource_configs: # config exists -- merge user config into it ru.dict_merge(self._resource_configs[rc], rcs[rc].as_dict(), policy='overwrite') else: # new config -- add as is self._resource_configs[rc] = rcs[rc].as_dict() default_aliases = "%s/configs/resource_aliases.json" % module_path self._resource_aliases = ru.read_json_str(default_aliases)['aliases'] # check if we have aliases to merge usr_aliases = '%s/.radical/pilot/configs/resource_aliases.json' % home if os.path.isfile(usr_aliases): ru.dict_merge(self._resource_aliases, ru.read_json_str(usr_aliases).get('aliases', {}), policy='overwrite') self._prof.prof('config_parser_stop', uid=self._uid)
def __init__ (self, user_cfg=None, default=True) : # accept any number of user configs if not isinstance (user_cfg, list) : user_cfg = [user_cfg] # set saga apitype for clean inheritance (cpi to api mapping relies on # _apitype) self._apitype = 'saga.Session' resource_cfg = "%s/resources.json" % os.path.dirname (troy.__file__) config_dir = "%s/.troy" % os.environ.get ('HOME', '/etc/') config_env = "%s" % os.environ.get ('TROY_CONFIG', None) # we read our base config from $HOME/troy/* by default, but also accept # other locations if $TROY_CONFIG is set. Items later in the list below # overwrite earlier ones. self.cfg = tu.get_config ([_config_skeleton, resource_cfg , config_dir , config_env ] + user_cfg) # make sure that the resource sections in the config have the minimal # set of entries for res_name in self.cfg['resources'] : ru.dict_merge (self.cfg['resources'][res_name], _resource_config_skeleton, policy='preserve', logger=troy._logger) # we set the log level as indicated in the troy config or user # config, fallback being log level ERROR log_level = 'ERROR' log_level = self.cfg.get ('log_level', log_level) log_level = os.environ.get ('TROY_VERBOSE', log_level) troy._logger.setLevel (log_level) # now that config parsing is done, we can create the session ID session_id_stub = self.cfg.get ("session_id", 'session.') self.id = ru.generate_id (session_id_stub, mode=ru.ID_UNIQUE) troy._logger.info ("session id: %s" % self.id) # and initialize the inherited saga session tu.Timed.__init__ (self, 'troy.Session', self.id) self.timed_method ('saga.Session', ['init'], saga.Session.__init__, [self, default])
def get_resource_config (self, resource) : # resources may be in fact URLs -- but resource configs use host # names as keys. So we check if the URL is well formed and attempt # to extract the host # FIXME: cache results, URL parsing is expensive try : resource_url = saga.Url (resource) # the url string 'india.futuregrid.org' is parsed as url path # element, not as URL host name. if resource_url.host : resource = resource_url.host except saga.SagaException as e : pass # probably not a URL :P resource_cfg = self.get_config ('resources') # default to a copy of the resource config skeleton troy._logger.debug ('create resource config for %s' % resource) ret = dict (_resource_config_skeleton) # check if have a match with one of the wildcards. for resource_key in resource_cfg.keys () : if '*' in resource_key : resource_pattern = re.compile (fnmatch.translate (resource_key)) if resource_pattern.match (resource): troy._logger.debug ('merge resource pattern %s for %s' \ % (resource_key, resource)) ru.dict_merge (ret, resource_cfg[resource_key], policy='overwrite', logger=troy._logger) # check if we have an exact match for the resource name. This upersedes # the wildcard entries if resource in resource_cfg : troy._logger.debug ('merge resource config for %s' % resource) ru.dict_merge (ret, resource_cfg[resource], policy='overwrite', logger=troy._logger) # make sure the hostname is in the config ret['hostname'] = resource return ret
def merge_description (self, source) : """ merge additional information into the pilot description -- such as resource information, or application specific data """ # we only allow this in DESCRIBED or BOUND state if not self.state in [DESCRIBED, BOUND] : raise RuntimeError ('pilot is not in DESCRIBED state (%s)' \ % self.state) pd_dict = self.description.as_dict () ru.dict_merge (pd_dict, source, policy='overwrite') ru.dict_stringexpand (pd_dict) ru.dict_stringexpand (pd_dict, self.session.cfg) self.description = troy.PilotDescription (pd_dict)
def test_dict_merge () : dict_1 = {'key_shared' : 'val_shared_1', 'key_orig_1' : 'val_orig_1'} dict_2 = {'key_shared' : 'val_shared_2', 'key_orig_2' : 'val_orig_2'} try : ru.dict_merge (dict_1, dict_2) assert (False), 'expected ValueError exception' except ValueError : pass except Exception as e : assert (False), 'expected ValueError exception, not %s' % e ru.dict_merge (dict_1, dict_2, policy='preserve') assert (dict_1.keys() == ['key_orig_1', 'key_orig_2', 'key_shared']) assert (dict_1['key_shared'] == 'val_shared_1') assert (dict_1['key_orig_1'] == 'val_orig_1') assert (dict_1['key_orig_2'] == 'val_orig_2') ru.dict_merge (dict_1, dict_2, policy='overwrite') assert (dict_1.keys() == ['key_orig_1', 'key_orig_2', 'key_shared']) assert (dict_1['key_shared'] == 'val_shared_2') assert (dict_1['key_orig_1'] == 'val_orig_1') assert (dict_1['key_orig_2'] == 'val_orig_2')
def stop(self): # During LRMS termination, we call any existing shutdown hooks on the # launch methods. We only call LM shutdown hooks *once* launch_methods = set() # set keeps entries unique if 'mpi_launch_method' in self._cfg: launch_methods.add(self._cfg['mpi_launch_method']) launch_methods.add(self._cfg['task_launch_method']) launch_methods.add(self._cfg['agent_launch_method']) for lm in launch_methods: if lm: try: from .... import pilot as rp ru.dict_merge(self.lm_info, rp.agent.LM.lrms_shutdown_hook(lm, self._cfg, self, self.lm_info, self._log)) except Exception as e: self._log.exception("lrms shutdown hook failed") raise self._log.info("lrms shutdown hook succeeded (%s)" % lm)
def __init__(self, agent_name): assert(agent_name != 'agent_0'), 'expect subagent, not agent_0' print "startup agent %s" % agent_name # load config, create session and controller, init rpu.Worker agent_cfg = "%s/%s.cfg" % (os.getcwd(), agent_name) cfg = ru.read_json_str(agent_cfg) self._uid = agent_name self._pid = cfg['pilot_id'] self._sid = cfg['session_id'] self._final_cause = None # Create a session. # # This session will not connect to MongoDB, but will create any # communication channels and components/workers specified in the # config -- we merge that information into our own config. # We don't want the session to start components though, so remove them # from the config copy. session_cfg = copy.deepcopy(cfg) session_cfg['owner'] = self._uid session_cfg['components'] = dict() session = rp_Session(cfg=session_cfg, _connect=False, uid=self._sid) # we still want the bridge addresses known though, so make sure they are # merged into our own copy, along with any other additions done by the # session. ru.dict_merge(cfg, session._cfg, ru.PRESERVE) pprint.pprint(cfg) if session.is_connected: raise RuntimeError('agent_n should not connect to mongodb') # at this point the session is up and workin, and the session # controller should have brought up all communication bridges and the # agent components. We are ready to roll! rpu.Worker.__init__(self, cfg, session)
def stop(self): # During LRMS termination, we call any existing shutdown hooks on the # launch methods. We only call LM shutdown hooks *once* launch_methods = set() # set keeps entries unique if 'mpi_launch_method' in self._cfg: launch_methods.add(self._cfg['mpi_launch_method']) launch_methods.add(self._cfg['task_launch_method']) launch_methods.add(self._cfg['agent_launch_method']) for lm in launch_methods: if lm: try: from .... import pilot as rp ru.dict_merge(self.lm_info, rp.agent.LM.lrms_shutdown_hook(lm, self._cfg, self, self.lm_info, self._log, self._prof)) except Exception as e: self._log.exception("lrms shutdown hook failed") raise self._log.info("lrms shutdown hook succeeded (%s)" % lm)
def stop(self): # During LRMS termination, we call any existing shutdown hooks on the # launch methods. We only call LM shutdown hooks *once* launch_methods = set() launch_methods.add(self._cfg.get('mpi_launch_method')) launch_methods.add(self._cfg.get('task_launch_method')) launch_methods.add(self._cfg.get('agent_launch_method')) launch_methods.discard(None) for lm in launch_methods: try: from .... import pilot as rp ru.dict_merge(self.lm_info, rp.agent.LM.lrms_shutdown_hook(lm, self._cfg, self, self.lm_info, self._log, self._prof)) except Exception as e: self._log.exception("lrms shutdown hook failed: %s" % e) raise self._log.info("lrms shutdown hook succeeded (%s)" % lm)
def _write_sa_configs(self): # we have all information needed by the subagents -- write the # sub-agent config files. # write deep-copies of the config for each sub-agent (sans from agent_0) for sa in self._cfg.get('agents', {}): assert(sa != 'agent_0'), 'expect subagent, not agent_0' # use our own config sans agents/components as a basis for # the sub-agent config. tmp_cfg = copy.deepcopy(self._cfg) tmp_cfg['agents'] = dict() tmp_cfg['components'] = dict() # merge sub_agent layout into the config ru.dict_merge(tmp_cfg, self._cfg['agents'][sa], ru.OVERWRITE) tmp_cfg['agent_name'] = sa tmp_cfg['owner'] = 'agent_0' ru.write_json(tmp_cfg, './%s.cfg' % sa)
def _get_config(self, cfg=None): ''' derive a worker base configuration from the control pubsub configuration ''' # FIXME: this uses insider knowledge on the config location and # structure. It would be better if agent.0 creates the worker # base config from scratch on startup. pwd = os.getcwd() ru.dict_merge(cfg, ru.read_json('%s/../control_pubsub.json' % pwd)) del (cfg['channel']) del (cfg['cmgr']) cfg['log_lvl'] = 'debug' cfg['kind'] = 'master' cfg['base'] = pwd cfg['uid'] = ru.generate_id('master.%(item_counter)06d', ru.ID_CUSTOM, ns=self._session.uid) return ru.Config(cfg=cfg)
def setUp(test_type, test_name): ret = list() for fin in glob.glob('tests/test_cases/unit.*.json'): tc = ru.read_json(fin) unit = tc['unit'] setup = tc['setup'].get(test_type, {}) result = tc['results'].get(test_type, {}).get(test_name) resource_file = tc['results'].get('resource_file', {}).get(test_name) resource_filename = tc['results'].get('resource_filename', {}).get(test_name) test = ru.dict_merge(unit, setup, ru.PRESERVE) if result: if resource_file and resource_filename: ret.append([test, result, resource_file, resource_filename]) else: ret.append([test, result]) return ret
def __init__(self, agent_name): assert(agent_name == 'agent_0'), 'expect agent_0, not subagent' print 'startup agent %s' % agent_name # load config, create session, init rpu.Worker agent_cfg = '%s/%s.cfg' % (os.getcwd(), agent_name) cfg = ru.read_json_str(agent_cfg) cfg['agent_name'] = agent_name self._uid = agent_name self._pid = cfg['pilot_id'] self._sid = cfg['session_id'] self._runtime = cfg['runtime'] self._starttime = time.time() self._final_cause = None self._lrms = None # this better be on a shared FS! cfg['workdir'] = os.getcwd() # sanity check on config settings if 'cores' not in cfg: raise ValueError('Missing number of cores') if 'lrms' not in cfg: raise ValueError('Missing LRMS') if 'dburl' not in cfg: raise ValueError('Missing DBURL') if 'pilot_id' not in cfg: raise ValueError('Missing pilot id') if 'runtime' not in cfg: raise ValueError('Missing or zero agent runtime') if 'scheduler' not in cfg: raise ValueError('Missing agent scheduler') if 'session_id' not in cfg: raise ValueError('Missing session id') if 'spawner' not in cfg: raise ValueError('Missing agent spawner') if 'task_launch_method' not in cfg: raise ValueError('Missing unit launch method') # Check for the RADICAL_PILOT_DB_HOSTPORT env var, which will hold # the address of the tunnelized DB endpoint. If it exists, we # overrule the agent config with it. hostport = os.environ.get('RADICAL_PILOT_DB_HOSTPORT') if hostport: dburl = ru.Url(cfg['dburl']) dburl.host, dburl.port = hostport.split(':') cfg['dburl'] = str(dburl) # Create a session. # # This session will connect to MongoDB, and will also create any # communication channels and components/workers specified in the # config -- we merge that information into our own config. # We don't want the session to start components though, so remove them # from the config copy. session_cfg = copy.deepcopy(cfg) session_cfg['components'] = dict() session = rp_Session(cfg=session_cfg, uid=self._sid) # we still want the bridge addresses known though, so make sure they are # merged into our own copy, along with any other additions done by the # session. ru.dict_merge(cfg, session._cfg, ru.PRESERVE) pprint.pprint(cfg) if not session.is_connected: raise RuntimeError('agent_0 could not connect to mongodb') # at this point the session is up and connected, and it should have # brought up all communication bridges and the UpdateWorker. We are # ready to rumble! rpu.Worker.__init__(self, cfg, session) # this is the earlier point to sync bootstrapper and agent # profiles self._prof.prof('sync_rel', msg='agent_0 start', uid=self._pid) # Create LRMS which will give us the set of agent_nodes to use for # sub-agent startup. Add the remaining LRMS information to the # config, for the benefit of the scheduler). self._lrms = rpa_rm.RM.create(name=self._cfg['lrms'], cfg=self._cfg, session=self._session) # add the resource manager information to our own config self._cfg['lrms_info'] = self._lrms.lrms_info
def _prepare_pilot(self, resource, rcfg, pilot): pid = pilot["uid"] ret = {'ft' : list(), 'jd' : None } # # ---------------------------------------------------------------------- # # the rcfg can contain keys with string expansion placeholders where # # values from the pilot description need filling in. A prominent # # example is `%(pd.project)s`, where the pilot description's `PROJECT` # # value needs to be filled in (here in lowercase). # expand = dict() # for k,v in pilot['description'].iteritems(): # if v is None: # v = '' # expand['pd.%s' % k] = v # if isinstance(v, basestring): # expand['pd.%s' % k.upper()] = v.upper() # expand['pd.%s' % k.lower()] = v.lower() # else: # expand['pd.%s' % k.upper()] = v # expand['pd.%s' % k.lower()] = v # # for k in rcfg: # if isinstance(rcfg[k], basestring): # orig = rcfg[k] # rcfg[k] = rcfg[k] % expand # expanded = rcfg[k] # if orig != expanded: # self._log.debug('RCFG:\n%s\n%s', orig, expanded) # ---------------------------------------------------------------------- # Database connection parameters sid = self._session.uid database_url = self._session.dburl # some default values are determined at runtime default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \ (resource, self._rp_version) # ---------------------------------------------------------------------- # pilot description and resource configuration number_cores = pilot['description']['cores'] number_gpus = pilot['description']['gpus'] runtime = pilot['description']['runtime'] queue = pilot['description']['queue'] project = pilot['description']['project'] cleanup = pilot['description']['cleanup'] memory = pilot['description']['memory'] candidate_hosts = pilot['description']['candidate_hosts'] # ---------------------------------------------------------------------- # get parameters from resource cfg, set defaults where needed agent_launch_method = rcfg.get('agent_launch_method') agent_dburl = rcfg.get('agent_mongodb_endpoint', database_url) agent_spawner = rcfg.get('agent_spawner', DEFAULT_AGENT_SPAWNER) rc_agent_config = rcfg.get('agent_config', DEFAULT_AGENT_CONFIG) agent_scheduler = rcfg.get('agent_scheduler') tunnel_bind_device = rcfg.get('tunnel_bind_device') default_queue = rcfg.get('default_queue') forward_tunnel_endpoint = rcfg.get('forward_tunnel_endpoint') lrms = rcfg.get('lrms') mpi_launch_method = rcfg.get('mpi_launch_method', '') pre_bootstrap_0 = rcfg.get('pre_bootstrap_0', []) pre_bootstrap_1 = rcfg.get('pre_bootstrap_1', []) python_interpreter = rcfg.get('python_interpreter') task_launch_method = rcfg.get('task_launch_method') rp_version = rcfg.get('rp_version', DEFAULT_RP_VERSION) virtenv_mode = rcfg.get('virtenv_mode', DEFAULT_VIRTENV_MODE) virtenv = rcfg.get('virtenv', default_virtenv) cores_per_node = rcfg.get('cores_per_node', 0) gpus_per_node = rcfg.get('gpus_per_node', 0) lfs_path_per_node = rcfg.get('lfs_path_per_node', None) lfs_size_per_node = rcfg.get('lfs_size_per_node', 0) python_dist = rcfg.get('python_dist') virtenv_dist = rcfg.get('virtenv_dist', DEFAULT_VIRTENV_DIST) cu_tmp = rcfg.get('cu_tmp') spmd_variation = rcfg.get('spmd_variation') shared_filesystem = rcfg.get('shared_filesystem', True) stage_cacerts = rcfg.get('stage_cacerts', False) cu_pre_exec = rcfg.get('cu_pre_exec') cu_post_exec = rcfg.get('cu_post_exec') export_to_cu = rcfg.get('export_to_cu') mandatory_args = rcfg.get('mandatory_args', []) saga_jd_supplement = rcfg.get('saga_jd_supplement', {}) import pprint self._log.debug(cores_per_node) self._log.debug(pprint.pformat(rcfg)) # make sure that mandatory args are known for ma in mandatory_args: if pilot['description'].get(ma) is None: raise ValueError('attribute "%s" is required for "%s"' % (ma, resource)) # get pilot and global sandbox resource_sandbox = self._session._get_resource_sandbox (pilot).path session_sandbox = self._session._get_session_sandbox(pilot).path pilot_sandbox = self._session._get_pilot_sandbox (pilot).path pilot['resource_sandbox'] = str(self._session._get_resource_sandbox(pilot)) pilot['pilot_sandbox'] = str(self._session._get_pilot_sandbox(pilot)) pilot['client_sandbox'] = str(self._session._get_client_sandbox()) # Agent configuration that is not part of the public API. # The agent config can either be a config dict, or # a string pointing to a configuration name. If neither # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is # set. The last fallback is 'agent_default' agent_config = pilot['description'].get('_config') if not agent_config: agent_config = os.environ.get('RADICAL_PILOT_AGENT_CONFIG') if not agent_config: agent_config = rc_agent_config if isinstance(agent_config, dict): # use dict as is agent_cfg = agent_config elif isinstance(agent_config, basestring): try: # interpret as a config name agent_cfg_file = os.path.join(self._conf_dir, "agent_%s.json" % agent_config) self._log.info("Read agent config file: %s", agent_cfg_file) agent_cfg = ru.read_json(agent_cfg_file) # allow for user level overload user_cfg_file = '%s/.radical/pilot/config/%s' \ % (os.environ['HOME'], os.path.basename(agent_cfg_file)) if os.path.exists(user_cfg_file): self._log.info("merging user config: %s" % user_cfg_file) user_cfg = ru.read_json(user_cfg_file) ru.dict_merge (agent_cfg, user_cfg, policy='overwrite') except Exception as e: self._log.exception("Error reading agent config file: %s" % e) raise else: # we can't handle this type raise TypeError('agent config must be string (config name) or dict') # expand variables in virtenv string virtenv = virtenv % {'pilot_sandbox' : pilot_sandbox, 'session_sandbox' : session_sandbox, 'resource_sandbox': resource_sandbox} # Check for deprecated global_virtenv if 'global_virtenv' in rcfg: raise RuntimeError("'global_virtenv' is deprecated (%s)" % resource) # Create a host:port string for use by the bootstrap_0. db_url = rs.Url(agent_dburl) if db_url.port: db_hostport = "%s:%d" % (db_url.host, db_url.port) else: db_hostport = "%s:%d" % (db_url.host, 27017) # mongodb default # ---------------------------------------------------------------------- # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to root_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith('@') and \ not rp_version in ['installed', 'local', 'debug', 'release']: raise ValueError("invalid rp_version '%s'" % rp_version) if rp_version.startswith('@'): rp_version = rp_version[1:] # strip '@' # ---------------------------------------------------------------------- # sanity checks if not python_dist : raise RuntimeError("missing python distribution") if not virtenv_dist : raise RuntimeError("missing virtualenv distribution") if not agent_spawner : raise RuntimeError("missing agent spawner") if not agent_scheduler : raise RuntimeError("missing agent scheduler") if not lrms : raise RuntimeError("missing LRMS") if not agent_launch_method: raise RuntimeError("missing agentlaunch method") if not task_launch_method : raise RuntimeError("missing task launch method") # massage some values if not queue : queue = default_queue if cleanup and isinstance (cleanup, bool) : # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) if shared_filesystem: cleanup = 'luve' else: # we cannot clean the sandbox from within the agent, as the hop # staging would then fail, and we'd get nothing back. # FIXME: cleanup needs to be done by the pmgr.launcher, or # someone else, really, after fetching all logs and # profiles. cleanup = 'luv' # we never cleanup virtenvs which are not private if virtenv_mode is not 'private' : cleanup = cleanup.replace ('v', '') # add dists to staging files, if needed if rp_version in ['local', 'debug']: sdist_names = [ru.sdist_name, rs.sdist_name, self._rp_sdist_name] sdist_paths = [ru.sdist_path, rs.sdist_path, self._rp_sdist_path] else: sdist_names = list() sdist_paths = list() # if cores_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if cores_per_node: cores_per_node = int(cores_per_node) number_cores = int(cores_per_node * math.ceil(float(number_cores) / cores_per_node)) # if gpus_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if gpus_per_node: gpus_per_node = int(gpus_per_node) number_gpus = int(gpus_per_node * math.ceil(float(number_gpus) / gpus_per_node)) # set mandatory args bootstrap_args = "" bootstrap_args += " -d '%s'" % ':'.join(sdist_names) bootstrap_args += " -p '%s'" % pid bootstrap_args += " -s '%s'" % sid bootstrap_args += " -m '%s'" % virtenv_mode bootstrap_args += " -r '%s'" % rp_version bootstrap_args += " -b '%s'" % python_dist bootstrap_args += " -g '%s'" % virtenv_dist bootstrap_args += " -v '%s'" % virtenv bootstrap_args += " -y '%d'" % runtime # set optional args if lrms == "CCM": bootstrap_args += " -c" if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if tunnel_bind_device: bootstrap_args += " -t '%s'" % tunnel_bind_device if cleanup: bootstrap_args += " -x '%s'" % cleanup for arg in pre_bootstrap_0: bootstrap_args += " -e '%s'" % arg for arg in pre_bootstrap_1: bootstrap_args += " -w '%s'" % arg agent_cfg['owner'] = 'agent_0' agent_cfg['cores'] = number_cores agent_cfg['gpus'] = number_gpus agent_cfg['lrms'] = lrms agent_cfg['spawner'] = agent_spawner agent_cfg['scheduler'] = agent_scheduler agent_cfg['runtime'] = runtime agent_cfg['dburl'] = str(database_url) agent_cfg['session_id'] = sid agent_cfg['pilot_id'] = pid agent_cfg['logdir'] = '.' agent_cfg['pilot_sandbox'] = pilot_sandbox agent_cfg['session_sandbox'] = session_sandbox agent_cfg['resource_sandbox'] = resource_sandbox agent_cfg['agent_launch_method']= agent_launch_method agent_cfg['task_launch_method'] = task_launch_method agent_cfg['mpi_launch_method'] = mpi_launch_method agent_cfg['cores_per_node'] = cores_per_node agent_cfg['gpus_per_node'] = gpus_per_node agent_cfg['lfs_path_per_node'] = lfs_path_per_node agent_cfg['lfs_size_per_node'] = lfs_size_per_node agent_cfg['cu_tmp'] = cu_tmp agent_cfg['export_to_cu'] = export_to_cu agent_cfg['cu_pre_exec'] = cu_pre_exec agent_cfg['cu_post_exec'] = cu_post_exec agent_cfg['resource_cfg'] = copy.deepcopy(rcfg) agent_cfg['debug'] = self._log.getEffectiveLevel() # we'll also push the agent config into MongoDB pilot['cfg'] = agent_cfg # ---------------------------------------------------------------------- # Write agent config dict to a json file in pilot sandbox. agent_cfg_name = 'agent_0.cfg' cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.') os.close(cfg_tmp_handle) # file exists now # Convert dict to json file self._log.debug("Write agent cfg to '%s'.", cfg_tmp_file) self._log.debug(pprint.pformat(agent_cfg)) ru.write_json(agent_cfg, cfg_tmp_file) ret['ft'].append({'src' : cfg_tmp_file, 'tgt' : '%s/%s' % (pilot_sandbox, agent_cfg_name), 'rem' : True}) # purge the tmp file after packing # ---------------------------------------------------------------------- # we also touch the log and profile tarballs in the target pilot sandbox ret['ft'].append({'src' : '/dev/null', 'tgt' : '%s/%s' % (pilot_sandbox, '%s.log.tgz' % pid), 'rem' : False}) # don't remove /dev/null # only stage profiles if we profile if self._prof.enabled: ret['ft'].append({ 'src' : '/dev/null', 'tgt' : '%s/%s' % (pilot_sandbox, '%s.prof.tgz' % pid), 'rem' : False}) # don't remove /dev/null # check if we have a sandbox cached for that resource. If so, we have # nothing to do. Otherwise we create the sandbox and stage the RP # stack etc. # NOTE: this will race when multiple pilot launcher instances are used! with self._cache_lock: if resource not in self._sandboxes: for sdist in sdist_paths: base = os.path.basename(sdist) ret['ft'].append({'src' : sdist, 'tgt' : '%s/%s' % (session_sandbox, base), 'rem' : False}) # Copy the bootstrap shell script. bootstrapper_path = os.path.abspath("%s/agent/%s" % (self._root_dir, BOOTSTRAPPER_0)) self._log.debug("use bootstrapper %s", bootstrapper_path) ret['ft'].append({'src' : bootstrapper_path, 'tgt' : '%s/%s' % (session_sandbox, BOOTSTRAPPER_0), 'rem' : False}) # Some machines cannot run pip due to outdated CA certs. # For those, we also stage an updated certificate bundle # TODO: use booleans all the way? if stage_cacerts: cc_name = 'cacert.pem.gz' cc_path = os.path.abspath("%s/agent/%s" % (self._root_dir, cc_name)) self._log.debug("use CAs %s", cc_path) ret['ft'].append({'src' : cc_path, 'tgt' : '%s/%s' % (session_sandbox, cc_name), 'rem' : False}) self._sandboxes[resource] = True # ---------------------------------------------------------------------- # Create SAGA Job description and submit the pilot job jd = rs.job.Description() if shared_filesystem: bootstrap_tgt = '%s/%s' % (session_sandbox, BOOTSTRAPPER_0) else: bootstrap_tgt = '%s/%s' % ('.', BOOTSTRAPPER_0) jd.name = pid jd.executable = "/bin/bash" jd.arguments = ['-l %s' % bootstrap_tgt, bootstrap_args] jd.working_directory = pilot_sandbox jd.project = project jd.output = "bootstrap_0.out" jd.error = "bootstrap_0.err" jd.total_cpu_count = number_cores jd.total_gpu_count = number_gpus jd.processes_per_host = cores_per_node jd.spmd_variation = spmd_variation jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue jd.candidate_hosts = candidate_hosts jd.environment = dict() # we set any saga_jd_supplement keys which are not already set above for key, val in saga_jd_supplement.iteritems(): if not jd[key]: self._log.debug('supplement %s: %s', key, val) jd[key] = val if 'RADICAL_PILOT_PROFILE' in os.environ : jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE' # for condor backends and the like which do not have shared FSs, we add # additional staging directives so that the backend system binds the # files from the session and pilot sandboxes to the pilot job. jd.file_transfer = list() if not shared_filesystem: jd.file_transfer.extend([ 'site:%s/%s > %s' % (session_sandbox, BOOTSTRAPPER_0, BOOTSTRAPPER_0), 'site:%s/%s > %s' % (pilot_sandbox, agent_cfg_name, agent_cfg_name), 'site:%s/%s.log.tgz > %s.log.tgz' % (pilot_sandbox, pid, pid), 'site:%s/%s.log.tgz < %s.log.tgz' % (pilot_sandbox, pid, pid) ]) if 'RADICAL_PILOT_PROFILE' in os.environ: jd.file_transfer.extend([ 'site:%s/%s.prof.tgz > %s.prof.tgz' % (pilot_sandbox, pid, pid), 'site:%s/%s.prof.tgz < %s.prof.tgz' % (pilot_sandbox, pid, pid) ]) for sdist in sdist_names: jd.file_transfer.extend([ 'site:%s/%s > %s' % (session_sandbox, sdist, sdist) ]) if stage_cacerts: jd.file_transfer.extend([ 'site:%s/%s > %s' % (session_sandbox, cc_name, cc_name) ]) self._log.debug("Bootstrap command line: %s %s", jd.executable, jd.arguments) ret['jd'] = jd return ret
def profile(command, *args, **kwargs): if callable(command): cmd_str = "%s %s %s" % (command.__name__, str(args), str(kwargs)) else: cmd_str = command print "profile: %s" % cmd_str if '_RADICAL_SYNAPSE_EMULATED' in os.environ: cmd_str = os.environ.get('_RADICAL_SYNAPSE_EMULATEE', cmd_str) print 'using emulated command name: %s' % cmd_str info = {'cmd': cmd_str} # start stress, get it spinning for one min to get a confirmed load # measurement, then run our own load, then kill stress. if LOAD > 0: rsu.logger.info("creating system load %s" % LOAD) os.popen("killall -9 stress 2>&1 > /dev/null") os.popen('stress --cpu %s &' % LOAD) time.sleep(60) load_1 = float(os.popen(LOAD_CMD).read()) start = rsu.timestamp() os.environ['_RADICAL_SYNAPSE_PROFILED'] = 'TRUE' # run the profiled function/command in a separate process if callable(command): proc = mp.Process(target=command, args=args, kwargs=kwargs) proc.start() else: proc = sp.Popen(command.split(), stdout=sp.PIPE, stderr=sp.STDOUT) watch_mode = os.environ.get('RADICAL_SYNAPSE_WATCHMODE', 'full').lower() watchers = list() if watch_mode == 'full': watchers.append(rsw.WatcherCPU(proc.pid)) watchers.append(rsw.WatcherSto(proc.pid)) watchers.append(rsw.WatcherMem(proc.pid)) # watchmode 'basic' watchers.append(rsw.WatcherSys(proc.pid)) if callable(command): proc.join() out = "" ret = None else: out = proc.communicate()[0] ret = proc.returncode stop = rsu.timestamp() info['time'] = dict() info['time']['start'] = rsu.time_zero() info['time']['real'] = stop - start for watcher in reversed(watchers): watcher.stop() watcher.join() ru.dict_merge(info, watcher.get_data()) # allow watchers to finalize some stuff, now having data from other watchers # available for watcher in reversed(watchers): watcher.finalize(info) time_2 = rsu.timestamp() load_2 = float(os.popen(LOAD_CMD).read()) info['cpu']['load'] = max(load_1, load_2) rsu.logger.info("system load %s: %s" % (LOAD, info['cpu']['load'])) if LOAD > 0: rsu.logger.info("stopping system load") os.popen("killall -9 stress 2>&1 > /dev/null") rsu.logger.info("stopped system load") return info, ret, out
def __init__(self, url, session=None, logger=None, opts=None, posix=True, interactive=True): if logger: self.logger = logger else: self.logger = ru.Logger('radical.saga.pty') if session: self.session = session else: self.session = ss.Session(default=True) self.logger.debug("PTYShell init %s" % self) self.url = url # describes the shell to run self.posix = posix # /bin/sh compatible? self.interactive = interactive # bash -i ? self.latency = 0.0 # set by factory self.cp_slave = None # file copy channel self.initialized = False self.pty_id = PTYShell._pty_id PTYShell._pty_id += 1 self.cfg = ru.Config('radical.saga', 'utils')['pty'] # opts passed on construction overwrite file config if opts: self.cfg = ru.dict_merge(self.cfg, opts, policy='overwrite') # get prompt pattern from config, or use default self.prompt = self.cfg.get('prompt_pattern', DEFAULT_PROMPT) self.prompt_re = re.compile("^(.*?)%s" % self.prompt, re.DOTALL) self.logger.info("PTY prompt pattern: %s" % self.prompt) # we need a local dir for file staging caches. At this point we use # $HOME, but should make this configurable (FIXME) self.base = os.environ['HOME'] + '/.radical/saga/adaptors/shell/' try: os.makedirs(self.base) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(self.base): pass else: raise rse.NoSuccess("could not create staging dir: %s" % e) self.factory = supsf.PTYShellFactory() self.pty_info = self.factory.initialize(self.url, self.session, self.prompt, self.logger, self.cfg, self.posix, interactive=self.interactive) self.pty_shell = self.factory.run_shell(self.pty_info) self._trace('init : %s' % self.pty_shell.command) self.initialize()
def _prepare_pilot(self, resource, rcfg, pilot): pid = pilot["uid"] ret = {'ft': list(), 'jd': None} # ------------------------------------------------------------------ # Database connection parameters sid = self._session.uid database_url = self._session.dburl # some default values are determined at runtime default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \ (resource, self._rp_version) # ------------------------------------------------------------------ # get parameters from resource cfg, set defaults where needed agent_launch_method = rcfg.get('agent_launch_method') agent_dburl = rcfg.get('agent_mongodb_endpoint', database_url) agent_spawner = rcfg.get('agent_spawner', DEFAULT_AGENT_SPAWNER) rc_agent_config = rcfg.get('agent_config', DEFAULT_AGENT_CONFIG) agent_scheduler = rcfg.get('agent_scheduler') tunnel_bind_device = rcfg.get('tunnel_bind_device') default_queue = rcfg.get('default_queue') forward_tunnel_endpoint = rcfg.get('forward_tunnel_endpoint') lrms = rcfg.get('lrms') mpi_launch_method = rcfg.get('mpi_launch_method', '') pre_bootstrap_1 = rcfg.get('pre_bootstrap_1', []) pre_bootstrap_2 = rcfg.get('pre_bootstrap_2', []) python_interpreter = rcfg.get('python_interpreter') task_launch_method = rcfg.get('task_launch_method') rp_version = rcfg.get('rp_version', DEFAULT_RP_VERSION) virtenv_mode = rcfg.get('virtenv_mode', DEFAULT_VIRTENV_MODE) virtenv = rcfg.get('virtenv', default_virtenv) cores_per_node = rcfg.get('cores_per_node', 0) health_check = rcfg.get('health_check', True) python_dist = rcfg.get('python_dist') virtenv_dist = rcfg.get('virtenv_dist', DEFAULT_VIRTENV_DIST) cu_tmp = rcfg.get('cu_tmp') spmd_variation = rcfg.get('spmd_variation') shared_filesystem = rcfg.get('shared_filesystem', True) stage_cacerts = rcfg.get('stage_cacerts', False) cu_pre_exec = rcfg.get('cu_pre_exec') cu_post_exec = rcfg.get('cu_post_exec') export_to_cu = rcfg.get('export_to_cu') mandatory_args = rcfg.get('mandatory_args', []) # ------------------------------------------------------------------ # get parameters from the pilot description number_cores = pilot['description']['cores'] runtime = pilot['description']['runtime'] queue = pilot['description']['queue'] project = pilot['description']['project'] cleanup = pilot['description']['cleanup'] memory = pilot['description']['memory'] candidate_hosts = pilot['description']['candidate_hosts'] # make sure that mandatory args are known for ma in mandatory_args: if pilot['description'].get(ma) is None: raise ValueError('attribute "%s" is required for "%s"' \ % (ma, resource)) # get pilot and global sandbox resource_sandbox = self._session._get_resource_sandbox(pilot).path session_sandbox = self._session._get_session_sandbox(pilot).path pilot_sandbox = self._session._get_pilot_sandbox(pilot).path pilot['resource_sandbox'] = str( self._session._get_resource_sandbox(pilot)) pilot['pilot_sandbox'] = str(self._session._get_pilot_sandbox(pilot)) pilot['client_sandbox'] = str(self._session._get_client_sandbox()) # Agent configuration that is not part of the public API. # The agent config can either be a config dict, or # a string pointing to a configuration name. If neither # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is # set. The last fallback is 'agent_default' agent_config = pilot['description'].get('_config') if not agent_config: agent_config = os.environ.get('RADICAL_PILOT_AGENT_CONFIG') if not agent_config: agent_config = rc_agent_config if isinstance(agent_config, dict): # use dict as is agent_cfg = agent_config elif isinstance(agent_config, basestring): try: # interpret as a config name agent_cfg_file = os.path.join(self._conf_dir, "agent_%s.json" % agent_config) self._log.info("Read agent config file: %s", agent_cfg_file) agent_cfg = ru.read_json(agent_cfg_file) # allow for user level overload user_cfg_file = '%s/.radical/pilot/config/%s' \ % (os.environ['HOME'], os.path.basename(agent_cfg_file)) if os.path.exists(user_cfg_file): self._log.info("merging user config: %s" % user_cfg_file) user_cfg = ru.read_json(user_cfg_file) ru.dict_merge(agent_cfg, user_cfg, policy='overwrite') except Exception as e: self._log.exception("Error reading agent config file: %s" % e) raise else: # we can't handle this type raise TypeError( 'agent config must be string (config name) or dict') # expand variables in virtenv string virtenv = virtenv % { 'pilot_sandbox': pilot_sandbox, 'session_sandbox': session_sandbox, 'resource_sandbox': resource_sandbox } # Check for deprecated global_virtenv if 'global_virtenv' in rcfg: raise RuntimeError("'global_virtenv' is deprecated (%s)" % resource) # Create a host:port string for use by the bootstrap_1. db_url = rs.Url(agent_dburl) if db_url.port: db_hostport = "%s:%d" % (db_url.host, db_url.port) else: db_hostport = "%s:%d" % (db_url.host, 27017) # mongodb default # ------------------------------------------------------------------ # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to root_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith('@') and \ not rp_version in ['installed', 'local', 'debug', 'release']: raise ValueError("invalid rp_version '%s'" % rp_version) if rp_version.startswith('@'): rp_version = rp_version[1:] # strip '@' # ------------------------------------------------------------------ # sanity checks if not python_dist: raise RuntimeError("missing python distribution") if not virtenv_dist: raise RuntimeError("missing virtualenv distribution") if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not agent_launch_method: raise RuntimeError("missing agentlaunch method") if not task_launch_method: raise RuntimeError("missing task launch method") # massage some values if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) if shared_filesystem: cleanup = 'luve' else: # we cannot clean the sandbox from within the agent, as the hop # staging would then fail, and we'd get nothing back. # FIXME: cleanup needs to be done by the pmgr.launcher, or # someone else, really, after fetching all logs and # profiles. cleanup = 'luv' # we never cleanup virtenvs which are not private if virtenv_mode is not 'private': cleanup = cleanup.replace('v', '') # add dists to staging files, if needed if rp_version in ['local', 'debug']: sdist_names = [ru.sdist_name, rs.sdist_name, self._rp_sdist_name] sdist_paths = [ru.sdist_path, rs.sdist_path, self._rp_sdist_path] else: sdist_names = list() sdist_paths = list() # if cores_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if cores_per_node: cores_per_node = int(cores_per_node) number_cores = int(cores_per_node * math.ceil(float(number_cores) / cores_per_node)) # set mandatory args bootstrap_args = "" bootstrap_args += " -d '%s'" % ':'.join(sdist_names) bootstrap_args += " -p '%s'" % pid bootstrap_args += " -s '%s'" % sid bootstrap_args += " -m '%s'" % virtenv_mode bootstrap_args += " -r '%s'" % rp_version bootstrap_args += " -b '%s'" % python_dist bootstrap_args += " -g '%s'" % virtenv_dist bootstrap_args += " -v '%s'" % virtenv bootstrap_args += " -y '%d'" % runtime # set optional args if lrms == "CCM": bootstrap_args += " -c" if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if tunnel_bind_device: bootstrap_args += " -t '%s'" % tunnel_bind_device if cleanup: bootstrap_args += " -x '%s'" % cleanup for arg in pre_bootstrap_1: bootstrap_args += " -e '%s'" % arg for arg in pre_bootstrap_2: bootstrap_args += " -w '%s'" % arg agent_cfg['owner'] = 'agent_0' agent_cfg['cores'] = number_cores agent_cfg['lrms'] = lrms agent_cfg['spawner'] = agent_spawner agent_cfg['scheduler'] = agent_scheduler agent_cfg['runtime'] = runtime agent_cfg['dburl'] = str(database_url) agent_cfg['session_id'] = sid agent_cfg['pilot_id'] = pid agent_cfg['logdir'] = '.' agent_cfg['pilot_sandbox'] = pilot_sandbox agent_cfg['session_sandbox'] = session_sandbox agent_cfg['resource_sandbox'] = resource_sandbox agent_cfg['agent_launch_method'] = agent_launch_method agent_cfg['task_launch_method'] = task_launch_method agent_cfg['mpi_launch_method'] = mpi_launch_method agent_cfg['cores_per_node'] = cores_per_node agent_cfg['cu_tmp'] = cu_tmp agent_cfg['export_to_cu'] = export_to_cu agent_cfg['cu_pre_exec'] = cu_pre_exec agent_cfg['cu_post_exec'] = cu_post_exec agent_cfg['resource_cfg'] = copy.deepcopy(rcfg) agent_cfg['debug'] = self._log.getEffectiveLevel() # we'll also push the agent config into MongoDB pilot['cfg'] = agent_cfg # ------------------------------------------------------------------ # Write agent config dict to a json file in pilot sandbox. agent_cfg_name = 'agent_0.cfg' cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.') os.close(cfg_tmp_handle) # file exists now # Convert dict to json file self._log.debug("Write agent cfg to '%s'.", cfg_tmp_file) self._log.debug(pprint.pformat(agent_cfg)) ru.write_json(agent_cfg, cfg_tmp_file) ret['ft'].append({ 'src': cfg_tmp_file, 'tgt': '%s/%s' % (pilot_sandbox, agent_cfg_name), 'rem': True }) # purge the tmp file after packing # ---------------------------------------------------------------------- # we also touch the log and profile tarballs in the target pilot sandbox ret['ft'].append({ 'src': '/dev/null', 'tgt': '%s/%s' % (pilot_sandbox, '%s.log.tgz' % pid), 'rem': False }) # don't remove /dev/null # only stage profiles if we profile if self._prof.enabled: ret['ft'].append({ 'src': '/dev/null', 'tgt': '%s/%s' % (pilot_sandbox, '%s.prof.tgz' % pid), 'rem': False }) # don't remove /dev/null # check if we have a sandbox cached for that resource. If so, we have # nothing to do. Otherwise we create the sandbox and stage the RP # stack etc. # NOTE: this will race when multiple pilot launcher instances are used! with self._cache_lock: if not resource in self._sandboxes: for sdist in sdist_paths: base = os.path.basename(sdist) ret['ft'].append({ 'src': sdist, 'tgt': '%s/%s' % (session_sandbox, base), 'rem': False }) # Copy the bootstrap shell script. bootstrapper_path = os.path.abspath("%s/agent/%s" \ % (self._root_dir, BOOTSTRAPPER_0)) self._log.debug("use bootstrapper %s", bootstrapper_path) ret['ft'].append({ 'src': bootstrapper_path, 'tgt': '%s/%s' % (session_sandbox, BOOTSTRAPPER_0), 'rem': False }) # Some machines cannot run pip due to outdated CA certs. # For those, we also stage an updated certificate bundle # TODO: use booleans all the way? if stage_cacerts: cc_name = 'cacert.pem.gz' cc_path = os.path.abspath("%s/agent/%s" % (self._root_dir, cc_name)) self._log.debug("use CAs %s", cc_path) ret['ft'].append({ 'src': cc_path, 'tgt': '%s/%s' % (session_sandbox, cc_name), 'rem': False }) self._sandboxes[resource] = True # ------------------------------------------------------------------ # Create SAGA Job description and submit the pilot job jd = rs.job.Description() if shared_filesystem: bootstrap_tgt = '%s/%s' % (session_sandbox, BOOTSTRAPPER_0) else: bootstrap_tgt = '%s/%s' % ('.', BOOTSTRAPPER_0) jd.name = pid jd.executable = "/bin/bash" jd.arguments = ['-l %s' % bootstrap_tgt, bootstrap_args] jd.working_directory = pilot_sandbox jd.project = project jd.output = "bootstrap_1.out" jd.error = "bootstrap_1.err" jd.total_cpu_count = number_cores jd.processes_per_host = cores_per_node jd.spmd_variation = spmd_variation jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue jd.candidate_hosts = candidate_hosts jd.environment = dict() if 'RADICAL_PILOT_PROFILE' in os.environ: jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE' # for condor backends and the like which do not have shared FSs, we add # additional staging directives so that the backend system binds the # files from the session and pilot sandboxes to the pilot job. jd.file_transfer = list() if not shared_filesystem: jd.file_transfer.extend([ 'site:%s/%s > %s' % (session_sandbox, BOOTSTRAPPER_0, BOOTSTRAPPER_0), 'site:%s/%s > %s' % (pilot_sandbox, agent_cfg_name, agent_cfg_name), 'site:%s/%s.log.tgz > %s.log.tgz' % (pilot_sandbox, pid, pid), 'site:%s/%s.log.tgz < %s.log.tgz' % (pilot_sandbox, pid, pid) ]) if 'RADICAL_PILOT_PROFILE' in os.environ: jd.file_transfer.extend([ 'site:%s/%s.prof.tgz > %s.prof.tgz' % (pilot_sandbox, pid, pid), 'site:%s/%s.prof.tgz < %s.prof.tgz' % (pilot_sandbox, pid, pid) ]) for sdist in sdist_names: jd.file_transfer.extend( ['site:%s/%s > %s' % (session_sandbox, sdist, sdist)]) if stage_cacerts: jd.file_transfer.extend( ['site:%s/%s > %s' % (session_sandbox, cc_name, cc_name)]) self._log.debug("Bootstrap command line: %s %s", jd.executable, jd.arguments) ret['jd'] = jd return ret
def __init__(self, cfg, logger): self.name = type(self).__name__ self._cfg = cfg self._log = logger self.requested_cores = self._cfg['cores'] self._log.info("Configuring LRMS %s.", self.name) self.lm_info = dict() self.lrms_info = dict() self.slot_list = list() self.node_list = list() self.agent_nodes = {} self.cores_per_node = None # The LRMS will possibly need to reserve nodes for the agent, according to the # agent layout. We dig out the respective requirements from the config # right here. self._agent_reqs = [] layout = self._cfg['agent_layout'] # FIXME: this loop iterates over all agents *defined* in the layout, not # over all agents which are to be actually executed, thus # potentially reserving too many nodes. for worker in layout: target = layout[worker].get('target') # make sure that the target either 'local', which we will ignore, # or 'node'. if target == 'local': pass # ignore that one elif target == 'node': self._agent_reqs.append(worker) else : raise ValueError("ill-formatted agent target '%s'" % target) # We are good to get rolling, and to detect the runtime environment of # the local LRMS. self._configure() logger.info("Discovered execution environment: %s", self.node_list) # Make sure we got a valid nodelist and a valid setting for # cores_per_node if not self.node_list or self.cores_per_node < 1: raise RuntimeError('LRMS configuration invalid (%s)(%s)' % \ (self.node_list, self.cores_per_node)) # Check if the LRMS implementation reserved agent nodes. If not, pick # the first couple of nodes from the nodelist as a fallback. if self._agent_reqs and not self.agent_nodes: self._log.info('Determine list of agent nodes generically.') for worker in self._agent_reqs: # Get a node from the end of the node list self.agent_nodes[worker] = self.node_list.pop() # If all nodes are taken by workers now, we can safely stop, # and let the raise below do its thing. if not self.node_list: break if self.agent_nodes: self._log.info('Reserved agent node(s): %s' % self.agent_nodes.values()) self._log.info('Agent(s) running on node(s): %s' % self.agent_nodes.keys()) self._log.info('Remaining work node(s): %s' % self.node_list) # Check if we can do any work if not self.node_list: raise RuntimeError('LRMS has no nodes left to run units') # After LRMS configuration, we call any existing config hooks on the # launch methods. Those hooks may need to adjust the LRMS settings # (hello ORTE). We only call LM hooks *once* launch_methods = set() # set keeps entries unique if 'mpi_launch_method' in self._cfg: launch_methods.add(self._cfg['mpi_launch_method']) launch_methods.add(self._cfg['task_launch_method']) launch_methods.add(self._cfg['agent_launch_method']) for lm in launch_methods: if lm: try: from .... import pilot as rp ru.dict_merge(self.lm_info, rp.agent.LM.lrms_config_hook(lm, self._cfg, self, self._log)) except Exception as e: self._log.exception("lrms config hook failed") raise self._log.info("lrms config hook succeeded (%s)" % lm) # For now assume that all nodes have equal amount of cores cores_avail = (len(self.node_list) + len(self.agent_nodes)) * self.cores_per_node if 'RADICAL_PILOT_PROFILE' not in os.environ: if cores_avail < int(self.requested_cores): raise ValueError("Not enough cores available (%s) to satisfy allocation request (%s)." \ % (str(cores_avail), str(self.requested_cores))) # NOTE: self.lrms_info is what scheduler and launch method can # ultimately use, as it is included into the cfg passed to all # components. # # four elements are well defined: # lm_info: the dict received via the LM's lrms_config_hook # node_list: a list of node names to be used for unit execution # cores_per_node: as the name says # agent_nodes: list of node names reserved for agent execution # # That list may turn out to be insufficient for some schedulers. Yarn # for example may need to communicate YARN service endpoints etc. an # LRMS can thus expand this dict, but is then likely bound to a specific # scheduler which can interpret the additional information. self.lrms_info['name'] = self.name self.lrms_info['lm_info'] = self.lm_info self.lrms_info['node_list'] = self.node_list self.lrms_info['cores_per_node'] = self.cores_per_node self.lrms_info['agent_nodes'] = self.agent_nodes
def __init__(self, cfg, session): self.name = type(self).__name__ self._cfg = cfg self._session = session self._log = self._session._log self._prof = self._session._prof self.requested_cores = self._cfg['cores'] self.requested_gpus = self._cfg['gpus'] self._log.info("Configuring LRMS %s.", self.name) self.lm_info = dict() self.lrms_info = dict() self.node_list = list() self.agent_nodes = dict() self.cores_per_node = None self.gpus_per_node = None self.lfs_per_node = None self.mem_per_node = None # The LRMS will possibly need to reserve nodes for the agent, according # to the agent layout. We dig out the respective requirements from the # config right here. self._agent_reqs = [] agents = self._cfg.get('agents', {}) # FIXME: this loop iterates over all agents *defined* in the layout, not # over all agents which are to be actually executed, thus # potentially reserving too many nodes.a # NOTE: this code path is *within* the agent, so at least agent_0 # cannot possibly land on a different node. for agent in agents: target = agents[agent].get('target') # make sure that the target either 'local', which we will ignore, # or 'node'. if target == 'local': pass # ignore that one elif target == 'node': self._agent_reqs.append(agent) else : raise ValueError("ill-formatted agent target '%s'" % target) # We are good to get rolling, and to detect the runtime environment of # the local LRMS. self._configure() self._log.info("Discovered execution environment: %s", self.node_list) # Make sure we got a valid nodelist and a valid setting for # cores_per_node if not self.node_list or self.cores_per_node < 1: raise RuntimeError('LRMS configuration invalid (%s)(%s)' % (self.node_list, self.cores_per_node)) # Check if the LRMS implementation reserved agent nodes. If not, pick # the first couple of nodes from the nodelist as a fallback. if self._agent_reqs and not self.agent_nodes: self._log.info('Determine list of agent nodes generically.') for agent in self._agent_reqs: # Get a node from the end of the node list self.agent_nodes[agent] = self.node_list.pop() # If all nodes are taken by workers now, we can safely stop, # and let the raise below do its thing. if not self.node_list: break if self.agent_nodes: self._log.info('Reserved nodes: %s' % self.agent_nodes.values()) self._log.info('Agent nodes: %s' % self.agent_nodes.keys()) self._log.info('Worker nodes: %s' % self.node_list) # Check if we can do any work if not self.node_list: raise RuntimeError('LRMS has no nodes left to run units') # After LRMS configuration, we call any existing config hooks on the # launch methods. Those hooks may need to adjust the LRMS settings # (hello ORTE). We only call LM hooks *once* (thus the set) launch_methods = set() launch_methods.add(self._cfg.get('mpi_launch_method')) launch_methods.add(self._cfg.get('task_launch_method')) launch_methods.add(self._cfg.get('agent_launch_method')) launch_methods.discard(None) for lm in launch_methods: try: from .... import pilot as rp ru.dict_merge(self.lm_info, rp.agent.LM.lrms_config_hook(lm, self._cfg, self, self._log, self._prof)) except Exception as e: # FIXME don't catch/raise self._log.exception("lrms config hook failed: %s" % e) raise self._log.info("lrms config hook succeeded (%s)" % lm) # For now assume that all nodes have equal amount of cores and gpus cores_avail = (len(self.node_list) + len(self.agent_nodes)) * self.cores_per_node gpus_avail = (len(self.node_list) + len(self.agent_nodes)) * self.gpus_per_node # on debug runs, we allow more cpus/gpus to appear than physically exist if 'RADICAL_DEBUG' not in os.environ: if cores_avail < int(self.requested_cores): raise ValueError("Not enough cores available (%s < %s)." % (str(cores_avail), str(self.requested_cores))) if gpus_avail < int(self.requested_gpus): raise ValueError("Not enough gpus available (%s < %s)." % (str(gpus_avail), str(self.requested_gpus))) # NOTE: self.lrms_info is what scheduler and launch method can # ultimately use, as it is included into the cfg passed to all # components. # # five elements are well defined: # lm_info: the dict received via the LM's lrms_config_hook # node_list: a list of node names to be used for unit execution # cores_per_node: as the name says # gpus_per_node: as the name says # agent_nodes: list of node names reserved for agent execution # # That list may turn out to be insufficient for some schedulers. Yarn # for example may need to communicate YARN service endpoints etc. an # LRMS can thus expand this dict, but is then likely bound to a specific # scheduler which can interpret the additional information. self.lrms_info['name'] = self.name self.lrms_info['lm_info'] = self.lm_info self.lrms_info['node_list'] = self.node_list self.lrms_info['cores_per_node'] = self.cores_per_node self.lrms_info['gpus_per_node'] = self.gpus_per_node self.lrms_info['agent_nodes'] = self.agent_nodes self.lrms_info['lfs_per_node'] = self.lfs_per_node self.lrms_info['mem_per_node'] = self.mem_per_node
def __init__(self, cfg, session): self.name = type(self).__name__ self._cfg = cfg self._session = session self._log = self._session._log self._prof = self._session._prof self.requested_cores = self._cfg['cores'] self.requested_gpus = self._cfg['gpus'] self._log.info("Configuring ResourceManager %s.", self.name) self.lm_info = dict() self.rm_info = dict() self.node_list = list() self.agent_nodes = dict() self.cores_per_node = 0 self.gpus_per_node = 0 self.lfs_per_node = 0 self.mem_per_node = 0 self.smt = int(os.environ.get('RADICAL_SAGA_SMT', 1)) # The ResourceManager will possibly need to reserve nodes for the agent, # according to the agent layout. We dig out the respective requirements # from the config right here. self._agent_reqs = [] agents = self._cfg.get('agents', {}) # FIXME: this loop iterates over all agents *defined* in the layout, not # over all agents which are to be actually executed, thus # potentially reserving too many nodes.a # NOTE: this code path is *within* the agent, so at least agent.0 # cannot possibly land on a different node. for agent in agents: target = agents[agent].get('target') # make sure that the target either 'local', which we will ignore, # or 'node'. if target == 'local': pass # ignore that one elif target == 'node': self._agent_reqs.append(agent) else: raise ValueError("ill-formatted agent target '%s'" % target) # We are good to get rolling, and to detect the runtime environment of # the local ResourceManager. self._configure() self._log.info("Discovered execution environment: %s", self.node_list) # Make sure we got a valid nodelist and a valid setting for # cores_per_node if not self.node_list or self.cores_per_node < 1: raise RuntimeError( 'ResourceManager configuration invalid (%s)(%s)' % (self.node_list, self.cores_per_node)) # Check if the ResourceManager implementation reserved agent nodes. # If not, pick the first couple of nodes from the nodelist as fallback. if self._agent_reqs and not self.agent_nodes: self._log.info('Determine list of agent nodes generically.') for agent in self._agent_reqs: # Get a node from the end of the node list self.agent_nodes[agent] = self.node_list.pop() # If all nodes are taken by workers now, we can safely stop, # and let the raise below do its thing. if not self.node_list: break if self.agent_nodes: self._log.info('Reserved nodes: %s' % list(self.agent_nodes.values())) self._log.info('Agent nodes: %s' % list(self.agent_nodes.keys())) self._log.info('Worker nodes: %s' % self.node_list) # Check if we can do any work if not self.node_list: raise RuntimeError( 'ResourceManager has no nodes left to run units') # After ResourceManager configuration, we call any existing config hooks # on the launch methods. Those hooks may need to adjust the # ResourceManager settings (hello ORTE). We only call LaunchMethod # hooks *once* (thus the set) launch_methods = set() launch_methods.add(self._cfg.get('mpi_launch_method')) launch_methods.add(self._cfg.get('task_launch_method')) launch_methods.add(self._cfg.get('agent_launch_method')) launch_methods.discard(None) for lm in launch_methods: try: ru.dict_merge( self.lm_info, rpa.LaunchMethod.rm_config_hook(name=lm, cfg=self._cfg, rm=self, log=self._log, profiler=self._prof)) except Exception as e: # FIXME don't catch/raise self._log.exception("ResourceManager config hook failed: %s" % e) raise self._log.info("ResourceManager config hook succeeded (%s)" % lm) # For now assume that all nodes have equal amount of cores and gpus cores_avail = (len(self.node_list) + len(self.agent_nodes)) * self.cores_per_node gpus_avail = (len(self.node_list) + len(self.agent_nodes)) * self.gpus_per_node # on debug runs, we allow more cpus/gpus to appear than physically exist if 'RADICAL_DEBUG' not in os.environ: if cores_avail < int(self.requested_cores): raise ValueError("Not enough cores available (%s < %s)." % (str(cores_avail), str(self.requested_cores))) if gpus_avail < int(self.requested_gpus): raise ValueError("Not enough gpus available (%s < %s)." % (str(gpus_avail), str(self.requested_gpus))) # NOTE: self.rm_info is what scheduler and launch method can # ultimately use, as it is included into the cfg passed to all # components. # # five elements are well defined: # lm_info: the dict received via the LM's rm_config_hook # node_list: a list of node names to be used for unit execution # cores_per_node: as the name says # gpus_per_node: as the name says # agent_nodes: list of node names reserved for agent execution # # That list may turn out to be insufficient for some schedulers. Yarn # for example may need to communicate YARN service endpoints etc. an # ResourceManager can thus expand this dict, but is then likely bound to # a specific scheduler which can interpret the additional information. self.rm_info['name'] = self.name self.rm_info['lm_info'] = self.lm_info self.rm_info['node_list'] = self.node_list self.rm_info['cores_per_node'] = self.cores_per_node self.rm_info['gpus_per_node'] = self.gpus_per_node self.rm_info['agent_nodes'] = self.agent_nodes self.rm_info['lfs_per_node'] = self.lfs_per_node self.rm_info['mem_per_node'] = self.mem_per_node
# "resource_cfg" : # { # "*.futuregrid.org" : # { # "username" : "merzky" # } # } # } USER_CONFIG_PATH = os.environ.get('HOME', '/tmp') + '/.my_app.cfg' # load the user config, and merge it with the default config user_config = ru.read_json_str(USER_CONFIG_PATH) # merge the user config into the app config, so that the user config keys are # applied where appropriate ru.dict_merge(app_config, user_config, policy='overwrite', wildcards=True) # lets see what we got pprint.pprint(app_config) # this should result in : # # { # 'log_level' : 0, # 'scheduler' : 'rp.SCHED_BACKFILLING', # 'resources' : ['india.furturegrid.org', 'sierra.futuregrid.org'], # 'resource_cfg': # { # '*.futuregrid.org': # { # 'username' : 'merzky'
def __init__(self, cfg, session): self.name = type(self).__name__ self._cfg = cfg self._session = session self._log = self._session._log self._prof = self._session._prof self.requested_cores = self._cfg['cores'] self._log.info("Configuring LRMS %s.", self.name) self.lm_info = dict() self.lrms_info = dict() self.slot_list = list() self.node_list = list() self.agent_nodes = dict() self.cores_per_node = None self.gpus_per_node = None self.lfs_per_node = None # The LRMS will possibly need to reserve nodes for the agent, according # to the agent layout. We dig out the respective requirements from the # config right here. self._agent_reqs = [] agents = self._cfg.get('agents', {}) # FIXME: this loop iterates over all agents *defined* in the layout, not # over all agents which are to be actually executed, thus # potentially reserving too many nodes.a # NOTE: this code path is *within* the agent, so at least agent_0 # cannot possibly land on a different node. for agent in agents: target = agents[agent].get('target') # make sure that the target either 'local', which we will ignore, # or 'node'. if target == 'local': pass # ignore that one elif target == 'node': self._agent_reqs.append(agent) else : raise ValueError("ill-formatted agent target '%s'" % target) # We are good to get rolling, and to detect the runtime environment of # the local LRMS. self._configure() self._log.info("Discovered execution environment: %s", self.node_list) # Make sure we got a valid nodelist and a valid setting for # cores_per_node if not self.node_list or self.cores_per_node < 1: raise RuntimeError('LRMS configuration invalid (%s)(%s)' % \ (self.node_list, self.cores_per_node)) # Check if the LRMS implementation reserved agent nodes. If not, pick # the first couple of nodes from the nodelist as a fallback. if self._agent_reqs and not self.agent_nodes: self._log.info('Determine list of agent nodes generically.') for agent in self._agent_reqs: # Get a node from the end of the node list self.agent_nodes[agent] = self.node_list.pop() # If all nodes are taken by workers now, we can safely stop, # and let the raise below do its thing. if not self.node_list: break if self.agent_nodes: self._log.info('Reserved agent node(s): %s' % self.agent_nodes.values()) self._log.info('Agent(s) running on node(s): %s' % self.agent_nodes.keys()) self._log.info('Remaining work node(s): %s' % self.node_list) # Check if we can do any work if not self.node_list: raise RuntimeError('LRMS has no nodes left to run units') # After LRMS configuration, we call any existing config hooks on the # launch methods. Those hooks may need to adjust the LRMS settings # (hello ORTE). We only call LM hooks *once* launch_methods = set() # set keeps entries unique if 'mpi_launch_method' in self._cfg: launch_methods.add(self._cfg['mpi_launch_method']) launch_methods.add(self._cfg['task_launch_method']) launch_methods.add(self._cfg['agent_launch_method']) for lm in launch_methods: if lm: try: from .... import pilot as rp ru.dict_merge(self.lm_info, rp.agent.LM.lrms_config_hook(lm, self._cfg, self, self._log, self._prof)) except Exception as e: self._log.exception("lrms config hook failed") raise self._log.info("lrms config hook succeeded (%s)" % lm) # For now assume that all nodes have equal amount of cores and gpus cores_avail = (len(self.node_list) + len(self.agent_nodes)) * self.cores_per_node gpus_avail = (len(self.node_list) + len(self.agent_nodes)) * self.gpus_per_node if 'RADICAL_PILOT_PROFILE' not in os.environ: if cores_avail < int(self.requested_cores): raise ValueError("Not enough cores available (%s) to satisfy allocation request (%s)." \ % (str(cores_avail), str(self.requested_cores))) # NOTE: self.lrms_info is what scheduler and launch method can # ultimately use, as it is included into the cfg passed to all # components. # # five elements are well defined: # lm_info: the dict received via the LM's lrms_config_hook # node_list: a list of node names to be used for unit execution # cores_per_node: as the name says # gpus_per_node: as the name says # agent_nodes: list of node names reserved for agent execution # # That list may turn out to be insufficient for some schedulers. Yarn # for example may need to communicate YARN service endpoints etc. an # LRMS can thus expand this dict, but is then likely bound to a specific # scheduler which can interpret the additional information. self.lrms_info['name'] = self.name self.lrms_info['lm_info'] = self.lm_info self.lrms_info['node_list'] = self.node_list self.lrms_info['cores_per_node'] = self.cores_per_node self.lrms_info['gpus_per_node'] = self.gpus_per_node self.lrms_info['agent_nodes'] = self.agent_nodes self.lrms_info['lfs_per_node'] = self.lfs_per_node
def profile (command, *args, **kwargs) : if callable (command): cmd_str = "%s %s %s" % (command.__name__, str (args), str(kwargs)) else: cmd_str = command print "profile: %s" % cmd_str if '_RADICAL_SYNAPSE_EMULATED' in os.environ: cmd_str = os.environ.get ('_RADICAL_SYNAPSE_EMULATEE', cmd_str) print 'using emulated command name: %s' % cmd_str info = {'cmd' : cmd_str} # start stress, get it spinning for one min to get a confirmed load # measurement, then run our own load, then kill stress. if LOAD > 0: rsu.logger.info ("creating system load %s" % LOAD) os.popen ("killall -9 stress 2>&1 > /dev/null") os.popen ('stress --cpu %s &' % LOAD) time.sleep (60) load_1 = float(os.popen (LOAD_CMD).read()) start = rsu.timestamp() os.environ['_RADICAL_SYNAPSE_PROFILED'] = 'TRUE' # run the profiled function/command in a separate process if callable (command): proc = mp.Process (target = command, args = args, kwargs = kwargs) proc.start () else: proc = sp.Popen (command.split(), stdout = sp.PIPE, stderr = sp.STDOUT) watch_mode = os.environ.get('RADICAL_SYNAPSE_WATCHMODE', 'full').lower() watchers = list() if watch_mode == 'full': watchers.append (rsw.WatcherCPU (proc.pid)) watchers.append (rsw.WatcherSto (proc.pid)) watchers.append (rsw.WatcherMem (proc.pid)) # watchmode 'basic' watchers.append (rsw.WatcherSys (proc.pid)) if callable (command): proc.join() out = "" ret = None else: out = proc.communicate()[0] ret = proc.returncode stop = rsu.timestamp() info['time'] = dict() info['time']['start'] = rsu.time_zero() info['time']['real'] = stop-start for watcher in reversed(watchers): watcher.stop () watcher.join () ru.dict_merge (info, watcher.get_data()) # allow watchers to finalize some stuff, now having data from other watchers # available for watcher in reversed(watchers): watcher.finalize(info) time_2 = rsu.timestamp() load_2 = float(os.popen (LOAD_CMD).read()) info['cpu']['load'] = max(load_1, load_2) rsu.logger.info ("system load %s: %s" % (LOAD, info['cpu']['load'])) if LOAD > 0: rsu.logger.info ("stopping system load") os.popen ("killall -9 stress 2>&1 > /dev/null") rsu.logger.info ("stopped system load") return info, ret, out
def __init__(self, cfg, session): # We temporarily do not call the base class constructor. The # constraint was not to change the base class at any point. # The constructor of the base class performs certain computations # that are specific to a node architecture, i.e., (i) requirement of # cores_per_node and gpus_per_node, (ii) no requirement for # sockets_per_node, and (iii) no validity checks on cores_per_socket, # gpus_per_socket, and sockets_per_node. It is, hence, incompatible # with the node architecture expected within this module. # We have three options: # 1) Change the child class, do not call the base class constructor # 2) Call the base class constructor, make the child class and its node # structure compatible with that expected in the base class. # 3) Change the base class --- Out of scope of this project # 3 is probably the correct approach, but long term. 2 is not a # good approach as we are striving to keep a child class compatible # with a base class (this should never be the case). # We go ahead with 1, process of elimination really, but with the # advantage that we have the code content that will be required when # we implement 3, the long term approach. # ResourceManager.__init__(self, cfg, session) self.name = type(self).__name__ self._cfg = cfg self._session = session self._log = self._session._log self._prof = self._session._prof self.requested_cores = self._cfg['cores'] self._log.info("Configuring ResourceManager %s.", self.name) self.lm_info = dict() self.rm_info = dict() self.slot_list = list() self.node_list = list() self.agent_nodes = dict() self.sockets_per_node = None self.cores_per_socket = None self.gpus_per_socket = None self.lfs_per_node = None self.mem_per_node = None self.smt = int(os.environ.get('RADICAL_SAGA_SMT', 1)) # The ResourceManager will possibly need to reserve nodes for the agent, according # to the agent layout. We dig out the respective requirements from the # config right here. self._agent_reqs = [] agents = self._cfg.get('agents', {}) # FIXME: this loop iterates over all agents *defined* in the layout, not # over all agents which are to be actually executed, thus # potentially reserving too many nodes.a # NOTE: this code path is *within* the agent, so at least agent.0 # cannot possibly land on a different node. for agent in agents: target = agents[agent].get('target') # make sure that the target either 'local', which we will ignore, # or 'node'. if target == 'local': pass # ignore that one elif target == 'node': self._agent_reqs.append(agent) else : raise ValueError("ill-formatted agent target '%s'" % target) # We are good to get rolling, and to detect the runtime environment of # the local ResourceManager. self._configure() self._log.info("Discovered execution environment: %s", self.node_list) # Make sure we got a valid nodelist and a valid setting for # cores_per_socket and sockets_per_node if not self.node_list or\ self.sockets_per_node < 1 or \ self.cores_per_socket < 1: raise RuntimeError('ResourceManager configuration invalid (%s)(%s)(%s)' % (self.node_list, self.sockets_per_node, self.cores_per_socket)) # Check if the ResourceManager implementation reserved agent nodes. If not, pick # the first couple of nodes from the nodelist as a fallback. if self._agent_reqs and not self.agent_nodes: self._log.info('Determine list of agent nodes generically.') for agent in self._agent_reqs: # Get a node from the end of the node list self.agent_nodes[agent] = self.node_list.pop() # If all nodes are taken by workers now, we can safely stop, # and let the raise below do its thing. if not self.node_list: break if self.agent_nodes: self._log.info('agents : %s' % list(self.agent_nodes.keys())) self._log.info('agent nodes : %s' % list(self.agent_nodes.values())) self._log.info('worker nodes: %s' % self.node_list) # Check if we can do any work if not self.node_list: raise RuntimeError('ResourceManager has no nodes left to run units') # After ResourceManager configuration, we call any existing config hooks on the # launch methods. Those hooks may need to adjust the ResourceManager settings # (hello ORTE). We only call LaunchMethod hooks *once* launch_methods = set() # set keeps entries unique if 'mpi_launch_method' in self._cfg: launch_methods.add(self._cfg['mpi_launch_method']) launch_methods.add(self._cfg['task_launch_method']) launch_methods.add(self._cfg['agent_launch_method']) for lm in launch_methods: if lm: try: from .... import pilot as rp ru.dict_merge(self.lm_info, rp.agent.LaunchMethod.rm_config_hook(lm, self._cfg, self, self._log, self._prof)) except: self._log.exception("rm config hook failed") raise self._log.info("rm config hook succeeded (%s)" % lm) # # For now assume that all nodes have equal amount of cores and gpus # cores_avail = (len(self.node_list) + len(self.agent_nodes)) \ # * self.cores_per_socket * self.sockets_per_node # gpus_avail = (len(self.node_list) + len(self.agent_nodes)) \ # * self.gpus_per_socket * self.sockets_per_node # NOTE: self.rm_info is what scheduler and launch method can # ultimately use, as it is included into the cfg passed to all # components. # # it defines # lm_info: dict received via the LM's rm_config_hook # node_list: list of node names to be used for unit execution # sockets_per_node: integer number of sockets on a node # cores_per_socket: integer number of cores per socket # gpus_per_socket: integer number of gpus per socket # agent_nodes: list of node names reserved for agent execution # lfs_per_node: dict consisting the path and size of lfs on each node # mem_per_node: number of MB per node # smt: threads per core (exposed as core in RP) # self.rm_info = { 'name' : self.name, 'lm_info' : self.lm_info, 'node_list' : self.node_list, 'sockets_per_node' : self.sockets_per_node, 'cores_per_socket' : self.cores_per_socket * self.smt, 'gpus_per_socket' : self.gpus_per_socket, 'cores_per_node' : self.sockets_per_node * self.cores_per_socket * self.smt, 'gpus_per_node' : self.sockets_per_node * self.gpus_per_socket, 'agent_nodes' : self.agent_nodes, 'lfs_per_node' : self.lfs_per_node, 'mem_per_node' : self.mem_per_node, 'smt' : self.smt }
def __init__ (self, database_url=None, database_name="radicalpilot", uid=None, name=None): """Creates a new or reconnects to an exising session. If called without a uid, a new Session instance is created and stored in the database. If uid is set, an existing session is retrieved from the database. **Arguments:** * **database_url** (`string`): The MongoDB URL. If none is given, RP uses the environment variable RADICAL_PILOT_DBURL. If that is not set, an error will be raises. * **database_name** (`string`): An alternative database name (default: 'radicalpilot'). * **uid** (`string`): If uid is set, we try re-connect to an existing session instead of creating a new one. * **name** (`string`): An optional human readable name. **Returns:** * A new Session instance. **Raises:** * :class:`radical.pilot.DatabaseError` """ # init the base class inits saga.Session.__init__ (self) Object.__init__ (self) # before doing anything else, set up the debug helper for the lifetime # of the session. self._debug_helper = ru.DebugHelper () # Dictionaries holding all manager objects created during the session. self._pilot_manager_objects = list() self._unit_manager_objects = list() # Create a new process registry. All objects belonging to this # session will register their worker processes (if they have any) # in this registry. This makes it easier to shut down things in # a more coordinate fashion. self._process_registry = _ProcessRegistry() # The resource configuration dictionary associated with the session. self._resource_configs = {} self._database_url = database_url self._database_name = database_name if not self._database_url : self._database_url = os.getenv ("RADICAL_PILOT_DBURL", None) if not self._database_url : raise PilotException ("no database URL (set RADICAL_PILOT_DBURL)") logger.info("using database url %s" % self._database_url) # if the database url contains a path element, we interpret that as # database name (without the leading slash) tmp_url = ru.Url (self._database_url) if tmp_url.path and \ tmp_url.path[0] == '/' and \ len(tmp_url.path) > 1 : self._database_name = tmp_url.path[1:] logger.info("using database path %s" % self._database_name) else : logger.info("using database name %s" % self._database_name) # Loading all "default" resource configurations module_path = os.path.dirname(os.path.abspath(__file__)) default_cfgs = "%s/configs/*.json" % module_path config_files = glob.glob(default_cfgs) for config_file in config_files: try : rcs = ResourceConfig.from_file(config_file) except Exception as e : logger.error ("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Loaded resource configurations for %s" % rc) self._resource_configs[rc] = rcs[rc].as_dict() user_cfgs = "%s/.radical/pilot/configs/*.json" % os.environ.get ('HOME') config_files = glob.glob(user_cfgs) for config_file in config_files: try : rcs = ResourceConfig.from_file(config_file) except Exception as e : logger.error ("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Loaded resource configurations for %s" % rc) if rc in self._resource_configs : # config exists -- merge user config into it ru.dict_merge (self._resource_configs[rc], rcs[rc].as_dict(), policy='overwrite') else : # new config -- add as is self._resource_configs[rc] = rcs[rc].as_dict() default_aliases = "%s/configs/aliases.json" % module_path self._resource_aliases = ru.read_json_str (default_aliases)['aliases'] ########################## ## CREATE A NEW SESSION ## ########################## if uid is None: try: self._connected = None if name : self._name = name self._uid = name # self._uid = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM) else : self._uid = ru.generate_id ('rp.session', mode=ru.ID_PRIVATE) self._name = self._uid self._dbs, self._created, self._connection_info = \ dbSession.new(sid = self._uid, name = self._name, db_url = self._database_url, db_name = database_name) logger.info("New Session created%s." % str(self)) except Exception, ex: logger.exception ('session create failed') raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \ % (self._database_url, ex))
def run(self): """Starts the process when Process.start() is called. """ global JOB_CHECK_INTERVAL # make sure to catch sys.exit (which raises SystemExit) try: # Get directory where this module lives mod_dir = os.path.dirname(os.path.realpath(__file__)) # Try to connect to the database try: db = self._session.get_db() pilot_col = db["%s.p" % self._session.uid] logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return last_job_check = time.time() while not self._terminate.is_set(): # Periodically, we pull up all ComputePilots that are pending # execution or were last seen executing and check if the corresponding # SAGA job is still pending in the queue. If that is not the case, # we assume that the job has failed for some reasons and update # the state of the ComputePilot accordingly. if last_job_check + JOB_CHECK_INTERVAL < time.time(): last_job_check = time.time() self.check_pilot_states(pilot_col) if self._disabled.is_set(): # don't process any new pilot start requests. # NOTE: this is not clean, in principle there could be other # launchers alive which want to still start those # pending pilots. In practice we only ever use one # pmgr though, and its during its shutdown that we get # here... ts = time.time() compute_pilot = pilot_col.find_and_modify( query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH}, update={ "$set": {"state": CANCELED}, "$push": {"statehistory": {"state": CANCELED, "timestamp": ts}}, }, ) # run state checks more frequently. JOB_CHECK_INTERVAL = 3 time.sleep(1) continue # See if we can find a ComputePilot that is waiting to be launched. # If we find one, we use SAGA to create a job service, a job # description and a job that is then send to the local or remote # queueing system. If this succedes, we set the ComputePilot's # state to pending, otherwise to failed. compute_pilot = None ts = time.time() compute_pilot = pilot_col.find_and_modify( query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH}, update={ "$set": {"state": LAUNCHING}, "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}}, }, ) if not compute_pilot: time.sleep(IDLE_TIMER) else: try: # ------------------------------------------------------ # # LAUNCH THE PILOT AGENT VIA SAGA # logentries = [] pilot_id = str(compute_pilot["_id"]) logger.info("Launching ComputePilot %s" % pilot_id) # ------------------------------------------------------ # Database connection parameters session_id = self._session.uid database_url = self._session.dburl # ------------------------------------------------------ # pilot description and resource configuration number_cores = compute_pilot["description"]["cores"] runtime = compute_pilot["description"]["runtime"] queue = compute_pilot["description"]["queue"] project = compute_pilot["description"]["project"] cleanup = compute_pilot["description"]["cleanup"] resource_key = compute_pilot["description"]["resource"] schema = compute_pilot["description"]["access_schema"] memory = compute_pilot["description"]["memory"] candidate_hosts = compute_pilot["description"]["candidate_hosts"] pilot_sandbox = compute_pilot["sandbox"] global_sandbox = compute_pilot["global_sandbox"] # we expand and exchange keys in the resource config, # depending on the selected schema so better use a deep # copy.. resource_cfg = self._session.get_resource_config(resource_key, schema) # import pprint # pprint.pprint (resource_cfg) # ------------------------------------------------------ # get parameters from cfg, set defaults where needed agent_launch_method = resource_cfg.get("agent_launch_method") agent_dburl = resource_cfg.get("agent_mongodb_endpoint", database_url) agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER) agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE) rc_agent_config = resource_cfg.get("agent_config", DEFAULT_AGENT_CONFIG) agent_scheduler = resource_cfg.get("agent_scheduler") tunnel_bind_device = resource_cfg.get("tunnel_bind_device") default_queue = resource_cfg.get("default_queue") forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint") js_endpoint = resource_cfg.get("job_manager_endpoint") lrms = resource_cfg.get("lrms") mpi_launch_method = resource_cfg.get("mpi_launch_method") pre_bootstrap_1 = resource_cfg.get("pre_bootstrap_1") pre_bootstrap_2 = resource_cfg.get("pre_bootstrap_2") python_interpreter = resource_cfg.get("python_interpreter") spmd_variation = resource_cfg.get("spmd_variation") task_launch_method = resource_cfg.get("task_launch_method") rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION) virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE) virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV) stage_cacerts = resource_cfg.get("stage_cacerts", "False") cores_per_node = resource_cfg.get("cores_per_node") shared_filesystem = resource_cfg.get("shared_filesystem", True) health_check = resource_cfg.get("health_check", True) python_dist = resource_cfg.get("python_dist") cu_pre_exec = resource_cfg.get("cu_pre_exec") cu_post_exec = resource_cfg.get("cu_post_exec") export_to_cu = resource_cfg.get("export_to_cu") # Agent configuration that is not part of the public API. # The agent config can either be a config dict, or # a string pointing to a configuration name. If neither # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is # set. The last fallback is 'agent_default' agent_config = compute_pilot["description"].get("_config") if not agent_config: agent_config = os.environ.get("RADICAL_PILOT_AGENT_CONFIG") if not agent_config: agent_config = rc_agent_config if isinstance(agent_config, dict): # nothing to do agent_cfg_dict = agent_config pass elif isinstance(agent_config, basestring): try: if os.path.exists(agent_config): # try to open as file name logger.info("Read agent config file: %s" % agent_config) agent_cfg_dict = ru.read_json(agent_config) else: # otherwise interpret as a config name module_path = os.path.dirname(os.path.abspath(__file__)) config_path = "%s/../configs/" % module_path agent_cfg_file = os.path.join(config_path, "agent_%s.json" % agent_config) logger.info("Read agent config file: %s" % agent_cfg_file) agent_cfg_dict = ru.read_json(agent_cfg_file) # no matter how we read the config file, we # allow for user level overload cfg_base = os.path.basename(agent_cfg_file) user_cfg = "%s/.radical/pilot/config/%s" % (os.environ["HOME"], cfg_base) if os.path.exists(user_cfg): logger.info("merging user config: %s" % user_cfg) user_cfg_dict = ru.read_json(user_cfg) ru.dict_merge(agent_cfg_dict, user_cfg_dict, policy="overwrite") except Exception as e: logger.exception("Error reading agent config file: %s" % e) raise else: # we can't handle this type raise TypeError("agent config must be string (filename) or dict") # TODO: use booleans all the way? if stage_cacerts.lower() == "true": stage_cacerts = True else: stage_cacerts = False # expand variables in virtenv string virtenv = virtenv % { "pilot_sandbox": saga.Url(pilot_sandbox).path, "global_sandbox": saga.Url(global_sandbox).path, } # Check for deprecated global_virtenv global_virtenv = resource_cfg.get("global_virtenv") if global_virtenv: logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'") virtenv = global_virtenv virtenv_mode = "use" # Create a host:port string for use by the bootstrap_1. db_url = saga.Url(agent_dburl) if db_url.port: db_hostport = "%s:%d" % (db_url.host, db_url.port) else: db_hostport = "%s:%d" % (db_url.host, 27017) # mongodb default # Open the remote sandbox # TODO: make conditional on shared_fs? sandbox_tgt = saga.filesystem.Directory( pilot_sandbox, session=self._session, flags=saga.filesystem.CREATE_PARENTS ) LOCAL_SCHEME = "file" # ------------------------------------------------------ # Copy the bootstrap shell script. # This also creates the sandbox. BOOTSTRAPPER_SCRIPT = "bootstrap_1.sh" bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, BOOTSTRAPPER_SCRIPT)) msg = "Using bootstrapper %s" % bootstrapper_path logentries.append(Logentry(msg, logger=logger.info)) bs_script_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, bootstrapper_path)) msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, sandbox_tgt) logentries.append(Logentry(msg, logger=logger.debug)) if shared_filesystem: sandbox_tgt.copy(bs_script_url, BOOTSTRAPPER_SCRIPT) # ------------------------------------------------------ # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to mod_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]: raise ValueError("invalid rp_version '%s'" % rp_version) stage_sdist = True if rp_version in ["installed", "release"]: stage_sdist = False if rp_version.startswith("@"): stage_sdist = False rp_version = rp_version[1:] # strip '@' # ------------------------------------------------------ # Copy the rp sdist if needed. We actually also stage # the sdists for radical.utils and radical.saga, so that # we have the complete stack to install... if stage_sdist: for sdist_path in [ru.sdist_path, saga.sdist_path, rp_sdist_path]: sdist_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, sdist_path)) msg = "Copying sdist '%s' to sandbox (%s)." % (sdist_url, pilot_sandbox) logentries.append(Logentry(msg, logger=logger.debug)) if shared_filesystem: sandbox_tgt.copy(sdist_url, os.path.basename(str(sdist_url))) # ------------------------------------------------------ # Some machines cannot run pip due to outdated CA certs. # For those, we also stage an updated certificate bundle if stage_cacerts: cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz")) cc_url = saga.Url("%s://localhost/%s" % (LOCAL_SCHEME, cc_path)) msg = "Copying CA certificate bundle '%s' to sandbox (%s)." % (cc_url, pilot_sandbox) logentries.append(Logentry(msg, logger=logger.debug)) if shared_filesystem: sandbox_tgt.copy(cc_url, os.path.basename(str(cc_url))) # ------------------------------------------------------ # sanity checks if not python_dist: raise RuntimeError("missing python distribution") if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not agent_launch_method: raise RuntimeError("missing agentlaunch method") if not task_launch_method: raise RuntimeError("missing task launch method") # massage some values if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): cleanup = "luve" # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) # # we never cleanup virtenvs which are not private if virtenv_mode is not "private": cleanup = cleanup.replace("v", "") sdists = ":".join([ru.sdist_name, saga.sdist_name, rp_sdist_name]) # if cores_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if cores_per_node: cores_per_node = int(cores_per_node) number_cores = int(cores_per_node * math.ceil(float(number_cores) / cores_per_node)) # set mandatory args bootstrap_args = "" bootstrap_args += " -d '%s'" % sdists bootstrap_args += " -m '%s'" % virtenv_mode bootstrap_args += " -p '%s'" % pilot_id bootstrap_args += " -r '%s'" % rp_version bootstrap_args += " -s '%s'" % session_id bootstrap_args += " -v '%s'" % virtenv bootstrap_args += " -b '%s'" % python_dist # set optional args if agent_type: bootstrap_args += " -a '%s'" % agent_type if lrms == "CCM": bootstrap_args += " -c" if pre_bootstrap_1: bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap_1) if pre_bootstrap_2: bootstrap_args += " -w '%s'" % "' -w '".join(pre_bootstrap_2) if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if tunnel_bind_device: bootstrap_args += " -t '%s'" % tunnel_bind_device if cleanup: bootstrap_args += " -x '%s'" % cleanup # set some agent configuration agent_cfg_dict["cores"] = number_cores agent_cfg_dict["resource_cfg"] = resource_cfg agent_cfg_dict["debug"] = os.environ.get( "RADICAL_PILOT_AGENT_VERBOSE", logger.getEffectiveLevel() ) agent_cfg_dict["mongodb_url"] = str(agent_dburl) agent_cfg_dict["lrms"] = lrms agent_cfg_dict["spawner"] = agent_spawner agent_cfg_dict["scheduler"] = agent_scheduler agent_cfg_dict["runtime"] = runtime agent_cfg_dict["pilot_id"] = pilot_id agent_cfg_dict["session_id"] = session_id agent_cfg_dict["agent_launch_method"] = agent_launch_method agent_cfg_dict["task_launch_method"] = task_launch_method agent_cfg_dict["export_to_cu"] = export_to_cu agent_cfg_dict["cu_pre_exec"] = cu_pre_exec agent_cfg_dict["cu_post_exec"] = cu_post_exec if mpi_launch_method: agent_cfg_dict["mpi_launch_method"] = mpi_launch_method if cores_per_node: agent_cfg_dict["cores_per_node"] = cores_per_node # ------------------------------------------------------ # Write agent config dict to a json file in pilot sandbox. cfg_tmp_dir = tempfile.mkdtemp(prefix="rp_agent_cfg_dir") agent_cfg_name = "agent_0.cfg" cfg_tmp_file = os.path.join(cfg_tmp_dir, agent_cfg_name) cfg_tmp_handle = os.open(cfg_tmp_file, os.O_WRONLY | os.O_CREAT) # Convert dict to json file msg = "Writing agent configuration to file '%s'." % cfg_tmp_file logentries.append(Logentry(msg, logger=logger.debug)) ru.write_json(agent_cfg_dict, cfg_tmp_file) cf_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, cfg_tmp_file)) msg = "Copying agent configuration file '%s' to sandbox (%s)." % (cf_url, pilot_sandbox) logentries.append(Logentry(msg, logger=logger.debug)) if shared_filesystem: sandbox_tgt.copy(cf_url, agent_cfg_name) # Close agent config file os.close(cfg_tmp_handle) # ------------------------------------------------------ # Done with all transfers to pilot sandbox, close handle sandbox_tgt.close() # ------------------------------------------------------ # now that the scripts are in place and configured, # we can launch the agent js_url = saga.Url(js_endpoint) logger.debug("saga.job.Service ('%s')" % js_url) if js_url in self._shared_worker_data["job_services"]: js = self._shared_worker_data["job_services"][js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data["job_services"][js_url] = js # ------------------------------------------------------ # Create SAGA Job description and submit the pilot job jd = saga.job.Description() jd.executable = "/bin/bash" jd.arguments = ["-l %s" % BOOTSTRAPPER_SCRIPT, bootstrap_args] jd.working_directory = saga.Url(pilot_sandbox).path jd.project = project jd.output = "bootstrap_1.out" jd.error = "bootstrap_1.err" jd.total_cpu_count = number_cores jd.processes_per_host = cores_per_node jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue jd.candidate_hosts = candidate_hosts jd.environment = dict() # TODO: not all files might be required, this also needs to be made conditional if not shared_filesystem: jd.file_transfer = [ #'%s > %s' % (bootstrapper_path, os.path.basename(bootstrapper_path)), "%s > %s" % ( bootstrapper_path, os.path.join(jd.working_directory, "input", os.path.basename(bootstrapper_path)), ), "%s > %s" % (cfg_tmp_file, os.path.join(jd.working_directory, "input", agent_cfg_name)), #'%s < %s' % ('agent.log', os.path.join(jd.working_directory, 'agent.log')), #'%s < %s' % (os.path.join(jd.working_directory, 'agent.log'), 'agent.log'), #'%s < %s' % ('agent.log', 'agent.log'), #'%s < %s' % (os.path.join(jd.working_directory, 'STDOUT'), 'unit.000000/STDOUT'), #'%s < %s' % (os.path.join(jd.working_directory, 'unit.000000/STDERR'), 'STDERR') #'%s < %s' % ('unit.000000/STDERR', 'unit.000000/STDERR') # TODO: This needs to go into a per pilot directory on the submit node "%s < %s" % ("pilot.0000.log.tgz", "pilot.0000.log.tgz"), ] if stage_sdist: jd.file_transfer.extend( [ #'%s > %s' % (rp_sdist_path, os.path.basename(rp_sdist_path)), "%s > %s" % ( rp_sdist_path, os.path.join( jd.working_directory, "input", os.path.basename(rp_sdist_path) ), ), #'%s > %s' % (saga.sdist_path, os.path.basename(saga.sdist_path)), "%s > %s" % ( saga.sdist_path, os.path.join( jd.working_directory, "input", os.path.basename(saga.sdist_path) ), ), #'%s > %s' % (ru.sdist_path, os.path.basename(ru.sdist_path)), "%s > %s" % ( ru.sdist_path, os.path.join( jd.working_directory, "input", os.path.basename(ru.sdist_path) ), ), ] ) if stage_cacerts: jd.file_transfer.append( "%s > %s" % (cc_path, os.path.join(jd.working_directory, "input", os.path.basename(cc_path))) ) if "RADICAL_PILOT_PROFILE" in os.environ: # TODO: This needs to go into a per pilot directory on the submit node jd.file_transfer.append("%s < %s" % ("pilot.0000.prof.tgz", "pilot.0000.prof.tgz")) # Set the SPMD variation only if required if spmd_variation: jd.spmd_variation = spmd_variation if "RADICAL_PILOT_PROFILE" in os.environ: jd.environment["RADICAL_PILOT_PROFILE"] = "TRUE" logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments)) msg = "Submitting SAGA job with description: %s" % str(jd.as_dict()) logentries.append(Logentry(msg, logger=logger.debug)) try: pilotjob = js.create_job(jd) except saga.BadParameter as e: raise ValueError("Pilot submission to %s failed: %s" % (resource_key, e)) pilotjob.run() # Clean up agent config file and dir after submission os.unlink(cfg_tmp_file) os.rmdir(cfg_tmp_dir) # do a quick error check if pilotjob.state == saga.FAILED: raise RuntimeError("SAGA Job state is FAILED.") saga_job_id = pilotjob.id self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url] msg = "SAGA job submitted with job id %s" % str(saga_job_id) logentries.append(Logentry(msg, logger=logger.debug)) # # ------------------------------------------------------ log_dicts = list() for le in logentries: log_dicts.append(le.as_dict()) # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful. ts = time.time() ret = pilot_col.update( {"_id": pilot_id, "state": LAUNCHING}, { "$set": { "state": PENDING_ACTIVE, "saga_job_id": saga_job_id, "health_check_enabled": health_check, "agent_config": agent_cfg_dict, }, "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) if ret["n"] == 0: # could not update, probably because the agent is # running already. Just update state history and # jobid then # FIXME: make sure of the agent state! ret = pilot_col.update( {"_id": pilot_id}, { "$set": {"saga_job_id": saga_job_id, "health_check_enabled": health_check}, "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) except Exception as e: # Update the Pilot's state 'FAILED'. out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = time.time() # FIXME: we seem to be unable to bson/json handle saga # log messages containing an '#'. This shows up here. # Until we find a clean workaround, make log shorter and # rely on saga logging to reveal the problem. msg = "Pilot launching failed! (%s)" % e logentries.append(Logentry(msg)) log_dicts = list() log_messages = list() for le in logentries: log_dicts.append(le.as_dict()) log_messages.append(str(le.message)) pilot_col.update( {"_id": pilot_id, "state": {"$ne": FAILED}}, { "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log}, "$push": {"statehistory": {"state": FAILED, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) logger.exception("\n".join(log_messages)) except SystemExit as e: logger.exception("pilot launcher thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main()
def __init__(self, agent_name): assert(agent_name == 'agent_0'), 'expect agent_0, not subagent' print 'startup agent %s' % agent_name # load config, create session, init rpu.Worker agent_cfg = '%s/%s.cfg' % (os.getcwd(), agent_name) cfg = ru.read_json_str(agent_cfg) cfg['agent_name'] = agent_name self._uid = agent_name self._pid = cfg['pilot_id'] self._sid = cfg['session_id'] self._runtime = cfg['runtime'] self._starttime = time.time() self._final_cause = None self._lrms = None # this better be on a shared FS! cfg['workdir'] = os.getcwd() # sanity check on config settings if not 'cores' in cfg: raise ValueError('Missing number of cores') if not 'lrms' in cfg: raise ValueError('Missing LRMS') if not 'dburl' in cfg: raise ValueError('Missing DBURL') if not 'pilot_id' in cfg: raise ValueError('Missing pilot id') if not 'runtime' in cfg: raise ValueError('Missing or zero agent runtime') if not 'scheduler' in cfg: raise ValueError('Missing agent scheduler') if not 'session_id' in cfg: raise ValueError('Missing session id') if not 'spawner' in cfg: raise ValueError('Missing agent spawner') if not 'task_launch_method' in cfg: raise ValueError('Missing unit launch method') # Check for the RADICAL_PILOT_DB_HOSTPORT env var, which will hold # the address of the tunnelized DB endpoint. If it exists, we # overrule the agent config with it. hostport = os.environ.get('RADICAL_PILOT_DB_HOSTPORT') if hostport: dburl = ru.Url(cfg['dburl']) dburl.host, dburl.port = hostport.split(':') cfg['dburl'] = str(dburl) # Create a session. # # This session will connect to MongoDB, and will also create any # communication channels and components/workers specified in the # config -- we merge that information into our own config. # We don't want the session to start components though, so remove them # from the config copy. session_cfg = copy.deepcopy(cfg) session_cfg['components'] = dict() session = rp_Session(cfg=session_cfg, uid=self._sid) # we still want the bridge addresses known though, so make sure they are # merged into our own copy, along with any other additions done by the # session. ru.dict_merge(cfg, session._cfg, ru.PRESERVE) pprint.pprint(cfg) if not session.is_connected: raise RuntimeError('agent_0 could not connect to mongodb') # at this point the session is up and connected, and it should have # brought up all communication bridges and the UpdateWorker. We are # ready to rumble! rpu.Worker.__init__(self, cfg, session) # this is the earlier point to sync bootstrapper and agent # profiles self._prof.prof('sync_rel', msg='agent_0 start', uid=self._pid) # Create LRMS which will give us the set of agent_nodes to use for # sub-agent startup. Add the remaining LRMS information to the # config, for the benefit of the scheduler). self._lrms = rpa_rm.RM.create(name=self._cfg['lrms'], cfg=self._cfg, session=self._session) # add the resource manager information to our own config self._cfg['lrms_info'] = self._lrms.lrms_info
def get_config (params) : """ This method attempts to obtain configuration settings from a variety of sources, depending on the parameter. it can point to an env var, or to a directory containing configuration files, or to a single configuration file, or to a list of any above, or it is a config dict already, or a list of such dicts. In all cases, the config is obtained from the respective source (which is assumed json formatted in the case of config files), and a single merged and expanded dict is returned. """ ret = dict() # always make params list for simpler code below if not isinstance(params, list) : params = [params] for param in params : if not param or None == param : # we silently accept None's, to save some # repetetetetive checks on the calling side continue elif isinstance (param, dict) : # simply merge it into the result ru.dict_merge (ret, param, policy='overwrite') elif isinstance (param, basestring) : # check if the string points to an env variable if param in os.environ : # assume that the value of the env var is what we really want param = os.environ[param] # is string, is not env, must be a dir or a file if os.path.isdir (param) : # config dir cfg_files = glob.glob ("%s/*" % param) # print 'is dir %s/*' % param # print cfg_files elif os.path.isfile (param) : # single config file cfg_files = [param] else : troy._logger.warning ("cannot handle config location %s" % param) cfg_files = list() print 'files: %s' % cfg_files # read and merge all config files for cfg_file in cfg_files : cfg_dict = dict() try : cfg_dict = ru.read_json (cfg_file) troy._logger.info ("reading config in %s" % cfg_file) except Exception as e : troy._logger.critical ("skipping config in %s (%s)" % (cfg_file, e)) raise # import pprint # print '================' # print cfg_file # pprint.pprint (cfg_dict) # print '================' ru.dict_merge (ret, cfg_dict, policy='overwrite') else : raise TypeError ("get_config parameter must be (list of) dict or " "string, not %s" % type(param)) # print '================================' # pprint.pprint (ret) # print '================================' # expand config(s) before returning ru.dict_stringexpand (ret) return ret
def __init__(self, database_url=None, database_name="radicalpilot", uid=None, name=None): """Creates a new or reconnects to an exising session. If called without a uid, a new Session instance is created and stored in the database. If uid is set, an existing session is retrieved from the database. **Arguments:** * **database_url** (`string`): The MongoDB URL. If none is given, RP uses the environment variable RADICAL_PILOT_DBURL. If that is not set, an error will be raises. * **database_name** (`string`): An alternative database name (default: 'radicalpilot'). * **uid** (`string`): If uid is set, we try re-connect to an existing session instead of creating a new one. * **name** (`string`): An optional human readable name. **Returns:** * A new Session instance. **Raises:** * :class:`radical.pilot.DatabaseError` """ # init the base class inits saga.Session.__init__(self) Object.__init__(self) # before doing anything else, set up the debug helper for the lifetime # of the session. self._debug_helper = ru.DebugHelper() # Dictionaries holding all manager objects created during the session. self._pilot_manager_objects = list() self._unit_manager_objects = list() # Create a new process registry. All objects belonging to this # session will register their worker processes (if they have any) # in this registry. This makes it easier to shut down things in # a more coordinate fashion. self._process_registry = _ProcessRegistry() # The resource configuration dictionary associated with the session. self._resource_configs = {} self._database_url = database_url self._database_name = database_name if not self._database_url: self._database_url = os.getenv("RADICAL_PILOT_DBURL", None) if not self._database_url: raise PilotException("no database URL (set RADICAL_PILOT_DBURL)") logger.info("using database url %s" % self._database_url) # if the database url contains a path element, we interpret that as # database name (without the leading slash) tmp_url = ru.Url(self._database_url) if tmp_url.path and \ tmp_url.path[0] == '/' and \ len(tmp_url.path) > 1 : self._database_name = tmp_url.path[1:] logger.info("using database path %s" % self._database_name) else: logger.info("using database name %s" % self._database_name) # Loading all "default" resource configurations module_path = os.path.dirname(os.path.abspath(__file__)) default_cfgs = "%s/configs/*.json" % module_path config_files = glob.glob(default_cfgs) for config_file in config_files: try: rcs = ResourceConfig.from_file(config_file) except Exception as e: logger.error("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Loaded resource configurations for %s" % rc) self._resource_configs[rc] = rcs[rc].as_dict() user_cfgs = "%s/.radical/pilot/configs/*.json" % os.environ.get('HOME') config_files = glob.glob(user_cfgs) for config_file in config_files: try: rcs = ResourceConfig.from_file(config_file) except Exception as e: logger.error("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Loaded resource configurations for %s" % rc) if rc in self._resource_configs: # config exists -- merge user config into it ru.dict_merge(self._resource_configs[rc], rcs[rc].as_dict(), policy='overwrite') else: # new config -- add as is self._resource_configs[rc] = rcs[rc].as_dict() default_aliases = "%s/configs/aliases.json" % module_path self._resource_aliases = ru.read_json_str(default_aliases)['aliases'] ########################## ## CREATE A NEW SESSION ## ########################## if uid is None: try: self._connected = None if name: self._name = name self._uid = name # self._uid = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM) else: self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE) self._name = self._uid self._dbs, self._created, self._connection_info = \ dbSession.new(sid = self._uid, name = self._name, db_url = self._database_url, db_name = database_name) logger.info("New Session created%s." % str(self)) except Exception, ex: logger.exception('session create failed') raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \ % (self._database_url, ex))
class Session(saga.Session): """A Session encapsulates a RADICAL-Pilot instance and is the *root* object for all other RADICAL-Pilot objects. A Session holds :class:`radical.pilot.PilotManager` and :class:`radical.pilot.UnitManager` instances which in turn hold :class:`radical.pilot.Pilot` and :class:`radical.pilot.ComputeUnit` instances. Each Session has a unique identifier :data:`radical.pilot.Session.uid` that can be used to re-connect to a RADICAL-Pilot instance in the database. **Example**:: s1 = radical.pilot.Session(database_url=DBURL) s2 = radical.pilot.Session(database_url=DBURL, uid=s1.uid) # s1 and s2 are pointing to the same session assert s1.uid == s2.uid """ #--------------------------------------------------------------------------- # def __init__(self, database_url=None, database_name=None, name=None): """Creates a new session. If called without a uid, a new Session instance is created and stored in the database. If uid is set, an existing session is retrieved from the database. **Arguments:** * **database_url** (`string`): The MongoDB URL. If none is given, RP uses the environment variable RADICAL_PILOT_DBURL. If that is not set, an error will be raises. * **database_name** (`string`): An alternative database name (default: 'radicalpilot'). * **uid** (`string`): If uid is set, we try re-connect to an existing session instead of creating a new one. * **name** (`string`): An optional human readable name. **Returns:** * A new Session instance. **Raises:** * :class:`radical.pilot.DatabaseError` """ logger = ru.get_logger('radical.pilot') if database_name: logger.error( "The 'database_name' parameter is deprecated - please specify an URL path" ) else: database_name = 'radicalpilot' # init the base class inits saga.Session.__init__(self) self._dh = ru.DebugHelper() self._valid = True self._terminate = threading.Event() self._terminate.clear() # before doing anything else, set up the debug helper for the lifetime # of the session. self._debug_helper = ru.DebugHelper() # Dictionaries holding all manager objects created during the session. self._pilot_manager_objects = dict() self._unit_manager_objects = dict() # The resource configuration dictionary associated with the session. self._resource_configs = {} if not database_url: database_url = os.getenv("RADICAL_PILOT_DBURL", None) if not database_url: raise PilotException("no database URL (set RADICAL_PILOT_DBURL)") self._dburl = ru.Url(database_url) # if the database url contains a path element, we interpret that as # database name (without the leading slash) if not self._dburl.path or \ self._dburl.path[0] != '/' or \ len(self._dburl.path) <= 1 : logger.error( "incomplete URLs are deprecated -- missing database name!") self._dburl.path = database_name # defaults to 'radicalpilot' logger.info("using database %s" % self._dburl) # ---------------------------------------------------------------------- # create new session try: if name: self._name = name self._uid = name # self._uid = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM) else: self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE) self._name = self._uid logger.report.info('<<create session %s' % self._uid) self._dbs = dbSession(sid=self._uid, name=self._name, dburl=self._dburl) self._dburl = self._dbs._dburl logger.info("New Session created: %s." % str(self)) except Exception, ex: logger.exception('session create failed') raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \ % (self._dburl, ex)) # initialize profiling self.prof = Profiler('%s' % self._uid) self.prof.prof('start session', uid=self._uid) # Loading all "default" resource configurations module_path = os.path.dirname(os.path.abspath(__file__)) default_cfgs = "%s/configs/resource_*.json" % module_path config_files = glob.glob(default_cfgs) for config_file in config_files: try: logger.info("Load resource configurations from %s" % config_file) rcs = ResourceConfig.from_file(config_file) except Exception as e: logger.error("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Load resource configurations for %s" % rc) self._resource_configs[rc] = rcs[rc].as_dict() user_cfgs = "%s/.radical/pilot/configs/resource_*.json" % os.environ.get( 'HOME') config_files = glob.glob(user_cfgs) for config_file in config_files: try: rcs = ResourceConfig.from_file(config_file) except Exception as e: logger.error("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Load resource configurations for %s" % rc) if rc in self._resource_configs: # config exists -- merge user config into it ru.dict_merge(self._resource_configs[rc], rcs[rc].as_dict(), policy='overwrite') else: # new config -- add as is self._resource_configs[rc] = rcs[rc].as_dict() default_aliases = "%s/configs/resource_aliases.json" % module_path self._resource_aliases = ru.read_json_str(default_aliases)['aliases'] self.prof.prof('configs parsed', uid=self._uid) _rec = os.environ.get('RADICAL_PILOT_RECORD_SESSION') if _rec: self._rec = "%s/%s" % (_rec, self._uid) os.system('mkdir -p %s' % self._rec) ru.write_json({'dburl': str(self._dburl)}, "%s/session.json" % self._rec) logger.info("recording session in %s" % self._rec) else: self._rec = None logger.report.ok('>>ok\n')
# { # "*.futuregrid.org" : # { # "username" : "merzky" # } # } # } USER_CONFIG_PATH = os.environ.get ('HOME', '/tmp') + '/.my_app.cfg' # load the user config, and merge it with the default config user_config = ru.read_json_str (USER_CONFIG_PATH) # merge the user config into the app config, so that the user config keys are # applied where appropriate ru.dict_merge (app_config, user_config, policy='overwrite', wildcards=True) # lets see what we got pprint.pprint (app_config) # this should result in : # # { # 'log_level' : 0, # 'scheduler' : 'rp.SCHED_BACKFILLING', # 'resources' : ['india.furturegrid.org', 'sierra.futuregrid.org'], # 'resource_cfg': # { # '*.futuregrid.org':