def merge_description (self, source) :
        """
        merge additional information into the unit description -- such as
        resource information, or application specific data
        """

      # print 'merging  unit %s' % self.id
      # print '         with %s' % source
      # print '          and %s' % self.as_dict()

        # we only allow this in DESCRIBED or BOUND state
        if  not self.state in [DESCRIBED, BOUND] :
            raise RuntimeError ('unit is not in DESCRIBED state (%s)' \
                             % self.state)

        ud_dict = self.as_dict ()

        ru.dict_merge        (ud_dict, source, policy='overwrite')
        ru.dict_stringexpand (ud_dict)
        ru.dict_stringexpand (ud_dict, self.session.cfg)

      # print '-------------'
      # import pprint
      # pprint.pprint (ud_dict)
      # print '-------------'
      # pprint.pprint (self.session.cfg)
      # print '-------------'
      # exit()

        for (key, val) in ud_dict.iteritems () :
            try :
                self.set_attribute (key, val)
            except :
                pass
예제 #2
0
    def stop(self):

        # During ResourceManager termination, we call any existing shutdown hooks on the
        # launch methods.  We only call LaunchMethod shutdown hooks *once*
        launch_methods = set()
        launch_methods.add(self._cfg.get('mpi_launch_method'))
        launch_methods.add(self._cfg.get('task_launch_method'))
        launch_methods.add(self._cfg.get('agent_launch_method'))

        launch_methods.discard(None)

        for lm in launch_methods:
            try:
                ru.dict_merge(
                    self.lm_info,
                    rpa.LaunchMethod.rm_shutdown_hook(name=lm,
                                                      cfg=self._cfg,
                                                      rm=self,
                                                      lm_info=self.lm_info,
                                                      log=self._log,
                                                      profiler=self._prof))
            except Exception as e:
                self._log.exception(
                    "ResourceManager shutdown hook failed: %s" % e)
                raise

            self._log.info("ResourceManager shutdown hook succeeded (%s)" % lm)
예제 #3
0
    def _load_resource_configs(self):

        self.is_valid()

        self._prof.prof('config_parser_start', uid=self._uid)

        # Loading all "default" resource configurations
        module_path  = os.path.dirname(os.path.abspath(__file__))
        default_cfgs = "%s/configs/resource_*.json" % module_path
        config_files = glob.glob(default_cfgs)

        for config_file in config_files:

            try:
                self._log.info("Load resource configurations from %s" % config_file)
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                self._log.exception("skip config file %s: %s" % (config_file, e))
                raise RuntimeError('config error (%s) - abort' % e)

            for rc in rcs:
                self._log.info("Load resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict() 

        home         = os.environ.get('HOME', '')
        user_cfgs    = "%s/.radical/pilot/configs/resource_*.json" % home
        config_files = glob.glob(user_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                self._log.exception("skip config file %s: %s" % (config_file, e))
                raise RuntimeError('config error (%s) - abort' % e)

            for rc in rcs:
                self._log.info("Load resource configurations for %s" % rc)

                if rc in self._resource_configs:
                    # config exists -- merge user config into it
                    ru.dict_merge(self._resource_configs[rc],
                                  rcs[rc].as_dict(),
                                  policy='overwrite')
                else:
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict() 

        default_aliases = "%s/configs/resource_aliases.json" % module_path
        self._resource_aliases = ru.read_json_str(default_aliases)['aliases']

        # check if we have aliases to merge
        usr_aliases = '%s/.radical/pilot/configs/resource_aliases.json' % home
        if os.path.isfile(usr_aliases):
            ru.dict_merge(self._resource_aliases,
                          ru.read_json_str(usr_aliases).get('aliases', {}),
                          policy='overwrite')

        self._prof.prof('config_parser_stop', uid=self._uid)
예제 #4
0
    def __init__ (self, user_cfg=None, default=True) :

        # accept any number of user configs
        if  not isinstance (user_cfg, list) :
            user_cfg = [user_cfg]


        # set saga apitype for clean inheritance (cpi to api mapping relies on
        # _apitype)
        self._apitype = 'saga.Session'

        resource_cfg = "%s/resources.json" % os.path.dirname (troy.__file__)
        config_dir   = "%s/.troy"          % os.environ.get  ('HOME', '/etc/')
        config_env   = "%s"                % os.environ.get  ('TROY_CONFIG', None)

        # we read our base config from $HOME/troy/* by default, but also accept
        # other locations if $TROY_CONFIG is set.  Items later in the list below
        # overwrite earlier ones.
        self.cfg = tu.get_config ([_config_skeleton,
                                   resource_cfg    , 
                                   config_dir      ,
                                   config_env      ] + user_cfg)

        # make sure that the resource sections in the config have the minimal
        # set of entries
        for res_name in self.cfg['resources'] :
            ru.dict_merge (self.cfg['resources'][res_name], 
                           _resource_config_skeleton, 
                           policy='preserve', 
                           logger=troy._logger)


        # we set the log level as indicated in the troy config or user
        # config, fallback being log level ERROR
        log_level = 'ERROR'
        log_level = self.cfg.get   ('log_level',    log_level)
        log_level = os.environ.get ('TROY_VERBOSE', log_level)
        troy._logger.setLevel (log_level)


        # now that config parsing is done, we can create the session ID
        session_id_stub = self.cfg.get  ("session_id", 'session.')
        self.id         = ru.generate_id (session_id_stub, mode=ru.ID_UNIQUE)
        troy._logger.info ("session id: %s" % self.id)
        
        # and initialize the inherited saga session
        tu.Timed.__init__ (self, 'troy.Session', self.id)
        self.timed_method ('saga.Session', ['init'],  
                           saga.Session.__init__, [self, default])
예제 #5
0
    def get_resource_config (self, resource) :

        # resources may be in fact URLs -- but resource configs use host
        # names as keys.  So we check if the URL is well formed and attempt
        # to extract the host
        # FIXME: cache results, URL parsing is expensive
        try :
            resource_url = saga.Url (resource)

            # the url string 'india.futuregrid.org' is parsed as url path
            # element, not as URL host name.
            if  resource_url.host :
                resource = resource_url.host

        except saga.SagaException as e :
            pass # probably not a URL :P


        resource_cfg = self.get_config ('resources')

        # default to a copy of the resource config skeleton
        troy._logger.debug ('create resource config for %s' % resource)
        ret = dict (_resource_config_skeleton)

        # check if have a match with one of the wildcards.
        for resource_key in resource_cfg.keys () :
            if  '*' in resource_key :
                resource_pattern = re.compile (fnmatch.translate (resource_key))
                if  resource_pattern.match (resource):
                    troy._logger.debug ('merge resource pattern %s for %s' \
                                     % (resource_key, resource))
                    ru.dict_merge (ret, resource_cfg[resource_key],
                                   policy='overwrite', 
                                   logger=troy._logger)

        # check if we have an exact match for the resource name.  This upersedes
        # the wildcard entries

        if  resource in resource_cfg :
            troy._logger.debug ('merge resource config for %s' % resource)
            ru.dict_merge (ret, resource_cfg[resource], 
                          policy='overwrite', 
                          logger=troy._logger)

        # make sure the hostname is in the config
        ret['hostname'] = resource

        return ret
예제 #6
0
    def merge_description (self, source) :
        """
        merge additional information into the pilot description -- such as
        resource information, or application specific data
        """

        # we only allow this in DESCRIBED or BOUND state
        if  not self.state in [DESCRIBED, BOUND] :
            raise RuntimeError ('pilot is not in DESCRIBED state (%s)' \
                             % self.state)

        pd_dict = self.description.as_dict ()
        ru.dict_merge        (pd_dict, source, policy='overwrite')
        ru.dict_stringexpand (pd_dict)
        ru.dict_stringexpand (pd_dict, self.session.cfg)

        self.description = troy.PilotDescription (pd_dict)
예제 #7
0
def test_dict_merge () :

    dict_1 = {'key_shared' : 'val_shared_1', 
              'key_orig_1' : 'val_orig_1'}
    dict_2 = {'key_shared' : 'val_shared_2', 
              'key_orig_2' : 'val_orig_2'}

    try :
        ru.dict_merge (dict_1, dict_2)
        assert (False), 'expected ValueError exception'
    except ValueError :
        pass
    except Exception as e :
        assert (False), 'expected ValueError exception, not %s' % e


    ru.dict_merge (dict_1, dict_2, policy='preserve')

    assert (dict_1.keys()        == ['key_orig_1', 'key_orig_2', 'key_shared'])
    assert (dict_1['key_shared'] == 'val_shared_1')
    assert (dict_1['key_orig_1'] == 'val_orig_1')
    assert (dict_1['key_orig_2'] == 'val_orig_2')


    ru.dict_merge (dict_1, dict_2, policy='overwrite')

    assert (dict_1.keys()        == ['key_orig_1', 'key_orig_2', 'key_shared'])
    assert (dict_1['key_shared'] == 'val_shared_2')
    assert (dict_1['key_orig_1'] == 'val_orig_1')
    assert (dict_1['key_orig_2'] == 'val_orig_2')
예제 #8
0
    def stop(self):

        # During LRMS termination, we call any existing shutdown hooks on the
        # launch methods.  We only call LM shutdown hooks *once*
        launch_methods = set() # set keeps entries unique
        if 'mpi_launch_method' in self._cfg:
            launch_methods.add(self._cfg['mpi_launch_method'])
        launch_methods.add(self._cfg['task_launch_method'])
        launch_methods.add(self._cfg['agent_launch_method'])

        for lm in launch_methods:
            if lm:
                try:
                    from .... import pilot as rp
                    ru.dict_merge(self.lm_info,
                    rp.agent.LM.lrms_shutdown_hook(lm, self._cfg, self,
                                                    self.lm_info, self._log))
                except Exception as e:
                    self._log.exception("lrms shutdown hook failed")
                    raise

                self._log.info("lrms shutdown hook succeeded (%s)" % lm)
예제 #9
0
    def __init__(self, agent_name):

        assert(agent_name != 'agent_0'), 'expect subagent, not agent_0'
        print "startup agent %s" % agent_name

        # load config, create session and controller, init rpu.Worker
        agent_cfg  = "%s/%s.cfg" % (os.getcwd(), agent_name)
        cfg        = ru.read_json_str(agent_cfg)

        self._uid         = agent_name
        self._pid         = cfg['pilot_id']
        self._sid         = cfg['session_id']
        self._final_cause = None

        # Create a session.  
        #
        # This session will not connect to MongoDB, but will create any
        # communication channels and components/workers specified in the 
        # config -- we merge that information into our own config.
        # We don't want the session to start components though, so remove them
        # from the config copy.
        session_cfg = copy.deepcopy(cfg)
        session_cfg['owner']      = self._uid
        session_cfg['components'] = dict()
        session = rp_Session(cfg=session_cfg, _connect=False, uid=self._sid)

        # we still want the bridge addresses known though, so make sure they are
        # merged into our own copy, along with any other additions done by the
        # session.
        ru.dict_merge(cfg, session._cfg, ru.PRESERVE)
        pprint.pprint(cfg)

        if session.is_connected:
            raise RuntimeError('agent_n should not connect to mongodb')

        # at this point the session is up and workin, and the session
        # controller should have brought up all communication bridges and the
        # agent components.  We are ready to roll!
        rpu.Worker.__init__(self, cfg, session)
예제 #10
0
    def stop(self):

        # During LRMS termination, we call any existing shutdown hooks on the
        # launch methods.  We only call LM shutdown hooks *once*
        launch_methods = set() # set keeps entries unique
        if 'mpi_launch_method' in self._cfg:
            launch_methods.add(self._cfg['mpi_launch_method'])
        launch_methods.add(self._cfg['task_launch_method'])
        launch_methods.add(self._cfg['agent_launch_method'])

        for lm in launch_methods:
            if lm:
                try:
                    from .... import pilot as rp
                    ru.dict_merge(self.lm_info,
                    rp.agent.LM.lrms_shutdown_hook(lm, self._cfg, self,
                                                    self.lm_info, self._log,
                                                    self._prof))
                except Exception as e:
                    self._log.exception("lrms shutdown hook failed")
                    raise

                self._log.info("lrms shutdown hook succeeded (%s)" % lm)
예제 #11
0
    def stop(self):

        # During LRMS termination, we call any existing shutdown hooks on the
        # launch methods.  We only call LM shutdown hooks *once*
        launch_methods = set()
        launch_methods.add(self._cfg.get('mpi_launch_method'))
        launch_methods.add(self._cfg.get('task_launch_method'))
        launch_methods.add(self._cfg.get('agent_launch_method'))

        launch_methods.discard(None)

        for lm in launch_methods:
            try:
                from .... import pilot as rp
                ru.dict_merge(self.lm_info,
                rp.agent.LM.lrms_shutdown_hook(lm, self._cfg, self,
                                                self.lm_info, self._log,
                                                self._prof))
            except Exception as e:
                self._log.exception("lrms shutdown hook failed: %s" % e)
                raise

            self._log.info("lrms shutdown hook succeeded (%s)" % lm)
예제 #12
0
    def _write_sa_configs(self):

        # we have all information needed by the subagents -- write the
        # sub-agent config files.

        # write deep-copies of the config for each sub-agent (sans from agent_0)
        for sa in self._cfg.get('agents', {}):

            assert(sa != 'agent_0'), 'expect subagent, not agent_0'

            # use our own config sans agents/components as a basis for
            # the sub-agent config.
            tmp_cfg = copy.deepcopy(self._cfg)
            tmp_cfg['agents']     = dict()
            tmp_cfg['components'] = dict()

            # merge sub_agent layout into the config
            ru.dict_merge(tmp_cfg, self._cfg['agents'][sa], ru.OVERWRITE)

            tmp_cfg['agent_name'] = sa
            tmp_cfg['owner']      = 'agent_0'

            ru.write_json(tmp_cfg, './%s.cfg' % sa)
예제 #13
0
    def _get_config(self, cfg=None):
        '''
        derive a worker base configuration from the control pubsub configuration
        '''

        # FIXME: this uses insider knowledge on the config location and
        #        structure.  It would be better if agent.0 creates the worker
        #        base config from scratch on startup.

        pwd = os.getcwd()
        ru.dict_merge(cfg, ru.read_json('%s/../control_pubsub.json' % pwd))

        del (cfg['channel'])
        del (cfg['cmgr'])

        cfg['log_lvl'] = 'debug'
        cfg['kind'] = 'master'
        cfg['base'] = pwd
        cfg['uid'] = ru.generate_id('master.%(item_counter)06d',
                                    ru.ID_CUSTOM,
                                    ns=self._session.uid)

        return ru.Config(cfg=cfg)
예제 #14
0
    def _write_sa_configs(self):

        # we have all information needed by the subagents -- write the
        # sub-agent config files.

        # write deep-copies of the config for each sub-agent (sans from agent_0)
        for sa in self._cfg.get('agents', {}):

            assert(sa != 'agent_0'), 'expect subagent, not agent_0'

            # use our own config sans agents/components as a basis for
            # the sub-agent config.
            tmp_cfg = copy.deepcopy(self._cfg)
            tmp_cfg['agents']     = dict()
            tmp_cfg['components'] = dict()

            # merge sub_agent layout into the config
            ru.dict_merge(tmp_cfg, self._cfg['agents'][sa], ru.OVERWRITE)

            tmp_cfg['agent_name'] = sa
            tmp_cfg['owner']      = 'agent_0'

            ru.write_json(tmp_cfg, './%s.cfg' % sa)
예제 #15
0
def setUp(test_type, test_name):

    ret = list()
    for fin in glob.glob('tests/test_cases/unit.*.json'):

        tc = ru.read_json(fin)
        unit = tc['unit']
        setup = tc['setup'].get(test_type, {})
        result = tc['results'].get(test_type, {}).get(test_name)
        resource_file = tc['results'].get('resource_file', {}).get(test_name)
        resource_filename = tc['results'].get('resource_filename',
                                              {}).get(test_name)
        test = ru.dict_merge(unit, setup, ru.PRESERVE)

        if result:
            if resource_file and resource_filename:
                ret.append([test, result, resource_file, resource_filename])
            else:
                ret.append([test, result])

    return ret
예제 #16
0
    def __init__(self, agent_name):

        assert(agent_name == 'agent_0'), 'expect agent_0, not subagent'
        print 'startup agent %s' % agent_name

        # load config, create session, init rpu.Worker
        agent_cfg  = '%s/%s.cfg' % (os.getcwd(), agent_name)
        cfg        = ru.read_json_str(agent_cfg)

        cfg['agent_name'] = agent_name

        self._uid         = agent_name
        self._pid         = cfg['pilot_id']
        self._sid         = cfg['session_id']
        self._runtime     = cfg['runtime']
        self._starttime   = time.time()
        self._final_cause = None
        self._lrms        = None

        # this better be on a shared FS!
        cfg['workdir']    = os.getcwd()

        # sanity check on config settings
        if 'cores'               not in cfg: raise ValueError('Missing number of cores')
        if 'lrms'                not in cfg: raise ValueError('Missing LRMS')
        if 'dburl'               not in cfg: raise ValueError('Missing DBURL')
        if 'pilot_id'            not in cfg: raise ValueError('Missing pilot id')
        if 'runtime'             not in cfg: raise ValueError('Missing or zero agent runtime')
        if 'scheduler'           not in cfg: raise ValueError('Missing agent scheduler')
        if 'session_id'          not in cfg: raise ValueError('Missing session id')
        if 'spawner'             not in cfg: raise ValueError('Missing agent spawner')
        if 'task_launch_method'  not in cfg: raise ValueError('Missing unit launch method')

        # Check for the RADICAL_PILOT_DB_HOSTPORT env var, which will hold
        # the address of the tunnelized DB endpoint. If it exists, we
        # overrule the agent config with it.
        hostport = os.environ.get('RADICAL_PILOT_DB_HOSTPORT')
        if hostport:
            dburl = ru.Url(cfg['dburl'])
            dburl.host, dburl.port = hostport.split(':')
            cfg['dburl'] = str(dburl)

        # Create a session.
        #
        # This session will connect to MongoDB, and will also create any
        # communication channels and components/workers specified in the
        # config -- we merge that information into our own config.
        # We don't want the session to start components though, so remove them
        # from the config copy.        
        session_cfg = copy.deepcopy(cfg)
        session_cfg['components'] = dict()
        session = rp_Session(cfg=session_cfg, uid=self._sid)

        # we still want the bridge addresses known though, so make sure they are
        # merged into our own copy, along with any other additions done by the
        # session.
        ru.dict_merge(cfg, session._cfg, ru.PRESERVE)
        pprint.pprint(cfg)

        if not session.is_connected:
            raise RuntimeError('agent_0 could not connect to mongodb')

        # at this point the session is up and connected, and it should have
        # brought up all communication bridges and the UpdateWorker.  We are
        # ready to rumble!
        rpu.Worker.__init__(self, cfg, session)

        # this is the earlier point to sync bootstrapper and agent # profiles
        self._prof.prof('sync_rel', msg='agent_0 start', uid=self._pid)

        # Create LRMS which will give us the set of agent_nodes to use for
        # sub-agent startup.  Add the remaining LRMS information to the
        # config, for the benefit of the scheduler).
        self._lrms = rpa_rm.RM.create(name=self._cfg['lrms'], cfg=self._cfg,
                                      session=self._session)

        # add the resource manager information to our own config
        self._cfg['lrms_info'] = self._lrms.lrms_info
예제 #17
0
    def _prepare_pilot(self, resource, rcfg, pilot):

        pid = pilot["uid"]
        ret = {'ft' : list(),
               'jd' : None  }

      # # ----------------------------------------------------------------------
      # # the rcfg can contain keys with string expansion placeholders where
      # # values from the pilot description need filling in.  A prominent
      # # example is `%(pd.project)s`, where the pilot description's `PROJECT`
      # # value needs to be filled in (here in lowercase).
      # expand = dict()
      # for k,v in pilot['description'].iteritems():
      #     if v is None:
      #         v = ''
      #     expand['pd.%s' % k] = v
      #     if isinstance(v, basestring):
      #         expand['pd.%s' % k.upper()] = v.upper()
      #         expand['pd.%s' % k.lower()] = v.lower()
      #     else:
      #         expand['pd.%s' % k.upper()] = v
      #         expand['pd.%s' % k.lower()] = v
      #
      # for k in rcfg:
      #     if isinstance(rcfg[k], basestring):
      #         orig     = rcfg[k]
      #         rcfg[k]  = rcfg[k] % expand
      #         expanded = rcfg[k]
      #         if orig != expanded:
      #             self._log.debug('RCFG:\n%s\n%s', orig, expanded)

        # ----------------------------------------------------------------------
        # Database connection parameters
        sid           = self._session.uid
        database_url  = self._session.dburl

        # some default values are determined at runtime
        default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \
                          (resource, self._rp_version)

        # ----------------------------------------------------------------------
        # pilot description and resource configuration
        number_cores    = pilot['description']['cores']
        number_gpus     = pilot['description']['gpus']
        runtime         = pilot['description']['runtime']
        queue           = pilot['description']['queue']
        project         = pilot['description']['project']
        cleanup         = pilot['description']['cleanup']
        memory          = pilot['description']['memory']
        candidate_hosts = pilot['description']['candidate_hosts']

        # ----------------------------------------------------------------------
        # get parameters from resource cfg, set defaults where needed
        agent_launch_method     = rcfg.get('agent_launch_method')
        agent_dburl             = rcfg.get('agent_mongodb_endpoint', database_url)
        agent_spawner           = rcfg.get('agent_spawner',       DEFAULT_AGENT_SPAWNER)
        rc_agent_config         = rcfg.get('agent_config',        DEFAULT_AGENT_CONFIG)
        agent_scheduler         = rcfg.get('agent_scheduler')
        tunnel_bind_device      = rcfg.get('tunnel_bind_device')
        default_queue           = rcfg.get('default_queue')
        forward_tunnel_endpoint = rcfg.get('forward_tunnel_endpoint')
        lrms                    = rcfg.get('lrms')
        mpi_launch_method       = rcfg.get('mpi_launch_method', '')
        pre_bootstrap_0         = rcfg.get('pre_bootstrap_0', [])
        pre_bootstrap_1         = rcfg.get('pre_bootstrap_1', [])
        python_interpreter      = rcfg.get('python_interpreter')
        task_launch_method      = rcfg.get('task_launch_method')
        rp_version              = rcfg.get('rp_version',          DEFAULT_RP_VERSION)
        virtenv_mode            = rcfg.get('virtenv_mode',        DEFAULT_VIRTENV_MODE)
        virtenv                 = rcfg.get('virtenv',             default_virtenv)
        cores_per_node          = rcfg.get('cores_per_node', 0)
        gpus_per_node           = rcfg.get('gpus_per_node',  0)
        lfs_path_per_node       = rcfg.get('lfs_path_per_node', None)
        lfs_size_per_node       = rcfg.get('lfs_size_per_node',  0)
        python_dist             = rcfg.get('python_dist')
        virtenv_dist            = rcfg.get('virtenv_dist',        DEFAULT_VIRTENV_DIST)
        cu_tmp                  = rcfg.get('cu_tmp')
        spmd_variation          = rcfg.get('spmd_variation')
        shared_filesystem       = rcfg.get('shared_filesystem', True)
        stage_cacerts           = rcfg.get('stage_cacerts', False)
        cu_pre_exec             = rcfg.get('cu_pre_exec')
        cu_post_exec            = rcfg.get('cu_post_exec')
        export_to_cu            = rcfg.get('export_to_cu')
        mandatory_args          = rcfg.get('mandatory_args', [])
        saga_jd_supplement      = rcfg.get('saga_jd_supplement', {})

        import pprint
        self._log.debug(cores_per_node)
        self._log.debug(pprint.pformat(rcfg))

        # make sure that mandatory args are known
        for ma in mandatory_args:
            if pilot['description'].get(ma) is None:
                raise  ValueError('attribute "%s" is required for "%s"'
                                 % (ma, resource))

        # get pilot and global sandbox
        resource_sandbox = self._session._get_resource_sandbox (pilot).path
        session_sandbox  = self._session._get_session_sandbox(pilot).path
        pilot_sandbox    = self._session._get_pilot_sandbox  (pilot).path

        pilot['resource_sandbox'] = str(self._session._get_resource_sandbox(pilot))
        pilot['pilot_sandbox']    = str(self._session._get_pilot_sandbox(pilot))
        pilot['client_sandbox']   = str(self._session._get_client_sandbox())

        # Agent configuration that is not part of the public API.
        # The agent config can either be a config dict, or
        # a string pointing to a configuration name.  If neither
        # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is
        # set.  The last fallback is 'agent_default'
        agent_config = pilot['description'].get('_config')
        if not agent_config:
            agent_config = os.environ.get('RADICAL_PILOT_AGENT_CONFIG')
        if not agent_config:
            agent_config = rc_agent_config

        if isinstance(agent_config, dict):

            # use dict as is
            agent_cfg = agent_config

        elif isinstance(agent_config, basestring):
            try:
                # interpret as a config name
                agent_cfg_file = os.path.join(self._conf_dir, "agent_%s.json" % agent_config)

                self._log.info("Read agent config file: %s",  agent_cfg_file)
                agent_cfg = ru.read_json(agent_cfg_file)

                # allow for user level overload
                user_cfg_file = '%s/.radical/pilot/config/%s' \
                              % (os.environ['HOME'], os.path.basename(agent_cfg_file))

                if os.path.exists(user_cfg_file):
                    self._log.info("merging user config: %s" % user_cfg_file)
                    user_cfg = ru.read_json(user_cfg_file)
                    ru.dict_merge (agent_cfg, user_cfg, policy='overwrite')

            except Exception as e:
                self._log.exception("Error reading agent config file: %s" % e)
                raise

        else:
            # we can't handle this type
            raise TypeError('agent config must be string (config name) or dict')

        # expand variables in virtenv string
        virtenv = virtenv % {'pilot_sandbox'   : pilot_sandbox,
                             'session_sandbox' : session_sandbox,
                             'resource_sandbox': resource_sandbox}

        # Check for deprecated global_virtenv
        if 'global_virtenv' in rcfg:
            raise RuntimeError("'global_virtenv' is deprecated (%s)" % resource)

        # Create a host:port string for use by the bootstrap_0.
        db_url = rs.Url(agent_dburl)
        if db_url.port:
            db_hostport = "%s:%d" % (db_url.host, db_url.port)
        else:
            db_hostport = "%s:%d" % (db_url.host, 27017)  # mongodb default

        # ----------------------------------------------------------------------
        # the version of the agent is derived from
        # rp_version, which has the following format
        # and interpretation:
        #
        # case rp_version:
        #   @<token>:
        #   @tag/@branch/@commit: # no sdist staging
        #       git clone $github_base radical.pilot.src
        #       (cd radical.pilot.src && git checkout token)
        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
        #       rm -rf radical.pilot.src
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   release: # no sdist staging
        #       pip install -t $VIRTENV/rp_install radical.pilot
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   local: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $VIRTENV/rp_install $sdist/
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   debug: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $SANDBOX/rp_install $sdist/
        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
        #
        #   installed: # no sdist staging
        #       true
        # esac
        #
        # virtenv_mode
        #   private : error  if ve exists, otherwise create, then use
        #   update  : update if ve exists, otherwise create, then use
        #   create  : use    if ve exists, otherwise create, then use
        #   use     : use    if ve exists, otherwise error,  then exit
        #   recreate: delete if ve exists, otherwise create, then use
        #      
        # examples   :
        #   [email protected]
        #   virtenv@devel
        #   virtenv@release
        #   virtenv@installed
        #   stage@local
        #   stage@/tmp/my_agent.py
        #
        # Note that some combinations may be invalid,
        # specifically in the context of virtenv_mode.  If, for
        # example, virtenv_mode is 'use', then the 'virtenv:tag'
        # will not make sense, as the virtenv is not updated.
        # In those cases, the virtenv_mode is honored, and
        # a warning is printed.
        #
        # Also, the 'stage' mode can only be combined with the
        # 'local' source, or with a path to the agent (relative
        # to root_dir, or absolute).
        #
        # A rp_version which does not adhere to the
        # above syntax is ignored, and the fallback stage@local
        # is used.

        if  not rp_version.startswith('@') and \
            not rp_version in ['installed', 'local', 'debug', 'release']:
            raise ValueError("invalid rp_version '%s'" % rp_version)

        if rp_version.startswith('@'):
            rp_version  = rp_version[1:]  # strip '@'


        # ----------------------------------------------------------------------
        # sanity checks
        if not python_dist        : raise RuntimeError("missing python distribution")
        if not virtenv_dist       : raise RuntimeError("missing virtualenv distribution")
        if not agent_spawner      : raise RuntimeError("missing agent spawner")
        if not agent_scheduler    : raise RuntimeError("missing agent scheduler")
        if not lrms               : raise RuntimeError("missing LRMS")
        if not agent_launch_method: raise RuntimeError("missing agentlaunch method")
        if not task_launch_method : raise RuntimeError("missing task launch method")

        # massage some values
        if not queue :
            queue = default_queue

        if  cleanup and isinstance (cleanup, bool) :
            #  l : log files
            #  u : unit work dirs
            #  v : virtualenv
            #  e : everything (== pilot sandbox)
            if shared_filesystem:
                cleanup = 'luve'
            else:
                # we cannot clean the sandbox from within the agent, as the hop
                # staging would then fail, and we'd get nothing back.
                # FIXME: cleanup needs to be done by the pmgr.launcher, or
                #        someone else, really, after fetching all logs and 
                #        profiles.
                cleanup = 'luv'

            # we never cleanup virtenvs which are not private
            if virtenv_mode is not 'private' :
                cleanup = cleanup.replace ('v', '')

        # add dists to staging files, if needed
        if rp_version in ['local', 'debug']:
            sdist_names = [ru.sdist_name, rs.sdist_name, self._rp_sdist_name]
            sdist_paths = [ru.sdist_path, rs.sdist_path, self._rp_sdist_path]
        else:
            sdist_names = list()
            sdist_paths = list()

        # if cores_per_node is set (!= None), then we need to
        # allocation full nodes, and thus round up
        if cores_per_node:
            cores_per_node = int(cores_per_node)
            number_cores   = int(cores_per_node
                           * math.ceil(float(number_cores) / cores_per_node))

        # if gpus_per_node is set (!= None), then we need to
        # allocation full nodes, and thus round up
        if gpus_per_node:
            gpus_per_node = int(gpus_per_node)
            number_gpus   = int(gpus_per_node
                           * math.ceil(float(number_gpus) / gpus_per_node))

        # set mandatory args
        bootstrap_args  = ""
        bootstrap_args += " -d '%s'" % ':'.join(sdist_names)
        bootstrap_args += " -p '%s'" % pid
        bootstrap_args += " -s '%s'" % sid
        bootstrap_args += " -m '%s'" % virtenv_mode
        bootstrap_args += " -r '%s'" % rp_version
        bootstrap_args += " -b '%s'" % python_dist
        bootstrap_args += " -g '%s'" % virtenv_dist
        bootstrap_args += " -v '%s'" % virtenv
        bootstrap_args += " -y '%d'" % runtime

        # set optional args
        if lrms == "CCM":           bootstrap_args += " -c"
        if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
        if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport
        if python_interpreter:      bootstrap_args += " -i '%s'" % python_interpreter
        if tunnel_bind_device:      bootstrap_args += " -t '%s'" % tunnel_bind_device
        if cleanup:                 bootstrap_args += " -x '%s'" % cleanup

        for arg in pre_bootstrap_0:
            bootstrap_args += " -e '%s'" % arg
        for arg in pre_bootstrap_1:
            bootstrap_args += " -w '%s'" % arg

        agent_cfg['owner']              = 'agent_0'
        agent_cfg['cores']              = number_cores
        agent_cfg['gpus']               = number_gpus
        agent_cfg['lrms']               = lrms
        agent_cfg['spawner']            = agent_spawner
        agent_cfg['scheduler']          = agent_scheduler
        agent_cfg['runtime']            = runtime
        agent_cfg['dburl']              = str(database_url)
        agent_cfg['session_id']         = sid
        agent_cfg['pilot_id']           = pid
        agent_cfg['logdir']             = '.'
        agent_cfg['pilot_sandbox']      = pilot_sandbox
        agent_cfg['session_sandbox']    = session_sandbox
        agent_cfg['resource_sandbox']   = resource_sandbox
        agent_cfg['agent_launch_method']= agent_launch_method
        agent_cfg['task_launch_method'] = task_launch_method
        agent_cfg['mpi_launch_method']  = mpi_launch_method
        agent_cfg['cores_per_node']     = cores_per_node
        agent_cfg['gpus_per_node']      = gpus_per_node
        agent_cfg['lfs_path_per_node']  = lfs_path_per_node
        agent_cfg['lfs_size_per_node']  = lfs_size_per_node
        agent_cfg['cu_tmp']             = cu_tmp
        agent_cfg['export_to_cu']       = export_to_cu
        agent_cfg['cu_pre_exec']        = cu_pre_exec
        agent_cfg['cu_post_exec']       = cu_post_exec
        agent_cfg['resource_cfg']       = copy.deepcopy(rcfg)
        agent_cfg['debug']              = self._log.getEffectiveLevel()

        # we'll also push the agent config into MongoDB
        pilot['cfg'] = agent_cfg

        # ----------------------------------------------------------------------
        # Write agent config dict to a json file in pilot sandbox.

        agent_cfg_name = 'agent_0.cfg'
        cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.')
        os.close(cfg_tmp_handle)  # file exists now

        # Convert dict to json file
        self._log.debug("Write agent cfg to '%s'.", cfg_tmp_file)
        self._log.debug(pprint.pformat(agent_cfg))
        ru.write_json(agent_cfg, cfg_tmp_file)

        ret['ft'].append({'src' : cfg_tmp_file, 
                          'tgt' : '%s/%s' % (pilot_sandbox, agent_cfg_name),
                          'rem' : True})  # purge the tmp file after packing

        # ----------------------------------------------------------------------
        # we also touch the log and profile tarballs in the target pilot sandbox
        ret['ft'].append({'src' : '/dev/null',
                          'tgt' : '%s/%s' % (pilot_sandbox, '%s.log.tgz' % pid),
                          'rem' : False})  # don't remove /dev/null
        # only stage profiles if we profile
        if self._prof.enabled:
            ret['ft'].append({
                          'src' : '/dev/null',
                          'tgt' : '%s/%s' % (pilot_sandbox, '%s.prof.tgz' % pid),
                          'rem' : False})  # don't remove /dev/null

        # check if we have a sandbox cached for that resource.  If so, we have
        # nothing to do.  Otherwise we create the sandbox and stage the RP
        # stack etc.
        # NOTE: this will race when multiple pilot launcher instances are used!
        with self._cache_lock:

            if resource not in self._sandboxes:

                for sdist in sdist_paths:
                    base = os.path.basename(sdist)
                    ret['ft'].append({'src' : sdist, 
                                      'tgt' : '%s/%s' % (session_sandbox, base),
                                      'rem' : False})

                # Copy the bootstrap shell script.
                bootstrapper_path = os.path.abspath("%s/agent/%s"
                                  % (self._root_dir, BOOTSTRAPPER_0))
                self._log.debug("use bootstrapper %s", bootstrapper_path)

                ret['ft'].append({'src' : bootstrapper_path, 
                                  'tgt' : '%s/%s' % (session_sandbox, BOOTSTRAPPER_0),
                                  'rem' : False})

                # Some machines cannot run pip due to outdated CA certs.
                # For those, we also stage an updated certificate bundle
                # TODO: use booleans all the way?
                if stage_cacerts:

                    cc_name = 'cacert.pem.gz'
                    cc_path = os.path.abspath("%s/agent/%s" % (self._root_dir, cc_name))
                    self._log.debug("use CAs %s", cc_path)

                    ret['ft'].append({'src' : cc_path, 
                                      'tgt' : '%s/%s' % (session_sandbox, cc_name),
                                      'rem' : False})

                self._sandboxes[resource] = True


        # ----------------------------------------------------------------------
        # Create SAGA Job description and submit the pilot job

        jd = rs.job.Description()

        if shared_filesystem:
            bootstrap_tgt = '%s/%s' % (session_sandbox, BOOTSTRAPPER_0)
        else:
            bootstrap_tgt = '%s/%s' % ('.', BOOTSTRAPPER_0)

        jd.name                  = pid
        jd.executable            = "/bin/bash"
        jd.arguments             = ['-l %s' % bootstrap_tgt, bootstrap_args]
        jd.working_directory     = pilot_sandbox
        jd.project               = project
        jd.output                = "bootstrap_0.out"
        jd.error                 = "bootstrap_0.err"
        jd.total_cpu_count       = number_cores
        jd.total_gpu_count       = number_gpus
        jd.processes_per_host    = cores_per_node
        jd.spmd_variation        = spmd_variation
        jd.wall_time_limit       = runtime
        jd.total_physical_memory = memory
        jd.queue                 = queue
        jd.candidate_hosts       = candidate_hosts
        jd.environment           = dict()

        # we set any saga_jd_supplement keys which are not already set above
        for key, val in saga_jd_supplement.iteritems():
            if not jd[key]:
                self._log.debug('supplement %s: %s', key, val)
                jd[key] = val

        if 'RADICAL_PILOT_PROFILE' in os.environ :
            jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE'

        # for condor backends and the like which do not have shared FSs, we add
        # additional staging directives so that the backend system binds the
        # files from the session and pilot sandboxes to the pilot job.
        jd.file_transfer = list()
        if not shared_filesystem:

            jd.file_transfer.extend([
                'site:%s/%s > %s' % (session_sandbox, BOOTSTRAPPER_0, BOOTSTRAPPER_0),
                'site:%s/%s > %s' % (pilot_sandbox,   agent_cfg_name, agent_cfg_name),
                'site:%s/%s.log.tgz > %s.log.tgz' % (pilot_sandbox, pid, pid),
                'site:%s/%s.log.tgz < %s.log.tgz' % (pilot_sandbox, pid, pid)
            ])

            if 'RADICAL_PILOT_PROFILE' in os.environ:
                jd.file_transfer.extend([
                    'site:%s/%s.prof.tgz > %s.prof.tgz' % (pilot_sandbox, pid, pid),
                    'site:%s/%s.prof.tgz < %s.prof.tgz' % (pilot_sandbox, pid, pid)
                ])

            for sdist in sdist_names:
                jd.file_transfer.extend([
                    'site:%s/%s > %s' % (session_sandbox, sdist, sdist)
                ])

            if stage_cacerts:
                jd.file_transfer.extend([
                    'site:%s/%s > %s' % (session_sandbox, cc_name, cc_name)
                ])

        self._log.debug("Bootstrap command line: %s %s", jd.executable, jd.arguments)

        ret['jd'] = jd
        return ret
예제 #18
0
def profile(command, *args, **kwargs):

    if callable(command):
        cmd_str = "%s %s %s" % (command.__name__, str(args), str(kwargs))

    else:
        cmd_str = command

    print "profile: %s" % cmd_str

    if '_RADICAL_SYNAPSE_EMULATED' in os.environ:
        cmd_str = os.environ.get('_RADICAL_SYNAPSE_EMULATEE', cmd_str)
        print 'using emulated command name: %s' % cmd_str

    info = {'cmd': cmd_str}

    # start stress, get it spinning for one min to get a confirmed load
    # measurement, then run our own load, then kill stress.
    if LOAD > 0:
        rsu.logger.info("creating system load %s" % LOAD)
        os.popen("killall -9 stress 2>&1 > /dev/null")
        os.popen('stress --cpu %s &' % LOAD)
        time.sleep(60)

    load_1 = float(os.popen(LOAD_CMD).read())
    start = rsu.timestamp()

    os.environ['_RADICAL_SYNAPSE_PROFILED'] = 'TRUE'

    # run the profiled function/command in a separate process
    if callable(command):

        proc = mp.Process(target=command, args=args, kwargs=kwargs)
        proc.start()

    else:

        proc = sp.Popen(command.split(), stdout=sp.PIPE, stderr=sp.STDOUT)

    watch_mode = os.environ.get('RADICAL_SYNAPSE_WATCHMODE', 'full').lower()
    watchers = list()

    if watch_mode == 'full':
        watchers.append(rsw.WatcherCPU(proc.pid))
        watchers.append(rsw.WatcherSto(proc.pid))
        watchers.append(rsw.WatcherMem(proc.pid))

    # watchmode 'basic'
    watchers.append(rsw.WatcherSys(proc.pid))

    if callable(command):

        proc.join()
        out = ""
        ret = None

    else:
        out = proc.communicate()[0]
        ret = proc.returncode

    stop = rsu.timestamp()

    info['time'] = dict()
    info['time']['start'] = rsu.time_zero()
    info['time']['real'] = stop - start

    for watcher in reversed(watchers):
        watcher.stop()
        watcher.join()
        ru.dict_merge(info, watcher.get_data())

    # allow watchers to finalize some stuff, now having data from other watchers
    # available
    for watcher in reversed(watchers):
        watcher.finalize(info)

    time_2 = rsu.timestamp()
    load_2 = float(os.popen(LOAD_CMD).read())
    info['cpu']['load'] = max(load_1, load_2)
    rsu.logger.info("system load %s: %s" % (LOAD, info['cpu']['load']))

    if LOAD > 0:
        rsu.logger.info("stopping system load")
        os.popen("killall -9 stress 2>&1 > /dev/null")
        rsu.logger.info("stopped  system load")

    return info, ret, out
예제 #19
0
    def __init__(self,
                 url,
                 session=None,
                 logger=None,
                 opts=None,
                 posix=True,
                 interactive=True):

        if logger: self.logger = logger
        else: self.logger = ru.Logger('radical.saga.pty')

        if session: self.session = session
        else: self.session = ss.Session(default=True)

        self.logger.debug("PTYShell init %s" % self)

        self.url = url  # describes the shell to run
        self.posix = posix  # /bin/sh compatible?
        self.interactive = interactive  # bash -i ?
        self.latency = 0.0  # set by factory
        self.cp_slave = None  # file copy channel

        self.initialized = False

        self.pty_id = PTYShell._pty_id
        PTYShell._pty_id += 1

        self.cfg = ru.Config('radical.saga', 'utils')['pty']

        # opts passed on construction overwrite file config
        if opts:
            self.cfg = ru.dict_merge(self.cfg, opts, policy='overwrite')

        # get prompt pattern from config, or use default
        self.prompt = self.cfg.get('prompt_pattern', DEFAULT_PROMPT)
        self.prompt_re = re.compile("^(.*?)%s" % self.prompt, re.DOTALL)
        self.logger.info("PTY prompt pattern: %s" % self.prompt)

        # we need a local dir for file staging caches.  At this point we use
        # $HOME, but should make this configurable (FIXME)
        self.base = os.environ['HOME'] + '/.radical/saga/adaptors/shell/'

        try:
            os.makedirs(self.base)

        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(self.base):
                pass
            else:
                raise rse.NoSuccess("could not create staging dir: %s" % e)

        self.factory = supsf.PTYShellFactory()
        self.pty_info = self.factory.initialize(self.url,
                                                self.session,
                                                self.prompt,
                                                self.logger,
                                                self.cfg,
                                                self.posix,
                                                interactive=self.interactive)
        self.pty_shell = self.factory.run_shell(self.pty_info)

        self._trace('init : %s' % self.pty_shell.command)

        self.initialize()
예제 #20
0
    def _prepare_pilot(self, resource, rcfg, pilot):

        pid = pilot["uid"]
        ret = {'ft': list(), 'jd': None}

        # ------------------------------------------------------------------
        # Database connection parameters
        sid = self._session.uid
        database_url = self._session.dburl

        # some default values are determined at runtime
        default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \
                          (resource, self._rp_version)

        # ------------------------------------------------------------------
        # get parameters from resource cfg, set defaults where needed
        agent_launch_method = rcfg.get('agent_launch_method')
        agent_dburl = rcfg.get('agent_mongodb_endpoint', database_url)
        agent_spawner = rcfg.get('agent_spawner', DEFAULT_AGENT_SPAWNER)
        rc_agent_config = rcfg.get('agent_config', DEFAULT_AGENT_CONFIG)
        agent_scheduler = rcfg.get('agent_scheduler')
        tunnel_bind_device = rcfg.get('tunnel_bind_device')
        default_queue = rcfg.get('default_queue')
        forward_tunnel_endpoint = rcfg.get('forward_tunnel_endpoint')
        lrms = rcfg.get('lrms')
        mpi_launch_method = rcfg.get('mpi_launch_method', '')
        pre_bootstrap_1 = rcfg.get('pre_bootstrap_1', [])
        pre_bootstrap_2 = rcfg.get('pre_bootstrap_2', [])
        python_interpreter = rcfg.get('python_interpreter')
        task_launch_method = rcfg.get('task_launch_method')
        rp_version = rcfg.get('rp_version', DEFAULT_RP_VERSION)
        virtenv_mode = rcfg.get('virtenv_mode', DEFAULT_VIRTENV_MODE)
        virtenv = rcfg.get('virtenv', default_virtenv)
        cores_per_node = rcfg.get('cores_per_node', 0)
        health_check = rcfg.get('health_check', True)
        python_dist = rcfg.get('python_dist')
        virtenv_dist = rcfg.get('virtenv_dist', DEFAULT_VIRTENV_DIST)
        cu_tmp = rcfg.get('cu_tmp')
        spmd_variation = rcfg.get('spmd_variation')
        shared_filesystem = rcfg.get('shared_filesystem', True)
        stage_cacerts = rcfg.get('stage_cacerts', False)
        cu_pre_exec = rcfg.get('cu_pre_exec')
        cu_post_exec = rcfg.get('cu_post_exec')
        export_to_cu = rcfg.get('export_to_cu')
        mandatory_args = rcfg.get('mandatory_args', [])

        # ------------------------------------------------------------------
        # get parameters from the pilot description
        number_cores = pilot['description']['cores']
        runtime = pilot['description']['runtime']
        queue = pilot['description']['queue']
        project = pilot['description']['project']
        cleanup = pilot['description']['cleanup']
        memory = pilot['description']['memory']
        candidate_hosts = pilot['description']['candidate_hosts']

        # make sure that mandatory args are known
        for ma in mandatory_args:
            if pilot['description'].get(ma) is None:
                raise  ValueError('attribute "%s" is required for "%s"' \
                                 % (ma, resource))

        # get pilot and global sandbox
        resource_sandbox = self._session._get_resource_sandbox(pilot).path
        session_sandbox = self._session._get_session_sandbox(pilot).path
        pilot_sandbox = self._session._get_pilot_sandbox(pilot).path

        pilot['resource_sandbox'] = str(
            self._session._get_resource_sandbox(pilot))
        pilot['pilot_sandbox'] = str(self._session._get_pilot_sandbox(pilot))
        pilot['client_sandbox'] = str(self._session._get_client_sandbox())

        # Agent configuration that is not part of the public API.
        # The agent config can either be a config dict, or
        # a string pointing to a configuration name.  If neither
        # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is
        # set.  The last fallback is 'agent_default'
        agent_config = pilot['description'].get('_config')
        if not agent_config:
            agent_config = os.environ.get('RADICAL_PILOT_AGENT_CONFIG')
        if not agent_config:
            agent_config = rc_agent_config

        if isinstance(agent_config, dict):

            # use dict as is
            agent_cfg = agent_config

        elif isinstance(agent_config, basestring):
            try:
                # interpret as a config name
                agent_cfg_file = os.path.join(self._conf_dir,
                                              "agent_%s.json" % agent_config)

                self._log.info("Read agent config file: %s", agent_cfg_file)
                agent_cfg = ru.read_json(agent_cfg_file)

                # allow for user level overload
                user_cfg_file = '%s/.radical/pilot/config/%s' \
                              % (os.environ['HOME'], os.path.basename(agent_cfg_file))

                if os.path.exists(user_cfg_file):
                    self._log.info("merging user config: %s" % user_cfg_file)
                    user_cfg = ru.read_json(user_cfg_file)
                    ru.dict_merge(agent_cfg, user_cfg, policy='overwrite')

            except Exception as e:
                self._log.exception("Error reading agent config file: %s" % e)
                raise

        else:
            # we can't handle this type
            raise TypeError(
                'agent config must be string (config name) or dict')

        # expand variables in virtenv string
        virtenv = virtenv % {
            'pilot_sandbox': pilot_sandbox,
            'session_sandbox': session_sandbox,
            'resource_sandbox': resource_sandbox
        }

        # Check for deprecated global_virtenv
        if 'global_virtenv' in rcfg:
            raise RuntimeError("'global_virtenv' is deprecated (%s)" %
                               resource)

        # Create a host:port string for use by the bootstrap_1.
        db_url = rs.Url(agent_dburl)
        if db_url.port:
            db_hostport = "%s:%d" % (db_url.host, db_url.port)
        else:
            db_hostport = "%s:%d" % (db_url.host, 27017)  # mongodb default

        # ------------------------------------------------------------------
        # the version of the agent is derived from
        # rp_version, which has the following format
        # and interpretation:
        #
        # case rp_version:
        #   @<token>:
        #   @tag/@branch/@commit: # no sdist staging
        #       git clone $github_base radical.pilot.src
        #       (cd radical.pilot.src && git checkout token)
        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
        #       rm -rf radical.pilot.src
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   release: # no sdist staging
        #       pip install -t $VIRTENV/rp_install radical.pilot
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   local: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $VIRTENV/rp_install $sdist/
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   debug: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $SANDBOX/rp_install $sdist/
        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
        #
        #   installed: # no sdist staging
        #       true
        # esac
        #
        # virtenv_mode
        #   private : error  if ve exists, otherwise create, then use
        #   update  : update if ve exists, otherwise create, then use
        #   create  : use    if ve exists, otherwise create, then use
        #   use     : use    if ve exists, otherwise error,  then exit
        #   recreate: delete if ve exists, otherwise create, then use
        #
        # examples   :
        #   [email protected]
        #   virtenv@devel
        #   virtenv@release
        #   virtenv@installed
        #   stage@local
        #   stage@/tmp/my_agent.py
        #
        # Note that some combinations may be invalid,
        # specifically in the context of virtenv_mode.  If, for
        # example, virtenv_mode is 'use', then the 'virtenv:tag'
        # will not make sense, as the virtenv is not updated.
        # In those cases, the virtenv_mode is honored, and
        # a warning is printed.
        #
        # Also, the 'stage' mode can only be combined with the
        # 'local' source, or with a path to the agent (relative
        # to root_dir, or absolute).
        #
        # A rp_version which does not adhere to the
        # above syntax is ignored, and the fallback stage@local
        # is used.

        if  not rp_version.startswith('@') and \
            not rp_version in ['installed', 'local', 'debug', 'release']:
            raise ValueError("invalid rp_version '%s'" % rp_version)

        if rp_version.startswith('@'):
            rp_version = rp_version[1:]  # strip '@'

        # ------------------------------------------------------------------
        # sanity checks
        if not python_dist: raise RuntimeError("missing python distribution")
        if not virtenv_dist:
            raise RuntimeError("missing virtualenv distribution")
        if not agent_spawner: raise RuntimeError("missing agent spawner")
        if not agent_scheduler: raise RuntimeError("missing agent scheduler")
        if not lrms: raise RuntimeError("missing LRMS")
        if not agent_launch_method:
            raise RuntimeError("missing agentlaunch method")
        if not task_launch_method:
            raise RuntimeError("missing task launch method")

        # massage some values
        if not queue:
            queue = default_queue

        if cleanup and isinstance(cleanup, bool):
            #  l : log files
            #  u : unit work dirs
            #  v : virtualenv
            #  e : everything (== pilot sandbox)
            if shared_filesystem:
                cleanup = 'luve'
            else:
                # we cannot clean the sandbox from within the agent, as the hop
                # staging would then fail, and we'd get nothing back.
                # FIXME: cleanup needs to be done by the pmgr.launcher, or
                #        someone else, really, after fetching all logs and
                #        profiles.
                cleanup = 'luv'

            # we never cleanup virtenvs which are not private
            if virtenv_mode is not 'private':
                cleanup = cleanup.replace('v', '')

        # add dists to staging files, if needed
        if rp_version in ['local', 'debug']:
            sdist_names = [ru.sdist_name, rs.sdist_name, self._rp_sdist_name]
            sdist_paths = [ru.sdist_path, rs.sdist_path, self._rp_sdist_path]
        else:
            sdist_names = list()
            sdist_paths = list()

        # if cores_per_node is set (!= None), then we need to
        # allocation full nodes, and thus round up
        if cores_per_node:
            cores_per_node = int(cores_per_node)
            number_cores = int(cores_per_node *
                               math.ceil(float(number_cores) / cores_per_node))

        # set mandatory args
        bootstrap_args = ""
        bootstrap_args += " -d '%s'" % ':'.join(sdist_names)
        bootstrap_args += " -p '%s'" % pid
        bootstrap_args += " -s '%s'" % sid
        bootstrap_args += " -m '%s'" % virtenv_mode
        bootstrap_args += " -r '%s'" % rp_version
        bootstrap_args += " -b '%s'" % python_dist
        bootstrap_args += " -g '%s'" % virtenv_dist
        bootstrap_args += " -v '%s'" % virtenv
        bootstrap_args += " -y '%d'" % runtime

        # set optional args
        if lrms == "CCM": bootstrap_args += " -c"
        if forward_tunnel_endpoint:
            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
        if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport
        if python_interpreter:
            bootstrap_args += " -i '%s'" % python_interpreter
        if tunnel_bind_device:
            bootstrap_args += " -t '%s'" % tunnel_bind_device
        if cleanup: bootstrap_args += " -x '%s'" % cleanup

        for arg in pre_bootstrap_1:
            bootstrap_args += " -e '%s'" % arg
        for arg in pre_bootstrap_2:
            bootstrap_args += " -w '%s'" % arg

        agent_cfg['owner'] = 'agent_0'
        agent_cfg['cores'] = number_cores
        agent_cfg['lrms'] = lrms
        agent_cfg['spawner'] = agent_spawner
        agent_cfg['scheduler'] = agent_scheduler
        agent_cfg['runtime'] = runtime
        agent_cfg['dburl'] = str(database_url)
        agent_cfg['session_id'] = sid
        agent_cfg['pilot_id'] = pid
        agent_cfg['logdir'] = '.'
        agent_cfg['pilot_sandbox'] = pilot_sandbox
        agent_cfg['session_sandbox'] = session_sandbox
        agent_cfg['resource_sandbox'] = resource_sandbox
        agent_cfg['agent_launch_method'] = agent_launch_method
        agent_cfg['task_launch_method'] = task_launch_method
        agent_cfg['mpi_launch_method'] = mpi_launch_method
        agent_cfg['cores_per_node'] = cores_per_node
        agent_cfg['cu_tmp'] = cu_tmp
        agent_cfg['export_to_cu'] = export_to_cu
        agent_cfg['cu_pre_exec'] = cu_pre_exec
        agent_cfg['cu_post_exec'] = cu_post_exec
        agent_cfg['resource_cfg'] = copy.deepcopy(rcfg)
        agent_cfg['debug'] = self._log.getEffectiveLevel()

        # we'll also push the agent config into MongoDB
        pilot['cfg'] = agent_cfg

        # ------------------------------------------------------------------
        # Write agent config dict to a json file in pilot sandbox.

        agent_cfg_name = 'agent_0.cfg'
        cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.')
        os.close(cfg_tmp_handle)  # file exists now

        # Convert dict to json file
        self._log.debug("Write agent cfg to '%s'.", cfg_tmp_file)
        self._log.debug(pprint.pformat(agent_cfg))
        ru.write_json(agent_cfg, cfg_tmp_file)

        ret['ft'].append({
            'src': cfg_tmp_file,
            'tgt': '%s/%s' % (pilot_sandbox, agent_cfg_name),
            'rem': True
        })  # purge the tmp file after packing

        # ----------------------------------------------------------------------
        # we also touch the log and profile tarballs in the target pilot sandbox
        ret['ft'].append({
            'src': '/dev/null',
            'tgt': '%s/%s' % (pilot_sandbox, '%s.log.tgz' % pid),
            'rem': False
        })  # don't remove /dev/null
        # only stage profiles if we profile
        if self._prof.enabled:
            ret['ft'].append({
                'src':
                '/dev/null',
                'tgt':
                '%s/%s' % (pilot_sandbox, '%s.prof.tgz' % pid),
                'rem':
                False
            })  # don't remove /dev/null

        # check if we have a sandbox cached for that resource.  If so, we have
        # nothing to do.  Otherwise we create the sandbox and stage the RP
        # stack etc.
        # NOTE: this will race when multiple pilot launcher instances are used!
        with self._cache_lock:

            if not resource in self._sandboxes:

                for sdist in sdist_paths:
                    base = os.path.basename(sdist)
                    ret['ft'].append({
                        'src': sdist,
                        'tgt': '%s/%s' % (session_sandbox, base),
                        'rem': False
                    })

                # Copy the bootstrap shell script.
                bootstrapper_path = os.path.abspath("%s/agent/%s" \
                        % (self._root_dir, BOOTSTRAPPER_0))
                self._log.debug("use bootstrapper %s", bootstrapper_path)

                ret['ft'].append({
                    'src':
                    bootstrapper_path,
                    'tgt':
                    '%s/%s' % (session_sandbox, BOOTSTRAPPER_0),
                    'rem':
                    False
                })

                # Some machines cannot run pip due to outdated CA certs.
                # For those, we also stage an updated certificate bundle
                # TODO: use booleans all the way?
                if stage_cacerts:

                    cc_name = 'cacert.pem.gz'
                    cc_path = os.path.abspath("%s/agent/%s" %
                                              (self._root_dir, cc_name))
                    self._log.debug("use CAs %s", cc_path)

                    ret['ft'].append({
                        'src':
                        cc_path,
                        'tgt':
                        '%s/%s' % (session_sandbox, cc_name),
                        'rem':
                        False
                    })

                self._sandboxes[resource] = True

        # ------------------------------------------------------------------
        # Create SAGA Job description and submit the pilot job

        jd = rs.job.Description()

        if shared_filesystem:
            bootstrap_tgt = '%s/%s' % (session_sandbox, BOOTSTRAPPER_0)
        else:
            bootstrap_tgt = '%s/%s' % ('.', BOOTSTRAPPER_0)

        jd.name = pid
        jd.executable = "/bin/bash"
        jd.arguments = ['-l %s' % bootstrap_tgt, bootstrap_args]
        jd.working_directory = pilot_sandbox
        jd.project = project
        jd.output = "bootstrap_1.out"
        jd.error = "bootstrap_1.err"
        jd.total_cpu_count = number_cores
        jd.processes_per_host = cores_per_node
        jd.spmd_variation = spmd_variation
        jd.wall_time_limit = runtime
        jd.total_physical_memory = memory
        jd.queue = queue
        jd.candidate_hosts = candidate_hosts
        jd.environment = dict()

        if 'RADICAL_PILOT_PROFILE' in os.environ:
            jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE'

        # for condor backends and the like which do not have shared FSs, we add
        # additional staging directives so that the backend system binds the
        # files from the session and pilot sandboxes to the pilot job.
        jd.file_transfer = list()
        if not shared_filesystem:

            jd.file_transfer.extend([
                'site:%s/%s > %s' %
                (session_sandbox, BOOTSTRAPPER_0, BOOTSTRAPPER_0),
                'site:%s/%s > %s' %
                (pilot_sandbox, agent_cfg_name, agent_cfg_name),
                'site:%s/%s.log.tgz > %s.log.tgz' % (pilot_sandbox, pid, pid),
                'site:%s/%s.log.tgz < %s.log.tgz' % (pilot_sandbox, pid, pid)
            ])

            if 'RADICAL_PILOT_PROFILE' in os.environ:
                jd.file_transfer.extend([
                    'site:%s/%s.prof.tgz > %s.prof.tgz' %
                    (pilot_sandbox, pid, pid),
                    'site:%s/%s.prof.tgz < %s.prof.tgz' %
                    (pilot_sandbox, pid, pid)
                ])

            for sdist in sdist_names:
                jd.file_transfer.extend(
                    ['site:%s/%s > %s' % (session_sandbox, sdist, sdist)])

            if stage_cacerts:
                jd.file_transfer.extend(
                    ['site:%s/%s > %s' % (session_sandbox, cc_name, cc_name)])

        self._log.debug("Bootstrap command line: %s %s", jd.executable,
                        jd.arguments)

        ret['jd'] = jd
        return ret
예제 #21
0
    def __init__(self, cfg, logger):

        self.name            = type(self).__name__
        self._cfg            = cfg
        self._log            = logger
        self.requested_cores = self._cfg['cores']

        self._log.info("Configuring LRMS %s.", self.name)

        self.lm_info         = dict()
        self.lrms_info       = dict()
        self.slot_list       = list()
        self.node_list       = list()
        self.agent_nodes     = {}
        self.cores_per_node  = None

        # The LRMS will possibly need to reserve nodes for the agent, according to the
        # agent layout.  We dig out the respective requirements from the config
        # right here.
        self._agent_reqs = []
        layout = self._cfg['agent_layout']
        # FIXME: this loop iterates over all agents *defined* in the layout, not
        #        over all agents which are to be actually executed, thus
        #        potentially reserving too many nodes.
        for worker in layout:
            target = layout[worker].get('target')
            # make sure that the target either 'local', which we will ignore,
            # or 'node'.
            if target == 'local':
                pass # ignore that one
            elif target == 'node':
                self._agent_reqs.append(worker)
            else :
                raise ValueError("ill-formatted agent target '%s'" % target)

        # We are good to get rolling, and to detect the runtime environment of
        # the local LRMS.
        self._configure()
        logger.info("Discovered execution environment: %s", self.node_list)

        # Make sure we got a valid nodelist and a valid setting for
        # cores_per_node
        if not self.node_list or self.cores_per_node < 1:
            raise RuntimeError('LRMS configuration invalid (%s)(%s)' % \
                    (self.node_list, self.cores_per_node))

        # Check if the LRMS implementation reserved agent nodes.  If not, pick
        # the first couple of nodes from the nodelist as a fallback.
        if self._agent_reqs and not self.agent_nodes:
            self._log.info('Determine list of agent nodes generically.')
            for worker in self._agent_reqs:
                # Get a node from the end of the node list
                self.agent_nodes[worker] = self.node_list.pop()
                # If all nodes are taken by workers now, we can safely stop,
                # and let the raise below do its thing.
                if not self.node_list:
                    break

        if self.agent_nodes:
            self._log.info('Reserved agent node(s): %s' % self.agent_nodes.values())
            self._log.info('Agent(s) running on node(s): %s' % self.agent_nodes.keys())
            self._log.info('Remaining work node(s): %s' % self.node_list)

        # Check if we can do any work
        if not self.node_list:
            raise RuntimeError('LRMS has no nodes left to run units')

        # After LRMS configuration, we call any existing config hooks on the
        # launch methods.  Those hooks may need to adjust the LRMS settings
        # (hello ORTE).  We only call LM hooks *once*
        launch_methods = set() # set keeps entries unique
        if 'mpi_launch_method' in self._cfg:
            launch_methods.add(self._cfg['mpi_launch_method'])
        launch_methods.add(self._cfg['task_launch_method'])
        launch_methods.add(self._cfg['agent_launch_method'])

        for lm in launch_methods:
            if lm:
                try:
                    from .... import pilot as rp
                    ru.dict_merge(self.lm_info,
                            rp.agent.LM.lrms_config_hook(lm, self._cfg, self, self._log))
                except Exception as e:
                    self._log.exception("lrms config hook failed")
                    raise

                self._log.info("lrms config hook succeeded (%s)" % lm)

        # For now assume that all nodes have equal amount of cores
        cores_avail = (len(self.node_list) + len(self.agent_nodes)) * self.cores_per_node
        if 'RADICAL_PILOT_PROFILE' not in os.environ:
            if cores_avail < int(self.requested_cores):
                raise ValueError("Not enough cores available (%s) to satisfy allocation request (%s)." \
                                % (str(cores_avail), str(self.requested_cores)))

        # NOTE: self.lrms_info is what scheduler and launch method can
        # ultimately use, as it is included into the cfg passed to all
        # components.
        #
        # four elements are well defined:
        #   lm_info:        the dict received via the LM's lrms_config_hook
        #   node_list:      a list of node names to be used for unit execution
        #   cores_per_node: as the name says
        #   agent_nodes:    list of node names reserved for agent execution
        #
        # That list may turn out to be insufficient for some schedulers.  Yarn
        # for example may need to communicate YARN service endpoints etc.  an
        # LRMS can thus expand this dict, but is then likely bound to a specific
        # scheduler which can interpret the additional information.
        self.lrms_info['name']           = self.name
        self.lrms_info['lm_info']        = self.lm_info
        self.lrms_info['node_list']      = self.node_list
        self.lrms_info['cores_per_node'] = self.cores_per_node
        self.lrms_info['agent_nodes']    = self.agent_nodes
예제 #22
0
    def __init__(self, cfg, session):        

        self.name            = type(self).__name__
        self._cfg            = cfg
        self._session        = session
        self._log            = self._session._log
        self._prof           = self._session._prof
        self.requested_cores = self._cfg['cores']
        self.requested_gpus  = self._cfg['gpus']

        self._log.info("Configuring LRMS %s.", self.name)

        self.lm_info         = dict()
        self.lrms_info       = dict()
        self.node_list       = list()
        self.agent_nodes     = dict()
        self.cores_per_node  = None
        self.gpus_per_node   = None
        self.lfs_per_node    = None
        self.mem_per_node    = None

        # The LRMS will possibly need to reserve nodes for the agent, according
        # to the agent layout.  We dig out the respective requirements from the
        # config right here.
        self._agent_reqs = []
        agents = self._cfg.get('agents', {})

        # FIXME: this loop iterates over all agents *defined* in the layout, not
        #        over all agents which are to be actually executed, thus
        #        potentially reserving too many nodes.a
        # NOTE:  this code path is *within* the agent, so at least agent_0
        #        cannot possibly land on a different node.
        for agent in agents:
            target = agents[agent].get('target')
            # make sure that the target either 'local', which we will ignore,
            # or 'node'.
            if target == 'local':
                pass  # ignore that one
            elif target == 'node':
                self._agent_reqs.append(agent)
            else :
                raise ValueError("ill-formatted agent target '%s'" % target)

        # We are good to get rolling, and to detect the runtime environment of
        # the local LRMS.
        self._configure()
        self._log.info("Discovered execution environment: %s", self.node_list)

        # Make sure we got a valid nodelist and a valid setting for
        # cores_per_node
        if not self.node_list or self.cores_per_node < 1:
            raise RuntimeError('LRMS configuration invalid (%s)(%s)' %
                    (self.node_list, self.cores_per_node))

        # Check if the LRMS implementation reserved agent nodes.  If not, pick
        # the first couple of nodes from the nodelist as a fallback.
        if self._agent_reqs and not self.agent_nodes:
            self._log.info('Determine list of agent nodes generically.')
            for agent in self._agent_reqs:
                # Get a node from the end of the node list
                self.agent_nodes[agent] = self.node_list.pop()
                # If all nodes are taken by workers now, we can safely stop,
                # and let the raise below do its thing.
                if not self.node_list:
                    break

        if self.agent_nodes:
            self._log.info('Reserved nodes: %s' % self.agent_nodes.values())
            self._log.info('Agent    nodes: %s' % self.agent_nodes.keys())
            self._log.info('Worker   nodes: %s' % self.node_list)

        # Check if we can do any work
        if not self.node_list:
            raise RuntimeError('LRMS has no nodes left to run units')

        # After LRMS configuration, we call any existing config hooks on the
        # launch methods.  Those hooks may need to adjust the LRMS settings
        # (hello ORTE).  We only call LM hooks *once* (thus the set)
        launch_methods = set()
        launch_methods.add(self._cfg.get('mpi_launch_method'))
        launch_methods.add(self._cfg.get('task_launch_method'))
        launch_methods.add(self._cfg.get('agent_launch_method'))

        launch_methods.discard(None)

        for lm in launch_methods:
            try:
                from .... import pilot as rp
                ru.dict_merge(self.lm_info,
                        rp.agent.LM.lrms_config_hook(lm, self._cfg, self,
                            self._log, self._prof))
            except Exception as e:
                # FIXME don't catch/raise
                self._log.exception("lrms config hook failed: %s" % e)
                raise

            self._log.info("lrms config hook succeeded (%s)" % lm)

        # For now assume that all nodes have equal amount of cores and gpus
        cores_avail = (len(self.node_list) + len(self.agent_nodes)) * self.cores_per_node
        gpus_avail  = (len(self.node_list) + len(self.agent_nodes)) * self.gpus_per_node

        # on debug runs, we allow more cpus/gpus to appear than physically exist
        if 'RADICAL_DEBUG' not in os.environ:
            if cores_avail < int(self.requested_cores):
                raise ValueError("Not enough cores available (%s < %s)."
                                % (str(cores_avail), str(self.requested_cores)))

            if gpus_avail < int(self.requested_gpus):
                raise ValueError("Not enough gpus available (%s < %s)."
                                % (str(gpus_avail), str(self.requested_gpus)))


        # NOTE: self.lrms_info is what scheduler and launch method can
        # ultimately use, as it is included into the cfg passed to all
        # components.
        #
        # five elements are well defined:
        #   lm_info:        the dict received via the LM's lrms_config_hook
        #   node_list:      a list of node names to be used for unit execution
        #   cores_per_node: as the name says
        #   gpus_per_node:  as the name says
        #   agent_nodes:    list of node names reserved for agent execution
        #
        # That list may turn out to be insufficient for some schedulers.  Yarn
        # for example may need to communicate YARN service endpoints etc.  an
        # LRMS can thus expand this dict, but is then likely bound to a specific
        # scheduler which can interpret the additional information.
        self.lrms_info['name']           = self.name
        self.lrms_info['lm_info']        = self.lm_info
        self.lrms_info['node_list']      = self.node_list
        self.lrms_info['cores_per_node'] = self.cores_per_node
        self.lrms_info['gpus_per_node']  = self.gpus_per_node
        self.lrms_info['agent_nodes']    = self.agent_nodes
        self.lrms_info['lfs_per_node']   = self.lfs_per_node
        self.lrms_info['mem_per_node']   = self.mem_per_node
예제 #23
0
    def __init__(self, cfg, session):

        self.name = type(self).__name__
        self._cfg = cfg
        self._session = session
        self._log = self._session._log
        self._prof = self._session._prof
        self.requested_cores = self._cfg['cores']
        self.requested_gpus = self._cfg['gpus']

        self._log.info("Configuring ResourceManager %s.", self.name)

        self.lm_info = dict()
        self.rm_info = dict()
        self.node_list = list()
        self.agent_nodes = dict()
        self.cores_per_node = 0
        self.gpus_per_node = 0
        self.lfs_per_node = 0
        self.mem_per_node = 0
        self.smt = int(os.environ.get('RADICAL_SAGA_SMT', 1))

        # The ResourceManager will possibly need to reserve nodes for the agent,
        # according to the agent layout.  We dig out the respective requirements
        # from the config right here.
        self._agent_reqs = []
        agents = self._cfg.get('agents', {})

        # FIXME: this loop iterates over all agents *defined* in the layout, not
        #        over all agents which are to be actually executed, thus
        #        potentially reserving too many nodes.a
        # NOTE:  this code path is *within* the agent, so at least agent.0
        #        cannot possibly land on a different node.
        for agent in agents:
            target = agents[agent].get('target')
            # make sure that the target either 'local', which we will ignore,
            # or 'node'.
            if target == 'local':
                pass  # ignore that one
            elif target == 'node':
                self._agent_reqs.append(agent)
            else:
                raise ValueError("ill-formatted agent target '%s'" % target)

        # We are good to get rolling, and to detect the runtime environment of
        # the local ResourceManager.
        self._configure()
        self._log.info("Discovered execution environment: %s", self.node_list)

        # Make sure we got a valid nodelist and a valid setting for
        # cores_per_node
        if not self.node_list or self.cores_per_node < 1:
            raise RuntimeError(
                'ResourceManager configuration invalid (%s)(%s)' %
                (self.node_list, self.cores_per_node))

        # Check if the ResourceManager implementation reserved agent nodes.
        # If not, pick the first couple of nodes from the nodelist as fallback.
        if self._agent_reqs and not self.agent_nodes:
            self._log.info('Determine list of agent nodes generically.')
            for agent in self._agent_reqs:
                # Get a node from the end of the node list
                self.agent_nodes[agent] = self.node_list.pop()
                # If all nodes are taken by workers now, we can safely stop,
                # and let the raise below do its thing.
                if not self.node_list:
                    break

        if self.agent_nodes:
            self._log.info('Reserved nodes: %s' %
                           list(self.agent_nodes.values()))
            self._log.info('Agent    nodes: %s' %
                           list(self.agent_nodes.keys()))
            self._log.info('Worker   nodes: %s' % self.node_list)

        # Check if we can do any work
        if not self.node_list:
            raise RuntimeError(
                'ResourceManager has no nodes left to run units')

        # After ResourceManager configuration, we call any existing config hooks
        # on the launch methods.  Those hooks may need to adjust the
        # ResourceManager settings (hello ORTE).  We only call LaunchMethod
        # hooks *once* (thus the set)
        launch_methods = set()
        launch_methods.add(self._cfg.get('mpi_launch_method'))
        launch_methods.add(self._cfg.get('task_launch_method'))
        launch_methods.add(self._cfg.get('agent_launch_method'))

        launch_methods.discard(None)

        for lm in launch_methods:
            try:
                ru.dict_merge(
                    self.lm_info,
                    rpa.LaunchMethod.rm_config_hook(name=lm,
                                                    cfg=self._cfg,
                                                    rm=self,
                                                    log=self._log,
                                                    profiler=self._prof))
            except Exception as e:
                # FIXME don't catch/raise
                self._log.exception("ResourceManager config hook failed: %s" %
                                    e)
                raise

            self._log.info("ResourceManager config hook succeeded (%s)" % lm)

        # For now assume that all nodes have equal amount of cores and gpus
        cores_avail = (len(self.node_list) +
                       len(self.agent_nodes)) * self.cores_per_node
        gpus_avail = (len(self.node_list) +
                      len(self.agent_nodes)) * self.gpus_per_node

        # on debug runs, we allow more cpus/gpus to appear than physically exist
        if 'RADICAL_DEBUG' not in os.environ:
            if cores_avail < int(self.requested_cores):
                raise ValueError("Not enough cores available (%s < %s)." %
                                 (str(cores_avail), str(self.requested_cores)))

            if gpus_avail < int(self.requested_gpus):
                raise ValueError("Not enough gpus available (%s < %s)." %
                                 (str(gpus_avail), str(self.requested_gpus)))

        # NOTE: self.rm_info is what scheduler and launch method can
        # ultimately use, as it is included into the cfg passed to all
        # components.
        #
        # five elements are well defined:
        #   lm_info:        the dict received via the LM's rm_config_hook
        #   node_list:      a list of node names to be used for unit execution
        #   cores_per_node: as the name says
        #   gpus_per_node:  as the name says
        #   agent_nodes:    list of node names reserved for agent execution
        #
        # That list may turn out to be insufficient for some schedulers.  Yarn
        # for example may need to communicate YARN service endpoints etc.  an
        # ResourceManager can thus expand this dict, but is then likely bound to
        # a specific scheduler which can interpret the additional information.
        self.rm_info['name'] = self.name
        self.rm_info['lm_info'] = self.lm_info
        self.rm_info['node_list'] = self.node_list
        self.rm_info['cores_per_node'] = self.cores_per_node
        self.rm_info['gpus_per_node'] = self.gpus_per_node
        self.rm_info['agent_nodes'] = self.agent_nodes
        self.rm_info['lfs_per_node'] = self.lfs_per_node
        self.rm_info['mem_per_node'] = self.mem_per_node
예제 #24
0
#     "resource_cfg" :
#     {
#         "*.futuregrid.org" :
#         {
#             "username"      : "merzky"
#         }
#     }
# }
USER_CONFIG_PATH = os.environ.get('HOME', '/tmp') + '/.my_app.cfg'

# load the user config, and merge it with the default config
user_config = ru.read_json_str(USER_CONFIG_PATH)

# merge the user config into the app config, so that the user config keys are
# applied where appropriate
ru.dict_merge(app_config, user_config, policy='overwrite', wildcards=True)

# lets see what we got
pprint.pprint(app_config)

# this should result in :
#
# {
#     'log_level'   : 0,
#     'scheduler'   : 'rp.SCHED_BACKFILLING',
#     'resources'   : ['india.furturegrid.org', 'sierra.futuregrid.org'],
#     'resource_cfg':
#     {
#         '*.futuregrid.org':
#         {
#             'username'     : 'merzky'
예제 #25
0
    def __init__(self, cfg, session):        

        self.name            = type(self).__name__
        self._cfg            = cfg
        self._session        = session
        self._log            = self._session._log
        self._prof           = self._session._prof
        self.requested_cores = self._cfg['cores']

        self._log.info("Configuring LRMS %s.", self.name)

        self.lm_info         = dict()
        self.lrms_info       = dict()
        self.slot_list       = list()
        self.node_list       = list()
        self.agent_nodes     = dict()
        self.cores_per_node  = None
        self.gpus_per_node   = None
        self.lfs_per_node    = None

        # The LRMS will possibly need to reserve nodes for the agent, according
        # to the agent layout.  We dig out the respective requirements from the
        # config right here.
        self._agent_reqs = []
        agents = self._cfg.get('agents', {})

        # FIXME: this loop iterates over all agents *defined* in the layout, not
        #        over all agents which are to be actually executed, thus
        #        potentially reserving too many nodes.a
        # NOTE:  this code path is *within* the agent, so at least agent_0
        #        cannot possibly land on a different node.
        for agent in agents:
            target = agents[agent].get('target')
            # make sure that the target either 'local', which we will ignore,
            # or 'node'.
            if target == 'local':
                pass # ignore that one
            elif target == 'node':
                self._agent_reqs.append(agent)
            else :
                raise ValueError("ill-formatted agent target '%s'" % target)

        # We are good to get rolling, and to detect the runtime environment of
        # the local LRMS.
        self._configure()
        self._log.info("Discovered execution environment: %s", self.node_list)

        # Make sure we got a valid nodelist and a valid setting for
        # cores_per_node
        if not self.node_list or self.cores_per_node < 1:
            raise RuntimeError('LRMS configuration invalid (%s)(%s)' % \
                    (self.node_list, self.cores_per_node))

        # Check if the LRMS implementation reserved agent nodes.  If not, pick
        # the first couple of nodes from the nodelist as a fallback.
        if self._agent_reqs and not self.agent_nodes:
            self._log.info('Determine list of agent nodes generically.')
            for agent in self._agent_reqs:
                # Get a node from the end of the node list
                self.agent_nodes[agent] = self.node_list.pop()
                # If all nodes are taken by workers now, we can safely stop,
                # and let the raise below do its thing.
                if not self.node_list:
                    break

        if self.agent_nodes:
            self._log.info('Reserved agent node(s): %s' % self.agent_nodes.values())
            self._log.info('Agent(s) running on node(s): %s' % self.agent_nodes.keys())
            self._log.info('Remaining work node(s): %s' % self.node_list)

        # Check if we can do any work
        if not self.node_list:
            raise RuntimeError('LRMS has no nodes left to run units')

        # After LRMS configuration, we call any existing config hooks on the
        # launch methods.  Those hooks may need to adjust the LRMS settings
        # (hello ORTE).  We only call LM hooks *once*
        launch_methods = set() # set keeps entries unique
        if 'mpi_launch_method' in self._cfg:
            launch_methods.add(self._cfg['mpi_launch_method'])
        launch_methods.add(self._cfg['task_launch_method'])
        launch_methods.add(self._cfg['agent_launch_method'])

        for lm in launch_methods:
            if lm:
                try:
                    from .... import pilot as rp
                    ru.dict_merge(self.lm_info,
                            rp.agent.LM.lrms_config_hook(lm, self._cfg, self,
                                self._log, self._prof))
                except Exception as e:
                    self._log.exception("lrms config hook failed")
                    raise

                self._log.info("lrms config hook succeeded (%s)" % lm)

        # For now assume that all nodes have equal amount of cores and gpus
        cores_avail = (len(self.node_list) + len(self.agent_nodes)) * self.cores_per_node
        gpus_avail  = (len(self.node_list) + len(self.agent_nodes)) * self.gpus_per_node
        if 'RADICAL_PILOT_PROFILE' not in os.environ:
            if cores_avail < int(self.requested_cores):
                raise ValueError("Not enough cores available (%s) to satisfy allocation request (%s)." \
                                % (str(cores_avail), str(self.requested_cores)))

        # NOTE: self.lrms_info is what scheduler and launch method can
        # ultimately use, as it is included into the cfg passed to all
        # components.
        #
        # five elements are well defined:
        #   lm_info:        the dict received via the LM's lrms_config_hook
        #   node_list:      a list of node names to be used for unit execution
        #   cores_per_node: as the name says
        #   gpus_per_node:  as the name says
        #   agent_nodes:    list of node names reserved for agent execution
        #
        # That list may turn out to be insufficient for some schedulers.  Yarn
        # for example may need to communicate YARN service endpoints etc.  an
        # LRMS can thus expand this dict, but is then likely bound to a specific
        # scheduler which can interpret the additional information.
        self.lrms_info['name']           = self.name
        self.lrms_info['lm_info']        = self.lm_info
        self.lrms_info['node_list']      = self.node_list
        self.lrms_info['cores_per_node'] = self.cores_per_node
        self.lrms_info['gpus_per_node']  = self.gpus_per_node
        self.lrms_info['agent_nodes']    = self.agent_nodes
        self.lrms_info['lfs_per_node']   = self.lfs_per_node
예제 #26
0
def profile (command, *args, **kwargs) :

    if callable (command):
        cmd_str = "%s %s %s" % (command.__name__, str (args), str(kwargs))

    else:
        cmd_str = command


    print "profile: %s" % cmd_str

    if '_RADICAL_SYNAPSE_EMULATED' in os.environ:
        cmd_str = os.environ.get ('_RADICAL_SYNAPSE_EMULATEE', cmd_str)
        print 'using emulated command name: %s' % cmd_str

    info = {'cmd' : cmd_str}

    # start stress, get it spinning for one min to get a confirmed load
    # measurement, then run our own load, then kill stress.
    if  LOAD > 0:
        rsu.logger.info ("creating system load %s" % LOAD)
        os.popen ("killall -9 stress 2>&1 > /dev/null")
        os.popen ('stress --cpu %s &' % LOAD)
        time.sleep (60)

    load_1 = float(os.popen (LOAD_CMD).read())
    start  = rsu.timestamp()

    os.environ['_RADICAL_SYNAPSE_PROFILED'] = 'TRUE'

    # run the profiled function/command in a separate process
    if callable (command):

        proc = mp.Process (target = command,
                           args   = args,
                           kwargs = kwargs)
        proc.start ()

    else:

        proc = sp.Popen (command.split(),
                        stdout = sp.PIPE,
                        stderr = sp.STDOUT)

    watch_mode = os.environ.get('RADICAL_SYNAPSE_WATCHMODE', 'full').lower()
    watchers   = list()

    if watch_mode == 'full':
        watchers.append (rsw.WatcherCPU (proc.pid))
        watchers.append (rsw.WatcherSto (proc.pid))
        watchers.append (rsw.WatcherMem (proc.pid))

    # watchmode 'basic'
    watchers.append (rsw.WatcherSys (proc.pid))


    if callable (command):

        proc.join()
        out = ""
        ret = None

    else:
        out = proc.communicate()[0]
        ret = proc.returncode

    stop = rsu.timestamp()

    info['time'] = dict()
    info['time']['start'] = rsu.time_zero()
    info['time']['real']  = stop-start

    for watcher in reversed(watchers):
        watcher.stop ()
        watcher.join ()
        ru.dict_merge (info, watcher.get_data())

    # allow watchers to finalize some stuff, now having data from other watchers
    # available
    for watcher in reversed(watchers):
        watcher.finalize(info)

    time_2 = rsu.timestamp()
    load_2 = float(os.popen (LOAD_CMD).read())
    info['cpu']['load'] = max(load_1, load_2)
    rsu.logger.info ("system load %s: %s" % (LOAD, info['cpu']['load']))
   
    if  LOAD > 0:
        rsu.logger.info ("stopping system load")
        os.popen ("killall -9 stress 2>&1 > /dev/null")
        rsu.logger.info ("stopped  system load")

    return info, ret, out
예제 #27
0
    def __init__(self, cfg, session):

        # We temporarily do not call the base class constructor. The
        # constraint was not to change the base class at any point.
        # The constructor of the base class performs certain computations
        # that are specific to a node architecture, i.e., (i) requirement of
        # cores_per_node and gpus_per_node, (ii) no requirement for
        # sockets_per_node, and (iii) no validity checks on cores_per_socket,
        # gpus_per_socket, and sockets_per_node. It is, hence, incompatible
        # with the node architecture expected within this module.

        # We have three options:
        # 1) Change the child class, do not call the base class constructor
        # 2) Call the base class constructor, make the child class and its node
        #    structure compatible with that expected in the base class.
        # 3) Change the base class --- Out of scope of this project

        # 3 is probably the correct approach, but long term. 2 is not a
        # good approach as we are striving to keep a child class compatible
        # with a base class (this should never be the case).
        # We go ahead with 1, process of elimination really, but with the
        # advantage that we have the code content that will be required when
        # we implement 3, the long term approach.

        # ResourceManager.__init__(self, cfg, session)

        self.name            = type(self).__name__
        self._cfg            = cfg
        self._session        = session
        self._log            = self._session._log
        self._prof           = self._session._prof
        self.requested_cores = self._cfg['cores']

        self._log.info("Configuring ResourceManager %s.", self.name)

        self.lm_info            = dict()
        self.rm_info            = dict()
        self.slot_list          = list()
        self.node_list          = list()
        self.agent_nodes        = dict()
        self.sockets_per_node   = None
        self.cores_per_socket   = None
        self.gpus_per_socket    = None
        self.lfs_per_node       = None
        self.mem_per_node       = None
        self.smt                = int(os.environ.get('RADICAL_SAGA_SMT', 1))

        # The ResourceManager will possibly need to reserve nodes for the agent, according
        # to the agent layout.  We dig out the respective requirements from the
        # config right here.
        self._agent_reqs = []
        agents = self._cfg.get('agents', {})

        # FIXME: this loop iterates over all agents *defined* in the layout, not
        #        over all agents which are to be actually executed, thus
        #        potentially reserving too many nodes.a
        # NOTE:  this code path is *within* the agent, so at least agent.0
        #        cannot possibly land on a different node.
        for agent in agents:
            target = agents[agent].get('target')
            # make sure that the target either 'local', which we will ignore,
            # or 'node'.
            if target == 'local':
                pass  # ignore that one
            elif target == 'node':
                self._agent_reqs.append(agent)
            else :
                raise ValueError("ill-formatted agent target '%s'" % target)

        # We are good to get rolling, and to detect the runtime environment of
        # the local ResourceManager.
        self._configure()
        self._log.info("Discovered execution environment: %s", self.node_list)

        # Make sure we got a valid nodelist and a valid setting for
        # cores_per_socket and sockets_per_node
        if not self.node_list        or\
           self.sockets_per_node < 1 or \
           self.cores_per_socket < 1:
            raise RuntimeError('ResourceManager configuration invalid (%s)(%s)(%s)' %
                    (self.node_list, self.sockets_per_node,
                     self.cores_per_socket))

        # Check if the ResourceManager implementation reserved agent nodes.  If not, pick
        # the first couple of nodes from the nodelist as a fallback.
        if self._agent_reqs and not self.agent_nodes:
            self._log.info('Determine list of agent nodes generically.')
            for agent in self._agent_reqs:
                # Get a node from the end of the node list
                self.agent_nodes[agent] = self.node_list.pop()
                # If all nodes are taken by workers now, we can safely stop,
                # and let the raise below do its thing.
                if not self.node_list:
                    break

        if self.agent_nodes:
            self._log.info('agents      : %s' % list(self.agent_nodes.keys()))
            self._log.info('agent nodes : %s' % list(self.agent_nodes.values()))
            self._log.info('worker nodes: %s' % self.node_list)

        # Check if we can do any work
        if not self.node_list:
            raise RuntimeError('ResourceManager has no nodes left to run units')

        # After ResourceManager configuration, we call any existing config hooks on the
        # launch methods.  Those hooks may need to adjust the ResourceManager settings
        # (hello ORTE).  We only call LaunchMethod hooks *once*
        launch_methods = set()  # set keeps entries unique
        if 'mpi_launch_method' in self._cfg:
            launch_methods.add(self._cfg['mpi_launch_method'])
        launch_methods.add(self._cfg['task_launch_method'])
        launch_methods.add(self._cfg['agent_launch_method'])

        for lm in launch_methods:
            if lm:
                try:
                    from .... import pilot as rp
                    ru.dict_merge(self.lm_info,
                            rp.agent.LaunchMethod.rm_config_hook(lm, self._cfg,
                                                   self, self._log, self._prof))

                except:
                    self._log.exception("rm config hook failed")
                    raise

                self._log.info("rm config hook succeeded (%s)" % lm)

      # # For now assume that all nodes have equal amount of cores and gpus
      # cores_avail = (len(self.node_list) + len(self.agent_nodes)) \
      #             * self.cores_per_socket * self.sockets_per_node
      # gpus_avail  = (len(self.node_list) + len(self.agent_nodes)) \
      #             * self.gpus_per_socket * self.sockets_per_node


        # NOTE: self.rm_info is what scheduler and launch method can
        #       ultimately use, as it is included into the cfg passed to all
        #       components.
        #
        # it defines
        #   lm_info:            dict received via the LM's rm_config_hook
        #   node_list:          list of node names to be used for unit execution
        #   sockets_per_node:   integer number of sockets on a node
        #   cores_per_socket:   integer number of cores per socket
        #   gpus_per_socket:    integer number of gpus per socket
        #   agent_nodes:        list of node names reserved for agent execution
        #   lfs_per_node:       dict consisting the path and size of lfs on each node
        #   mem_per_node:       number of MB per node
        #   smt:                threads per core (exposed as core in RP)
        #

        self.rm_info = {
            'name'             : self.name,
            'lm_info'          : self.lm_info,
            'node_list'        : self.node_list,
            'sockets_per_node' : self.sockets_per_node,
            'cores_per_socket' : self.cores_per_socket * self.smt,
            'gpus_per_socket'  : self.gpus_per_socket,
            'cores_per_node'   : self.sockets_per_node * self.cores_per_socket * self.smt,
            'gpus_per_node'    : self.sockets_per_node * self.gpus_per_socket,
            'agent_nodes'      : self.agent_nodes,
            'lfs_per_node'     : self.lfs_per_node,
            'mem_per_node'     : self.mem_per_node,
            'smt'              : self.smt
        }
예제 #28
0
    def __init__ (self, database_url=None, database_name="radicalpilot",
                  uid=None, name=None):
        """Creates a new or reconnects to an exising session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

        **Arguments:**
            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        # init the base class inits
        saga.Session.__init__ (self)
        Object.__init__ (self)

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper ()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = list()
        self._unit_manager_objects = list()

        # Create a new process registry. All objects belonging to this 
        # session will register their worker processes (if they have any)
        # in this registry. This makes it easier to shut down things in 
        # a more coordinate fashion. 
        self._process_registry = _ProcessRegistry()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        self._database_url  = database_url
        self._database_name = database_name 

        if  not self._database_url :
            self._database_url = os.getenv ("RADICAL_PILOT_DBURL", None)

        if  not self._database_url :
            raise PilotException ("no database URL (set RADICAL_PILOT_DBURL)")  

        logger.info("using database url  %s" % self._database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        tmp_url = ru.Url (self._database_url)
        if  tmp_url.path            and \
            tmp_url.path[0]  == '/' and \
            len(tmp_url.path) >  1  :
            self._database_name = tmp_url.path[1:]
            logger.info("using database path %s" % self._database_name)
        else :
            logger.info("using database name %s" % self._database_name)

        # Loading all "default" resource configurations
        module_path   = os.path.dirname(os.path.abspath(__file__))
        default_cfgs  = "%s/configs/*.json" % module_path
        config_files  = glob.glob(default_cfgs)

        for config_file in config_files:

            try :
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e :
                logger.error ("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict() 

        user_cfgs     = "%s/.radical/pilot/configs/*.json" % os.environ.get ('HOME')
        config_files  = glob.glob(user_cfgs)

        for config_file in config_files:

            try :
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e :
                logger.error ("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)

                if  rc in self._resource_configs :
                    # config exists -- merge user config into it
                    ru.dict_merge (self._resource_configs[rc],
                                   rcs[rc].as_dict(),
                                   policy='overwrite')
                else :
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict() 

        default_aliases = "%s/configs/aliases.json" % module_path
        self._resource_aliases = ru.read_json_str (default_aliases)['aliases']

        ##########################
        ## CREATE A NEW SESSION ##
        ##########################
        if uid is None:
            try:
                self._connected  = None

                if name :
                    self._name = name
                    self._uid  = name
                  # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
                else :
                    self._uid  = ru.generate_id ('rp.session', mode=ru.ID_PRIVATE)
                    self._name = self._uid


                self._dbs, self._created, self._connection_info = \
                        dbSession.new(sid     = self._uid,
                                      name    = self._name,
                                      db_url  = self._database_url,
                                      db_name = database_name)

                logger.info("New Session created%s." % str(self))

            except Exception, ex:
                logger.exception ('session create failed')
                raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                                % (self._database_url, ex))  
예제 #29
0
    def run(self):
        """Starts the process when Process.start() is called.
        """

        global JOB_CHECK_INTERVAL

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                db = self._session.get_db()
                pilot_col = db["%s.p" % self._session.uid]
                logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._terminate.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                if self._disabled.is_set():
                    # don't process any new pilot start requests.
                    # NOTE: this is not clean, in principle there could be other
                    #       launchers alive which want to still start those
                    #       pending pilots.  In practice we only ever use one
                    #       pmgr though, and its during its shutdown that we get
                    #       here...
                    ts = time.time()
                    compute_pilot = pilot_col.find_and_modify(
                        query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH},
                        update={
                            "$set": {"state": CANCELED},
                            "$push": {"statehistory": {"state": CANCELED, "timestamp": ts}},
                        },
                    )

                    # run state checks more frequently.
                    JOB_CHECK_INTERVAL = 3
                    time.sleep(1)
                    continue

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = time.time()
                compute_pilot = pilot_col.find_and_modify(
                    query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH},
                    update={
                        "$set": {"state": LAUNCHING},
                        "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}},
                    },
                )

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_id = self._session.uid
                        database_url = self._session.dburl

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot["description"]["cores"]
                        runtime = compute_pilot["description"]["runtime"]
                        queue = compute_pilot["description"]["queue"]
                        project = compute_pilot["description"]["project"]
                        cleanup = compute_pilot["description"]["cleanup"]
                        resource_key = compute_pilot["description"]["resource"]
                        schema = compute_pilot["description"]["access_schema"]
                        memory = compute_pilot["description"]["memory"]
                        candidate_hosts = compute_pilot["description"]["candidate_hosts"]
                        pilot_sandbox = compute_pilot["sandbox"]
                        global_sandbox = compute_pilot["global_sandbox"]

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_launch_method = resource_cfg.get("agent_launch_method")
                        agent_dburl = resource_cfg.get("agent_mongodb_endpoint", database_url)
                        agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE)
                        rc_agent_config = resource_cfg.get("agent_config", DEFAULT_AGENT_CONFIG)
                        agent_scheduler = resource_cfg.get("agent_scheduler")
                        tunnel_bind_device = resource_cfg.get("tunnel_bind_device")
                        default_queue = resource_cfg.get("default_queue")
                        forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint")
                        js_endpoint = resource_cfg.get("job_manager_endpoint")
                        lrms = resource_cfg.get("lrms")
                        mpi_launch_method = resource_cfg.get("mpi_launch_method")
                        pre_bootstrap_1 = resource_cfg.get("pre_bootstrap_1")
                        pre_bootstrap_2 = resource_cfg.get("pre_bootstrap_2")
                        python_interpreter = resource_cfg.get("python_interpreter")
                        spmd_variation = resource_cfg.get("spmd_variation")
                        task_launch_method = resource_cfg.get("task_launch_method")
                        rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get("stage_cacerts", "False")
                        cores_per_node = resource_cfg.get("cores_per_node")
                        shared_filesystem = resource_cfg.get("shared_filesystem", True)
                        health_check = resource_cfg.get("health_check", True)
                        python_dist = resource_cfg.get("python_dist")
                        cu_pre_exec = resource_cfg.get("cu_pre_exec")
                        cu_post_exec = resource_cfg.get("cu_post_exec")
                        export_to_cu = resource_cfg.get("export_to_cu")

                        # Agent configuration that is not part of the public API.
                        # The agent config can either be a config dict, or
                        # a string pointing to a configuration name.  If neither
                        # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is
                        # set.  The last fallback is 'agent_default'
                        agent_config = compute_pilot["description"].get("_config")
                        if not agent_config:
                            agent_config = os.environ.get("RADICAL_PILOT_AGENT_CONFIG")
                        if not agent_config:
                            agent_config = rc_agent_config

                        if isinstance(agent_config, dict):
                            # nothing to do
                            agent_cfg_dict = agent_config
                            pass

                        elif isinstance(agent_config, basestring):
                            try:
                                if os.path.exists(agent_config):
                                    # try to open as file name
                                    logger.info("Read agent config file: %s" % agent_config)
                                    agent_cfg_dict = ru.read_json(agent_config)
                                else:
                                    # otherwise interpret as a config name
                                    module_path = os.path.dirname(os.path.abspath(__file__))
                                    config_path = "%s/../configs/" % module_path
                                    agent_cfg_file = os.path.join(config_path, "agent_%s.json" % agent_config)
                                    logger.info("Read agent config file: %s" % agent_cfg_file)
                                    agent_cfg_dict = ru.read_json(agent_cfg_file)
                                # no matter how we read the config file, we
                                # allow for user level overload
                                cfg_base = os.path.basename(agent_cfg_file)
                                user_cfg = "%s/.radical/pilot/config/%s" % (os.environ["HOME"], cfg_base)
                                if os.path.exists(user_cfg):
                                    logger.info("merging user config: %s" % user_cfg)
                                    user_cfg_dict = ru.read_json(user_cfg)
                                    ru.dict_merge(agent_cfg_dict, user_cfg_dict, policy="overwrite")
                            except Exception as e:
                                logger.exception("Error reading agent config file: %s" % e)
                                raise

                        else:
                            # we can't handle this type
                            raise TypeError("agent config must be string (filename) or dict")

                        # TODO: use booleans all the way?
                        if stage_cacerts.lower() == "true":
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            "pilot_sandbox": saga.Url(pilot_sandbox).path,
                            "global_sandbox": saga.Url(global_sandbox).path,
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get("global_virtenv")
                        if global_virtenv:
                            logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'")
                            virtenv = global_virtenv
                            virtenv_mode = "use"

                        # Create a host:port string for use by the bootstrap_1.
                        db_url = saga.Url(agent_dburl)
                        if db_url.port:
                            db_hostport = "%s:%d" % (db_url.host, db_url.port)
                        else:
                            db_hostport = "%s:%d" % (db_url.host, 27017)  # mongodb default

                        # Open the remote sandbox
                        # TODO: make conditional on shared_fs?
                        sandbox_tgt = saga.filesystem.Directory(
                            pilot_sandbox, session=self._session, flags=saga.filesystem.CREATE_PARENTS
                        )

                        LOCAL_SCHEME = "file"

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.
                        # This also creates the sandbox.
                        BOOTSTRAPPER_SCRIPT = "bootstrap_1.sh"
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, BOOTSTRAPPER_SCRIPT))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, bootstrapper_path))

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, sandbox_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        if shared_filesystem:
                            sandbox_tgt.copy(bs_script_url, BOOTSTRAPPER_SCRIPT)

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]:
                            raise ValueError("invalid rp_version '%s'" % rp_version)

                        stage_sdist = True
                        if rp_version in ["installed", "release"]:
                            stage_sdist = False

                        if rp_version.startswith("@"):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for sdist_path in [ru.sdist_path, saga.sdist_path, rp_sdist_path]:

                                sdist_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, sdist_path))
                                msg = "Copying sdist '%s' to sandbox (%s)." % (sdist_url, pilot_sandbox)
                                logentries.append(Logentry(msg, logger=logger.debug))
                                if shared_filesystem:
                                    sandbox_tgt.copy(sdist_url, os.path.basename(str(sdist_url)))

                        # ------------------------------------------------------
                        # Some machines cannot run pip due to outdated CA certs.
                        # For those, we also stage an updated certificate bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz"))

                            cc_url = saga.Url("%s://localhost/%s" % (LOCAL_SCHEME, cc_path))
                            msg = "Copying CA certificate bundle '%s' to sandbox (%s)." % (cc_url, pilot_sandbox)
                            logentries.append(Logentry(msg, logger=logger.debug))
                            if shared_filesystem:
                                sandbox_tgt.copy(cc_url, os.path.basename(str(cc_url)))

                        # ------------------------------------------------------
                        # sanity checks
                        if not python_dist:
                            raise RuntimeError("missing python distribution")
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms:
                            raise RuntimeError("missing LRMS")
                        if not agent_launch_method:
                            raise RuntimeError("missing agentlaunch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = "luve"  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not "private":
                                cleanup = cleanup.replace("v", "")

                        sdists = ":".join([ru.sdist_name, saga.sdist_name, rp_sdist_name])

                        # if cores_per_node is set (!= None), then we need to
                        # allocation full nodes, and thus round up
                        if cores_per_node:
                            cores_per_node = int(cores_per_node)
                            number_cores = int(cores_per_node * math.ceil(float(number_cores) / cores_per_node))

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -d '%s'" % sdists
                        bootstrap_args += " -m '%s'" % virtenv_mode
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -r '%s'" % rp_version
                        bootstrap_args += " -s '%s'" % session_id
                        bootstrap_args += " -v '%s'" % virtenv
                        bootstrap_args += " -b '%s'" % python_dist

                        # set optional args
                        if agent_type:
                            bootstrap_args += " -a '%s'" % agent_type
                        if lrms == "CCM":
                            bootstrap_args += " -c"
                        if pre_bootstrap_1:
                            bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap_1)
                        if pre_bootstrap_2:
                            bootstrap_args += " -w '%s'" % "' -w '".join(pre_bootstrap_2)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -h '%s'" % db_hostport
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if tunnel_bind_device:
                            bootstrap_args += " -t '%s'" % tunnel_bind_device
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # set some agent configuration
                        agent_cfg_dict["cores"] = number_cores
                        agent_cfg_dict["resource_cfg"] = resource_cfg
                        agent_cfg_dict["debug"] = os.environ.get(
                            "RADICAL_PILOT_AGENT_VERBOSE", logger.getEffectiveLevel()
                        )
                        agent_cfg_dict["mongodb_url"] = str(agent_dburl)
                        agent_cfg_dict["lrms"] = lrms
                        agent_cfg_dict["spawner"] = agent_spawner
                        agent_cfg_dict["scheduler"] = agent_scheduler
                        agent_cfg_dict["runtime"] = runtime
                        agent_cfg_dict["pilot_id"] = pilot_id
                        agent_cfg_dict["session_id"] = session_id
                        agent_cfg_dict["agent_launch_method"] = agent_launch_method
                        agent_cfg_dict["task_launch_method"] = task_launch_method
                        agent_cfg_dict["export_to_cu"] = export_to_cu
                        agent_cfg_dict["cu_pre_exec"] = cu_pre_exec
                        agent_cfg_dict["cu_post_exec"] = cu_post_exec
                        if mpi_launch_method:
                            agent_cfg_dict["mpi_launch_method"] = mpi_launch_method
                        if cores_per_node:
                            agent_cfg_dict["cores_per_node"] = cores_per_node

                        # ------------------------------------------------------
                        # Write agent config dict to a json file in pilot sandbox.

                        cfg_tmp_dir = tempfile.mkdtemp(prefix="rp_agent_cfg_dir")
                        agent_cfg_name = "agent_0.cfg"
                        cfg_tmp_file = os.path.join(cfg_tmp_dir, agent_cfg_name)
                        cfg_tmp_handle = os.open(cfg_tmp_file, os.O_WRONLY | os.O_CREAT)

                        # Convert dict to json file
                        msg = "Writing agent configuration to file '%s'." % cfg_tmp_file
                        logentries.append(Logentry(msg, logger=logger.debug))
                        ru.write_json(agent_cfg_dict, cfg_tmp_file)

                        cf_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, cfg_tmp_file))
                        msg = "Copying agent configuration file '%s' to sandbox (%s)." % (cf_url, pilot_sandbox)
                        logentries.append(Logentry(msg, logger=logger.debug))
                        if shared_filesystem:
                            sandbox_tgt.copy(cf_url, agent_cfg_name)

                        # Close agent config file
                        os.close(cfg_tmp_handle)

                        # ------------------------------------------------------
                        # Done with all transfers to pilot sandbox, close handle
                        sandbox_tgt.close()

                        # ------------------------------------------------------
                        # now that the scripts are in place and configured,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data["job_services"]:
                            js = self._shared_worker_data["job_services"][js_url]
                        else:
                            js = saga.job.Service(js_url, session=self._session)
                            self._shared_worker_data["job_services"][js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = ["-l %s" % BOOTSTRAPPER_SCRIPT, bootstrap_args]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "bootstrap_1.out"
                        jd.error = "bootstrap_1.err"
                        jd.total_cpu_count = number_cores
                        jd.processes_per_host = cores_per_node
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue
                        jd.candidate_hosts = candidate_hosts
                        jd.environment = dict()

                        # TODO: not all files might be required, this also needs to be made conditional
                        if not shared_filesystem:
                            jd.file_transfer = [
                                #'%s > %s' % (bootstrapper_path, os.path.basename(bootstrapper_path)),
                                "%s > %s"
                                % (
                                    bootstrapper_path,
                                    os.path.join(jd.working_directory, "input", os.path.basename(bootstrapper_path)),
                                ),
                                "%s > %s" % (cfg_tmp_file, os.path.join(jd.working_directory, "input", agent_cfg_name)),
                                #'%s < %s' % ('agent.log', os.path.join(jd.working_directory, 'agent.log')),
                                #'%s < %s' % (os.path.join(jd.working_directory, 'agent.log'), 'agent.log'),
                                #'%s < %s' % ('agent.log', 'agent.log'),
                                #'%s < %s' % (os.path.join(jd.working_directory, 'STDOUT'), 'unit.000000/STDOUT'),
                                #'%s < %s' % (os.path.join(jd.working_directory, 'unit.000000/STDERR'), 'STDERR')
                                #'%s < %s' % ('unit.000000/STDERR', 'unit.000000/STDERR')
                                # TODO: This needs to go into a per pilot directory on the submit node
                                "%s < %s" % ("pilot.0000.log.tgz", "pilot.0000.log.tgz"),
                            ]

                            if stage_sdist:
                                jd.file_transfer.extend(
                                    [
                                        #'%s > %s' % (rp_sdist_path, os.path.basename(rp_sdist_path)),
                                        "%s > %s"
                                        % (
                                            rp_sdist_path,
                                            os.path.join(
                                                jd.working_directory, "input", os.path.basename(rp_sdist_path)
                                            ),
                                        ),
                                        #'%s > %s' % (saga.sdist_path, os.path.basename(saga.sdist_path)),
                                        "%s > %s"
                                        % (
                                            saga.sdist_path,
                                            os.path.join(
                                                jd.working_directory, "input", os.path.basename(saga.sdist_path)
                                            ),
                                        ),
                                        #'%s > %s' % (ru.sdist_path, os.path.basename(ru.sdist_path)),
                                        "%s > %s"
                                        % (
                                            ru.sdist_path,
                                            os.path.join(
                                                jd.working_directory, "input", os.path.basename(ru.sdist_path)
                                            ),
                                        ),
                                    ]
                                )

                            if stage_cacerts:
                                jd.file_transfer.append(
                                    "%s > %s"
                                    % (cc_path, os.path.join(jd.working_directory, "input", os.path.basename(cc_path)))
                                )

                            if "RADICAL_PILOT_PROFILE" in os.environ:
                                # TODO: This needs to go into a per pilot directory on the submit node
                                jd.file_transfer.append("%s < %s" % ("pilot.0000.prof.tgz", "pilot.0000.prof.tgz"))

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if "RADICAL_PILOT_PROFILE" in os.environ:
                            jd.environment["RADICAL_PILOT_PROFILE"] = "TRUE"

                        logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        try:
                            pilotjob = js.create_job(jd)
                        except saga.BadParameter as e:
                            raise ValueError("Pilot submission to %s failed: %s" % (resource_key, e))
                        pilotjob.run()

                        # Clean up agent config file and dir after submission
                        os.unlink(cfg_tmp_file)
                        os.rmdir(cfg_tmp_dir)

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url]

                        msg = "SAGA job submitted with job id %s" % str(saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = time.time()
                        ret = pilot_col.update(
                            {"_id": pilot_id, "state": LAUNCHING},
                            {
                                "$set": {
                                    "state": PENDING_ACTIVE,
                                    "saga_job_id": saga_job_id,
                                    "health_check_enabled": health_check,
                                    "agent_config": agent_cfg_dict,
                                },
                                "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )

                        if ret["n"] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update(
                                {"_id": pilot_id},
                                {
                                    "$set": {"saga_job_id": saga_job_id, "health_check_enabled": health_check},
                                    "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                    "$pushAll": {"log": log_dicts},
                                },
                            )

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                        ts = time.time()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(str(le.message))

                        pilot_col.update(
                            {"_id": pilot_id, "state": {"$ne": FAILED}},
                            {
                                "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log},
                                "$push": {"statehistory": {"state": FAILED, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )
                        logger.exception("\n".join(log_messages))

        except SystemExit as e:
            logger.exception("pilot launcher thread caught system exit -- forcing application shutdown")
            import thread

            thread.interrupt_main()
예제 #30
0
    def __init__(self, agent_name):

        assert(agent_name == 'agent_0'), 'expect agent_0, not subagent'
        print 'startup agent %s' % agent_name

        # load config, create session, init rpu.Worker
        agent_cfg  = '%s/%s.cfg' % (os.getcwd(), agent_name)
        cfg        = ru.read_json_str(agent_cfg)

        cfg['agent_name'] = agent_name

        self._uid         = agent_name
        self._pid         = cfg['pilot_id']
        self._sid         = cfg['session_id']
        self._runtime     = cfg['runtime']
        self._starttime   = time.time()
        self._final_cause = None
        self._lrms        = None

        # this better be on a shared FS!
        cfg['workdir']    = os.getcwd()

        # sanity check on config settings
        if not 'cores'               in cfg: raise ValueError('Missing number of cores')
        if not 'lrms'                in cfg: raise ValueError('Missing LRMS')
        if not 'dburl'               in cfg: raise ValueError('Missing DBURL')
        if not 'pilot_id'            in cfg: raise ValueError('Missing pilot id')
        if not 'runtime'             in cfg: raise ValueError('Missing or zero agent runtime')
        if not 'scheduler'           in cfg: raise ValueError('Missing agent scheduler')
        if not 'session_id'          in cfg: raise ValueError('Missing session id')
        if not 'spawner'             in cfg: raise ValueError('Missing agent spawner')
        if not 'task_launch_method'  in cfg: raise ValueError('Missing unit launch method')

        # Check for the RADICAL_PILOT_DB_HOSTPORT env var, which will hold
        # the address of the tunnelized DB endpoint. If it exists, we
        # overrule the agent config with it.
        hostport = os.environ.get('RADICAL_PILOT_DB_HOSTPORT')
        if hostport:
            dburl = ru.Url(cfg['dburl'])
            dburl.host, dburl.port = hostport.split(':')
            cfg['dburl'] = str(dburl)

        # Create a session.
        #
        # This session will connect to MongoDB, and will also create any
        # communication channels and components/workers specified in the
        # config -- we merge that information into our own config.
        # We don't want the session to start components though, so remove them
        # from the config copy.
        session_cfg = copy.deepcopy(cfg)
        session_cfg['components'] = dict()
        session = rp_Session(cfg=session_cfg, uid=self._sid)

        # we still want the bridge addresses known though, so make sure they are
        # merged into our own copy, along with any other additions done by the
        # session.
        ru.dict_merge(cfg, session._cfg, ru.PRESERVE)
        pprint.pprint(cfg)

        if not session.is_connected:
            raise RuntimeError('agent_0 could not connect to mongodb')

        # at this point the session is up and connected, and it should have
        # brought up all communication bridges and the UpdateWorker.  We are
        # ready to rumble!
        rpu.Worker.__init__(self, cfg, session)

        # this is the earlier point to sync bootstrapper and agent # profiles
        self._prof.prof('sync_rel', msg='agent_0 start', uid=self._pid)

        # Create LRMS which will give us the set of agent_nodes to use for
        # sub-agent startup.  Add the remaining LRMS information to the
        # config, for the benefit of the scheduler).
        self._lrms = rpa_rm.RM.create(name=self._cfg['lrms'], cfg=self._cfg,
                                      session=self._session)

        # add the resource manager information to our own config
        self._cfg['lrms_info'] = self._lrms.lrms_info
예제 #31
0
def get_config (params) :
    """
    This method attempts to obtain configuration settings from a variety of
    sources, depending on the parameter. it can point to an env var, or to
    a directory containing configuration files, or to a single configuration
    file, or to a list of any above, or it is a config dict already, or a list
    of such dicts.  In all cases, the config is obtained from the respective
    source (which is assumed json formatted in the case of config files), and
    a single merged and expanded dict is returned.
    """


    ret = dict()

    # always make params list for simpler code below
    if  not isinstance(params, list) :
        params = [params]


    for param in params :

        if  not param or None == param : 

            # we silently accept None's, to save some 
            # repetetetetive checks on the calling side
            continue


        elif isinstance (param, dict) :

            # simply merge it into the result
            ru.dict_merge (ret, param, policy='overwrite')


        elif isinstance (param, basestring) :
        
            # check if the string points to an env variable
            if  param in os.environ :
                # assume that the value of the env var is what we really want
                param = os.environ[param]

            # is string, is not env, must be a dir or a file
            if  os.path.isdir (param) :
                # config dir
                cfg_files = glob.glob ("%s/*" % param)
              # print 'is dir %s/*' % param
              # print cfg_files

            elif os.path.isfile (param) :
                # single config file
                cfg_files = [param]

            else :
                troy._logger.warning ("cannot handle config location %s" % param)
                cfg_files = list()

            print 'files: %s' % cfg_files
            # read and merge all config files
            for cfg_file in cfg_files :
                cfg_dict = dict()
                try :
                    cfg_dict = ru.read_json (cfg_file)
                    troy._logger.info ("reading  config in %s" % cfg_file)
                except Exception as e :
                    troy._logger.critical ("skipping config in %s (%s)" % (cfg_file, e))
                    raise

              # import pprint
              # print '================'
              # print cfg_file
              # pprint.pprint (cfg_dict)
              # print '================'

                ru.dict_merge (ret, cfg_dict, policy='overwrite')



        else :
            raise TypeError ("get_config parameter must be (list of) dict or "
                             "string, not %s" % type(param))

  # print '================================'
  # pprint.pprint (ret)
  # print '================================'

    # expand config(s) before returning
    ru.dict_stringexpand (ret)

    return ret
예제 #32
0
    def __init__(self,
                 database_url=None,
                 database_name="radicalpilot",
                 uid=None,
                 name=None):
        """Creates a new or reconnects to an exising session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

        **Arguments:**
            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        # init the base class inits
        saga.Session.__init__(self)
        Object.__init__(self)

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = list()
        self._unit_manager_objects = list()

        # Create a new process registry. All objects belonging to this
        # session will register their worker processes (if they have any)
        # in this registry. This makes it easier to shut down things in
        # a more coordinate fashion.
        self._process_registry = _ProcessRegistry()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        self._database_url = database_url
        self._database_name = database_name

        if not self._database_url:
            self._database_url = os.getenv("RADICAL_PILOT_DBURL", None)

        if not self._database_url:
            raise PilotException("no database URL (set RADICAL_PILOT_DBURL)")

        logger.info("using database url  %s" % self._database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        tmp_url = ru.Url(self._database_url)
        if  tmp_url.path            and \
            tmp_url.path[0]  == '/' and \
            len(tmp_url.path) >  1  :
            self._database_name = tmp_url.path[1:]
            logger.info("using database path %s" % self._database_name)
        else:
            logger.info("using database name %s" % self._database_name)

        # Loading all "default" resource configurations
        module_path = os.path.dirname(os.path.abspath(__file__))
        default_cfgs = "%s/configs/*.json" % module_path
        config_files = glob.glob(default_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict()

        user_cfgs = "%s/.radical/pilot/configs/*.json" % os.environ.get('HOME')
        config_files = glob.glob(user_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)

                if rc in self._resource_configs:
                    # config exists -- merge user config into it
                    ru.dict_merge(self._resource_configs[rc],
                                  rcs[rc].as_dict(),
                                  policy='overwrite')
                else:
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict()

        default_aliases = "%s/configs/aliases.json" % module_path
        self._resource_aliases = ru.read_json_str(default_aliases)['aliases']

        ##########################
        ## CREATE A NEW SESSION ##
        ##########################
        if uid is None:
            try:
                self._connected = None

                if name:
                    self._name = name
                    self._uid = name
                # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
                else:
                    self._uid = ru.generate_id('rp.session',
                                               mode=ru.ID_PRIVATE)
                    self._name = self._uid


                self._dbs, self._created, self._connection_info = \
                        dbSession.new(sid     = self._uid,
                                      name    = self._name,
                                      db_url  = self._database_url,
                                      db_name = database_name)

                logger.info("New Session created%s." % str(self))

            except Exception, ex:
                logger.exception('session create failed')
                raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                                % (self._database_url, ex))
예제 #33
0
class Session(saga.Session):
    """A Session encapsulates a RADICAL-Pilot instance and is the *root* object
    for all other RADICAL-Pilot objects. 

    A Session holds :class:`radical.pilot.PilotManager` and :class:`radical.pilot.UnitManager`
    instances which in turn hold  :class:`radical.pilot.Pilot` and
    :class:`radical.pilot.ComputeUnit` instances.

    Each Session has a unique identifier :data:`radical.pilot.Session.uid` that can be
    used to re-connect to a RADICAL-Pilot instance in the database.

    **Example**::

        s1 = radical.pilot.Session(database_url=DBURL)
        s2 = radical.pilot.Session(database_url=DBURL, uid=s1.uid)

        # s1 and s2 are pointing to the same session
        assert s1.uid == s2.uid
    """

    #---------------------------------------------------------------------------
    #
    def __init__(self, database_url=None, database_name=None, name=None):
        """Creates a new session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

        **Arguments:**
            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        logger = ru.get_logger('radical.pilot')

        if database_name:
            logger.error(
                "The 'database_name' parameter is deprecated - please specify an URL path"
            )
        else:
            database_name = 'radicalpilot'

        # init the base class inits
        saga.Session.__init__(self)
        self._dh = ru.DebugHelper()
        self._valid = True
        self._terminate = threading.Event()
        self._terminate.clear()

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = dict()
        self._unit_manager_objects = dict()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        if not database_url:
            database_url = os.getenv("RADICAL_PILOT_DBURL", None)

        if not database_url:
            raise PilotException("no database URL (set RADICAL_PILOT_DBURL)")

        self._dburl = ru.Url(database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        if  not self._dburl.path         or \
            self._dburl.path[0]   != '/' or \
            len(self._dburl.path) <=  1  :
            logger.error(
                "incomplete URLs are deprecated -- missing database name!")
            self._dburl.path = database_name  # defaults to 'radicalpilot'

        logger.info("using database %s" % self._dburl)

        # ----------------------------------------------------------------------
        # create new session
        try:
            if name:
                self._name = name
                self._uid = name
            # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
            else:
                self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE)
                self._name = self._uid

            logger.report.info('<<create session %s' % self._uid)

            self._dbs = dbSession(sid=self._uid,
                                  name=self._name,
                                  dburl=self._dburl)

            self._dburl = self._dbs._dburl

            logger.info("New Session created: %s." % str(self))

        except Exception, ex:
            logger.exception('session create failed')
            raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                            % (self._dburl, ex))

        # initialize profiling
        self.prof = Profiler('%s' % self._uid)
        self.prof.prof('start session', uid=self._uid)

        # Loading all "default" resource configurations
        module_path = os.path.dirname(os.path.abspath(__file__))
        default_cfgs = "%s/configs/resource_*.json" % module_path
        config_files = glob.glob(default_cfgs)

        for config_file in config_files:

            try:
                logger.info("Load resource configurations from %s" %
                            config_file)
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Load resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict()

        user_cfgs = "%s/.radical/pilot/configs/resource_*.json" % os.environ.get(
            'HOME')
        config_files = glob.glob(user_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Load resource configurations for %s" % rc)

                if rc in self._resource_configs:
                    # config exists -- merge user config into it
                    ru.dict_merge(self._resource_configs[rc],
                                  rcs[rc].as_dict(),
                                  policy='overwrite')
                else:
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict()

        default_aliases = "%s/configs/resource_aliases.json" % module_path
        self._resource_aliases = ru.read_json_str(default_aliases)['aliases']

        self.prof.prof('configs parsed', uid=self._uid)

        _rec = os.environ.get('RADICAL_PILOT_RECORD_SESSION')
        if _rec:
            self._rec = "%s/%s" % (_rec, self._uid)
            os.system('mkdir -p %s' % self._rec)
            ru.write_json({'dburl': str(self._dburl)},
                          "%s/session.json" % self._rec)
            logger.info("recording session in %s" % self._rec)
        else:
            self._rec = None

        logger.report.ok('>>ok\n')
#     {
#         "*.futuregrid.org" :
#         {
#             "username"      : "merzky"
#         }
#     }
# }
USER_CONFIG_PATH = os.environ.get ('HOME', '/tmp') + '/.my_app.cfg' 

# load the user config, and merge it with the default config
user_config = ru.read_json_str (USER_CONFIG_PATH)


# merge the user config into the app config, so that the user config keys are
# applied where appropriate
ru.dict_merge (app_config, user_config, policy='overwrite', wildcards=True)


# lets see what we got
pprint.pprint (app_config)


# this should result in :
#
# {
#     'log_level'   : 0,
#     'scheduler'   : 'rp.SCHED_BACKFILLING',
#     'resources'   : ['india.furturegrid.org', 'sierra.futuregrid.org'],
#     'resource_cfg': 
#     {
#         '*.futuregrid.org':