def write_sub_configs(cfg, bridges, nodeip, log):
    """
    create a sub_config for each sub-agent we intent to spawn
    """

    # get bridge addresses from our bridges, and append them to the config
    if not "bridge_addresses" in cfg:
        cfg["bridge_addresses"] = dict()

    for b in bridges:
        # to avoid confusion with component input and output, we call bridge
        # input a 'sink', and a bridge output a 'source' (from the component
        # perspective)
        sink = ru.Url(bridges[b]["in"])
        source = ru.Url(bridges[b]["out"])

        # we replace the ip address with what we got from LRMS (nodeip).  The
        # bridge should be listening on all interfaces, but we want to make sure
        # the sub-agents connect on an IP which is accessible to them
        sink.host = nodeip
        source.host = nodeip

        # keep the resultin URLs as strings, to be used as addresses
        cfg["bridge_addresses"][b] = dict()
        cfg["bridge_addresses"][b]["sink"] = str(sink)
        cfg["bridge_addresses"][b]["source"] = str(source)

    # write deep-copies of the config (with the corrected agent_name) for each
    # sub-agent (apart from agent_0, obviously)
    for sa in cfg.get("agent_layout"):
        if sa != "agent_0":
            sa_cfg = copy.deepcopy(cfg)
            sa_cfg["agent_name"] = sa
            ru.write_json(sa_cfg, "./%s.cfg" % sa)
Exemplo n.º 2
0
def get_session_docs (db, sid, cache=None, cachedir=None) :

    # session docs may have been cached in /tmp/rp_cache_<uid>/<sid>.json -- in that
    # case we pull it from there instead of the database, which will be much
    # quicker.  Also, we do cache any retrieved docs to that place, for later
    # use.  An optional cachdir parameter changes that default location for
    # lookup and storage.
    if  not cachedir :
        cachedir = _CACHE_BASEDIR

    if  not cache :
        cache = "%s/%s.json" % (cachedir, sid)

    try :
        if  os.path.isfile (cache) :
            return ru.read_json (cache)
    except Exception as e :
        # continue w/o cache
        sys.stderr.write ("warning: cannot read session cache at %s (%s)\n" % (cache, e))


    # cache not used or not found -- go to db
    json_data = dict()

    # convert bson to json, i.e. serialize the ObjectIDs into strings.
    json_data['session'] = bson2json (list(db["%s"    % sid].find ()))
    json_data['pmgr'   ] = bson2json (list(db["%s.pm" % sid].find ()))
    json_data['pilot'  ] = bson2json (list(db["%s.p"  % sid].find ()))
    json_data['umgr'   ] = bson2json (list(db["%s.um" % sid].find ()))
    json_data['unit'   ] = bson2json (list(db["%s.cu" % sid].find ()))

    if  len(json_data['session']) == 0 :
        raise ValueError ('no such session %s' % sid)

  # if  len(json_data['session']) > 1 :
  #     print 'more than one session document -- picking first one'

    # there can only be one session, not a list of one
    json_data['session'] = json_data['session'][0]

    # we want to add a list of handled units to each pilot doc
    for pilot in json_data['pilot'] :

        pilot['unit_ids'] = list()

        for unit in json_data['unit'] :

            if  unit['pilot'] == str(pilot['_id']) :
                pilot['unit_ids'].append (str(unit['_id']))

    # if we got here, we did not find a cached version -- thus add this dataset
    # to the cache
    try :
        os.system ('mkdir -p %s' % _CACHE_BASEDIR)
        ru.write_json (json_data, "%s/%s.json" % (_CACHE_BASEDIR, sid))
    except Exception as e :
        # we can live without cache, no problem...
        pass

    return json_data
Exemplo n.º 3
0
def write_workflow(workflow, uid):

    try:
        os.mkdir(uid)
    except:
        pass

    data = list()
    if os.path.isfile('%s/entk_workflow.json' % uid):
        data = ru.read_json('%s/entk_workflow.json' % uid)

    for pipe in workflow:

        p = dict()
        p['uid'] = pipe.uid
        p['name'] = pipe.name
        p['state_history'] = pipe.state_history
        p['stages'] = list()

        for stage in pipe.stages:

            s = dict()
            s['uid'] = stage.uid
            s['name'] = stage.name
            s['state_history'] = stage.state_history
            s['tasks'] = list()

            for task in stage.tasks:
                s['tasks'].append(task.to_dict())

            p['stages'].append(s)

        data.append(p)

    ru.write_json(data, '%s/entk_workflow.json' % uid)
Exemplo n.º 4
0
def get_session_docs(db, sid, cache=None, cachedir=None):

    # session docs may have been cached in /tmp/rp_cache_<uid>/<sid>.json -- in that
    # case we pull it from there instead of the database, which will be much
    # quicker.  Also, we do cache any retrieved docs to that place, for later
    # use.  An optional cachdir parameter changes that default location for
    # lookup and storage.
    if not cachedir:
        cachedir = _CACHE_BASEDIR

    if not cache:
        cache = "%s/%s.json" % (cachedir, sid)

    try:
        if os.path.isfile(cache):
            return ru.read_json(cache)
    except Exception as e:
        # continue w/o cache
        sys.stderr.write("warning: cannot read session cache at %s (%s)\n" %
                         (cache, e))

    # cache not used or not found -- go to db
    json_data = dict()

    # convert bson to json, i.e. serialize the ObjectIDs into strings.
    json_data['session'] = bson2json(list(db["%s" % sid].find()))
    json_data['pmgr'] = bson2json(list(db["%s.pm" % sid].find()))
    json_data['pilot'] = bson2json(list(db["%s.p" % sid].find()))
    json_data['umgr'] = bson2json(list(db["%s.um" % sid].find()))
    json_data['unit'] = bson2json(list(db["%s.cu" % sid].find()))

    if len(json_data['session']) == 0:
        raise ValueError('no such session %s' % sid)

# if  len(json_data['session']) > 1 :
#     print 'more than one session document -- picking first one'

# there can only be one session, not a list of one
    json_data['session'] = json_data['session'][0]

    # we want to add a list of handled units to each pilot doc
    for pilot in json_data['pilot']:

        pilot['unit_ids'] = list()

        for unit in json_data['unit']:

            if unit['pilot'] == str(pilot['_id']):
                pilot['unit_ids'].append(str(unit['_id']))

    # if we got here, we did not find a cached version -- thus add this dataset
    # to the cache
    try:
        os.system('mkdir -p %s' % _CACHE_BASEDIR)
        ru.write_json(json_data, "%s/%s.json" % (_CACHE_BASEDIR, sid))
    except Exception as e:
        # we can live without cache, no problem...
        pass

    return json_data
Exemplo n.º 5
0
def test_amgr_read_config():

    amgr = Amgr(hostname=host, port=port)

    assert amgr._reattempts == 3

    assert amgr._rmq_cleanup
    assert amgr._autoterminate

    assert not amgr._write_workflow
    assert not amgr._resubmit_failed

    assert amgr._rts              == 'radical.pilot'
    assert amgr._num_pending_qs   == 1
    assert amgr._num_completed_qs == 1
    assert amgr._rts_config       == {"sandbox_cleanup": False,
                                      "db_cleanup"     : False}

    d = {"hostname"       : "radical.two",
         "port"           : 25672,
         "username"       : user,
         "password"       : passwd,
         "reattempts"     : 5,
         "resubmit_failed": True,
         "autoterminate"  : False,
         "write_workflow" : True,
         "rts"            : "mock",
         "rts_config"     : {"sandbox_cleanup": True,
                             "db_cleanup"     : True},
         "pending_qs"     : 2,
         "completed_qs"   : 3,
         "rmq_cleanup"    : False}

    ru.write_json(d, './config.json')
    amgr._read_config(config_path='./',
                      hostname=None,
                      port=None,
                      username=None,
                      password=None,
                      reattempts=None,
                      resubmit_failed=None,
                      autoterminate=None,
                      write_workflow=None,
                      rts=None,
                      rmq_cleanup=None,
                      rts_config=None)

    assert amgr._hostname         == d['hostname']
    assert amgr._port             == d['port']
    assert amgr._reattempts       == d['reattempts']
    assert amgr._resubmit_failed  == d['resubmit_failed']
    assert amgr._autoterminate    == d['autoterminate']
    assert amgr._write_workflow   == d['write_workflow']
    assert amgr._rts              == d['rts']
    assert amgr._rts_config       == d['rts_config']
    assert amgr._num_pending_qs   == d['pending_qs']
    assert amgr._num_completed_qs == d['completed_qs']
    assert amgr._rmq_cleanup      == d['rmq_cleanup']

    os.remove('./config.json')
Exemplo n.º 6
0
    def _spawn(self, launcher, funcs):

        # NOTE: see documentation of funcs['sandbox'] semantics in the ComputeUnit
        #       class definition.
        sandbox = '%s/%s'     % (self._pwd, funcs['uid'])
        fname   = '%s/%s.sh'  % (sandbox,   funcs['uid'])
        cfgname = '%s/%s.cfg' % (sandbox,   funcs['uid'])
        descr   = funcs['description']

        rpu.rec_makedir(sandbox)
        ru.write_json(funcs.get('cfg'), cfgname)

        launch_cmd, hop_cmd = launcher.construct_command(funcs, fname)

        if hop_cmd : cmdline = hop_cmd
        else       : cmdline = fname

        with open(fname, "w") as fout:

            fout.write('#!/bin/sh\n\n')

            # Create string for environment variable setting
            fout.write('export RP_SESSION_ID="%s"\n' % self._cfg['sid'])
            fout.write('export RP_PILOT_ID="%s"\n'   % self._cfg['pid'])
            fout.write('export RP_AGENT_ID="%s"\n'   % self._cfg['aid'])
            fout.write('export RP_SPAWNER_ID="%s"\n' % self.uid)
            fout.write('export RP_FUNCS_ID="%s"\n'   % funcs['uid'])
            fout.write('export RP_GTOD="%s"\n'       % self.gtod)
            fout.write('export RP_TMP="%s"\n'        % self._cu_tmp)

            # also add any env vars requested in the unit description
            if descr.get('environment', []):
                for key,val in descr['environment'].items():
                    fout.write('export "%s=%s"\n' % (key, val))

            fout.write('\n%s\n\n' % launch_cmd)
            fout.write('RETVAL=$?\n')
            fout.write("exit $RETVAL\n")

        # done writing to launch script, get it ready for execution.
        st = os.stat(fname)
        os.chmod(fname, st.st_mode | stat.S_IEXEC)

        fout = open('%s/%s.out' % (sandbox, funcs['uid']), "w")
        ferr = open('%s/%s.err' % (sandbox, funcs['uid']), "w")

        self._prof.prof('exec_start', uid=funcs['uid'])
        funcs['proc'] = subprocess.Popen(args       = cmdline,
                                         executable = None,
                                         stdin      = None,
                                         stdout     = fout,
                                         stderr     = ferr,
                                         preexec_fn = os.setsid,
                                         close_fds  = True,
                                         shell      = True,
                                         cwd        = sandbox)

        self._prof.prof('exec_ok', uid=funcs['uid'])
Exemplo n.º 7
0
    def _write_profile(self):

        base = os.path.dirname(self._profile_loc)
        fname, ext = os.path.basename(self._profile_loc).split('.')
        op_name = base + '/' + fname + '.%s.' % self._uid + ext

        ru.write_json(data=self._profile, filename=op_name)
        self._logger.info('Profiles from executor %s written to %s' %
                          (self._uid, op_name))
Exemplo n.º 8
0
def write_data(data, proc_path):

    if 'rp.session' in proc_path:
        proc_path = os.path.dirname(os.path.dirname(proc_path)) + '/' + os.path.basename(proc_path)
    if not os.path.isdir(os.path.dirname(proc_path)):
        os.makedirs(os.path.dirname(proc_path))
    ru.write_json(data,proc_path)
    
    return proc_path
Exemplo n.º 9
0
def fetch_json(sid,
               dburl=None,
               tgt=None,
               skip_existing=False,
               session=None,
               log=None):
    '''
    returns file name
    '''

    if not log and session:
        log = session._log
        rep = session._rep
    elif not log:
        log = ru.Logger('radical.pilot.utils')
        rep = ru.Reporter('radical.pilot.utils')

    if not tgt:
        tgt = '.'

    if tgt.startswith('/'):
        # Assume an absolute path
        dst = os.path.join(tgt, '%s.json' % sid)
    else:
        # Assume a relative path
        dst = os.path.join(os.getcwd(), tgt, '%s.json' % sid)

    try:
        os.makedirs(os.path.dirname(tgt))
    except OSError:
        pass  # dir exists

    if skip_existing and os.path.isfile(dst) \
            and os.stat(dst).st_size > 0:

        log.info("session already in %s", dst)

    else:

        if not dburl:
            dburl = os.environ.get('RADICAL_PILOT_DBURL')

        if not dburl:
            raise ValueError('RADICAL_PILOT_DBURL is not set')

        mongo, db, _, _, _ = ru.mongodb_connect(dburl)

        json_docs = get_session_docs(db, sid)
        ru.write_json(json_docs, dst)

        log.info("session written to %s", dst)

        mongo.close()

    rep.ok("+ %s (json)\n" % sid)
    return dst
Exemplo n.º 10
0
    def submit(self, descr, count, cores, gpus):
        '''
        submit n workers, and pass the queue info as configuration file.
        Do *not* wait for them to come up
        '''

        # each worker gets the specified number of cores and gpus.  All
        # resources need to be located on the same node.
        descr['cpu_processes'] = 1
        descr['cpu_threads'] = cores
        descr['cpu_thread_type'] = 'POSIX'
        descr['gpu_processses'] = gpus

        tasks = list()
        for _ in range(count):

            # write config file for that worker
            cfg = copy.deepcopy(self._cfg)
            cfg['info'] = self._info
            uid = ru.generate_id('worker')
            sbox = '%s/%s' % (cfg['base'], uid)
            fname = '%s/%s.json' % (sbox, uid)

            cfg['kind'] = 'worker'
            cfg['uid'] = uid
            cfg['base'] = sbox
            cfg['cores'] = cores
            cfg['gpus'] = gpus

            ru.rec_makedir(sbox)
            ru.write_json(cfg, fname)

            # grab default settings via CUD construction
            descr_complete = ComputeUnitDescription(descr).as_dict()

            # create task dict
            task = dict()
            task['description'] = copy.deepcopy(descr_complete)
            task['state'] = rps.AGENT_STAGING_INPUT_PENDING
            task['type'] = 'unit'
            task['uid'] = uid
            task['unit_sandbox_path'] = sbox
            task['unit_sandbox'] = 'file://localhost/' + sbox
            task['pilot_sandbox'] = cfg.base
            task['session_sandbox'] = cfg.base + '/../'
            task['resource_sandbox'] = cfg.base + '/../../'

            task['description']['arguments'] += [fname]

            tasks.append(task)
            self._workers[uid] = task

            self._log.debug('submit %s', uid)

        # insert the task
        self.advance(tasks, publish=False, push=True)
Exemplo n.º 11
0
    def _initialize_primary(self, dburl):

        self._rep.info ('<<new session: ')
        self._rep.plain('[%s]' % self._uid)

        # create db connection - need a dburl to connect to
        if not dburl: dburl = self._cfg.dburl
        if not dburl: dburl = self._cfg.default_dburl
        if not dburl: raise RuntimeError("no db URL (set RADICAL_PILOT_DBURL)")

        self._cfg.dburl = dburl

        self._rep.info ('<<database   : ')
        self._rep.plain('[%s]'    % dburl)
        self._log.info('dburl %s' % dburl)

        # create/connect database handle on primary sessions
        try:
            self._dbs = DBSession(sid=self.uid, dburl=dburl,
                                  cfg=self._cfg, log=self._log)

            py_version_detail = sys.version.replace("\n", " ")
            from . import version_detail as rp_version_detail

            self.inject_metadata({'radical_stack':
                                         {'rp': rp_version_detail,
                                          'rs': rs.version_detail,
                                          'ru': ru.version_detail,
                                          'py': py_version_detail}})
        except Exception:
            self._rep.error(">>err\n")
            self._log.exception('session create failed [%s]',  dburl)
            raise RuntimeError ('session create failed [%s]' % dburl)

        # primary sessions have a component manager which also manages
        # heartbeat.  'self._cmgr.close()` should be called during termination
        self._cmgr = rpu.ComponentManager(self._cfg)
        self._cmgr.start_bridges()
        self._cmgr.start_components()

        # expose the cmgr's heartbeat channel to anyone who wants to use it
        self._cfg.heartbeat = self._cmgr.cfg.heartbeat

        self._rec = False
        if self._cfg.record:

            # append session ID to recording path
            self._rec = "%s/%s" % (self._rec, self._uid)

            # create recording path and record session
            os.system('mkdir -p %s' % self._rec)
            ru.write_json({'dburl': str(self.dburl)},
                          "%s/session.json" % self._rec)
            self._log.info("recording session in %s" % self._rec)

        self._rep.ok('>>ok\n')
Exemplo n.º 12
0
def write_data(data, proc_path, typ=None):

    if 'rp' == typ:
        proc_path = os.path.dirname(
            os.path.dirname(proc_path)) + '/' + os.path.basename(proc_path)
    if not os.path.isdir(os.path.dirname(proc_path)):
        os.makedirs(os.path.dirname(proc_path))
    ru.write_json(data, proc_path)

    return proc_path
Exemplo n.º 13
0
def write_workflows(workflows, uid, fname=None, fwrite=True):

    import warnings
    warnings.simplefilter("once")
    warnings.warn(
        "The function write_workflows will be deprecated in favor " +
        "of the profiles. Please set RADICAL_ENTK_PROFILE=TRUE",
        DeprecationWarning)
    try:
        os.mkdir(uid)

    except:
        pass

    if not fname:
        fname = 'entk_workflow.json'

    data = {'stack': ru.stack(), 'workflows': list()}

    for workflow in workflows:

        w = dict()
        w['pipes'] = list()

        for pipe in workflow:

            p = dict()
            p['uid'] = pipe.uid
            p['name'] = pipe.name
            p['state_history'] = pipe.state_history
            p['stages'] = list()

            for stage in pipe.stages:

                s = dict()
                s['uid'] = stage.uid
                s['name'] = stage.name
                s['state_history'] = stage.state_history
                s['tasks'] = list()

                for task in stage.tasks:
                    s['tasks'].append(task.to_dict())

                p['stages'].append(s)

            w['pipes'].append(p)

        data['workflows'].append(w)

    if fwrite:
        ru.write_json(data, '%s/%s' % (uid, fname))
        return 0

    return data
Exemplo n.º 14
0
    def test_amgr_read_config(self, mocked_init, mocked_PlainCredentials,
                              mocked_ConnectionParameters, d):

        amgr = Amgr(hostname='host',
                    port='port',
                    username='******',
                    password='******')

        d["rts"] = "mock"
        d["rts_config"] = {"sandbox_cleanup": True, "db_cleanup": True}

        ru.write_json(d, './config.json')
        amgr._read_config(config_path='./',
                          hostname=None,
                          port=None,
                          username=None,
                          password=None,
                          reattempts=None,
                          resubmit_failed=None,
                          autoterminate=None,
                          write_workflow=None,
                          rts=None,
                          rmq_cleanup=None,
                          rts_config=None)

        self.assertEqual(amgr._hostname, d['hostname'])
        self.assertEqual(amgr._port, d['port'])
        self.assertEqual(amgr._reattempts, d['reattempts'])
        self.assertEqual(amgr._resubmit_failed, d['resubmit_failed'])
        self.assertEqual(amgr._autoterminate, d['autoterminate'])
        self.assertEqual(amgr._write_workflow, d['write_workflow'])
        self.assertEqual(amgr._rts, d['rts'])
        self.assertEqual(amgr._rts_config, d['rts_config'])
        self.assertEqual(amgr._num_pending_qs, d['pending_qs'])
        self.assertEqual(amgr._num_completed_qs, d['completed_qs'])
        self.assertEqual(amgr._rmq_cleanup, d['rmq_cleanup'])

        d['rts'] = 'another'
        ru.write_json(d, './config.json')
        print(d)
        with self.assertRaises(ValueError):
            amgr._read_config(config_path='./',
                              hostname=None,
                              port=None,
                              username=None,
                              password=None,
                              reattempts=None,
                              resubmit_failed=None,
                              autoterminate=None,
                              write_workflow=None,
                              rts=None,
                              rmq_cleanup=None,
                              rts_config=None)
def test_amgr_read_config():

    amgr = Amgr()

    assert amgr._mq_hostname == 'localhost'
    assert amgr._port == 5672
    assert amgr._reattempts == 3
    assert amgr._resubmit_failed == False
    assert amgr._autoterminate == True
    assert amgr._write_workflow == False
    assert amgr._rts == 'radical.pilot'
    assert amgr._num_pending_qs == 1
    assert amgr._num_completed_qs == 1
    assert amgr._rmq_cleanup == True
    assert amgr._rts_config == { "sandbox_cleanup": False, "db_cleanup": False}

    d = {"hostname": "radical.two",
         "port": 25672,
         "reattempts": 5,
         "resubmit_failed": True,
         "autoterminate": False,
         "write_workflow": True,
         "rts": "mock",
         "rts_config": { "sandbox_cleanup": True, "db_cleanup": True},
         "pending_qs": 2,
         "completed_qs": 3,
         "rmq_cleanup": False}

    ru.write_json(d, './config.json')
    amgr._read_config(config_path='./',
                      hostname=None,
                      port=None,
                      reattempts=None,
                      resubmit_failed=None,
                      autoterminate=None,
                      write_workflow=None,
                      rts=None,
                      rmq_cleanup=None,
                      rts_config=None)

    assert amgr._mq_hostname == d['hostname']
    assert amgr._port == d['port']
    assert amgr._reattempts == d['reattempts']
    assert amgr._resubmit_failed == d['resubmit_failed']
    assert amgr._autoterminate == d['autoterminate']
    assert amgr._write_workflow == d['write_workflow']
    assert amgr._rts == d['rts']
    assert amgr._rts_config == d['rts_config']
    assert amgr._num_pending_qs == d['pending_qs']
    assert amgr._num_completed_qs == d['completed_qs']
    assert amgr._rmq_cleanup == d['rmq_cleanup']

    os.remove('./config.json')
Exemplo n.º 16
0
def fetch_json(sid, dburl=None, tgt=None, skip_existing=False, session=None,
        log=None):
    '''
    returns file name
    '''

    if not log and session:
        log = session._log
        rep = session._rep
    elif not log:
        log = ru.Logger('radical.pilot.utils')
        rep = ru.Reporter('radical.pilot.utils')

    if not tgt:
        tgt = '.'

    if tgt.startswith('/'):
        # Assume an absolute path
        dst = os.path.join(tgt, '%s.json' % sid)
    else:
        # Assume a relative path
        dst = os.path.join(os.getcwd(), tgt, '%s.json' % sid)

    try:
        os.makedirs(os.path.dirname(tgt))
    except OSError:
        pass # dir exists

    if skip_existing and os.path.isfile(dst) \
            and os.stat(dst).st_size > 0:

        log.info("session already in %s", dst)

    else:

        if not dburl:
            dburl = os.environ.get('RADICAL_PILOT_DBURL')

        if not dburl:
            raise ValueError('RADICAL_PILOT_DBURL is not set')

        mongo, db, _, _, _ = ru.mongodb_connect(dburl)

        json_docs = get_session_docs(db, sid)
        ru.write_json(json_docs, dst)

        log.info("session written to %s", dst)

        mongo.close()

    rep.ok("+ %s (json)\n" % sid)
    return dst
Exemplo n.º 17
0
def write_workflows(workflows, uid, fname=None, fwrite=True):

    try:
        os.mkdir(uid)

    except:
        pass

    if not fname:
        fname = 'entk_workflow.json'

    data = {'stack'    : ru.stack(),
            'workflows': list()}


    for workflow in workflows:

        w = dict()
        w['pipes'] = list()

        for pipe in workflow:

            p = dict()
            p['uid']           = pipe.uid
            p['name']          = pipe.name
            p['state_history'] = pipe.state_history
            p['stages']        = list()

            for stage in pipe.stages:

                s = dict()
                s['uid']           = stage.uid
                s['name']          = stage.name
                s['state_history'] = stage.state_history
                s['tasks']         = list()

                for task in stage.tasks:
                    s['tasks'].append(task.to_dict())

                p['stages'].append(s)

            w['pipes'].append(p)

        data['workflows'].append(w)

    if fwrite:
        ru.write_json(data, '%s/%s' % (uid, fname))
        return 0

    return data
Exemplo n.º 18
0
def write_workflow(workflow, uid, workflow_fout='entk_workflow', fwrite=True):

    try:
        os.mkdir(uid)
    except:
        pass

    data = list()
    if os.path.isfile('%s/%s.json' % (uid, workflow_fout)):
        data = ru.read_json('%s/%s.json' % (uid, workflow_fout))

    stack = ru.stack()
    data.append({'stack': stack})

    for pipe in workflow:

        p = dict()
        p['uid'] = pipe.uid
        p['name'] = pipe.name
        p['state_history'] = pipe.state_history
        p['stages'] = list()

        for stage in pipe.stages:

            s = dict()
            s['uid'] = stage.uid
            s['name'] = stage.name
            s['state_history'] = stage.state_history
            s['tasks'] = list()

            for task in stage.tasks:
                s['tasks'].append(task.to_dict())

            p['stages'].append(s)

        data.append(p)

    if fwrite:
        ru.write_json(data, '%s/%s.json' % (uid, workflow_fout))
        return 0

    return data
def write_workflow(workflow, uid, workflow_fout='entk_workflow', fwrite=True):

    try:
        os.mkdir(uid)
    except:
        pass

    data = list()
    if os.path.isfile('%s/%s.json' % (uid, workflow_fout)):
        data = ru.read_json('%s/%s.json' % (uid, workflow_fout))

    stack = ru.stack()
    data.append({'stack': stack})

    for pipe in workflow:

        p = dict()
        p['uid'] = pipe.uid
        p['name'] = pipe.name
        p['state_history'] = pipe.state_history
        p['stages'] = list()

        for stage in pipe.stages:

            s = dict()
            s['uid'] = stage.uid
            s['name'] = stage.name
            s['state_history'] = stage.state_history
            s['tasks'] = list()

            for task in stage.tasks:
                s['tasks'].append(task.to_dict())

            p['stages'].append(s)

        data.append(p)

    if fwrite:
        ru.write_json(data, '%s/%s.json' % (uid, workflow_fout))
        return 0

    return data
Exemplo n.º 20
0
def fetch_json(sid, dburl=None, tgt=None, skip_existing=False):

    '''
    returns file name
    '''

    if not tgt:
        tgt = '.'

    if tgt.startswith('/'):
        # Assume an absolute path
        dst = os.path.join(tgt, '%s.json' % sid)
    else:
        # Assume a relative path
        dst = os.path.join(os.getcwd(), tgt, '%s.json' % sid)

    if skip_existing and os.path.isfile(dst) \
            and os.stat(dst).st_size > 0:

        print "session already in %s" % dst

    else:

        if not dburl:
            dburl = os.environ.get('RADICAL_PILOT_DBURL')

        if not dburl:
            from radical.pilot.session import default_dburl
            logger.report.warn('using default dburl: %s' % default_dburl)
            dburl = default_dburl

        mongo, db, _, _, _ = ru.mongodb_connect(dburl)

        json_docs = get_session_docs(db, sid)
        ru.write_json(json_docs, dst)

        print "session written to %s" % dst

        mongo.close()

    return dst
Exemplo n.º 21
0
def fetch_json(sid, dburl=None, tgt=None, skip_existing=False):
    '''
    returns file name
    '''

    if not tgt:
        tgt = '.'

    if tgt.startswith('/'):
        # Assume an absolute path
        dst = os.path.join(tgt, '%s.json' % sid)
    else:
        # Assume a relative path
        dst = os.path.join(os.getcwd(), tgt, '%s.json' % sid)

    if skip_existing and os.path.isfile(dst) \
            and os.stat(dst).st_size > 0:

        print "session already in %s" % dst

    else:

        if not dburl:
            dburl = os.environ['RADICAL_PILOT_DBURL']

        if not dburl:
            raise RuntimeError('Please set RADICAL_PILOT_DBURL')

        mongo, db, _, _, _ = ru.mongodb_connect(dburl)

        json_docs = get_session_docs(db, sid)
        ru.write_json(json_docs, dst)

        print "session written to %s" % dst

        mongo.close()

    return dst
Exemplo n.º 22
0
    def _write_sa_configs(self):

        # we have all information needed by the subagents -- write the
        # sub-agent config files.

        # write deep-copies of the config for each sub-agent (sans from agent_0)
        for sa in self._cfg.get('agents', {}):

            assert(sa != 'agent_0'), 'expect subagent, not agent_0'

            # use our own config sans agents/components as a basis for
            # the sub-agent config.
            tmp_cfg = copy.deepcopy(self._cfg)
            tmp_cfg['agents']     = dict()
            tmp_cfg['components'] = dict()

            # merge sub_agent layout into the config
            ru.dict_merge(tmp_cfg, self._cfg['agents'][sa], ru.OVERWRITE)

            tmp_cfg['agent_name'] = sa
            tmp_cfg['owner']      = 'agent_0'

            ru.write_json(tmp_cfg, './%s.cfg' % sa)
Exemplo n.º 23
0
    def _write_sa_configs(self):

        # we have all information needed by the subagents -- write the
        # sub-agent config files.

        # write deep-copies of the config for each sub-agent (sans from agent_0)
        for sa in self._cfg.get('agents', {}):

            assert(sa != 'agent_0'), 'expect subagent, not agent_0'

            # use our own config sans agents/components as a basis for
            # the sub-agent config.
            tmp_cfg = copy.deepcopy(self._cfg)
            tmp_cfg['agents']     = dict()
            tmp_cfg['components'] = dict()

            # merge sub_agent layout into the config
            ru.dict_merge(tmp_cfg, self._cfg['agents'][sa], ru.OVERWRITE)

            tmp_cfg['agent_name'] = sa
            tmp_cfg['owner']      = 'agent_0'

            ru.write_json(tmp_cfg, './%s.cfg' % sa)
Exemplo n.º 24
0
def store_profile (profile, tags=None, url=None, mode=None) :

    if not url:
        url = os.environ.get ('RADICAL_SYNAPSE_DBURL')

    if not url:
      # print "warning: need dburl to store profiles"
        return None

    if not mode:
        raise ValueError ("document needs mode (emulated | eecuted | profiled)")

    url = ru.Url (url)

    if not tags:
        tags  = dict()
        elems = filter (None, os.environ.get('RADICAL_SYNAPSE_TAGS', '').split(','))
        for elem in elems:
            if ':' in elem:
                key, val  = elem.split(':', 1)
                tags[key] = val
            else:
                tags[elem] = None


    command_idx = index_command (profile['cmd'], tags)
    print "index %s (%s) to %s" % (profile['cmd'], tags, command_idx)

    host = profile['sys'].get ('hostname')
    if not host:
        host = os.environ.get ('RADICAL_SYNAPSE_HOSTNAME', socket.gethostname())
        profile['sys']['hostname'] = host

    doc  = {'type'        : 'synapse_profile',
            'mode'        : mode,
            'command_idx' : command_idx,
            'command'     : profile['cmd'],
            'tags'        : tags,
            'profile'     : profile}



    if url.schema == 'mongodb':

        print 'store profile in db %s' % url

        [dbhost, port, dbname, _, _, _, _] = ru.split_dburl (url)

        db_client  = pymongo.MongoClient (host=dbhost, port=port)
        database   = db_client[dbname]
        collection = database['profiles']

        collection.insert (doc)


    elif url.schema == 'file':

        path = url.path

        if not os.path.isdir (path):
            os.system ('mkdir -p "%s"' % path)

        name = command_idx.split()[0]
      # for key, val in tags.iteritems():
      #     if val != None: name += "_%s:%s" % (key, val)
      #     else          : name += "_%s"    % (key)
        for tag in sorted(tags.keys()):
            if tags[tag] != None: name += "_%s" % tags[tag]
            else                : name += "_%s" % tag

        idx  = 0
        while True:
            fname = "%s/synapse_profile_%s_%s_%s_%03d.json" % (path, name, host, mode[0:3], idx)
            if not os.path.exists (fname):
                break
            idx += 1

        print 'store profile in file %s' % fname
        os.system ('mkdir -p "%s/"' % path)
        ru.write_json (doc, fname)
def write_session_description(amgr):

    desc = dict()

    desc['entities'] = dict()
    desc['entities']['pipeline'] = {
        'state_model': res._pipeline_state_values,
        'state_values': res._pipeline_state_inv,
        'event_model': dict(),
    }

    desc['entities']['stage'] = {
        'state_model': res._stage_state_values,
        'state_values': res._stage_state_inv,
        'event_model': dict(),
    }

    desc['entities']['task'] = {
        'state_model': res._task_state_values,
        'state_values': res._task_state_inv,
        'event_model': dict(),
    }

    desc['entities']['appmanager'] = {
        'state_model': None,
        'state_values': None,
        'event_model': dict(),
    }

    # Adding amgr to the tree
    tree = dict()
    tree[amgr._uid] = {'uid': amgr._uid,
                       'etype': 'appmanager',
                       'cfg': {},
                       'has': ['pipeline',
                               'wfprocessor',
                               'resource_manager',
                               'task_manager'],
                       'children': list()
                      }

    # Adding wfp to the tree
    wfp = amgr._wfp
    tree[amgr._uid]['children'].append(wfp._uid)
    tree[wfp._uid] = {'uid': wfp._uid,
                      'etype': 'wfprocessor',
                      'cfg': {},
                      'has': [],
                      'children': list()
                     }

    # Adding rmgr to the tree
    rmgr = amgr._resource_manager
    tree[amgr._uid]['children'].append(rmgr._uid)
    tree[rmgr._uid] = {'uid': rmgr._uid,
                       'etype': 'resource_manager',
                       'cfg': {},
                       'has': [],
                       'children': list()
                      }

    # Adding tmgr to the tree
    tmgr = amgr._task_manager
    tree[amgr._uid]['children'].append(tmgr._uid)
    tree[tmgr._uid] = {'uid': tmgr._uid,
                       'etype': 'task_manager',
                       'cfg': {},
                       'has': [],
                       'children': list()
                      }

    # Adding pipelines to the tree
    wf = amgr._workflow
    for pipe in wf:
        tree[amgr._uid]['children'].append(pipe.uid)
        tree[pipe.uid] = {'uid': pipe.uid,
                          'etype': 'pipeline',
                          'cfg': {},
                          'has': ['stage'],
                          'children': list()
                          }
        # Adding stages to the tree
        for stage in pipe.stages:
            tree[pipe.uid]['children'].append(stage.uid)
            tree[stage.uid] = {'uid': stage.uid,
                               'etype': 'stage',
                               'cfg': {},
                               'has': ['task'],
                               'children': list()
                               }
            # Adding tasks to the tree
            for task in stage.tasks:
                tree[stage.uid]['children'].append(task.uid)
                tree[task.uid] = {'uid': task.uid,
                                  'etype': 'task',
                                  'cfg': {},
                                  'has': [],
                                  'children': list()
                                  }

    desc['tree'] = tree
    desc['config'] = dict()

    ru.write_json(desc, '%s/radical.entk.%s.json' % (amgr.sid, amgr.sid))
Exemplo n.º 26
0
    def submit_pilots(self, descriptions):
        """
        Submits on or more :class:`radical.pilot.ComputePilot` instances to the
        pilot manager.

        **Arguments:**
            * **descriptions** [:class:`radical.pilot.ComputePilotDescription`
              or list of :class:`radical.pilot.ComputePilotDescription`]: The
              description of the compute pilot instance(s) to create.

        **Returns:**
              * A list of :class:`radical.pilot.ComputePilot` objects.
        """

        from .compute_pilot import ComputePilot

        self.is_valid()

        ret_list = True
        if not isinstance(descriptions, list):
            ret_list     = False
            descriptions = [descriptions]

        if len(descriptions) == 0:
            raise ValueError('cannot submit no pilot descriptions')


        self._log.report.info('<<submit %d pilot(s)\n\t' % len(descriptions))

        # create the pilot instance
        pilots     = list()
        pilot_docs = list()
        for pd in descriptions :

            if not pd.runtime:
                raise ValueError('pilot runtime must be defined')

            if pd.runtime <= 0:
                raise ValueError('pilot runtime must be positive')

            if not pd.cores:
                raise ValueError('pilot core size must be defined')

            if not pd.resource:
                raise ValueError('pilot target resource must be defined')

            pilot = ComputePilot(pmgr=self, descr=pd)
            pilots.append(pilot)
            pilot_doc = pilot.as_dict()
            pilot_docs.append(pilot_doc)

            # keep pilots around
            with self._pilots_lock:
                self._pilots[pilot.uid] = pilot

            if self._session._rec:
                ru.write_json(pd.as_dict(), "%s/%s.batch.%03d.json" \
                        % (self._session._rec, pilot.uid, self._rec_id))
            self._log.report.progress()

        # initial state advance to 'NEW'
        # FIXME: we should use update_pilot(), but that will not trigger an
        #        advance, since the state did not change.  We would then miss
        #        the profile entry for the advance to NEW.  So we here basically
        #        only trigger the profile entry for NEW.
        self.advance(pilot_docs, state=rps.NEW, publish=False, push=False)

        if self._session._rec:
            self._rec_id += 1

        # insert pilots into the database, as a bulk.
        self._session._dbs.insert_pilots(pilot_docs)

        # Only after the insert can we hand the pilots over to the next
        # components (ie. advance state).
        for pd in pilot_docs:
            pd['state'] = rps.PMGR_LAUNCHING_PENDING
            self._update_pilot(pd, advance=False)
        self.advance(pilot_docs, publish=True, push=True)

        self._log.report.ok('>>ok\n')

        if ret_list: return pilots
        else       : return pilots[0]
    def submit_pilots(self, descriptions):
        """
        Submits on or more :class:`radical.pilot.ComputePilot` instances to the
        pilot manager.

        **Arguments:**
            * **descriptions** [:class:`radical.pilot.ComputePilotDescription`
              or list of :class:`radical.pilot.ComputePilotDescription`]: The
              description of the compute pilot instance(s) to create.

        **Returns:**
              * A list of :class:`radical.pilot.ComputePilot` objects.
        """

        from .compute_pilot import ComputePilot

        self.is_valid()

        ret_list = True
        if not isinstance(descriptions, list):
            ret_list     = False
            descriptions = [descriptions]

        if len(descriptions) == 0:
            raise ValueError('cannot submit no pilot descriptions')


        self._rep.info('<<submit %d pilot(s)\n\t' % len(descriptions))

        # create the pilot instance
        pilots     = list()
        pilot_docs = list()
        for pd in descriptions :

            if not pd.runtime:
                raise ValueError('pilot runtime must be defined')

            if pd.runtime <= 0:
                raise ValueError('pilot runtime must be positive')

            if not pd.cores:
                raise ValueError('pilot size must be defined')

            if not pd.resource:
                raise ValueError('pilot target resource must be defined')

            pilot = ComputePilot(pmgr=self, descr=pd)
            pilots.append(pilot)
            pilot_doc = pilot.as_dict()
            pilot_docs.append(pilot_doc)

            # keep pilots around
            with self._pilots_lock:
                self._pilots[pilot.uid] = pilot

            if self._session._rec:
                ru.write_json(pd.as_dict(), "%s/%s.batch.%03d.json" \
                        % (self._session._rec, pilot.uid, self._rec_id))

            if 'resource' in pd and 'cores' in pd:
                self._rep.plain('[%s:%s]\n\t' % (pd['resource'], pd['cores']))
            elif 'resource' in pd:
                self._rep.plain('[%s]\n\t' % pd['resource'])


        # initial state advance to 'NEW'
        # FIXME: we should use update_pilot(), but that will not trigger an
        #        advance, since the state did not change.  We would then miss
        #        the profile entry for the advance to NEW.  So we here basically
        #        only trigger the profile entry for NEW.
        self.advance(pilot_docs, state=rps.NEW, publish=False, push=False)

        if self._session._rec:
            self._rec_id += 1

        # insert pilots into the database, as a bulk.
        self._session._dbs.insert_pilots(pilot_docs)

        # Only after the insert can we hand the pilots over to the next
        # components (ie. advance state).
        for pd in pilot_docs:
            pd['state'] = rps.PMGR_LAUNCHING_PENDING
            self._update_pilot(pd, advance=False)
        self.advance(pilot_docs, publish=True, push=True)

        self._rep.ok('>>ok\n')

        if ret_list: return pilots
        else       : return pilots[0]
Exemplo n.º 28
0
    def run(self):
        """Starts the process when Process.start() is called.
        """

        global JOB_CHECK_INTERVAL

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                db = self._session.get_db()
                pilot_col = db["%s.p" % self._session.uid]
                logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._terminate.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                if self._disabled.is_set():
                    # don't process any new pilot start requests.
                    # NOTE: this is not clean, in principle there could be other
                    #       launchers alive which want to still start those
                    #       pending pilots.  In practice we only ever use one
                    #       pmgr though, and its during its shutdown that we get
                    #       here...
                    ts = time.time()
                    compute_pilot = pilot_col.find_and_modify(
                        query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH},
                        update={
                            "$set": {"state": CANCELED},
                            "$push": {"statehistory": {"state": CANCELED, "timestamp": ts}},
                        },
                    )

                    # run state checks more frequently.
                    JOB_CHECK_INTERVAL = 3
                    time.sleep(1)
                    continue

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = time.time()
                compute_pilot = pilot_col.find_and_modify(
                    query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH},
                    update={
                        "$set": {"state": LAUNCHING},
                        "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}},
                    },
                )

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_id = self._session.uid
                        database_url = self._session.dburl

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot["description"]["cores"]
                        runtime = compute_pilot["description"]["runtime"]
                        queue = compute_pilot["description"]["queue"]
                        project = compute_pilot["description"]["project"]
                        cleanup = compute_pilot["description"]["cleanup"]
                        resource_key = compute_pilot["description"]["resource"]
                        schema = compute_pilot["description"]["access_schema"]
                        memory = compute_pilot["description"]["memory"]
                        candidate_hosts = compute_pilot["description"]["candidate_hosts"]
                        pilot_sandbox = compute_pilot["sandbox"]
                        global_sandbox = compute_pilot["global_sandbox"]

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_launch_method = resource_cfg.get("agent_launch_method")
                        agent_dburl = resource_cfg.get("agent_mongodb_endpoint", database_url)
                        agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE)
                        rc_agent_config = resource_cfg.get("agent_config", DEFAULT_AGENT_CONFIG)
                        agent_scheduler = resource_cfg.get("agent_scheduler")
                        tunnel_bind_device = resource_cfg.get("tunnel_bind_device")
                        default_queue = resource_cfg.get("default_queue")
                        forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint")
                        js_endpoint = resource_cfg.get("job_manager_endpoint")
                        lrms = resource_cfg.get("lrms")
                        mpi_launch_method = resource_cfg.get("mpi_launch_method")
                        pre_bootstrap_1 = resource_cfg.get("pre_bootstrap_1")
                        pre_bootstrap_2 = resource_cfg.get("pre_bootstrap_2")
                        python_interpreter = resource_cfg.get("python_interpreter")
                        spmd_variation = resource_cfg.get("spmd_variation")
                        task_launch_method = resource_cfg.get("task_launch_method")
                        rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get("stage_cacerts", "False")
                        cores_per_node = resource_cfg.get("cores_per_node")
                        shared_filesystem = resource_cfg.get("shared_filesystem", True)
                        health_check = resource_cfg.get("health_check", True)
                        python_dist = resource_cfg.get("python_dist")
                        cu_pre_exec = resource_cfg.get("cu_pre_exec")
                        cu_post_exec = resource_cfg.get("cu_post_exec")
                        export_to_cu = resource_cfg.get("export_to_cu")

                        # Agent configuration that is not part of the public API.
                        # The agent config can either be a config dict, or
                        # a string pointing to a configuration name.  If neither
                        # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is
                        # set.  The last fallback is 'agent_default'
                        agent_config = compute_pilot["description"].get("_config")
                        if not agent_config:
                            agent_config = os.environ.get("RADICAL_PILOT_AGENT_CONFIG")
                        if not agent_config:
                            agent_config = rc_agent_config

                        if isinstance(agent_config, dict):
                            # nothing to do
                            agent_cfg_dict = agent_config
                            pass

                        elif isinstance(agent_config, basestring):
                            try:
                                if os.path.exists(agent_config):
                                    # try to open as file name
                                    logger.info("Read agent config file: %s" % agent_config)
                                    agent_cfg_dict = ru.read_json(agent_config)
                                else:
                                    # otherwise interpret as a config name
                                    module_path = os.path.dirname(os.path.abspath(__file__))
                                    config_path = "%s/../configs/" % module_path
                                    agent_cfg_file = os.path.join(config_path, "agent_%s.json" % agent_config)
                                    logger.info("Read agent config file: %s" % agent_cfg_file)
                                    agent_cfg_dict = ru.read_json(agent_cfg_file)
                                # no matter how we read the config file, we
                                # allow for user level overload
                                cfg_base = os.path.basename(agent_cfg_file)
                                user_cfg = "%s/.radical/pilot/config/%s" % (os.environ["HOME"], cfg_base)
                                if os.path.exists(user_cfg):
                                    logger.info("merging user config: %s" % user_cfg)
                                    user_cfg_dict = ru.read_json(user_cfg)
                                    ru.dict_merge(agent_cfg_dict, user_cfg_dict, policy="overwrite")
                            except Exception as e:
                                logger.exception("Error reading agent config file: %s" % e)
                                raise

                        else:
                            # we can't handle this type
                            raise TypeError("agent config must be string (filename) or dict")

                        # TODO: use booleans all the way?
                        if stage_cacerts.lower() == "true":
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            "pilot_sandbox": saga.Url(pilot_sandbox).path,
                            "global_sandbox": saga.Url(global_sandbox).path,
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get("global_virtenv")
                        if global_virtenv:
                            logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'")
                            virtenv = global_virtenv
                            virtenv_mode = "use"

                        # Create a host:port string for use by the bootstrap_1.
                        db_url = saga.Url(agent_dburl)
                        if db_url.port:
                            db_hostport = "%s:%d" % (db_url.host, db_url.port)
                        else:
                            db_hostport = "%s:%d" % (db_url.host, 27017)  # mongodb default

                        # Open the remote sandbox
                        # TODO: make conditional on shared_fs?
                        sandbox_tgt = saga.filesystem.Directory(
                            pilot_sandbox, session=self._session, flags=saga.filesystem.CREATE_PARENTS
                        )

                        LOCAL_SCHEME = "file"

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.
                        # This also creates the sandbox.
                        BOOTSTRAPPER_SCRIPT = "bootstrap_1.sh"
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, BOOTSTRAPPER_SCRIPT))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, bootstrapper_path))

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, sandbox_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        if shared_filesystem:
                            sandbox_tgt.copy(bs_script_url, BOOTSTRAPPER_SCRIPT)

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]:
                            raise ValueError("invalid rp_version '%s'" % rp_version)

                        stage_sdist = True
                        if rp_version in ["installed", "release"]:
                            stage_sdist = False

                        if rp_version.startswith("@"):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for sdist_path in [ru.sdist_path, saga.sdist_path, rp_sdist_path]:

                                sdist_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, sdist_path))
                                msg = "Copying sdist '%s' to sandbox (%s)." % (sdist_url, pilot_sandbox)
                                logentries.append(Logentry(msg, logger=logger.debug))
                                if shared_filesystem:
                                    sandbox_tgt.copy(sdist_url, os.path.basename(str(sdist_url)))

                        # ------------------------------------------------------
                        # Some machines cannot run pip due to outdated CA certs.
                        # For those, we also stage an updated certificate bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz"))

                            cc_url = saga.Url("%s://localhost/%s" % (LOCAL_SCHEME, cc_path))
                            msg = "Copying CA certificate bundle '%s' to sandbox (%s)." % (cc_url, pilot_sandbox)
                            logentries.append(Logentry(msg, logger=logger.debug))
                            if shared_filesystem:
                                sandbox_tgt.copy(cc_url, os.path.basename(str(cc_url)))

                        # ------------------------------------------------------
                        # sanity checks
                        if not python_dist:
                            raise RuntimeError("missing python distribution")
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms:
                            raise RuntimeError("missing LRMS")
                        if not agent_launch_method:
                            raise RuntimeError("missing agentlaunch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = "luve"  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not "private":
                                cleanup = cleanup.replace("v", "")

                        sdists = ":".join([ru.sdist_name, saga.sdist_name, rp_sdist_name])

                        # if cores_per_node is set (!= None), then we need to
                        # allocation full nodes, and thus round up
                        if cores_per_node:
                            cores_per_node = int(cores_per_node)
                            number_cores = int(cores_per_node * math.ceil(float(number_cores) / cores_per_node))

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -d '%s'" % sdists
                        bootstrap_args += " -m '%s'" % virtenv_mode
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -r '%s'" % rp_version
                        bootstrap_args += " -s '%s'" % session_id
                        bootstrap_args += " -v '%s'" % virtenv
                        bootstrap_args += " -b '%s'" % python_dist

                        # set optional args
                        if agent_type:
                            bootstrap_args += " -a '%s'" % agent_type
                        if lrms == "CCM":
                            bootstrap_args += " -c"
                        if pre_bootstrap_1:
                            bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap_1)
                        if pre_bootstrap_2:
                            bootstrap_args += " -w '%s'" % "' -w '".join(pre_bootstrap_2)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -h '%s'" % db_hostport
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if tunnel_bind_device:
                            bootstrap_args += " -t '%s'" % tunnel_bind_device
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # set some agent configuration
                        agent_cfg_dict["cores"] = number_cores
                        agent_cfg_dict["resource_cfg"] = resource_cfg
                        agent_cfg_dict["debug"] = os.environ.get(
                            "RADICAL_PILOT_AGENT_VERBOSE", logger.getEffectiveLevel()
                        )
                        agent_cfg_dict["mongodb_url"] = str(agent_dburl)
                        agent_cfg_dict["lrms"] = lrms
                        agent_cfg_dict["spawner"] = agent_spawner
                        agent_cfg_dict["scheduler"] = agent_scheduler
                        agent_cfg_dict["runtime"] = runtime
                        agent_cfg_dict["pilot_id"] = pilot_id
                        agent_cfg_dict["session_id"] = session_id
                        agent_cfg_dict["agent_launch_method"] = agent_launch_method
                        agent_cfg_dict["task_launch_method"] = task_launch_method
                        agent_cfg_dict["export_to_cu"] = export_to_cu
                        agent_cfg_dict["cu_pre_exec"] = cu_pre_exec
                        agent_cfg_dict["cu_post_exec"] = cu_post_exec
                        if mpi_launch_method:
                            agent_cfg_dict["mpi_launch_method"] = mpi_launch_method
                        if cores_per_node:
                            agent_cfg_dict["cores_per_node"] = cores_per_node

                        # ------------------------------------------------------
                        # Write agent config dict to a json file in pilot sandbox.

                        cfg_tmp_dir = tempfile.mkdtemp(prefix="rp_agent_cfg_dir")
                        agent_cfg_name = "agent_0.cfg"
                        cfg_tmp_file = os.path.join(cfg_tmp_dir, agent_cfg_name)
                        cfg_tmp_handle = os.open(cfg_tmp_file, os.O_WRONLY | os.O_CREAT)

                        # Convert dict to json file
                        msg = "Writing agent configuration to file '%s'." % cfg_tmp_file
                        logentries.append(Logentry(msg, logger=logger.debug))
                        ru.write_json(agent_cfg_dict, cfg_tmp_file)

                        cf_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, cfg_tmp_file))
                        msg = "Copying agent configuration file '%s' to sandbox (%s)." % (cf_url, pilot_sandbox)
                        logentries.append(Logentry(msg, logger=logger.debug))
                        if shared_filesystem:
                            sandbox_tgt.copy(cf_url, agent_cfg_name)

                        # Close agent config file
                        os.close(cfg_tmp_handle)

                        # ------------------------------------------------------
                        # Done with all transfers to pilot sandbox, close handle
                        sandbox_tgt.close()

                        # ------------------------------------------------------
                        # now that the scripts are in place and configured,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data["job_services"]:
                            js = self._shared_worker_data["job_services"][js_url]
                        else:
                            js = saga.job.Service(js_url, session=self._session)
                            self._shared_worker_data["job_services"][js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = ["-l %s" % BOOTSTRAPPER_SCRIPT, bootstrap_args]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "bootstrap_1.out"
                        jd.error = "bootstrap_1.err"
                        jd.total_cpu_count = number_cores
                        jd.processes_per_host = cores_per_node
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue
                        jd.candidate_hosts = candidate_hosts
                        jd.environment = dict()

                        # TODO: not all files might be required, this also needs to be made conditional
                        if not shared_filesystem:
                            jd.file_transfer = [
                                #'%s > %s' % (bootstrapper_path, os.path.basename(bootstrapper_path)),
                                "%s > %s"
                                % (
                                    bootstrapper_path,
                                    os.path.join(jd.working_directory, "input", os.path.basename(bootstrapper_path)),
                                ),
                                "%s > %s" % (cfg_tmp_file, os.path.join(jd.working_directory, "input", agent_cfg_name)),
                                #'%s < %s' % ('agent.log', os.path.join(jd.working_directory, 'agent.log')),
                                #'%s < %s' % (os.path.join(jd.working_directory, 'agent.log'), 'agent.log'),
                                #'%s < %s' % ('agent.log', 'agent.log'),
                                #'%s < %s' % (os.path.join(jd.working_directory, 'STDOUT'), 'unit.000000/STDOUT'),
                                #'%s < %s' % (os.path.join(jd.working_directory, 'unit.000000/STDERR'), 'STDERR')
                                #'%s < %s' % ('unit.000000/STDERR', 'unit.000000/STDERR')
                                # TODO: This needs to go into a per pilot directory on the submit node
                                "%s < %s" % ("pilot.0000.log.tgz", "pilot.0000.log.tgz"),
                            ]

                            if stage_sdist:
                                jd.file_transfer.extend(
                                    [
                                        #'%s > %s' % (rp_sdist_path, os.path.basename(rp_sdist_path)),
                                        "%s > %s"
                                        % (
                                            rp_sdist_path,
                                            os.path.join(
                                                jd.working_directory, "input", os.path.basename(rp_sdist_path)
                                            ),
                                        ),
                                        #'%s > %s' % (saga.sdist_path, os.path.basename(saga.sdist_path)),
                                        "%s > %s"
                                        % (
                                            saga.sdist_path,
                                            os.path.join(
                                                jd.working_directory, "input", os.path.basename(saga.sdist_path)
                                            ),
                                        ),
                                        #'%s > %s' % (ru.sdist_path, os.path.basename(ru.sdist_path)),
                                        "%s > %s"
                                        % (
                                            ru.sdist_path,
                                            os.path.join(
                                                jd.working_directory, "input", os.path.basename(ru.sdist_path)
                                            ),
                                        ),
                                    ]
                                )

                            if stage_cacerts:
                                jd.file_transfer.append(
                                    "%s > %s"
                                    % (cc_path, os.path.join(jd.working_directory, "input", os.path.basename(cc_path)))
                                )

                            if "RADICAL_PILOT_PROFILE" in os.environ:
                                # TODO: This needs to go into a per pilot directory on the submit node
                                jd.file_transfer.append("%s < %s" % ("pilot.0000.prof.tgz", "pilot.0000.prof.tgz"))

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if "RADICAL_PILOT_PROFILE" in os.environ:
                            jd.environment["RADICAL_PILOT_PROFILE"] = "TRUE"

                        logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        try:
                            pilotjob = js.create_job(jd)
                        except saga.BadParameter as e:
                            raise ValueError("Pilot submission to %s failed: %s" % (resource_key, e))
                        pilotjob.run()

                        # Clean up agent config file and dir after submission
                        os.unlink(cfg_tmp_file)
                        os.rmdir(cfg_tmp_dir)

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url]

                        msg = "SAGA job submitted with job id %s" % str(saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = time.time()
                        ret = pilot_col.update(
                            {"_id": pilot_id, "state": LAUNCHING},
                            {
                                "$set": {
                                    "state": PENDING_ACTIVE,
                                    "saga_job_id": saga_job_id,
                                    "health_check_enabled": health_check,
                                    "agent_config": agent_cfg_dict,
                                },
                                "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )

                        if ret["n"] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update(
                                {"_id": pilot_id},
                                {
                                    "$set": {"saga_job_id": saga_job_id, "health_check_enabled": health_check},
                                    "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                    "$pushAll": {"log": log_dicts},
                                },
                            )

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                        ts = time.time()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(str(le.message))

                        pilot_col.update(
                            {"_id": pilot_id, "state": {"$ne": FAILED}},
                            {
                                "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log},
                                "$push": {"statehistory": {"state": FAILED, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )
                        logger.exception("\n".join(log_messages))

        except SystemExit as e:
            logger.exception("pilot launcher thread caught system exit -- forcing application shutdown")
            import thread

            thread.interrupt_main()
Exemplo n.º 29
0
    def _prepare_pilot(self, resource, rcfg, pilot):

        pid = pilot["uid"]
        ret = {'ft' : list(),
               'jd' : None  }

      # # ----------------------------------------------------------------------
      # # the rcfg can contain keys with string expansion placeholders where
      # # values from the pilot description need filling in.  A prominent
      # # example is `%(pd.project)s`, where the pilot description's `PROJECT`
      # # value needs to be filled in (here in lowercase).
      # expand = dict()
      # for k,v in pilot['description'].iteritems():
      #     if v is None:
      #         v = ''
      #     expand['pd.%s' % k] = v
      #     if isinstance(v, basestring):
      #         expand['pd.%s' % k.upper()] = v.upper()
      #         expand['pd.%s' % k.lower()] = v.lower()
      #     else:
      #         expand['pd.%s' % k.upper()] = v
      #         expand['pd.%s' % k.lower()] = v
      #
      # for k in rcfg:
      #     if isinstance(rcfg[k], basestring):
      #         orig     = rcfg[k]
      #         rcfg[k]  = rcfg[k] % expand
      #         expanded = rcfg[k]
      #         if orig != expanded:
      #             self._log.debug('RCFG:\n%s\n%s', orig, expanded)

        # ----------------------------------------------------------------------
        # Database connection parameters
        sid           = self._session.uid
        database_url  = self._session.dburl

        # some default values are determined at runtime
        default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \
                          (resource, self._rp_version)

        # ----------------------------------------------------------------------
        # pilot description and resource configuration
        number_cores    = pilot['description']['cores']
        number_gpus     = pilot['description']['gpus']
        runtime         = pilot['description']['runtime']
        queue           = pilot['description']['queue']
        project         = pilot['description']['project']
        cleanup         = pilot['description']['cleanup']
        memory          = pilot['description']['memory']
        candidate_hosts = pilot['description']['candidate_hosts']

        # ----------------------------------------------------------------------
        # get parameters from resource cfg, set defaults where needed
        agent_launch_method     = rcfg.get('agent_launch_method')
        agent_dburl             = rcfg.get('agent_mongodb_endpoint', database_url)
        agent_spawner           = rcfg.get('agent_spawner',       DEFAULT_AGENT_SPAWNER)
        rc_agent_config         = rcfg.get('agent_config',        DEFAULT_AGENT_CONFIG)
        agent_scheduler         = rcfg.get('agent_scheduler')
        tunnel_bind_device      = rcfg.get('tunnel_bind_device')
        default_queue           = rcfg.get('default_queue')
        forward_tunnel_endpoint = rcfg.get('forward_tunnel_endpoint')
        lrms                    = rcfg.get('lrms')
        mpi_launch_method       = rcfg.get('mpi_launch_method', '')
        pre_bootstrap_0         = rcfg.get('pre_bootstrap_0', [])
        pre_bootstrap_1         = rcfg.get('pre_bootstrap_1', [])
        python_interpreter      = rcfg.get('python_interpreter')
        task_launch_method      = rcfg.get('task_launch_method')
        rp_version              = rcfg.get('rp_version',          DEFAULT_RP_VERSION)
        virtenv_mode            = rcfg.get('virtenv_mode',        DEFAULT_VIRTENV_MODE)
        virtenv                 = rcfg.get('virtenv',             default_virtenv)
        cores_per_node          = rcfg.get('cores_per_node', 0)
        gpus_per_node           = rcfg.get('gpus_per_node',  0)
        lfs_path_per_node       = rcfg.get('lfs_path_per_node', None)
        lfs_size_per_node       = rcfg.get('lfs_size_per_node',  0)
        python_dist             = rcfg.get('python_dist')
        virtenv_dist            = rcfg.get('virtenv_dist',        DEFAULT_VIRTENV_DIST)
        cu_tmp                  = rcfg.get('cu_tmp')
        spmd_variation          = rcfg.get('spmd_variation')
        shared_filesystem       = rcfg.get('shared_filesystem', True)
        stage_cacerts           = rcfg.get('stage_cacerts', False)
        cu_pre_exec             = rcfg.get('cu_pre_exec')
        cu_post_exec            = rcfg.get('cu_post_exec')
        export_to_cu            = rcfg.get('export_to_cu')
        mandatory_args          = rcfg.get('mandatory_args', [])
        saga_jd_supplement      = rcfg.get('saga_jd_supplement', {})

        import pprint
        self._log.debug(cores_per_node)
        self._log.debug(pprint.pformat(rcfg))

        # make sure that mandatory args are known
        for ma in mandatory_args:
            if pilot['description'].get(ma) is None:
                raise  ValueError('attribute "%s" is required for "%s"'
                                 % (ma, resource))

        # get pilot and global sandbox
        resource_sandbox = self._session._get_resource_sandbox (pilot).path
        session_sandbox  = self._session._get_session_sandbox(pilot).path
        pilot_sandbox    = self._session._get_pilot_sandbox  (pilot).path

        pilot['resource_sandbox'] = str(self._session._get_resource_sandbox(pilot))
        pilot['pilot_sandbox']    = str(self._session._get_pilot_sandbox(pilot))
        pilot['client_sandbox']   = str(self._session._get_client_sandbox())

        # Agent configuration that is not part of the public API.
        # The agent config can either be a config dict, or
        # a string pointing to a configuration name.  If neither
        # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is
        # set.  The last fallback is 'agent_default'
        agent_config = pilot['description'].get('_config')
        if not agent_config:
            agent_config = os.environ.get('RADICAL_PILOT_AGENT_CONFIG')
        if not agent_config:
            agent_config = rc_agent_config

        if isinstance(agent_config, dict):

            # use dict as is
            agent_cfg = agent_config

        elif isinstance(agent_config, basestring):
            try:
                # interpret as a config name
                agent_cfg_file = os.path.join(self._conf_dir, "agent_%s.json" % agent_config)

                self._log.info("Read agent config file: %s",  agent_cfg_file)
                agent_cfg = ru.read_json(agent_cfg_file)

                # allow for user level overload
                user_cfg_file = '%s/.radical/pilot/config/%s' \
                              % (os.environ['HOME'], os.path.basename(agent_cfg_file))

                if os.path.exists(user_cfg_file):
                    self._log.info("merging user config: %s" % user_cfg_file)
                    user_cfg = ru.read_json(user_cfg_file)
                    ru.dict_merge (agent_cfg, user_cfg, policy='overwrite')

            except Exception as e:
                self._log.exception("Error reading agent config file: %s" % e)
                raise

        else:
            # we can't handle this type
            raise TypeError('agent config must be string (config name) or dict')

        # expand variables in virtenv string
        virtenv = virtenv % {'pilot_sandbox'   : pilot_sandbox,
                             'session_sandbox' : session_sandbox,
                             'resource_sandbox': resource_sandbox}

        # Check for deprecated global_virtenv
        if 'global_virtenv' in rcfg:
            raise RuntimeError("'global_virtenv' is deprecated (%s)" % resource)

        # Create a host:port string for use by the bootstrap_0.
        db_url = rs.Url(agent_dburl)
        if db_url.port:
            db_hostport = "%s:%d" % (db_url.host, db_url.port)
        else:
            db_hostport = "%s:%d" % (db_url.host, 27017)  # mongodb default

        # ----------------------------------------------------------------------
        # the version of the agent is derived from
        # rp_version, which has the following format
        # and interpretation:
        #
        # case rp_version:
        #   @<token>:
        #   @tag/@branch/@commit: # no sdist staging
        #       git clone $github_base radical.pilot.src
        #       (cd radical.pilot.src && git checkout token)
        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
        #       rm -rf radical.pilot.src
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   release: # no sdist staging
        #       pip install -t $VIRTENV/rp_install radical.pilot
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   local: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $VIRTENV/rp_install $sdist/
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   debug: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $SANDBOX/rp_install $sdist/
        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
        #
        #   installed: # no sdist staging
        #       true
        # esac
        #
        # virtenv_mode
        #   private : error  if ve exists, otherwise create, then use
        #   update  : update if ve exists, otherwise create, then use
        #   create  : use    if ve exists, otherwise create, then use
        #   use     : use    if ve exists, otherwise error,  then exit
        #   recreate: delete if ve exists, otherwise create, then use
        #      
        # examples   :
        #   [email protected]
        #   virtenv@devel
        #   virtenv@release
        #   virtenv@installed
        #   stage@local
        #   stage@/tmp/my_agent.py
        #
        # Note that some combinations may be invalid,
        # specifically in the context of virtenv_mode.  If, for
        # example, virtenv_mode is 'use', then the 'virtenv:tag'
        # will not make sense, as the virtenv is not updated.
        # In those cases, the virtenv_mode is honored, and
        # a warning is printed.
        #
        # Also, the 'stage' mode can only be combined with the
        # 'local' source, or with a path to the agent (relative
        # to root_dir, or absolute).
        #
        # A rp_version which does not adhere to the
        # above syntax is ignored, and the fallback stage@local
        # is used.

        if  not rp_version.startswith('@') and \
            not rp_version in ['installed', 'local', 'debug', 'release']:
            raise ValueError("invalid rp_version '%s'" % rp_version)

        if rp_version.startswith('@'):
            rp_version  = rp_version[1:]  # strip '@'


        # ----------------------------------------------------------------------
        # sanity checks
        if not python_dist        : raise RuntimeError("missing python distribution")
        if not virtenv_dist       : raise RuntimeError("missing virtualenv distribution")
        if not agent_spawner      : raise RuntimeError("missing agent spawner")
        if not agent_scheduler    : raise RuntimeError("missing agent scheduler")
        if not lrms               : raise RuntimeError("missing LRMS")
        if not agent_launch_method: raise RuntimeError("missing agentlaunch method")
        if not task_launch_method : raise RuntimeError("missing task launch method")

        # massage some values
        if not queue :
            queue = default_queue

        if  cleanup and isinstance (cleanup, bool) :
            #  l : log files
            #  u : unit work dirs
            #  v : virtualenv
            #  e : everything (== pilot sandbox)
            if shared_filesystem:
                cleanup = 'luve'
            else:
                # we cannot clean the sandbox from within the agent, as the hop
                # staging would then fail, and we'd get nothing back.
                # FIXME: cleanup needs to be done by the pmgr.launcher, or
                #        someone else, really, after fetching all logs and 
                #        profiles.
                cleanup = 'luv'

            # we never cleanup virtenvs which are not private
            if virtenv_mode is not 'private' :
                cleanup = cleanup.replace ('v', '')

        # add dists to staging files, if needed
        if rp_version in ['local', 'debug']:
            sdist_names = [ru.sdist_name, rs.sdist_name, self._rp_sdist_name]
            sdist_paths = [ru.sdist_path, rs.sdist_path, self._rp_sdist_path]
        else:
            sdist_names = list()
            sdist_paths = list()

        # if cores_per_node is set (!= None), then we need to
        # allocation full nodes, and thus round up
        if cores_per_node:
            cores_per_node = int(cores_per_node)
            number_cores   = int(cores_per_node
                           * math.ceil(float(number_cores) / cores_per_node))

        # if gpus_per_node is set (!= None), then we need to
        # allocation full nodes, and thus round up
        if gpus_per_node:
            gpus_per_node = int(gpus_per_node)
            number_gpus   = int(gpus_per_node
                           * math.ceil(float(number_gpus) / gpus_per_node))

        # set mandatory args
        bootstrap_args  = ""
        bootstrap_args += " -d '%s'" % ':'.join(sdist_names)
        bootstrap_args += " -p '%s'" % pid
        bootstrap_args += " -s '%s'" % sid
        bootstrap_args += " -m '%s'" % virtenv_mode
        bootstrap_args += " -r '%s'" % rp_version
        bootstrap_args += " -b '%s'" % python_dist
        bootstrap_args += " -g '%s'" % virtenv_dist
        bootstrap_args += " -v '%s'" % virtenv
        bootstrap_args += " -y '%d'" % runtime

        # set optional args
        if lrms == "CCM":           bootstrap_args += " -c"
        if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
        if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport
        if python_interpreter:      bootstrap_args += " -i '%s'" % python_interpreter
        if tunnel_bind_device:      bootstrap_args += " -t '%s'" % tunnel_bind_device
        if cleanup:                 bootstrap_args += " -x '%s'" % cleanup

        for arg in pre_bootstrap_0:
            bootstrap_args += " -e '%s'" % arg
        for arg in pre_bootstrap_1:
            bootstrap_args += " -w '%s'" % arg

        agent_cfg['owner']              = 'agent_0'
        agent_cfg['cores']              = number_cores
        agent_cfg['gpus']               = number_gpus
        agent_cfg['lrms']               = lrms
        agent_cfg['spawner']            = agent_spawner
        agent_cfg['scheduler']          = agent_scheduler
        agent_cfg['runtime']            = runtime
        agent_cfg['dburl']              = str(database_url)
        agent_cfg['session_id']         = sid
        agent_cfg['pilot_id']           = pid
        agent_cfg['logdir']             = '.'
        agent_cfg['pilot_sandbox']      = pilot_sandbox
        agent_cfg['session_sandbox']    = session_sandbox
        agent_cfg['resource_sandbox']   = resource_sandbox
        agent_cfg['agent_launch_method']= agent_launch_method
        agent_cfg['task_launch_method'] = task_launch_method
        agent_cfg['mpi_launch_method']  = mpi_launch_method
        agent_cfg['cores_per_node']     = cores_per_node
        agent_cfg['gpus_per_node']      = gpus_per_node
        agent_cfg['lfs_path_per_node']  = lfs_path_per_node
        agent_cfg['lfs_size_per_node']  = lfs_size_per_node
        agent_cfg['cu_tmp']             = cu_tmp
        agent_cfg['export_to_cu']       = export_to_cu
        agent_cfg['cu_pre_exec']        = cu_pre_exec
        agent_cfg['cu_post_exec']       = cu_post_exec
        agent_cfg['resource_cfg']       = copy.deepcopy(rcfg)
        agent_cfg['debug']              = self._log.getEffectiveLevel()

        # we'll also push the agent config into MongoDB
        pilot['cfg'] = agent_cfg

        # ----------------------------------------------------------------------
        # Write agent config dict to a json file in pilot sandbox.

        agent_cfg_name = 'agent_0.cfg'
        cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.')
        os.close(cfg_tmp_handle)  # file exists now

        # Convert dict to json file
        self._log.debug("Write agent cfg to '%s'.", cfg_tmp_file)
        self._log.debug(pprint.pformat(agent_cfg))
        ru.write_json(agent_cfg, cfg_tmp_file)

        ret['ft'].append({'src' : cfg_tmp_file, 
                          'tgt' : '%s/%s' % (pilot_sandbox, agent_cfg_name),
                          'rem' : True})  # purge the tmp file after packing

        # ----------------------------------------------------------------------
        # we also touch the log and profile tarballs in the target pilot sandbox
        ret['ft'].append({'src' : '/dev/null',
                          'tgt' : '%s/%s' % (pilot_sandbox, '%s.log.tgz' % pid),
                          'rem' : False})  # don't remove /dev/null
        # only stage profiles if we profile
        if self._prof.enabled:
            ret['ft'].append({
                          'src' : '/dev/null',
                          'tgt' : '%s/%s' % (pilot_sandbox, '%s.prof.tgz' % pid),
                          'rem' : False})  # don't remove /dev/null

        # check if we have a sandbox cached for that resource.  If so, we have
        # nothing to do.  Otherwise we create the sandbox and stage the RP
        # stack etc.
        # NOTE: this will race when multiple pilot launcher instances are used!
        with self._cache_lock:

            if resource not in self._sandboxes:

                for sdist in sdist_paths:
                    base = os.path.basename(sdist)
                    ret['ft'].append({'src' : sdist, 
                                      'tgt' : '%s/%s' % (session_sandbox, base),
                                      'rem' : False})

                # Copy the bootstrap shell script.
                bootstrapper_path = os.path.abspath("%s/agent/%s"
                                  % (self._root_dir, BOOTSTRAPPER_0))
                self._log.debug("use bootstrapper %s", bootstrapper_path)

                ret['ft'].append({'src' : bootstrapper_path, 
                                  'tgt' : '%s/%s' % (session_sandbox, BOOTSTRAPPER_0),
                                  'rem' : False})

                # Some machines cannot run pip due to outdated CA certs.
                # For those, we also stage an updated certificate bundle
                # TODO: use booleans all the way?
                if stage_cacerts:

                    cc_name = 'cacert.pem.gz'
                    cc_path = os.path.abspath("%s/agent/%s" % (self._root_dir, cc_name))
                    self._log.debug("use CAs %s", cc_path)

                    ret['ft'].append({'src' : cc_path, 
                                      'tgt' : '%s/%s' % (session_sandbox, cc_name),
                                      'rem' : False})

                self._sandboxes[resource] = True


        # ----------------------------------------------------------------------
        # Create SAGA Job description and submit the pilot job

        jd = rs.job.Description()

        if shared_filesystem:
            bootstrap_tgt = '%s/%s' % (session_sandbox, BOOTSTRAPPER_0)
        else:
            bootstrap_tgt = '%s/%s' % ('.', BOOTSTRAPPER_0)

        jd.name                  = pid
        jd.executable            = "/bin/bash"
        jd.arguments             = ['-l %s' % bootstrap_tgt, bootstrap_args]
        jd.working_directory     = pilot_sandbox
        jd.project               = project
        jd.output                = "bootstrap_0.out"
        jd.error                 = "bootstrap_0.err"
        jd.total_cpu_count       = number_cores
        jd.total_gpu_count       = number_gpus
        jd.processes_per_host    = cores_per_node
        jd.spmd_variation        = spmd_variation
        jd.wall_time_limit       = runtime
        jd.total_physical_memory = memory
        jd.queue                 = queue
        jd.candidate_hosts       = candidate_hosts
        jd.environment           = dict()

        # we set any saga_jd_supplement keys which are not already set above
        for key, val in saga_jd_supplement.iteritems():
            if not jd[key]:
                self._log.debug('supplement %s: %s', key, val)
                jd[key] = val

        if 'RADICAL_PILOT_PROFILE' in os.environ :
            jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE'

        # for condor backends and the like which do not have shared FSs, we add
        # additional staging directives so that the backend system binds the
        # files from the session and pilot sandboxes to the pilot job.
        jd.file_transfer = list()
        if not shared_filesystem:

            jd.file_transfer.extend([
                'site:%s/%s > %s' % (session_sandbox, BOOTSTRAPPER_0, BOOTSTRAPPER_0),
                'site:%s/%s > %s' % (pilot_sandbox,   agent_cfg_name, agent_cfg_name),
                'site:%s/%s.log.tgz > %s.log.tgz' % (pilot_sandbox, pid, pid),
                'site:%s/%s.log.tgz < %s.log.tgz' % (pilot_sandbox, pid, pid)
            ])

            if 'RADICAL_PILOT_PROFILE' in os.environ:
                jd.file_transfer.extend([
                    'site:%s/%s.prof.tgz > %s.prof.tgz' % (pilot_sandbox, pid, pid),
                    'site:%s/%s.prof.tgz < %s.prof.tgz' % (pilot_sandbox, pid, pid)
                ])

            for sdist in sdist_names:
                jd.file_transfer.extend([
                    'site:%s/%s > %s' % (session_sandbox, sdist, sdist)
                ])

            if stage_cacerts:
                jd.file_transfer.extend([
                    'site:%s/%s > %s' % (session_sandbox, cc_name, cc_name)
                ])

        self._log.debug("Bootstrap command line: %s %s", jd.executable, jd.arguments)

        ret['jd'] = jd
        return ret
Exemplo n.º 30
0
    def run(self):
        """Starts the process when Process.start() is called.
        """

        global JOB_CHECK_INTERVAL

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                db = self._session.get_db()
                pilot_col = db["%s.p" % self._session.uid]
                logger.debug(
                    "Connected to MongoDB. Serving requests for PilotManager %s."
                    % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._terminate.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                if self._disabled.is_set():
                    # don't process any new pilot start requests.
                    # NOTE: this is not clean, in principle there could be other
                    #       launchers alive which want to still start those
                    #       pending pilots.  In practice we only ever use one
                    #       pmgr though, and its during its shutdown that we get
                    #       here...
                    ts = timestamp()
                    compute_pilot = pilot_col.find_and_modify(
                        query={
                            "pilotmanager": self.pilot_manager_id,
                            "state": PENDING_LAUNCH
                        },
                        update={
                            "$set": {
                                "state": CANCELED
                            },
                            "$push": {
                                "statehistory": {
                                    "state": CANCELED,
                                    "timestamp": ts
                                }
                            }
                        })

                    # run state checks more frequently.
                    JOB_CHECK_INTERVAL = 3
                    time.sleep(1)
                    continue

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = timestamp()
                compute_pilot = pilot_col.find_and_modify(
                    query={
                        "pilotmanager": self.pilot_manager_id,
                        "state": PENDING_LAUNCH
                    },
                    update={
                        "$set": {
                            "state": LAUNCHING
                        },
                        "$push": {
                            "statehistory": {
                                "state": LAUNCHING,
                                "timestamp": ts
                            }
                        }
                    })

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_id = self._session.uid
                        database_url = self._session.dburl

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot['description']['cores']
                        runtime = compute_pilot['description']['runtime']
                        queue = compute_pilot['description']['queue']
                        project = compute_pilot['description']['project']
                        cleanup = compute_pilot['description']['cleanup']
                        resource_key = compute_pilot['description']['resource']
                        schema = compute_pilot['description']['access_schema']
                        memory = compute_pilot['description']['memory']
                        pilot_sandbox = compute_pilot['sandbox']
                        global_sandbox = compute_pilot['global_sandbox']

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(
                            resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_launch_method = resource_cfg.get(
                            'agent_launch_method')
                        agent_dburl = resource_cfg.get(
                            'agent_mongodb_endpoint', database_url)
                        agent_spawner = resource_cfg.get(
                            'agent_spawner', DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get('agent_type',
                                                      DEFAULT_AGENT_TYPE)
                        rc_agent_config = resource_cfg.get(
                            'agent_config', DEFAULT_AGENT_CONFIG)
                        agent_scheduler = resource_cfg.get('agent_scheduler')
                        tunnel_bind_device = resource_cfg.get(
                            'tunnel_bind_device')
                        default_queue = resource_cfg.get('default_queue')
                        forward_tunnel_endpoint = resource_cfg.get(
                            'forward_tunnel_endpoint')
                        js_endpoint = resource_cfg.get('job_manager_endpoint')
                        lrms = resource_cfg.get('lrms')
                        mpi_launch_method = resource_cfg.get(
                            'mpi_launch_method')
                        pre_bootstrap_1 = resource_cfg.get('pre_bootstrap_1')
                        pre_bootstrap_2 = resource_cfg.get('pre_bootstrap_2')
                        python_interpreter = resource_cfg.get(
                            'python_interpreter')
                        spmd_variation = resource_cfg.get('spmd_variation')
                        task_launch_method = resource_cfg.get(
                            'task_launch_method')
                        rp_version = resource_cfg.get('rp_version',
                                                      DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get(
                            'virtenv_mode', DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get('virtenv', DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get(
                            'stage_cacerts', 'False')
                        cores_per_node = resource_cfg.get('cores_per_node')

                        # Agent configuration that is not part of the public API.
                        # The agent config can either be a config dict, or
                        # a string pointing to a configuration name.  If neither
                        # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is
                        # set.  The last fallback is 'agent_default'
                        agent_config = compute_pilot['description'].get(
                            '_config')
                        if not agent_config:
                            agent_config = os.environ.get(
                                'RADICAL_PILOT_AGENT_CONFIG')
                        if not agent_config:
                            agent_config = rc_agent_config

                        if isinstance(agent_config, dict):
                            # nothing to do
                            agent_cfg_dict = agent_config
                            pass

                        elif isinstance(agent_config, basestring):
                            try:
                                if os.path.exists(agent_config):
                                    # try to open as file name
                                    logger.info("Read agent config file: %s" %
                                                agent_config)
                                    agent_cfg_dict = ru.read_json(agent_config)
                                else:
                                    # otherwise interpret as a config name
                                    # FIXME: load in session just like resource
                                    #        configs, including user level overloads
                                    module_path = os.path.dirname(
                                        os.path.abspath(__file__))
                                    config_path = "%s/../configs/" % module_path
                                    agent_cfg_file = os.path.join(
                                        config_path,
                                        "agent_%s.json" % agent_config)
                                    logger.info("Read agent config file: %s" %
                                                agent_cfg_file)
                                    agent_cfg_dict = ru.read_json(
                                        agent_cfg_file)
                            except Exception as e:
                                logger.exception(
                                    "Error reading agent config file: %s" % e)
                                raise

                        else:
                            # we can't handle this type
                            raise TypeError(
                                'agent config must be string (filename) or dict'
                            )

                        # TODO: use booleans all the way?
                        if stage_cacerts.lower() == 'true':
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            'pilot_sandbox': saga.Url(pilot_sandbox).path,
                            'global_sandbox': saga.Url(global_sandbox).path
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get('global_virtenv')
                        if global_virtenv:
                            logger.warn(
                                "'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'"
                            )
                            virtenv = global_virtenv
                            virtenv_mode = 'use'

                        # Create a host:port string for use by the bootstrap_1.
                        db_url = saga.Url(agent_dburl)
                        if db_url.port:
                            db_hostport = "%s:%d" % (db_url.host, db_url.port)
                        else:
                            db_hostport = "%s:%d" % (db_url.host, 27017
                                                     )  # mongodb default

                        # Open the remote sandbox
                        sandbox_tgt = saga.filesystem.Directory(
                            pilot_sandbox,
                            session=self._session,
                            flags=saga.filesystem.CREATE_PARENTS)

                        BOOTSTRAPPER_SCRIPT = "bootstrap_1.sh"
                        LOCAL_SCHEME = 'file'

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.  This also creates
                        # the sandbox. We use always "default_bootstrapper.sh"
                        # TODO: Is this still configurable and/or in the resource configs?
                        bootstrapper = "default_bootstrapper.sh"
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" \
                                % (mod_dir, bootstrapper))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url(
                            "%s://localhost%s" %
                            (LOCAL_SCHEME, bootstrapper_path))

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." \
                                % (bs_script_url, sandbox_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        sandbox_tgt.copy(bs_script_url, BOOTSTRAPPER_SCRIPT)

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if  not rp_version.startswith('@') and \
                            not rp_version in ['installed', 'local', 'debug']:
                            raise ValueError("invalid rp_version '%s'" %
                                             rp_version)

                        stage_sdist = True
                        if rp_version in ['installed', 'release']:
                            stage_sdist = False

                        if rp_version.startswith('@'):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for sdist_path in [
                                    ru.sdist_path, saga.sdist_path,
                                    rp_sdist_path
                            ]:

                                sdist_url = saga.Url(
                                    "%s://localhost%s" %
                                    (LOCAL_SCHEME, sdist_path))
                                msg = "Copying sdist '%s' to sandbox (%s)." % (
                                    sdist_url, pilot_sandbox)
                                logentries.append(
                                    Logentry(msg, logger=logger.debug))
                                sandbox_tgt.copy(
                                    sdist_url,
                                    os.path.basename(str(sdist_url)))

                        # ------------------------------------------------------
                        # Some machines cannot run pip due to outdated CA certs.
                        # For those, we also stage an updated certificate bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" \
                                    % (mod_dir, 'cacert.pem.gz'))

                            cc_url = saga.Url("%s://localhost/%s" %
                                              (LOCAL_SCHEME, cc_path))
                            msg = "Copying CA certificate bundle '%s' to sandbox (%s)." % (
                                cc_url, pilot_sandbox)
                            logentries.append(
                                Logentry(msg, logger=logger.debug))
                            sandbox_tgt.copy(cc_url,
                                             os.path.basename(str(cc_url)))

                        # ------------------------------------------------------
                        # sanity checks
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms: raise RuntimeError("missing LRMS")
                        if not agent_launch_method:
                            raise RuntimeError("missing agentlaunch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")
                        if not mpi_launch_method:
                            raise RuntimeError("missing mpi launch method")

                        # massage some values
                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = 'luve'  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not 'private':
                                cleanup = cleanup.replace('v', '')

                        sdists = ':'.join(
                            [ru.sdist_name, saga.sdist_name, rp_sdist_name])

                        # if cores_per_node is set (!= None), then we need to
                        # allocation full nodes, and thus round up
                        if cores_per_node:
                            cores_per_node = int(cores_per_node)
                            number_cores = int(cores_per_node * math.ceil(
                                float(number_cores) / cores_per_node))

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -d '%s'" % sdists
                        bootstrap_args += " -m '%s'" % virtenv_mode
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -r '%s'" % rp_version
                        bootstrap_args += " -s '%s'" % session_id
                        bootstrap_args += " -v '%s'" % virtenv

                        # set optional args
                        if agent_type:
                            bootstrap_args += " -a '%s'" % agent_type
                        if lrms == "CCM": bootstrap_args += " -c"
                        if pre_bootstrap_1:
                            bootstrap_args += " -e '%s'" % "' -e '".join(
                                pre_bootstrap_1)
                        if pre_bootstrap_2:
                            bootstrap_args += " -w '%s'" % "' -w '".join(
                                pre_bootstrap_2)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -h '%s'" % db_hostport
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if tunnel_bind_device:
                            bootstrap_args += " -t '%s'" % tunnel_bind_device
                        if cleanup: bootstrap_args += " -x '%s'" % cleanup

                        # set some agent configuration
                        agent_cfg_dict['cores'] = number_cores
                        agent_cfg_dict['debug'] = os.environ.get(
                            'RADICAL_PILOT_AGENT_VERBOSE',
                            logger.getEffectiveLevel())
                        agent_cfg_dict['mongodb_url'] = str(agent_dburl)
                        agent_cfg_dict['lrms'] = lrms
                        agent_cfg_dict['spawner'] = agent_spawner
                        agent_cfg_dict['scheduler'] = agent_scheduler
                        agent_cfg_dict['runtime'] = runtime
                        agent_cfg_dict['pilot_id'] = pilot_id
                        agent_cfg_dict['session_id'] = session_id
                        agent_cfg_dict[
                            'agent_launch_method'] = agent_launch_method
                        agent_cfg_dict[
                            'task_launch_method'] = task_launch_method
                        agent_cfg_dict['mpi_launch_method'] = mpi_launch_method
                        if cores_per_node:
                            agent_cfg_dict['cores_per_node'] = cores_per_node

                        # ------------------------------------------------------
                        # Write agent config dict to a json file in pilot sandbox.

                        cfg_tmp_handle, cf_tmp_file = tempfile.mkstemp(
                            suffix='.json', prefix='rp_agent_cfg_')

                        # Convert dict to json file
                        msg = "Writing agent configuration to file '%s'." % cf_tmp_file
                        logentries.append(Logentry(msg, logger=logger.debug))
                        ru.write_json(agent_cfg_dict, cf_tmp_file)

                        cf_url = saga.Url("%s://localhost%s" %
                                          (LOCAL_SCHEME, cf_tmp_file))
                        msg = "Copying agent configuration file '%s' to sandbox (%s)." % (
                            cf_url, pilot_sandbox)
                        logentries.append(Logentry(msg, logger=logger.debug))
                        sandbox_tgt.copy(cf_url, 'agent_0.cfg')

                        # close and remove temp file
                        os.close(cfg_tmp_handle)
                        os.unlink(cf_tmp_file)

                        # ------------------------------------------------------
                        # Done with all transfers to pilot sandbox, close handle
                        sandbox_tgt.close()

                        # ------------------------------------------------------
                        # now that the scripts are in place and configured,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data['job_services']:
                            js = self._shared_worker_data['job_services'][
                                js_url]
                        else:
                            js = saga.job.Service(js_url,
                                                  session=self._session)
                            self._shared_worker_data['job_services'][
                                js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = [
                            "-l %s" % BOOTSTRAPPER_SCRIPT, bootstrap_args
                        ]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "bootstrap_1.out"
                        jd.error = "bootstrap_1.err"
                        jd.total_cpu_count = number_cores
                        jd.processes_per_host = cores_per_node
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue
                        jd.environment = dict()

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if 'RADICAL_PILOT_PROFILE' in os.environ:
                            jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE'

                        logger.debug("Bootstrap command line: %s %s" %
                                     (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(
                            jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        pilotjob = js.create_job(jd)
                        pilotjob.run()

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data['job_ids'][pilot_id] = [
                            saga_job_id, js_url
                        ]

                        msg = "SAGA job submitted with job id %s" % str(
                            saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = timestamp()
                        ret = pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": LAUNCHING
                            }, {
                                "$set": {
                                    "state": PENDING_ACTIVE,
                                    "saga_job_id": saga_job_id,
                                    "agent_config": agent_cfg_dict
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                        if ret['n'] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update({"_id": pilot_id}, {
                                "$set": {
                                    "saga_job_id": saga_job_id
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(
                            pilot_col, pilot_id)
                        ts = timestamp()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(le.message)

                        pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": {
                                    "$ne": FAILED
                                }
                            }, {
                                "$set": {
                                    "state": FAILED,
                                    "stdout": out,
                                    "stderr": err,
                                    "logfile": log
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": FAILED,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })
                        logger.exception('\n'.join(log_messages))

        except SystemExit as e:
            logger.exception(
                "pilot launcher thread caught system exit -- forcing application shutdown"
            )
            import thread
            thread.interrupt_main()
Exemplo n.º 31
0
    def submit_units(self, descriptions):
        """
        Submits on or more :class:`radical.pilot.ComputeUnit` instances to the
        unit manager.

        **Arguments:**
            * **descriptions** [:class:`radical.pilot.ComputeUnitDescription`
              or list of :class:`radical.pilot.ComputeUnitDescription`]: The
              description of the compute unit instance(s) to create.

        **Returns:**
              * A list of :class:`radical.pilot.ComputeUnit` objects.
        """

        from .compute_unit import ComputeUnit

        self.is_valid()

        ret_list = True
        if not isinstance(descriptions, list):
            ret_list     = False
            descriptions = [descriptions]

        if len(descriptions) == 0:
            raise ValueError('cannot submit no unit descriptions')

        self._rep.info('<<submit %d unit(s)\n\t' % len(descriptions))

        # we return a list of compute units
        units = list()
        for ud in descriptions:

            if not ud.executable:
                raise ValueError('compute unit executable must be defined')

            unit = ComputeUnit(umgr=self, descr=ud)
            units.append(unit)

            # keep units around
            with self._units_lock:
                self._units[unit.uid] = unit

            if self._session._rec:
                ru.write_json(ud.as_dict(), "%s/%s.batch.%03d.json"
                        % (self._session._rec, unit.uid, self._rec_id))

            self._rep.progress()

        if self._session._rec:
            self._rec_id += 1

        # insert units into the database, as a bulk.
        unit_docs = [u.as_dict() for u in units]
        self._session._dbs.insert_units(unit_docs)

        # Only after the insert can we hand the units over to the next
        # components (ie. advance state).
        self.advance(unit_docs, rps.UMGR_SCHEDULING_PENDING, 
                     publish=True, push=True)
        self._rep.ok('>>ok\n')

        if ret_list: return units
        else       : return units[0]
Exemplo n.º 32
0
        bulk_size = 500
        bulk_id = '%s.%04d' % (session.uid, bulk)

        report.info('handle bulk %s (%d)\n\t' % (bulk_id, bulk_size))
        cuds = list()
        tasks = fetch_tasks(bulk_size=bulk_size)
        for task in tasks['data']:

            args = task['spec']['args'][0]
            prog = args['program']
            tid = task['id']
            fin = '%s/%s.in.json' % (sandbox, tid)
            fout = '%s/%s.out.json' % (sandbox, tid)

            ru.write_json(args, fin)

            cud = rp.ComputeUnitDescription()
            cud.executable = '/home/dgasmith/miniconda/envs/qcf/bin/qcengine'
            cud.arguments = [prog, fin]
            cud.name = tid
            cud.metadata = {'fout': fout}
            cud.input_staging = [fin]
            cud.output_staging = {
                'source': 'unit:///STDOUT',
                'target': '%s' % fout,
                'action': rp.TRANSFER
            }
            cud.gpu_processes = 0
            cud.cpu_processes = 1
            cud.cpu_threads = 1
Exemplo n.º 33
0
    def _prepare_pilot(self, resource, rcfg, pilot):

        pid = pilot["uid"]
        ret = {'ft': list(), 'jd': None}

        # ------------------------------------------------------------------
        # Database connection parameters
        sid = self._session.uid
        database_url = self._session.dburl

        # some default values are determined at runtime
        default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \
                          (resource, self._rp_version)

        # ------------------------------------------------------------------
        # get parameters from resource cfg, set defaults where needed
        agent_launch_method = rcfg.get('agent_launch_method')
        agent_dburl = rcfg.get('agent_mongodb_endpoint', database_url)
        agent_spawner = rcfg.get('agent_spawner', DEFAULT_AGENT_SPAWNER)
        rc_agent_config = rcfg.get('agent_config', DEFAULT_AGENT_CONFIG)
        agent_scheduler = rcfg.get('agent_scheduler')
        tunnel_bind_device = rcfg.get('tunnel_bind_device')
        default_queue = rcfg.get('default_queue')
        forward_tunnel_endpoint = rcfg.get('forward_tunnel_endpoint')
        lrms = rcfg.get('lrms')
        mpi_launch_method = rcfg.get('mpi_launch_method', '')
        pre_bootstrap_1 = rcfg.get('pre_bootstrap_1', [])
        pre_bootstrap_2 = rcfg.get('pre_bootstrap_2', [])
        python_interpreter = rcfg.get('python_interpreter')
        task_launch_method = rcfg.get('task_launch_method')
        rp_version = rcfg.get('rp_version', DEFAULT_RP_VERSION)
        virtenv_mode = rcfg.get('virtenv_mode', DEFAULT_VIRTENV_MODE)
        virtenv = rcfg.get('virtenv', default_virtenv)
        cores_per_node = rcfg.get('cores_per_node', 0)
        health_check = rcfg.get('health_check', True)
        python_dist = rcfg.get('python_dist')
        virtenv_dist = rcfg.get('virtenv_dist', DEFAULT_VIRTENV_DIST)
        cu_tmp = rcfg.get('cu_tmp')
        spmd_variation = rcfg.get('spmd_variation')
        shared_filesystem = rcfg.get('shared_filesystem', True)
        stage_cacerts = rcfg.get('stage_cacerts', False)
        cu_pre_exec = rcfg.get('cu_pre_exec')
        cu_post_exec = rcfg.get('cu_post_exec')
        export_to_cu = rcfg.get('export_to_cu')
        mandatory_args = rcfg.get('mandatory_args', [])

        # ------------------------------------------------------------------
        # get parameters from the pilot description
        number_cores = pilot['description']['cores']
        runtime = pilot['description']['runtime']
        queue = pilot['description']['queue']
        project = pilot['description']['project']
        cleanup = pilot['description']['cleanup']
        memory = pilot['description']['memory']
        candidate_hosts = pilot['description']['candidate_hosts']

        # make sure that mandatory args are known
        for ma in mandatory_args:
            if pilot['description'].get(ma) is None:
                raise  ValueError('attribute "%s" is required for "%s"' \
                                 % (ma, resource))

        # get pilot and global sandbox
        resource_sandbox = self._session._get_resource_sandbox(pilot).path
        session_sandbox = self._session._get_session_sandbox(pilot).path
        pilot_sandbox = self._session._get_pilot_sandbox(pilot).path

        pilot['resource_sandbox'] = str(
            self._session._get_resource_sandbox(pilot))
        pilot['pilot_sandbox'] = str(self._session._get_pilot_sandbox(pilot))
        pilot['client_sandbox'] = str(self._session._get_client_sandbox())

        # Agent configuration that is not part of the public API.
        # The agent config can either be a config dict, or
        # a string pointing to a configuration name.  If neither
        # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is
        # set.  The last fallback is 'agent_default'
        agent_config = pilot['description'].get('_config')
        if not agent_config:
            agent_config = os.environ.get('RADICAL_PILOT_AGENT_CONFIG')
        if not agent_config:
            agent_config = rc_agent_config

        if isinstance(agent_config, dict):

            # use dict as is
            agent_cfg = agent_config

        elif isinstance(agent_config, basestring):
            try:
                # interpret as a config name
                agent_cfg_file = os.path.join(self._conf_dir,
                                              "agent_%s.json" % agent_config)

                self._log.info("Read agent config file: %s", agent_cfg_file)
                agent_cfg = ru.read_json(agent_cfg_file)

                # allow for user level overload
                user_cfg_file = '%s/.radical/pilot/config/%s' \
                              % (os.environ['HOME'], os.path.basename(agent_cfg_file))

                if os.path.exists(user_cfg_file):
                    self._log.info("merging user config: %s" % user_cfg_file)
                    user_cfg = ru.read_json(user_cfg_file)
                    ru.dict_merge(agent_cfg, user_cfg, policy='overwrite')

            except Exception as e:
                self._log.exception("Error reading agent config file: %s" % e)
                raise

        else:
            # we can't handle this type
            raise TypeError(
                'agent config must be string (config name) or dict')

        # expand variables in virtenv string
        virtenv = virtenv % {
            'pilot_sandbox': pilot_sandbox,
            'session_sandbox': session_sandbox,
            'resource_sandbox': resource_sandbox
        }

        # Check for deprecated global_virtenv
        if 'global_virtenv' in rcfg:
            raise RuntimeError("'global_virtenv' is deprecated (%s)" %
                               resource)

        # Create a host:port string for use by the bootstrap_1.
        db_url = rs.Url(agent_dburl)
        if db_url.port:
            db_hostport = "%s:%d" % (db_url.host, db_url.port)
        else:
            db_hostport = "%s:%d" % (db_url.host, 27017)  # mongodb default

        # ------------------------------------------------------------------
        # the version of the agent is derived from
        # rp_version, which has the following format
        # and interpretation:
        #
        # case rp_version:
        #   @<token>:
        #   @tag/@branch/@commit: # no sdist staging
        #       git clone $github_base radical.pilot.src
        #       (cd radical.pilot.src && git checkout token)
        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
        #       rm -rf radical.pilot.src
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   release: # no sdist staging
        #       pip install -t $VIRTENV/rp_install radical.pilot
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   local: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $VIRTENV/rp_install $sdist/
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   debug: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $SANDBOX/rp_install $sdist/
        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
        #
        #   installed: # no sdist staging
        #       true
        # esac
        #
        # virtenv_mode
        #   private : error  if ve exists, otherwise create, then use
        #   update  : update if ve exists, otherwise create, then use
        #   create  : use    if ve exists, otherwise create, then use
        #   use     : use    if ve exists, otherwise error,  then exit
        #   recreate: delete if ve exists, otherwise create, then use
        #
        # examples   :
        #   [email protected]
        #   virtenv@devel
        #   virtenv@release
        #   virtenv@installed
        #   stage@local
        #   stage@/tmp/my_agent.py
        #
        # Note that some combinations may be invalid,
        # specifically in the context of virtenv_mode.  If, for
        # example, virtenv_mode is 'use', then the 'virtenv:tag'
        # will not make sense, as the virtenv is not updated.
        # In those cases, the virtenv_mode is honored, and
        # a warning is printed.
        #
        # Also, the 'stage' mode can only be combined with the
        # 'local' source, or with a path to the agent (relative
        # to root_dir, or absolute).
        #
        # A rp_version which does not adhere to the
        # above syntax is ignored, and the fallback stage@local
        # is used.

        if  not rp_version.startswith('@') and \
            not rp_version in ['installed', 'local', 'debug', 'release']:
            raise ValueError("invalid rp_version '%s'" % rp_version)

        if rp_version.startswith('@'):
            rp_version = rp_version[1:]  # strip '@'

        # ------------------------------------------------------------------
        # sanity checks
        if not python_dist: raise RuntimeError("missing python distribution")
        if not virtenv_dist:
            raise RuntimeError("missing virtualenv distribution")
        if not agent_spawner: raise RuntimeError("missing agent spawner")
        if not agent_scheduler: raise RuntimeError("missing agent scheduler")
        if not lrms: raise RuntimeError("missing LRMS")
        if not agent_launch_method:
            raise RuntimeError("missing agentlaunch method")
        if not task_launch_method:
            raise RuntimeError("missing task launch method")

        # massage some values
        if not queue:
            queue = default_queue

        if cleanup and isinstance(cleanup, bool):
            #  l : log files
            #  u : unit work dirs
            #  v : virtualenv
            #  e : everything (== pilot sandbox)
            if shared_filesystem:
                cleanup = 'luve'
            else:
                # we cannot clean the sandbox from within the agent, as the hop
                # staging would then fail, and we'd get nothing back.
                # FIXME: cleanup needs to be done by the pmgr.launcher, or
                #        someone else, really, after fetching all logs and
                #        profiles.
                cleanup = 'luv'

            # we never cleanup virtenvs which are not private
            if virtenv_mode is not 'private':
                cleanup = cleanup.replace('v', '')

        # add dists to staging files, if needed
        if rp_version in ['local', 'debug']:
            sdist_names = [ru.sdist_name, rs.sdist_name, self._rp_sdist_name]
            sdist_paths = [ru.sdist_path, rs.sdist_path, self._rp_sdist_path]
        else:
            sdist_names = list()
            sdist_paths = list()

        # if cores_per_node is set (!= None), then we need to
        # allocation full nodes, and thus round up
        if cores_per_node:
            cores_per_node = int(cores_per_node)
            number_cores = int(cores_per_node *
                               math.ceil(float(number_cores) / cores_per_node))

        # set mandatory args
        bootstrap_args = ""
        bootstrap_args += " -d '%s'" % ':'.join(sdist_names)
        bootstrap_args += " -p '%s'" % pid
        bootstrap_args += " -s '%s'" % sid
        bootstrap_args += " -m '%s'" % virtenv_mode
        bootstrap_args += " -r '%s'" % rp_version
        bootstrap_args += " -b '%s'" % python_dist
        bootstrap_args += " -g '%s'" % virtenv_dist
        bootstrap_args += " -v '%s'" % virtenv
        bootstrap_args += " -y '%d'" % runtime

        # set optional args
        if lrms == "CCM": bootstrap_args += " -c"
        if forward_tunnel_endpoint:
            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
        if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport
        if python_interpreter:
            bootstrap_args += " -i '%s'" % python_interpreter
        if tunnel_bind_device:
            bootstrap_args += " -t '%s'" % tunnel_bind_device
        if cleanup: bootstrap_args += " -x '%s'" % cleanup

        for arg in pre_bootstrap_1:
            bootstrap_args += " -e '%s'" % arg
        for arg in pre_bootstrap_2:
            bootstrap_args += " -w '%s'" % arg

        agent_cfg['owner'] = 'agent_0'
        agent_cfg['cores'] = number_cores
        agent_cfg['lrms'] = lrms
        agent_cfg['spawner'] = agent_spawner
        agent_cfg['scheduler'] = agent_scheduler
        agent_cfg['runtime'] = runtime
        agent_cfg['dburl'] = str(database_url)
        agent_cfg['session_id'] = sid
        agent_cfg['pilot_id'] = pid
        agent_cfg['logdir'] = '.'
        agent_cfg['pilot_sandbox'] = pilot_sandbox
        agent_cfg['session_sandbox'] = session_sandbox
        agent_cfg['resource_sandbox'] = resource_sandbox
        agent_cfg['agent_launch_method'] = agent_launch_method
        agent_cfg['task_launch_method'] = task_launch_method
        agent_cfg['mpi_launch_method'] = mpi_launch_method
        agent_cfg['cores_per_node'] = cores_per_node
        agent_cfg['cu_tmp'] = cu_tmp
        agent_cfg['export_to_cu'] = export_to_cu
        agent_cfg['cu_pre_exec'] = cu_pre_exec
        agent_cfg['cu_post_exec'] = cu_post_exec
        agent_cfg['resource_cfg'] = copy.deepcopy(rcfg)
        agent_cfg['debug'] = self._log.getEffectiveLevel()

        # we'll also push the agent config into MongoDB
        pilot['cfg'] = agent_cfg

        # ------------------------------------------------------------------
        # Write agent config dict to a json file in pilot sandbox.

        agent_cfg_name = 'agent_0.cfg'
        cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.')
        os.close(cfg_tmp_handle)  # file exists now

        # Convert dict to json file
        self._log.debug("Write agent cfg to '%s'.", cfg_tmp_file)
        self._log.debug(pprint.pformat(agent_cfg))
        ru.write_json(agent_cfg, cfg_tmp_file)

        ret['ft'].append({
            'src': cfg_tmp_file,
            'tgt': '%s/%s' % (pilot_sandbox, agent_cfg_name),
            'rem': True
        })  # purge the tmp file after packing

        # ----------------------------------------------------------------------
        # we also touch the log and profile tarballs in the target pilot sandbox
        ret['ft'].append({
            'src': '/dev/null',
            'tgt': '%s/%s' % (pilot_sandbox, '%s.log.tgz' % pid),
            'rem': False
        })  # don't remove /dev/null
        # only stage profiles if we profile
        if self._prof.enabled:
            ret['ft'].append({
                'src':
                '/dev/null',
                'tgt':
                '%s/%s' % (pilot_sandbox, '%s.prof.tgz' % pid),
                'rem':
                False
            })  # don't remove /dev/null

        # check if we have a sandbox cached for that resource.  If so, we have
        # nothing to do.  Otherwise we create the sandbox and stage the RP
        # stack etc.
        # NOTE: this will race when multiple pilot launcher instances are used!
        with self._cache_lock:

            if not resource in self._sandboxes:

                for sdist in sdist_paths:
                    base = os.path.basename(sdist)
                    ret['ft'].append({
                        'src': sdist,
                        'tgt': '%s/%s' % (session_sandbox, base),
                        'rem': False
                    })

                # Copy the bootstrap shell script.
                bootstrapper_path = os.path.abspath("%s/agent/%s" \
                        % (self._root_dir, BOOTSTRAPPER_0))
                self._log.debug("use bootstrapper %s", bootstrapper_path)

                ret['ft'].append({
                    'src':
                    bootstrapper_path,
                    'tgt':
                    '%s/%s' % (session_sandbox, BOOTSTRAPPER_0),
                    'rem':
                    False
                })

                # Some machines cannot run pip due to outdated CA certs.
                # For those, we also stage an updated certificate bundle
                # TODO: use booleans all the way?
                if stage_cacerts:

                    cc_name = 'cacert.pem.gz'
                    cc_path = os.path.abspath("%s/agent/%s" %
                                              (self._root_dir, cc_name))
                    self._log.debug("use CAs %s", cc_path)

                    ret['ft'].append({
                        'src':
                        cc_path,
                        'tgt':
                        '%s/%s' % (session_sandbox, cc_name),
                        'rem':
                        False
                    })

                self._sandboxes[resource] = True

        # ------------------------------------------------------------------
        # Create SAGA Job description and submit the pilot job

        jd = rs.job.Description()

        if shared_filesystem:
            bootstrap_tgt = '%s/%s' % (session_sandbox, BOOTSTRAPPER_0)
        else:
            bootstrap_tgt = '%s/%s' % ('.', BOOTSTRAPPER_0)

        jd.name = pid
        jd.executable = "/bin/bash"
        jd.arguments = ['-l %s' % bootstrap_tgt, bootstrap_args]
        jd.working_directory = pilot_sandbox
        jd.project = project
        jd.output = "bootstrap_1.out"
        jd.error = "bootstrap_1.err"
        jd.total_cpu_count = number_cores
        jd.processes_per_host = cores_per_node
        jd.spmd_variation = spmd_variation
        jd.wall_time_limit = runtime
        jd.total_physical_memory = memory
        jd.queue = queue
        jd.candidate_hosts = candidate_hosts
        jd.environment = dict()

        if 'RADICAL_PILOT_PROFILE' in os.environ:
            jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE'

        # for condor backends and the like which do not have shared FSs, we add
        # additional staging directives so that the backend system binds the
        # files from the session and pilot sandboxes to the pilot job.
        jd.file_transfer = list()
        if not shared_filesystem:

            jd.file_transfer.extend([
                'site:%s/%s > %s' %
                (session_sandbox, BOOTSTRAPPER_0, BOOTSTRAPPER_0),
                'site:%s/%s > %s' %
                (pilot_sandbox, agent_cfg_name, agent_cfg_name),
                'site:%s/%s.log.tgz > %s.log.tgz' % (pilot_sandbox, pid, pid),
                'site:%s/%s.log.tgz < %s.log.tgz' % (pilot_sandbox, pid, pid)
            ])

            if 'RADICAL_PILOT_PROFILE' in os.environ:
                jd.file_transfer.extend([
                    'site:%s/%s.prof.tgz > %s.prof.tgz' %
                    (pilot_sandbox, pid, pid),
                    'site:%s/%s.prof.tgz < %s.prof.tgz' %
                    (pilot_sandbox, pid, pid)
                ])

            for sdist in sdist_names:
                jd.file_transfer.extend(
                    ['site:%s/%s > %s' % (session_sandbox, sdist, sdist)])

            if stage_cacerts:
                jd.file_transfer.extend(
                    ['site:%s/%s > %s' % (session_sandbox, cc_name, cc_name)])

        self._log.debug("Bootstrap command line: %s %s", jd.executable,
                        jd.arguments)

        ret['jd'] = jd
        return ret
Exemplo n.º 34
0
def test_read_json():
    '''
    Test json parser
    '''

    # --------------------------------------------------------------------------
    # default xcase
    data = {'test_1': 1,
            'test_2': 'one',
            'test_3': [1, 'one']}

    filename  = _write_json(json.dumps(data))
    data_copy = ru.read_json(filename)

    assert(data_copy)

    for key in data:
        assert(key in data_copy)
        assert(data[key] == data_copy[key])

    for key in data_copy:
        assert(key in data)
        assert(data[key] == data_copy[key])


    # ---------------------------------------------------------------------------
    # string read
    data_copy = ru.read_json_str(filename)
    assert(isinstance(data_copy['test_2'], str))


    # ---------------------------------------------------------------------------
    # arg switching
    ru.write_json(filename, data_copy)
    ru.write_json(data_copy, filename)
    data_copy = ru.read_json_str(filename)
    assert(len(data_copy) == 3)

    os.unlink(filename)


    # --------------------------------------------------------------------------
    # manual parse
    data = '''{
                  "test_1": 1,
                  "test_2": "one",
                  "test_3": [1, "one"]
              }'''
    data_copy = ru.parse_json(data, filter_comments=False)
    assert(len(data_copy) == 3)
    assert(data_copy['test_2'] == 'one')


    # --------------------------------------------------------------------------
    # forced str conversion on manual parse
    data_copy = ru.parse_json_str(data)
    assert(len(data_copy) == 3)
    assert(isinstance(data_copy['test_2'], str))


    # ---------------------------------------------------------------------------
    # faulty json file
    filename = _write_raw(b'{"foo": [False]}')
    with pytest.raises(ValueError):
        ru.read_json(filename)
Exemplo n.º 35
0
    def submit_units(self, unit_descriptions):
        """Submits on or more :class:`radical.pilot.ComputeUnit` instances to the
        unit manager.

        **Arguments:**

            * **unit_descriptions** [:class:`radical.pilot.ComputeUnitDescription`
              or list of :class:`radical.pilot.ComputeUnitDescription`]: The
              description of the compute unit instance(s) to create.

        **Returns:**

              * A list of :class:`radical.pilot.ComputeUnit` objects.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """

        if not self._valid:
            raise RuntimeError("instance is already closed")

        return_list_type = True
        if not isinstance(unit_descriptions, list):
            return_list_type = False
            unit_descriptions = [unit_descriptions]

        if len(unit_descriptions) == 0:
            raise ValueError('cannot submit no unit descriptions')

        logger.report.info('<<submit %d unit(s)\n\t' % len(unit_descriptions))

        # we return a list of compute units
        ret = list()

        # the scheduler will return a dictionary of the form:
        #   {
        #     ud_1 : pilot_id_a,
        #     ud_2 : pilot_id_b
        #     ...
        #   }
        #
        # The scheduler may not be able to schedule some units - those will
        # have 'None' as pilot ID.

        units = list()
        for ud in unit_descriptions:

            u = ComputeUnit.create(unit_description=ud,
                                   unit_manager_obj=self,
                                   local_state=SCHEDULING)
            units.append(u)

            if self._session._rec:
                import radical.utils as ru
                ru.write_json(ud.as_dict(), "%s/%s.batch.%03d.json" \
                        % (self._session._rec, u.uid, self._rec_id))
            logger.report.progress()
        if self._session._rec:
            self._rec_id += 1

        self._worker.publish_compute_units(units=units)

        schedule = None
        try:
            schedule = self._scheduler.schedule(units=units)

        except Exception as e:
            logger.exception("Internal error - unit scheduler failed")
            raise

        self.handle_schedule(schedule)

        logger.report.ok('>>ok\n')

        if return_list_type:
            return units
        else:
            return units[0]
Exemplo n.º 36
0
def write_session_description(amgr):

    desc = dict()

    desc['entities'] = dict()
    desc['entities']['pipeline'] = {
        'state_model': res._pipeline_state_values,
        'state_values': res._pipeline_state_inv,
        'event_model': dict(),
    }

    desc['entities']['stage'] = {
        'state_model': res._stage_state_values,
        'state_values': res._stage_state_inv,
        'event_model': dict(),
    }

    desc['entities']['task'] = {
        'state_model': res._task_state_values,
        'state_values': res._task_state_inv,
        'event_model': dict(),
    }

    desc['entities']['appmanager'] = {
        'state_model': None,
        'state_values': None,
        'event_model': dict(),
    }

    # Adding amgr to the tree
    tree = dict()
    tree[amgr._uid] = {
        'uid': amgr._uid,
        'etype': 'appmanager',
        'cfg': {},
        'has': ['pipeline', 'wfprocessor', 'resource_manager', 'task_manager'],
        'children': list()
    }

    # Adding wfp to the tree
    wfp = amgr._wfp
    tree[amgr._uid]['children'].append(wfp._uid)
    tree[wfp._uid] = {
        'uid': wfp._uid,
        'etype': 'wfprocessor',
        'cfg': {},
        'has': [],
        'children': list()
    }

    # Adding rmgr to the tree
    rmgr = amgr._rmgr
    tree[amgr._uid]['children'].append(rmgr._uid)
    tree[rmgr._uid] = {
        'uid': rmgr._uid,
        'etype': 'resource_manager',
        'cfg': {},
        'has': [],
        'children': list()
    }

    # Adding tmgr to the tree
    tmgr = amgr._task_manager
    tree[amgr._uid]['children'].append(tmgr._uid)
    tree[tmgr._uid] = {
        'uid': tmgr._uid,
        'etype': 'task_manager',
        'cfg': {},
        'has': [],
        'children': list()
    }

    # Adding pipelines to the tree
    for wf in amgr._workflows:

        for pipe in wf:
            tree[amgr._uid]['children'].append(pipe._uid)
            tree[pipe._uid] = {
                'uid': pipe._uid,
                'etype': 'pipeline',
                'cfg': {},
                'has': ['stage'],
                'children': list()
            }
            # Adding stages to the tree
            for stage in pipe.stages:
                tree[pipe._uid]['children'].append(stage._uid)
                tree[stage._uid] = {
                    'uid': stage._uid,
                    'etype': 'stage',
                    'cfg': {},
                    'has': ['task'],
                    'children': list()
                }
                # Adding tasks to the tree
                for task in stage.tasks:
                    tree[stage._uid]['children'].append(task._uid)
                    tree[task._uid] = {
                        'uid': task._uid,
                        'etype': 'task',
                        'cfg': {},
                        'has': [],
                        'children': list()
                    }

    desc['tree'] = tree
    desc['config'] = dict()

    ru.write_json(desc, '%s/radical.entk.%s.json' % (amgr.sid, amgr.sid))
Exemplo n.º 37
0
def get_session_description(sid, src=None, dburl=None):
    """
    This will return a description which is usable for radical.analytics
    evaluation.  It informs about
      - set of stateful entities
      - state models of those entities
      - event models of those entities (maybe)
      - configuration of the application / module

    If `src` is given, it is interpreted as path to search for session
    information (json dump).  `src` defaults to `$PWD/$sid`.

    if `dburl` is given, its value is used to fetch session information from
    a database.  The dburl value defaults to `RADICAL_PILOT_DBURL`.
    """

    from radical.pilot import states as rps
    from .session      import fetch_json

    if not src:
        src = "%s/%s" % (os.getcwd(), sid)

    ftmp = fetch_json(sid=sid, dburl=dburl, tgt=src, skip_existing=True)
    json = ru.read_json(ftmp)


    # make sure we have uids
    def fix_json(json):
        def fix_uids(json):
            if isinstance(json, list):
                for elem in json:
                    fix_uids(elem)
            elif isinstance(json, dict):
                if 'unitmanager' in json and 'umgr' not in json:
                    json['umgr'] = json['unitmanager']
                if 'pilotmanager' in json and 'pmgr' not in json:
                    json['pmgr'] = json['pilotmanager']
                if '_id' in json and 'uid' not in json:
                    json['uid'] = json['_id']
                    if not 'cfg' in json:
                        json['cfg'] = dict()
                for k,v in json.iteritems():
                    fix_uids(v)
        fix_uids(json)
    fix_json(json)

    ru.write_json(json, '/tmp/t.json')

    assert(sid == json['session']['uid'])

    ret             = dict()
    ret['entities'] = dict()

    tree      = dict()
    tree[sid] = {'uid'      : sid,
                 'etype'    : 'session',
               # 'cfg'      : json['session']['cfg'],
                 'has'      : ['umgr', 'pmgr'],
                 'children' : list()
                }

    for pmgr in sorted(json['pmgr'], key=lambda k: k['uid']):
        uid = pmgr['uid']
        tree[sid]['children'].append(uid)
        tree[uid] = {'uid'      : uid,
                     'etype'    : 'pmgr',
                   # 'cfg'      : pmgr['cfg'],
                     'has'      : ['pilot'],
                     'children' : list()
                    }

    for umgr in sorted(json['umgr'], key=lambda k: k['uid']):
        uid = umgr['uid']
        tree[sid]['children'].append(uid)
        tree[uid] = {'uid'      : uid,
                     'etype'    : 'umgr',
               #     'cfg'      : umgr['cfg'],
                     'has'      : ['unit'],
                     'children' : list()
                    }

    for pilot in sorted(json['pilot'], key=lambda k: k['uid']):
        uid  = pilot['uid']
        pmgr = pilot['pmgr']
        tree[pmgr]['children'].append(uid)
        tree[uid] = {'uid'      : uid,
                     'etype'    : 'pilot',
               #     'cfg'      : pilot['cfg'],
                     'has'      : ['unit'],
                     'children' : list()
                    }

    for unit in sorted(json['unit'], key=lambda k: k['uid']):
        uid  = unit['uid']
        pid  = unit['umgr']
        umgr = unit['pilot']
        tree[pid ]['children'].append(uid)
        tree[umgr]['children'].append(uid)
        tree[uid] = {'uid'      : uid,
                     'etype'    : 'unit',
               #     'cfg'      : unit['description'],
                     'has'      : list(),
                     'children' : list()
                    }

    ret['tree'] = tree

    import pprint, sys
    pprint.pprint(tree)

    ret['entities']['pilot'] = {
            'state_model'  : rps.pilot_state_by_value,
            'state_values' : rps.pilot_state_value,
            'event_model'  : dict(),
            }

    ret['entities']['unit'] = {
            'state_model'  : rps.unit_state_by_value,
            'state_values' : rps.unit_state_value,
            'event_model'  : dict(),
            }

    ret['entities']['session'] = {
            'state_model'  : None, # session has no states, only events
            'state_values' : None,
            'event_model'  : dict(),
            }

    ret['config'] = dict() # magic to get session config goes here

    return ret
Exemplo n.º 38
0
class Session(saga.Session):
    """A Session encapsulates a RADICAL-Pilot instance and is the *root* object
    for all other RADICAL-Pilot objects. 

    A Session holds :class:`radical.pilot.PilotManager` and :class:`radical.pilot.UnitManager`
    instances which in turn hold  :class:`radical.pilot.Pilot` and
    :class:`radical.pilot.ComputeUnit` instances.

    Each Session has a unique identifier :data:`radical.pilot.Session.uid` that can be
    used to re-connect to a RADICAL-Pilot instance in the database.

    **Example**::

        s1 = radical.pilot.Session(database_url=DBURL)
        s2 = radical.pilot.Session(database_url=DBURL, uid=s1.uid)

        # s1 and s2 are pointing to the same session
        assert s1.uid == s2.uid
    """

    #---------------------------------------------------------------------------
    #
    def __init__(self, database_url=None, database_name=None, name=None):
        """Creates a new session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

        **Arguments:**
            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        logger = ru.get_logger('radical.pilot')

        if database_name:
            logger.error(
                "The 'database_name' parameter is deprecated - please specify an URL path"
            )
        else:
            database_name = 'radicalpilot'

        # init the base class inits
        saga.Session.__init__(self)
        self._dh = ru.DebugHelper()
        self._valid = True
        self._terminate = threading.Event()
        self._terminate.clear()

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = dict()
        self._unit_manager_objects = dict()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        if not database_url:
            database_url = os.getenv("RADICAL_PILOT_DBURL", None)

        if not database_url:
            raise PilotException("no database URL (set RADICAL_PILOT_DBURL)")

        self._dburl = ru.Url(database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        if  not self._dburl.path         or \
            self._dburl.path[0]   != '/' or \
            len(self._dburl.path) <=  1  :
            logger.error(
                "incomplete URLs are deprecated -- missing database name!")
            self._dburl.path = database_name  # defaults to 'radicalpilot'

        logger.info("using database %s" % self._dburl)

        # ----------------------------------------------------------------------
        # create new session
        try:
            if name:
                self._name = name
                self._uid = name
            # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
            else:
                self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE)
                self._name = self._uid

            logger.report.info('<<create session %s' % self._uid)

            self._dbs = dbSession(sid=self._uid,
                                  name=self._name,
                                  dburl=self._dburl)

            self._dburl = self._dbs._dburl

            logger.info("New Session created: %s." % str(self))

        except Exception, ex:
            logger.exception('session create failed')
            raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                            % (self._dburl, ex))

        # initialize profiling
        self.prof = Profiler('%s' % self._uid)
        self.prof.prof('start session', uid=self._uid)

        # Loading all "default" resource configurations
        module_path = os.path.dirname(os.path.abspath(__file__))
        default_cfgs = "%s/configs/resource_*.json" % module_path
        config_files = glob.glob(default_cfgs)

        for config_file in config_files:

            try:
                logger.info("Load resource configurations from %s" %
                            config_file)
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Load resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict()

        user_cfgs = "%s/.radical/pilot/configs/resource_*.json" % os.environ.get(
            'HOME')
        config_files = glob.glob(user_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Load resource configurations for %s" % rc)

                if rc in self._resource_configs:
                    # config exists -- merge user config into it
                    ru.dict_merge(self._resource_configs[rc],
                                  rcs[rc].as_dict(),
                                  policy='overwrite')
                else:
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict()

        default_aliases = "%s/configs/resource_aliases.json" % module_path
        self._resource_aliases = ru.read_json_str(default_aliases)['aliases']

        self.prof.prof('configs parsed', uid=self._uid)

        _rec = os.environ.get('RADICAL_PILOT_RECORD_SESSION')
        if _rec:
            self._rec = "%s/%s" % (_rec, self._uid)
            os.system('mkdir -p %s' % self._rec)
            ru.write_json({'dburl': str(self._dburl)},
                          "%s/session.json" % self._rec)
            logger.info("recording session in %s" % self._rec)
        else:
            self._rec = None

        logger.report.ok('>>ok\n')
Exemplo n.º 39
0
def generate_pipeline(cfg):

    cfg_file = cfg['run_cfg_file']  # resource and workload config
    run_file = cfg['run_file']  # runs for this campaign

    # setup S1 workload
    cfg = ru.Config(cfg=ru.read_json(cfg_file))
    runs = check_runs(cfg_file, run_file)

    if not runs:
        print('S1: nothing to run, exiting.')
        return

    # for each run in the campaign:
    # - create cfg with requested receptor and smiles
    # - create a number of masters as EnTK tasks and add them to a pipeline
    # - submit configured number of masters with that cfg

    # setup EnTK pipeline
    p = Pipeline()
    p.name = 'S1-RAPTOR'
    s = Stage()

    # create cfg
    subs = dict()
    rurl = cfg.fs_url + cfg.workload.results
    d = rs.filesystem.Directory(rurl)
    ls = [str(u).split('/')[-1] for u in d.list()]

    workload = cfg.workload

    for receptor, smiles, nodes, runtime in runs:

        print('%30s  %s' % (receptor, smiles))
        name = '%s_-_%s' % (receptor, smiles)
        tgt = '%s.%s.gz' % (name, workload.output)
        # rec  = False

        # if tgt in ls:
        #     if workload.recompute:
        #         rec += 1
        #         d.move(tgt, tgt + '.bak')
        #     else:
        #         print('skip      1 %s' % name)
        #         continue

        # if smiles in ls:
        #     if smiles not in subs:
        #         subs[smiles] = [str(u).split('/')[-1]  for u in d.list('%s/*' % smiles)]
        #     if tgt in subs[smiles]:
        #         if workload.recompute:
        #             rec += 2
        #             d.move('%s/%s'     % (smiles, tgt),
        #                     '%s/%s.bak' % (smiles, tgt))
        #         else:
        #             print('skip      2 %s' % name)
        #             continue

        ## if os.path.exists('results/%s.%s.gz' % (name, wofkload.output)):
        ##     print('skip      3 %s' % name)
        ##     continue

        #if rec: print('recompute %d %s' % (rec, name))
        #else  : print('compute   2 %s'  %       name)

        cpn = cfg.cpn
        gpn = cfg.gpn
        n_masters = cfg.n_masters

        cfg.workload.receptor = receptor
        cfg.workload.smiles = smiles
        cfg.workload.name = name
        cfg.nodes = nodes
        cfg.runtime = runtime
        cfg.n_workers = int(nodes / n_masters - 1)
        print('n_workers: %d' % cfg.n_workers)

        ru.write_json(cfg, 'configs/wf0.%s.cfg' % name)

        for i in range(n_masters):
            t = Task()

            t.pre_exec = [
                '. /gpfs/alpine/scratch/mturilli1/med110/radical.pilot.sandbox/s1.to/bin/activate'
            ]

            t.executable = "python3"
            t.arguments = ['wf0_master.py', i]
            t.cpu_threads = cpn
            t.upload_input_data = [
                'wf0_master.py', 'wf0_worker.py',
                'configs/wf0.%s.cfg > wf0.cfg' % name, 'read_ligand_dict.py'
            ]
            t.link_input_data = ['%s > input_dir' % workload.input_dir]
            t.download_output_data = [
                '%s.%s.gz > results/%s.%s.gz' %
                (name, workload.output, name, workload.output)
            ]
            # t.input_staging  = [{'source': 'wf0_master.py',
            #                         'target': 'wf0_master.py',
            #                         'action': rp.TRANSFER,
            #                         'flags' : rp.DEFAULT_FLAGS},
            #                         {'source': 'wf0_worker.py',
            #                         'target': 'wf0_worker.py',
            #                         'action': rp.TRANSFER,
            #                         'flags' : rp.DEFAULT_FLAGS},
            #                         {'source': 'configs/wf0.%s.cfg' % name,
            #                         'target': 'wf0.cfg',
            #                         'action': rp.TRANSFER,
            #                         'flags' : rp.DEFAULT_FLAGS},
            #                         {'source': workload.input_dir,
            #                         'target': 'input_dir',
            #                         'action': rp.LINK,
            #                         'flags' : rp.DEFAULT_FLAGS},
            #                         {'source': workload.impress_dir,
            #                         'target': 'impress_md',
            #                         'action': rp.LINK,
            #                         'flags' : rp.DEFAULT_FLAGS},
            #                         {'source': 'read_ligand_dict.py',
            #                         'target': 'read_ligand_dict.py',
            #                         'action': rp.TRANSFER,
            #                         'flags' : rp.DEFAULT_FLAGS},
            #                     ]
            # t.output_staging = [{'source': '%s.%s.gz'         % (name, workload.output),
            #                      'target': 'results/%s.%s.gz' % (name, workload.output),
            #                      'action': rp.TRANSFER,
            #                      'flags' : rp.DEFAULT_FLAGS}]
            s.add_tasks(t)

    p.add_stages(s)

    return p
Exemplo n.º 40
0
    def __init__(self, dburl=None, uid=None, cfg=None, _connect=True):
        """
        Creates a new session.  A new Session instance is created and 
        stored in the database.

        **Arguments:**
            * **dburl** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **uid** (`string`): Create a session with this UID.  
              *Only use this when you know what you are doing!*

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        if os.uname()[0] == 'Darwin':
            # on MacOS, we are running out of file descriptors soon.  The code
            # below attempts to increase the limit of open files - but any error
            # is silently ignored, so this is an best-effort, no guarantee.  We
            # leave responsibility for system limits with the user.
            try:
                import resource
                limits = list(resource.getrlimit(resource.RLIMIT_NOFILE))
                limits[0] = 512
                resource.setrlimit(resource.RLIMIT_NOFILE, limits)
            except:
                pass

        self._dh = ru.DebugHelper()
        self._valid = True
        self._closed = False
        self._valid_iter = 0  # detect recursive calls of `is_valid()`

        # class state
        self._dbs = None
        self._uid = None
        self._dburl = None
        self._reconnected = False

        self._cache = dict()  # cache sandboxes etc.
        self._cache_lock = threading.RLock()

        self._cache['resource_sandbox'] = dict()
        self._cache['session_sandbox'] = dict()
        self._cache['pilot_sandbox'] = dict()

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper()

        # Dictionaries holding all manager objects created during the session.
        # NOTE: should this also include agents?
        self._pmgrs = dict()
        self._umgrs = dict()
        self._bridges = list()
        self._components = list()

        # FIXME: we work around some garbage collection issues we don't yet
        #        understand: instead of relying on the GC to eventually collect
        #        some stuff, we actively free those on `session.close()`, at
        #        least for the current process.  Usually, all resources get
        #        nicely collected on process termination - but not when we
        #        create many sessions (one after the other) in the same
        #        application instance (ie. the same process).  This workarounf
        #        takes care of that use case.
        #        The clean solution would be to ensure clean termination
        #        sequence, something which I seem to be unable to implement...
        #        :/
        self._to_close = list()
        self._to_stop = list()
        self._to_destroy = list()

        # cache the client sandbox
        # FIXME: this needs to be overwritten if configured differently in the
        #        session config, as should be the case for any agent side
        #        session instance.
        self._client_sandbox = os.getcwd()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        # if a config is given, us its values:
        if cfg:
            self._cfg = copy.deepcopy(cfg)
        else:
            # otherwise we need a config
            self._cfg = ru.read_json("%s/configs/session_%s.json" \
                    % (os.path.dirname(__file__),
                       os.environ.get('RADICAL_PILOT_SESSION_CFG', 'default')))

        # fall back to config data where possible
        # sanity check on parameters
        if not uid:
            uid = self._cfg.get('session_id')

        if uid:
            self._uid = uid
            self._reconnected = True
        else:
            # generate new uid, reset all other ID counters
            # FIXME: this will screw up counters for *concurrent* sessions,
            #        as the ID generation is managed in a process singleton.
            self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE)
            ru.reset_id_counters(prefix='rp.session', reset_all_others=True)

        if not self._cfg.get('session_id'): self._cfg['session_id'] = self._uid
        if not self._cfg.get('owner'): self._cfg['owner'] = self._uid
        if not self._cfg.get('logdir'):            self._cfg['logdir']     = '%s/%s' \
                     % (os.getcwd(), self._uid)

        self._logdir = self._cfg['logdir']
        self._prof = self._get_profiler(name=self._cfg['owner'])
        self._rep = self._get_reporter(name=self._cfg['owner'])
        self._log = self._get_logger(name=self._cfg['owner'],
                                     level=self._cfg.get('debug'))

        if _connect:
            # we need a dburl to connect to.

            if not dburl:
                dburl = os.environ.get("RADICAL_PILOT_DBURL")

            if not dburl:
                dburl = self._cfg.get('default_dburl')

            if not dburl:
                dburl = self._cfg.get('dburl')

            if not dburl:
                # we forgive missing dburl on reconnect, but not otherwise
                raise RuntimeError("no database URL (set RADICAL_PILOT_DBURL)")

        self._dburl = ru.Url(dburl)
        self._cfg['dburl'] = str(self._dburl)

        # now we have config and uid - initialize base class (saga session)
        rs.Session.__init__(self, uid=self._uid)

        # ----------------------------------------------------------------------
        # create new session
        if _connect:
            self._log.info("using database %s" % self._dburl)

            # if the database url contains a path element, we interpret that as
            # database name (without the leading slash)
            if  not self._dburl.path         or \
                self._dburl.path[0]   != '/' or \
                len(self._dburl.path) <=  1  :
                if not uid:
                    # we fake reconnnect if no DB is available -- but otherwise we
                    # really really need a db connection...
                    raise ValueError("incomplete DBURL '%s' no db name!" %
                                     self._dburl)

        if not self._reconnected:
            self._prof.prof('session_start', uid=self._uid)
            self._rep.info('<<new session: ')
            self._rep.plain('[%s]' % self._uid)
            self._rep.info('<<database   : ')
            self._rep.plain('[%s]' % self._dburl)

        self._load_resource_configs()

        self._rec = os.environ.get('RADICAL_PILOT_RECORD_SESSION')
        if self._rec:
            # NOTE: Session recording cannot handle reconnected sessions, yet.
            #       We thus turn it off here with a warning
            if self._reconnected:
                self._log.warn("no session recording on reconnected session")

            else:
                # append session ID to recording path
                self._rec = "%s/%s" % (self._rec, self._uid)

                # create recording path and record session
                os.system('mkdir -p %s' % self._rec)
                ru.write_json({'dburl': str(self.dburl)},
                              "%s/session.json" % self._rec)
                self._log.info("recording session in %s" % self._rec)

        # create/connect database handle
        try:
            self._dbs = DBSession(sid=self.uid,
                                  dburl=str(self._dburl),
                                  cfg=self._cfg,
                                  logger=self._log,
                                  connect=_connect)

            # from here on we should be able to close the session again
            self._log.info("New Session created: %s." % self.uid)

        except Exception, ex:
            self._rep.error(">>err\n")
            self._log.exception('session create failed')
            raise RuntimeError("Couldn't create new session (database URL '%s' incorrect?): %s" \
                            % (dburl, ex))
Exemplo n.º 41
0
    def submit_units(self, unit_descriptions):
        """Submits on or more :class:`radical.pilot.ComputeUnit` instances to the
        unit manager.

        **Arguments:**

            * **unit_descriptions** [:class:`radical.pilot.ComputeUnitDescription`
              or list of :class:`radical.pilot.ComputeUnitDescription`]: The
              description of the compute unit instance(s) to create.

        **Returns:**

              * A list of :class:`radical.pilot.ComputeUnit` objects.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """

        if not self._valid:
            raise RuntimeError("instance is already closed")

        return_list_type = True
        if not isinstance(unit_descriptions, list):
            return_list_type  = False
            unit_descriptions = [unit_descriptions]

        if len(unit_descriptions) == 0:
            raise ValueError('cannot submit no unit descriptions')

        for ud in unit_descriptions:

            if float(ud.cores) != int(ud.cores):
                error_msg = "ComputeUnittDescription 'cores' must be integer."
                raise BadParameter(error_msg)

            if int(ud.cores) <= 0:
                error_msg = "ComputeUnittDescription 'cores' must be positive."
                raise BadParameter(error_msg)

        logger.report.info('<<submit %d unit(s)\n\t' % len(unit_descriptions))

        # we return a list of compute units
        ret = list()

        # the scheduler will return a dictionary of the form:
        #   {
        #     ud_1 : pilot_id_a,
        #     ud_2 : pilot_id_b
        #     ...
        #   }
        #
        # The scheduler may not be able to schedule some units - those will
        # have 'None' as pilot ID.

        units = list()
        for ud in unit_descriptions :

            u = ComputeUnit.create (unit_description=ud,
                                    unit_manager_obj=self, 
                                    local_state=SCHEDULING)
            units.append(u)

            if self._session._rec:
                import radical.utils as ru
                ru.write_json(ud.as_dict(), "%s/%s.batch.%03d.json" \
                        % (self._session._rec, u.uid, self._rec_id))
            logger.report.progress()
        if self._session._rec:
            self._rec_id += 1

        self._worker.publish_compute_units (units=units)

        schedule = None
        try:
            schedule = self._scheduler.schedule (units=units)
       
        except Exception as e:
            logger.exception ("Internal error - unit scheduler failed")
            raise 

        self.handle_schedule (schedule)

        logger.report.ok('>>ok\n')

        if  return_list_type :
            return units
        else :
            return units[0]
Exemplo n.º 42
0
def generate_pipeline(cfg):

    cfg_file = cfg['run_cfg_file']  # resource and workload config
    run_file = cfg['run_file']  # runs for this campaign

    # setup S1 workload
    cfg = ru.Config(cfg=ru.read_json(cfg_file))
    runs = check_runs(cfg_file, run_file)

    if not runs:
        print('S1: nothing to run, exiting.')
        return

    # for each run in the campaign:
    # - create cfg with requested receptor and smiles
    # - create a number of masters as EnTK tasks and add them to a pipeline
    # - submit configured number of masters with that cfg

    # setup EnTK pipeline
    p = Pipeline()
    p.name = 'S1.RAPTOR'
    s = Stage()

    # create cfg
    subs = dict()
    rurl = cfg.fs_url + cfg.workload.results
    d = rs.filesystem.Directory(rurl)
    ls = [str(u).split('/')[-1] for u in d.list()]

    workload = cfg.workload

    for receptor, smiles, n_workers, runtime in runs:

        print('%30s  %s' % (receptor, smiles))
        name = '%s_-_%s' % (receptor, smiles)
        tgt = '%s.%s.gz' % (name, workload.output)

        cpw = cfg.cpw
        gpw = cfg.gpw
        n_masters = cfg.n_masters

        cfg.workload.receptor = receptor
        cfg.workload.smiles = smiles
        cfg.workload.name = name
        cfg.runtime = runtime
        cfg.n_workers = n_workers
        print('n_workers: %d' % cfg.n_workers)

        ru.write_json(cfg, 'configs/wf0.%s.cfg' % name)

        for i in range(n_masters):
            t = Task()

            t.pre_exec = [
                '. /gpfs/alpine/scratch/mturilli1/med110/radical.pilot.sandbox/s1.to/bin/activate'
            ]

            t.executable = "python3"
            t.arguments = ['wf0_master.py', i]
            t.cpu_reqs = {
                'processes': 1,
                'threads_per_process': 4,
                'thread_type': None,
                'process_type': None
            }
            t.upload_input_data = [
                'wf0_master.py', 'wf0_worker.py',
                'configs/wf0.%s.cfg > wf0.cfg' % name, 'read_ligand_dict.py'
            ]
            t.link_input_data = ['%s > input_dir' % workload.input_dir]
            #t.download_output_data = ['%s.%s.gz > results/%s.%s.gz' %
            #    (name, workload.output, name, workload.output)]

            s.add_tasks(t)

    p.add_stages(s)

    return p
Exemplo n.º 43
0
            if rec: print('recompute %d %s' % (rec, name))
            else: print('compute   2 %s' % name)

            cpn = cfg.cpn
            gpn = cfg.gpn
            n_masters = cfg.n_masters

            cfg.workload.receptor = receptor
            cfg.workload.smiles = smiles
            cfg.workload.name = name
            cfg.nodes = nodes
            cfg.runtime = runtime
            cfg.n_workers = int(nodes / n_masters - 1)
            print('n_workers: %d' % cfg.n_workers)

            ru.write_json(cfg, 'configs/wf0.%s.cfg' % name)

            pd = rp.ComputePilotDescription(cfg.pilot_descr)
            pd.cores = nodes * cpn
            pd.gpus = nodes * gpn
            pd.runtime = runtime

            pilot = pmgr.submit_pilots(pd)
            pid = pilot.uid

            umgr.add_pilots(pilot)

            tds = list()

            for i in range(n_masters):
                td = rp.ComputeUnitDescription(cfg.master_descr)
    def submit_units(self, descriptions):
        """
        Submits on or more :class:`radical.pilot.ComputeUnit` instances to the
        unit manager.

        **Arguments:**
            * **descriptions** [:class:`radical.pilot.ComputeUnitDescription`
              or list of :class:`radical.pilot.ComputeUnitDescription`]: The
              description of the compute unit instance(s) to create.

        **Returns:**
              * A list of :class:`radical.pilot.ComputeUnit` objects.
        """

        from .compute_unit import ComputeUnit

        self.is_valid()

        ret_list = True
        if not isinstance(descriptions, list):
            ret_list     = False
            descriptions = [descriptions]

        if len(descriptions) == 0:
            raise ValueError('cannot submit no unit descriptions')

        self._rep.info('<<submit %d unit(s)\n\t' % len(descriptions))

        # we return a list of compute units
        units = list()
        for ud in descriptions:

            if not ud.executable:
                raise ValueError('compute unit executable must be defined')

            unit = ComputeUnit(umgr=self, descr=ud)
            units.append(unit)

            # keep units around
            with self._units_lock:
                self._units[unit.uid] = unit

            if self._session._rec:
                ru.write_json(ud.as_dict(), "%s/%s.batch.%03d.json"
                        % (self._session._rec, unit.uid, self._rec_id))

            self._rep.progress()

        if self._session._rec:
            self._rec_id += 1

        # insert units into the database, as a bulk.
        unit_docs = [u.as_dict() for u in units]
        self._session._dbs.insert_units(unit_docs)

        # Only after the insert can we hand the units over to the next
        # components (ie. advance state).
        self.advance(unit_docs, rps.UMGR_SCHEDULING_PENDING, 
                     publish=True, push=True)
        self._rep.ok('>>ok\n')

        if ret_list: return units
        else       : return units[0]
Exemplo n.º 45
0
def store_profile(profile, tags=None, url=None, mode=None):

    if not url:
        url = os.environ.get('RADICAL_SYNAPSE_DBURL')

    if not url:
        # print "warning: need dburl to store profiles"
        return None

    if not mode:
        raise ValueError("document needs mode (emulated | eecuted | profiled)")

    url = ru.Url(url)

    if not tags:
        tags = dict()
        elems = filter(None,
                       os.environ.get('RADICAL_SYNAPSE_TAGS', '').split(','))
        for elem in elems:
            if ':' in elem:
                key, val = elem.split(':', 1)
                tags[key] = val
            else:
                tags[elem] = None

    command_idx = index_command(profile['cmd'], tags)
    print "index %s (%s) to %s" % (profile['cmd'], tags, command_idx)

    host = profile['sys'].get('hostname')
    if not host:
        host = os.environ.get('RADICAL_SYNAPSE_HOSTNAME', socket.gethostname())
        profile['sys']['hostname'] = host

    doc = {
        'type': 'synapse_profile',
        'mode': mode,
        'command_idx': command_idx,
        'command': profile['cmd'],
        'tags': tags,
        'profile': profile
    }

    if url.schema == 'mongodb':

        print 'store profile in db %s' % url

        [dbhost, port, dbname, _, _, _, _] = ru.split_dburl(url)

        db_client = pymongo.MongoClient(host=dbhost, port=port)
        database = db_client[dbname]
        collection = database['profiles']

        collection.insert(doc)

    elif url.schema == 'file':

        path = url.path

        if not os.path.isdir(path):
            os.system('mkdir -p "%s"' % path)

        name = command_idx.split()[0]
        # for key, val in tags.iteritems():
        #     if val != None: name += "_%s:%s" % (key, val)
        #     else          : name += "_%s"    % (key)
        for tag in sorted(tags.keys()):
            if tags[tag] != None: name += "_%s" % tags[tag]
            else: name += "_%s" % tag

        idx = 0
        while True:
            fname = "%s/synapse_profile_%s_%s_%s_%03d.json" % (
                path, name, host, mode[0:3], idx)
            if not os.path.exists(fname):
                break
            idx += 1

        print 'store profile in file %s' % fname
        os.system('mkdir -p "%s/"' % path)
        ru.write_json(doc, fname)