def setUpClass(self): # Directory of sample config files self.sample_resource_dir = os.path.dirname(os.path.realpath(__file__)) # Load sample resource test config self.cfg_sample = ru.read_json(os.path.join(self.sample_resource_dir, "sample_resources.json")) self.cfg_sample_1 = self.cfg_sample['sample_resource_1'] self.cfg_sample_2 = self.cfg_sample['sample_resource_2'] self.cfg_sample_3 = self.cfg_sample['sample_resource_3'] # Directory of pilot resource config files self.pilot_resource_dir = '../../src/radical/pilot/configs' # Load xsede pilot resource config self.cfg_xsede = ru.read_json(os.path.join(self.pilot_resource_dir, 'resource_xsede.json')) self.cfg_xsede_bridges = self.cfg_xsede['bridges'] self.cfg_xsede_comet_ssh = self.cfg_xsede['comet_ssh'] self.cfg_xsede_comet_orte = self.cfg_xsede['comet_orte'] self.cfg_xsede_comet_ortelib = self.cfg_xsede['comet_ortelib'] self.cfg_xsede_comet_spark = self.cfg_xsede['comet_spark'] self.cfg_xsede_supermic_ssh = self.cfg_xsede['supermic_ssh'] self.cfg_xsede_supermic_orte = self.cfg_xsede['supermic_orte'] self.cfg_xsede_supermic_ortelib = self.cfg_xsede['supermic_ortelib'] self.cfg_xsede_supermic_spark = self.cfg_xsede['supermic_spark'] return
def test(): s = None try: cfg = ru.read_json("%s/session.json" % os.path.dirname(__file__)) dh = ru.DebugHelper() s = rp.Session(cfg=cfg) ca1 = CompA(s) cb1 = CompB(s) cb2 = CompB(s) ca1.start() cb1.start() cb2.start() # s._controller.add_things([ca1, cb1, cb2]) time.sleep(3) finally: if s: print 'close' s.close()
def get_session_docs(db, sid, cache=None, cachedir=None): # session docs may have been cached in /tmp/rp_cache_<uid>/<sid>.json -- in # that case we pull it from there instead of the database, which will be # much quicker. Also, we do cache any retrieved docs to that place, for # later use. An optional cachdir parameter changes that default location # for lookup and storage. if not cachedir: cachedir = _CACHE_BASEDIR if not cache: cache = "%s/%s.json" % (cachedir, sid) try: if os.path.isfile(cache): # print 'using cache: %s' % cache return ru.read_json(cache) except Exception as e: # continue w/o cache sys.stderr.write("cannot read session cache at %s (%s)\n" % (cache, e)) # cache not used or not found -- go to db json_data = dict() # convert bson to json, i.e. serialize the ObjectIDs into strings. json_data['session'] = bson2json(list(db[sid].find({'type': 'session'}))) json_data['pmgr'] = bson2json(list(db[sid].find({'type': 'pmgr'}))) json_data['pilot'] = bson2json(list(db[sid].find({'type': 'pilot'}))) json_data['umgr'] = bson2json(list(db[sid].find({'type': 'umgr'}))) json_data['unit'] = bson2json(list(db[sid].find({'type': 'unit'}))) if len(json_data['session']) == 0: raise ValueError('no session %s in db' % sid) # if len(json_data['session']) > 1: # print 'more than one session document -- picking first one' # there can only be one session, not a list of one json_data['session'] = json_data['session'][0] # we want to add a list of handled units to each pilot doc for pilot in json_data['pilot']: pilot['unit_ids'] = list() for unit in json_data['unit']: if unit['pilot'] == pilot['uid']: pilot['unit_ids'].append(unit['uid']) # if we got here, we did not find a cached version -- thus add this dataset # to the cache try: os.system('mkdir -p %s' % cachedir) ru.write_json(json_data, "%s/%s.json" % (cachedir, sid)) except Exception: # we can live without cache, no problem... pass return json_data
def _read_config(self, config_path, hostname, port, username, password, reattempts, resubmit_failed, autoterminate, write_workflow, rts, rmq_cleanup, rts_config): if not config_path: config_path = os.path.dirname(os.path.abspath(__file__)) config = ru.read_json(os.path.join(config_path, 'config.json')) def _if(val1, val2): if val1 is not None: return val1 else: return val2 self._hostname = _if(hostname, config['hostname']) self._port = _if(port, config['port']) self._username = _if(username, config['username']) self._password = _if(password, config['password']) self._reattempts = _if(reattempts, config['reattempts']) self._resubmit_failed = _if(resubmit_failed, config['resubmit_failed']) self._autoterminate = _if(autoterminate, config['autoterminate']) self._write_workflow = _if(write_workflow, config['write_workflow']) self._rmq_cleanup = _if(rmq_cleanup, config['rmq_cleanup']) self._rts_config = _if(rts_config, config['rts_config']) self._rts = _if(rts, config['rts']) credentials = pika.PlainCredentials(self._username, self._password) self._rmq_conn_params = pika.connection.ConnectionParameters( host=self._hostname, port=self._port, credentials=credentials) self._num_pending_qs = config['pending_qs'] self._num_completed_qs = config['completed_qs'] if self._rts not in ['radical.pilot', 'mock']: raise ValueError('invalid RTS %s' % self._rts)
def __init__(self, cfg): if isinstance(cfg, str): cfg = ru.Config(cfg=ru.read_json(cfg)) else: cfg = ru.Config(cfg=cfg) self._uid = cfg.uid self._term = mt.Event() self._info = ru.Config(cfg=cfg.get('info', {})) self._session = Session(cfg=cfg, _primary=False) rpu.Component.__init__(self, cfg, self._session) # connect to master self.register_subscriber(rpc.CONTROL_PUBSUB, self._control_cb) self.register_publisher(rpc.CONTROL_PUBSUB) info = self.initialize() self.publish(rpc.CONTROL_PUBSUB, { 'cmd': 'worker_register', 'arg': { 'uid': self._uid, 'info': info } })
def main(): # TODO: Test both with and without a provided config file. kwargs = {} if len(sys.argv) > 1: cfg = ru.Config(cfg=ru.read_json(sys.argv[1])) kwargs['cfg'] = cfg descr = cfg.worker_descr, count = cfg.n_workers, cores = cfg.cpn, gpus = cfg.gpn else: descr = rp.TaskDescription({ 'uid': 'raptor.worker', 'executable': 'scalems_rp_worker', 'arguments': [] }) count = 1 cores = 1 gpus = 0 master = ScaleMSMaster(**kwargs) master.submit(descr=descr, count=count, cores=cores, gpus=gpus) master.start() master.join() master.stop()
def get_session_docs (db, sid, cache=None, cachedir=None) : # session docs may have been cached in /tmp/rp_cache_<uid>/<sid>.json -- in that # case we pull it from there instead of the database, which will be much # quicker. Also, we do cache any retrieved docs to that place, for later # use. An optional cachdir parameter changes that default location for # lookup and storage. if not cachedir : cachedir = _CACHE_BASEDIR if not cache : cache = "%s/%s.json" % (cachedir, sid) try : if os.path.isfile (cache) : return ru.read_json (cache) except Exception as e : # continue w/o cache sys.stderr.write ("warning: cannot read session cache at %s (%s)\n" % (cache, e)) # cache not used or not found -- go to db json_data = dict() # convert bson to json, i.e. serialize the ObjectIDs into strings. json_data['session'] = bson2json (list(db["%s" % sid].find ())) json_data['pmgr' ] = bson2json (list(db["%s.pm" % sid].find ())) json_data['pilot' ] = bson2json (list(db["%s.p" % sid].find ())) json_data['umgr' ] = bson2json (list(db["%s.um" % sid].find ())) json_data['unit' ] = bson2json (list(db["%s.cu" % sid].find ())) if len(json_data['session']) == 0 : raise ValueError ('no such session %s' % sid) # if len(json_data['session']) > 1 : # print 'more than one session document -- picking first one' # there can only be one session, not a list of one json_data['session'] = json_data['session'][0] # we want to add a list of handled units to each pilot doc for pilot in json_data['pilot'] : pilot['unit_ids'] = list() for unit in json_data['unit'] : if unit['pilot'] == str(pilot['_id']) : pilot['unit_ids'].append (str(unit['_id'])) # if we got here, we did not find a cached version -- thus add this dataset # to the cache try : os.system ('mkdir -p %s' % _CACHE_BASEDIR) ru.write_json (json_data, "%s/%s.json" % (_CACHE_BASEDIR, sid)) except Exception as e : # we can live without cache, no problem... pass return json_data
def write_workflow(workflow, uid): try: os.mkdir(uid) except: pass data = list() if os.path.isfile('%s/entk_workflow.json' % uid): data = ru.read_json('%s/entk_workflow.json' % uid) for pipe in workflow: p = dict() p['uid'] = pipe.uid p['name'] = pipe.name p['state_history'] = pipe.state_history p['stages'] = list() for stage in pipe.stages: s = dict() s['uid'] = stage.uid s['name'] = stage.name s['state_history'] = stage.state_history s['tasks'] = list() for task in stage.tasks: s['tasks'].append(task.to_dict()) p['stages'].append(s) data.append(p) ru.write_json(data, '%s/entk_workflow.json' % uid)
def test_get_session_description(): sid = 're.session.host.user.012345.1234' src = '%s/sample_data/profiler' % pwd desc = get_session_description(sid=sid, src=src) assert desc == ru.read_json('%s/expected_desc_get_session.json' % src)
def _read_config(self, config_path, hostname, port, reattempts, resubmit_failed, autoterminate, write_workflow, rts, rmq_cleanup, rts_config): if not config_path: config_path = os.path.dirname(os.path.abspath(__file__)) config = ru.read_json(os.path.join(config_path, 'config.json')) self._mq_hostname = hostname if hostname else str(config['hostname']) self._port = port if port else config['port'] self._reattempts = reattempts if reattempts else config['reattempts'] self._resubmit_failed = resubmit_failed if resubmit_failed is not None else config[ 'resubmit_failed'] self._autoterminate = autoterminate if autoterminate is not None else config[ 'autoterminate'] self._write_workflow = write_workflow if write_workflow is not None else config[ 'write_workflow'] self._rts = rts if rts in ['radical.pilot', 'mock'] else str( config['rts']) self._rmq_cleanup = rmq_cleanup if rmq_cleanup is not None else config[ 'rmq_cleanup'] self._rts_config = rts_config if rts_config is not None else config[ 'rts_config'] self._num_pending_qs = config['pending_qs'] self._num_completed_qs = config['completed_qs']
def test_get_session_description(): sid = 're.session.host.user.012345.1234' curdir = os.path.dirname(os.path.abspath(__file__)) src = '%s/sample_data/profiler' % curdir desc = get_session_description(sid=sid, src=src) assert desc == ru.read_json('%s/expected_desc_get_session.json' % src)
def test_get_session_description(): sid = 're.session.vivek-HP-Pavilion-m6-Notebook-PC.vivek.017732.0002' curdir = os.path.dirname(os.path.abspath(__file__)) src = '%s/sample_data/profiler' % curdir desc = get_session_description(sid=sid, src=src) assert desc == ru.read_json('%s/expected_desc.json' % src)
def get_session_description(sid, src=None): if not src: src = './%s/' % sid if not os.path.isdir(src): raise EnTKError('No such directory %s' % src) return ru.read_json('%s/%s.json' % (src, sid))
def test_write_session_description(): amgr = AppManager(hostname=hostname, port=port, username=username, password=password) amgr.resource_desc = { 'resource': 'xsede.stampede', 'walltime': 59, 'cpus': 128, 'gpus': 64, 'project': 'xyz', 'queue': 'high' } workflow = [generate_pipeline(1), generate_pipeline(2)] amgr.workflow = workflow amgr._wfp = WFprocessor(sid=amgr.sid, workflow=amgr._workflow, pending_queue=amgr._pending_queue, completed_queue=amgr._completed_queue, resubmit_failed=amgr._resubmit_failed, rmq_conn_params=amgr._rmq_conn_params) amgr._workflow = amgr._wfp.workflow amgr._task_manager = TaskManager(sid=amgr._sid, pending_queue=amgr._pending_queue, completed_queue=amgr._completed_queue, rmgr=amgr._rmgr, rmq_conn_params=amgr._rmq_conn_params) write_session_description(amgr) desc = ru.read_json('%s/radical.entk.%s.json' % (amgr._sid, amgr._sid)) # tasks are originally set but saved as a list in json # uses sorting for convenient comparison, this doesn't change validity for k, v in (desc['tree'].items()): if k.startswith("stage"): desc['tree'][k]['children'] = sorted(v['children']) src = '%s/sample_data' % pwd assert desc == ru.read_json('%s/expected_desc_write_session.json' % src)
def test_write_session_description(): hostname = os.environ.get('RMQ_HOSTNAME', 'localhost') port = int(os.environ.get('RMQ_PORT', 5672)) amgr = AppManager(hostname=hostname, port=port) amgr.resource_desc = { 'resource': 'xsede.stampede', 'walltime': 60, 'cpus': 128, 'gpus': 64, 'project': 'xyz', 'queue': 'high' } workflow = [generate_pipeline(1), generate_pipeline(2)] amgr.workflow = workflow amgr._wfp = WFprocessor(sid=amgr._sid, workflow=amgr._workflow, pending_queue=amgr._pending_queue, completed_queue=amgr._completed_queue, mq_hostname=amgr._mq_hostname, port=amgr._port, resubmit_failed=amgr._resubmit_failed) amgr._wfp._initialize_workflow() amgr._workflow = amgr._wfp.workflow amgr._task_manager = TaskManager(sid=amgr._sid, pending_queue=amgr._pending_queue, completed_queue=amgr._completed_queue, mq_hostname=amgr._mq_hostname, rmgr=amgr._resource_manager, port=amgr._port ) # os.mkdir(amgr._sid) write_session_description(amgr) desc = ru.read_json('%s/radical.entk.%s.json' % (amgr._sid, amgr._sid)) curdir = os.path.dirname(os.path.abspath(__file__)) src = '%s/sample_data' % curdir assert desc == ru.read_json('%s/expected_desc_write_session.json' % src)
def setUp(self): ret = list() for fin in glob.glob( 'tests/test_agent_stagein/test_cases/unit.*.json'): tc = ru.read_json(fin) unit = tc['unit'] result = tc['results'] if result: ret.append([unit, result]) return ret
def test_executor_run(): fpath = os.path.dirname(os.path.abspath(__file__)) schedule = list() for x in range(10): t = Task(ops=100) c = Core(10) schedule.append({'task': t.to_dict(), 'core': c.to_dict()}) with open('%s/../config_test.yml' % fpath) as fp: cfg = yaml.load(fp) conn = pika.BlockingConnection( pika.ConnectionParameters(host=cfg['rmq']['host'], port=cfg['rmq']['port'])) chan = conn.channel() chan.basic_publish(exchange=cfg['rmq']['executor']['exchange'], routing_key=cfg['rmq']['executor']['queues']['config'], body=json.dumps({'engine_uid': 'test.0000'})) chan.basic_publish( exchange=cfg['rmq']['executor']['exchange'], routing_key=cfg['rmq']['executor']['queues']['schedule'], body=json.dumps(schedule)) conn.close() executor = Executor(cfg_path='%s/../config_test.yml' % fpath) t = threading.Thread(target=func_for_test_executor_run, args=(executor, )) t.daemon = True t.start() t.join(timeout=5) executor._write_profile() assert os.path.isfile('./profile.%s.json' % (executor._uid)) prof = ru.read_json('./profile.%s.json' % (executor._uid)) assert 'test.0000' in prof.keys() assert len(prof['test.0000']) == 10 for ind, x in enumerate(prof['test.0000']): assert x['task'] == schedule[ind]['task']['uid'] assert x['core'] == schedule[ind]['core']['uid'] assert x['end_time'] == 10 assert x['exec_time'] == 10 assert x['start_time'] == 0 for f in glob('profile.*'): os.remove(f)
def setUpClass(cls): """Initialize tests, just creates instance variables needed.""" super(AcceptanceTests, cls).setUpClass() cls.resource = None cls.session = None cls.pmgr = None cls.umgr = None cls.n = 128 # number of units to run cls.config = ru.read_json('%s/config.json' % os.path.dirname(__file__)) cls.setUp()
def setUpClass(cls): cls._base_dir = ru.get_radical_base('utils') cls._pid_str = '%06d' % os.getpid() cls._user = None try: import getpass cls._user = getpass.getuser() except: cls._user = '******' cls._test_cases = [] for f in glob.glob(TEST_CASES_PATH): cls._test_cases.extend(ru.read_json(f))
def register_input(self, states, input, worker=None): ''' Using this method, the component can be connected to a queue on which things are received to be worked upon. The given set of states (which can be a single state or a list of states) will trigger an assert check upon thing arrival. This method will further associate a thing state with a specific worker. Upon thing arrival, the thing state will be used to lookup the respective worker, and the thing will be handed over. Workers should call self.advance(thing), in order to push the thing toward the next component. If, for some reason, that is not possible before the worker returns, the component will retain ownership of the thing, and should call advance() asynchronously at a later point in time. Worker invocation is synchronous, ie. the main event loop will only check for the next thing once the worker method returns. ''' if not isinstance(states, list): states = [states] name = '%s.%s.%s' % (self.uid, worker.__name__, '_'.join(states)) if name in self._inputs: raise ValueError('input %s already registered' % name) # dig the addresses from the bridge's config file fname = '%s/%s.cfg' % (self._cfg.path, input) cfg = ru.read_json(fname) self._inputs[name] = { 'queue': ru.zmq.Getter(input, url=cfg['get'], log=self._log), 'states': states } self._log.debug('registered input %s', name) # we want exactly one worker associated with a state -- but a worker can # be responsible for multiple states for state in states: self._log.debug('%s register input %s: %s', self.uid, state, name) if state in self._workers: self._log.warn("%s replaces worker for %s (%s)" % (self.uid, state, self._workers[state])) self._workers[state] = worker self._log.debug('registered worker %s [%s]', worker.__name__, state)
def get_session_description(sid, src=None): if not src: src = os.getcwd() if os.path.exists(src): # EnTK profiles are always on localhost desc = ru.read_json("%s/%s/radical.entk.%s.json" % (src, sid, sid)) else: raise EnTKError('%s/%s does not exist' % (src, sid)) return desc
def test_write_session_description(): amgr = AppManager(hostname=hostname, port=port) amgr.resource_desc = { 'resource': 'xsede.stampede', 'walltime': 59, 'cpus': 128, 'gpus': 64, 'project': 'xyz', 'queue': 'high' } workflow = [generate_pipeline(1), generate_pipeline(2)] amgr.workflow = workflow amgr._wfp = WFprocessor(sid=amgr.sid, workflow=amgr._workflow, pending_queue=amgr._pending_queue, completed_queue=amgr._completed_queue, resubmit_failed=amgr._resubmit_failed, rmq_conn_params=amgr._rmq_conn_params) amgr._wfp.initialize_workflow() amgr._workflow = amgr._wfp.workflow amgr._task_manager = TaskManager(sid=amgr._sid, pending_queue=amgr._pending_queue, completed_queue=amgr._completed_queue, rmgr=amgr._rmgr, rmq_conn_params=amgr._rmq_conn_params) write_session_description(amgr) desc = ru.read_json('%s/radical.entk.%s.json' % (amgr._sid, amgr._sid)) src = '%s/sample_data' % pwd assert desc == ru.read_json('%s/expected_desc_write_session.json' % src)
def test_write_workflow(): try: wf = list() wf.append(generate_pipeline(1)) wf.append(generate_pipeline(2)) amgr = AppManager(hostname=hostname, port=port) amgr.workflow = wf amgr._wfp = WFprocessor(sid=amgr._sid, workflow=amgr._workflow, pending_queue=amgr._pending_queue, completed_queue=amgr._completed_queue, mq_hostname=amgr._mq_hostname, port=amgr._port, resubmit_failed=amgr._resubmit_failed) amgr._wfp._initialize_workflow() wf = amgr._wfp.workflow write_workflow(wf, 'test') data = ru.read_json('test/entk_workflow.json') assert len(data) == len(wf) + 1 stack = data.pop(0) assert stack.keys() == ['stack'] assert stack['stack'].keys() == ['sys','radical'] assert stack['stack']['sys'].keys() == ["python","pythonpath","virtualenv"] assert stack['stack']['radical'].keys() == ['saga', 'radical.pilot', 'radical.utils', 'radical.entk'] p_cnt = 0 for p in data: assert p['uid'] == wf[p_cnt].uid assert p['name'] == wf[p_cnt].name assert p['state_history'] == wf[p_cnt].state_history s_cnt = 0 for s in p['stages']: assert s['uid'] == wf[p_cnt].stages[s_cnt].uid assert s['name'] == wf[p_cnt].stages[s_cnt].name assert s['state_history'] == wf[p_cnt].stages[s_cnt].state_history for t in wf[p_cnt].stages[s_cnt].tasks: assert t.to_dict() in s['tasks'] s_cnt += 1 p_cnt += 1 except Exception as ex: shutil.rmtree('test') raise
def register_publisher(self, pubsub): ''' Using this method, the component can registered itself to be a publisher of notifications on the given pubsub channel. ''' assert (pubsub not in self._publishers) # dig the addresses from the bridge's config file fname = '%s/%s.cfg' % (self._cfg.path, pubsub) cfg = ru.read_json(fname) addr = cfg['pub'] self._publishers[pubsub] = ru.zmq.Publisher(pubsub, url=addr, log=self._log) self._log.debug('registered publisher for %s', pubsub)
def _pilots_backfill(self, requests): ''' Request new backfill pilots, chunked by the given max_cores and max_walltime. The given request_stub is used as template for the pilot descriptions. ''' self._rep.info('\nrequesting backfilled pilots\n') pds = list() for request in requests: del (request['backfill']) policy = request['policy'] partition = request['partition'] PWD = os.path.dirname(__file__) policy = ru.read_json('%s/policies/%s.json' % (PWD, request['policy'])) max_cores = policy.get('max_cores', MAX_CORES) max_walltime = policy.get('max_walltime', MAX_WALLTIME) self._rep.info('\nrequesting backfill pilots\n') bf = get_backfill(request['partition'], max_cores, max_walltime) for [partition, cores, walltime] in bf: pd = { 'resource': request.get('resource', 'local.localhost'), 'project': request.get('project'), 'queue': request.get('queue'), 'cores': cores, 'runtime': walltime } self._rep.ok( 'backfill @ %s [%5dcores * %4dmin] @ %10s(%10s)]\n' % (pd['resource'], pd['cores'], pd['runtime'], pd['queue'], pd['project'])) # pprint.pprint(pd) pds.append(rp.ComputePilotDescription(pd)) return pds
def write_workflow(workflow, uid, workflow_fout='entk_workflow', fwrite=True): try: os.mkdir(uid) except: pass data = list() if os.path.isfile('%s/%s.json' % (uid, workflow_fout)): data = ru.read_json('%s/%s.json' % (uid, workflow_fout)) stack = ru.stack() data.append({'stack': stack}) for pipe in workflow: p = dict() p['uid'] = pipe.uid p['name'] = pipe.name p['state_history'] = pipe.state_history p['stages'] = list() for stage in pipe.stages: s = dict() s['uid'] = stage.uid s['name'] = stage.name s['state_history'] = stage.state_history s['tasks'] = list() for task in stage.tasks: s['tasks'].append(task.to_dict()) p['stages'].append(s) data.append(p) if fwrite: ru.write_json(data, '%s/%s.json' % (uid, workflow_fout)) return 0 return data
def _configure(self): import flux flux_url = self._cfg['rm_info']['lm_info']['flux_env']['FLUX_URI'] self._flux = flux.Flux(url=flux_url) # don't advance tasks via the component's `advance()`, but push them # toward the executor *without state change* - state changes are # performed in retrospect by the executor, based on the scheduling and # execution events collected from Flux. qname = rpc.AGENT_EXECUTING_QUEUE fname = '%s/%s.cfg' % (self._cfg.path, qname) cfg = ru.read_json(fname) self._q = ru.zmq.Putter(qname, cfg['put']) # create job spec via the flux LM self._lm = LaunchMethod.create(name='FLUX', cfg=self._cfg, session=self._session)
def push_tasks(bulk_id, unit): ''' Once a bulk of tasks has been executed, push the resulting jsons back to the QCArchive service endpoint. The results are read from the unit's `stdout` file, which the executor needs to fetch back to localhost. Units which failed are marked and returned in a separate bulk, using the `shutdown` operation. ''' data_ok = dict() data_nok = list() for unit in units: if unit.state == rp.DONE: fout = unit.metadata['fout'] # FIXME: implies data staging result = ru.read_json(fout) data_ok[unit.name] = (result, 'single', []) else: data_nok.apppend(unit.name) if data_ok: payload = {"meta": {"name": bulk_id}, "data": data_ok} r = requests.post(address + "queue_manager", json=payload, verify=False) print('%s ok: %s' % (bulk_id, r.json())) if data_nok: payload = { "meta": { "name": bulk_id, "operation": 'shutdown' }, "data": data_nok } r = requests.put(address + "queue_manager", json=payload, verify=False) print('%s nok: %s' % (bulk_id, r))
def _read_config(self, config_path, hostname, port, reattempts, resubmit_failed, autoterminate, write_workflow, rts, rmq_cleanup, rts_config): if not config_path: config_path = os.path.dirname(os.path.abspath(__file__)) config = ru.read_json(os.path.join(config_path, 'config.json')) self._mq_hostname = hostname if hostname else str(config['hostname']) self._port = int(port if port else config['port']) self._reattempts = reattempts if reattempts else config['reattempts'] self._resubmit_failed = resubmit_failed if resubmit_failed is not None else config['resubmit_failed'] self._autoterminate = autoterminate if autoterminate is not None else config['autoterminate'] self._write_workflow = write_workflow if write_workflow is not None else config['write_workflow'] self._rts = rts if rts in ['radical.pilot', 'mock'] else str(config['rts']) self._rmq_cleanup = rmq_cleanup if rmq_cleanup is not None else config['rmq_cleanup'] self._rts_config = rts_config if rts_config is not None else config['rts_config'] self._num_pending_qs = config['pending_qs'] self._num_completed_qs = config['completed_qs']
def createWorkload(inputFile, nthreads): ''' Creates a workload composed of as many pipelines as they are specified in inputFile. Each pipeline is composed of a set of stages that correspond to bag of tasks. The ith stage of the jth pipeline contains a number of tasks equal to the entry (j,y) in the matrix specified in inputFile ''' workloadDesc = ru.read_json(inputFile) workload = [] totNumCUs = 0 for pipeline in workloadDesc: stageList = [] workload.append(stageList) for bof in pipeline: taskList = [] stageList.append(taskList) totNumCUs += bof for i in range(0, bof): cud = CUDef.createTAUGromacsCU( nthreads ) ## The number of cores per CU has been set to 1. (hard coded, can be changed). The creation of the CU could stay outside the loop since all the CUs are the same #cud = CUDef.createDateCU() ## Create a /bin/date CU --- Comment the line above and de-comment this one if you want to try /bin/date taskList.append(cud) return (workload, totNumCUs)
def setUp(test_type, test_name): ret = list() for fin in glob.glob('tests/test_cases/unit.*.json'): tc = ru.read_json(fin) unit = tc['unit'] setup = tc['setup'].get(test_type, {}) result = tc['results'].get(test_type, {}).get(test_name) resource_file = tc['results'].get('resource_file', {}).get(test_name) resource_filename = tc['results'].get('resource_filename', {}).get(test_name) test = ru.dict_merge(unit, setup, ru.PRESERVE) if result: if resource_file and resource_filename: ret.append([test, result, resource_file, resource_filename]) else: ret.append([test, result]) return ret
def setUpClass(cls): """Initialize tests, just creates instance variables needed.""" super(AcceptanceTests, cls).setUpClass() # Set-up the resource, hard-coding 'localhost' for now... cls.resource = None # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... cls.session = None # Add a Pilot Manager. Pilot managers manage one or more ComputePilots. cls.pmgr = None # Create a UnitManager object. cls.umgr = None # Read in configuration cls.config = ru.read_json('%s/config.json' % os.path.dirname(os.path.abspath(__file__))) # Number of Compute Units (CUs) cls.n = 128 # number of units to run
def test_duration_method_with_data_from_run_with_execution_barriers(): ''' This function tests if the durations obtained from the analytics function is the same as the duration obtained by the utils function. They should both be less than the 'max-min' (of the FINAL and INITIAL states respectively) as the data set in this case consists of profiles when not all units are concurrently being executed, i.e. there is an execution barrier between them and thus a 'gap' between their executions. ''' data_loc = '{0}/barrier_data'.format(os.path.dirname(os.path.realpath(__file__))) json_files = glob.glob('{0}/*.json'.format(data_loc)) json_file = json_files[0] json = ru.read_json(json_file) sid = os.path.basename(json_file)[:-5] session = ra.Session(sid, 'radical.pilot', src='{0}/'.format(data_loc)) assert get_duration_using_analytics(session) == get_duration_using_utils(session) assert get_duration_using_analytics(session) < get_duration_using_minmax(session)
def test_write_workflow(): wf = list() wf.append(generate_pipeline(1)) wf.append(generate_pipeline(2)) amgr = AppManager(hostname=hostname, port=port) amgr.workflow = wf amgr._wfp = WFprocessor(sid=amgr._sid, workflow=amgr._workflow, pending_queue=amgr._pending_queue, completed_queue=amgr._completed_queue, mq_hostname=amgr._mq_hostname, port=amgr._port, resubmit_failed=amgr._resubmit_failed) amgr._wfp._initialize_workflow() wf = amgr._wfp.workflow write_workflow(wf, 'test') data = ru.read_json('test/entk_workflow.json') assert len(data) == len(wf) p_cnt = 0 for p in data: assert p['uid'] == wf[p_cnt].uid assert p['name'] == wf[p_cnt].name assert p['state_history'] == wf[p_cnt].state_history s_cnt = 0 for s in p['stages']: assert s['uid'] == wf[p_cnt].stages[s_cnt].uid assert s['name'] == wf[p_cnt].stages[s_cnt].name assert s['state_history'] == wf[p_cnt].stages[s_cnt].state_history for t in wf[p_cnt].stages[s_cnt].tasks: assert t.to_dict() in s['tasks'] s_cnt += 1 p_cnt += 1 shutil.rmtree('test')
def register_output(self, states, output=None): ''' Using this method, the component can be connected to a queue to which things are sent after being worked upon. The given set of states (which can be a single state or a list of states) will trigger an assert check upon thing departure. If a state but no output is specified, we assume that the state is final, and the thing is then considered 'dropped' on calling advance() on it. The advance() will trigger a state notification though, and then mark the drop in the log. No other component should ever again work on such a final thing. It is the responsibility of the component to make sure that the thing is in fact in a final state. ''' if not isinstance(states, list): states = [states] for state in states: self._log.debug('%s register output %s:%s', self.uid, state, output) # we want a *unique* output queue for each state. if state in self._outputs: self._log.warn("%s replaces output for %s : %s -> %s" % (self.uid, state, self._outputs[state], output)) if not output: # this indicates a final state self._outputs[state] = None else: # non-final state, ie. we want a queue to push to # dig the addresses from the bridge's config file fname = '%s/%s.cfg' % (self._cfg.path, output) cfg = ru.read_json(fname) self._outputs[state] = ru.zmq.Putter(output, url=cfg['put'])
def test_executor_write_profile(): fpath = os.path.dirname(os.path.abspath(__file__)) executor = Executor(cfg_path='%s/../config_test.yml' % fpath) tasks = list() engine_uid = 'engine.0000' output = list() for x in range(10): task = Task() task.exec_core = 'core.%s' % x task.start_time = random() task.end_time = random() output.append({ 'task': task.uid, 'ops': task.ops, 'core': task.exec_core, 'start_time': task.start_time, 'end_time': task.end_time, 'exec_time': task.end_time - task.start_time }) tasks.append(task) executor._profile[engine_uid] = output executor._profile_loc = '%s/test.prof' % fpath executor._write_profile() assert os.path.isfile('%s/test.%s.prof' % (fpath, executor._uid)) prof = ru.read_json('%s/test.%s.prof' % (fpath, executor._uid)) assert engine_uid in prof.keys() assert output == prof[engine_uid] for f in glob('test.*'): os.remove(f)
def run(self): """Starts the process when Process.start() is called. """ global JOB_CHECK_INTERVAL # make sure to catch sys.exit (which raises SystemExit) try: # Get directory where this module lives mod_dir = os.path.dirname(os.path.realpath(__file__)) # Try to connect to the database try: db = self._session.get_db() pilot_col = db["%s.p" % self._session.uid] logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return last_job_check = time.time() while not self._terminate.is_set(): # Periodically, we pull up all ComputePilots that are pending # execution or were last seen executing and check if the corresponding # SAGA job is still pending in the queue. If that is not the case, # we assume that the job has failed for some reasons and update # the state of the ComputePilot accordingly. if last_job_check + JOB_CHECK_INTERVAL < time.time(): last_job_check = time.time() self.check_pilot_states(pilot_col) if self._disabled.is_set(): # don't process any new pilot start requests. # NOTE: this is not clean, in principle there could be other # launchers alive which want to still start those # pending pilots. In practice we only ever use one # pmgr though, and its during its shutdown that we get # here... ts = time.time() compute_pilot = pilot_col.find_and_modify( query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH}, update={ "$set": {"state": CANCELED}, "$push": {"statehistory": {"state": CANCELED, "timestamp": ts}}, }, ) # run state checks more frequently. JOB_CHECK_INTERVAL = 3 time.sleep(1) continue # See if we can find a ComputePilot that is waiting to be launched. # If we find one, we use SAGA to create a job service, a job # description and a job that is then send to the local or remote # queueing system. If this succedes, we set the ComputePilot's # state to pending, otherwise to failed. compute_pilot = None ts = time.time() compute_pilot = pilot_col.find_and_modify( query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH}, update={ "$set": {"state": LAUNCHING}, "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}}, }, ) if not compute_pilot: time.sleep(IDLE_TIMER) else: try: # ------------------------------------------------------ # # LAUNCH THE PILOT AGENT VIA SAGA # logentries = [] pilot_id = str(compute_pilot["_id"]) logger.info("Launching ComputePilot %s" % pilot_id) # ------------------------------------------------------ # Database connection parameters session_id = self._session.uid database_url = self._session.dburl # ------------------------------------------------------ # pilot description and resource configuration number_cores = compute_pilot["description"]["cores"] runtime = compute_pilot["description"]["runtime"] queue = compute_pilot["description"]["queue"] project = compute_pilot["description"]["project"] cleanup = compute_pilot["description"]["cleanup"] resource_key = compute_pilot["description"]["resource"] schema = compute_pilot["description"]["access_schema"] memory = compute_pilot["description"]["memory"] candidate_hosts = compute_pilot["description"]["candidate_hosts"] pilot_sandbox = compute_pilot["sandbox"] global_sandbox = compute_pilot["global_sandbox"] # we expand and exchange keys in the resource config, # depending on the selected schema so better use a deep # copy.. resource_cfg = self._session.get_resource_config(resource_key, schema) # import pprint # pprint.pprint (resource_cfg) # ------------------------------------------------------ # get parameters from cfg, set defaults where needed agent_launch_method = resource_cfg.get("agent_launch_method") agent_dburl = resource_cfg.get("agent_mongodb_endpoint", database_url) agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER) agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE) rc_agent_config = resource_cfg.get("agent_config", DEFAULT_AGENT_CONFIG) agent_scheduler = resource_cfg.get("agent_scheduler") tunnel_bind_device = resource_cfg.get("tunnel_bind_device") default_queue = resource_cfg.get("default_queue") forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint") js_endpoint = resource_cfg.get("job_manager_endpoint") lrms = resource_cfg.get("lrms") mpi_launch_method = resource_cfg.get("mpi_launch_method") pre_bootstrap_1 = resource_cfg.get("pre_bootstrap_1") pre_bootstrap_2 = resource_cfg.get("pre_bootstrap_2") python_interpreter = resource_cfg.get("python_interpreter") spmd_variation = resource_cfg.get("spmd_variation") task_launch_method = resource_cfg.get("task_launch_method") rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION) virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE) virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV) stage_cacerts = resource_cfg.get("stage_cacerts", "False") cores_per_node = resource_cfg.get("cores_per_node") shared_filesystem = resource_cfg.get("shared_filesystem", True) health_check = resource_cfg.get("health_check", True) python_dist = resource_cfg.get("python_dist") cu_pre_exec = resource_cfg.get("cu_pre_exec") cu_post_exec = resource_cfg.get("cu_post_exec") export_to_cu = resource_cfg.get("export_to_cu") # Agent configuration that is not part of the public API. # The agent config can either be a config dict, or # a string pointing to a configuration name. If neither # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is # set. The last fallback is 'agent_default' agent_config = compute_pilot["description"].get("_config") if not agent_config: agent_config = os.environ.get("RADICAL_PILOT_AGENT_CONFIG") if not agent_config: agent_config = rc_agent_config if isinstance(agent_config, dict): # nothing to do agent_cfg_dict = agent_config pass elif isinstance(agent_config, basestring): try: if os.path.exists(agent_config): # try to open as file name logger.info("Read agent config file: %s" % agent_config) agent_cfg_dict = ru.read_json(agent_config) else: # otherwise interpret as a config name module_path = os.path.dirname(os.path.abspath(__file__)) config_path = "%s/../configs/" % module_path agent_cfg_file = os.path.join(config_path, "agent_%s.json" % agent_config) logger.info("Read agent config file: %s" % agent_cfg_file) agent_cfg_dict = ru.read_json(agent_cfg_file) # no matter how we read the config file, we # allow for user level overload cfg_base = os.path.basename(agent_cfg_file) user_cfg = "%s/.radical/pilot/config/%s" % (os.environ["HOME"], cfg_base) if os.path.exists(user_cfg): logger.info("merging user config: %s" % user_cfg) user_cfg_dict = ru.read_json(user_cfg) ru.dict_merge(agent_cfg_dict, user_cfg_dict, policy="overwrite") except Exception as e: logger.exception("Error reading agent config file: %s" % e) raise else: # we can't handle this type raise TypeError("agent config must be string (filename) or dict") # TODO: use booleans all the way? if stage_cacerts.lower() == "true": stage_cacerts = True else: stage_cacerts = False # expand variables in virtenv string virtenv = virtenv % { "pilot_sandbox": saga.Url(pilot_sandbox).path, "global_sandbox": saga.Url(global_sandbox).path, } # Check for deprecated global_virtenv global_virtenv = resource_cfg.get("global_virtenv") if global_virtenv: logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'") virtenv = global_virtenv virtenv_mode = "use" # Create a host:port string for use by the bootstrap_1. db_url = saga.Url(agent_dburl) if db_url.port: db_hostport = "%s:%d" % (db_url.host, db_url.port) else: db_hostport = "%s:%d" % (db_url.host, 27017) # mongodb default # Open the remote sandbox # TODO: make conditional on shared_fs? sandbox_tgt = saga.filesystem.Directory( pilot_sandbox, session=self._session, flags=saga.filesystem.CREATE_PARENTS ) LOCAL_SCHEME = "file" # ------------------------------------------------------ # Copy the bootstrap shell script. # This also creates the sandbox. BOOTSTRAPPER_SCRIPT = "bootstrap_1.sh" bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, BOOTSTRAPPER_SCRIPT)) msg = "Using bootstrapper %s" % bootstrapper_path logentries.append(Logentry(msg, logger=logger.info)) bs_script_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, bootstrapper_path)) msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, sandbox_tgt) logentries.append(Logentry(msg, logger=logger.debug)) if shared_filesystem: sandbox_tgt.copy(bs_script_url, BOOTSTRAPPER_SCRIPT) # ------------------------------------------------------ # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to mod_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]: raise ValueError("invalid rp_version '%s'" % rp_version) stage_sdist = True if rp_version in ["installed", "release"]: stage_sdist = False if rp_version.startswith("@"): stage_sdist = False rp_version = rp_version[1:] # strip '@' # ------------------------------------------------------ # Copy the rp sdist if needed. We actually also stage # the sdists for radical.utils and radical.saga, so that # we have the complete stack to install... if stage_sdist: for sdist_path in [ru.sdist_path, saga.sdist_path, rp_sdist_path]: sdist_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, sdist_path)) msg = "Copying sdist '%s' to sandbox (%s)." % (sdist_url, pilot_sandbox) logentries.append(Logentry(msg, logger=logger.debug)) if shared_filesystem: sandbox_tgt.copy(sdist_url, os.path.basename(str(sdist_url))) # ------------------------------------------------------ # Some machines cannot run pip due to outdated CA certs. # For those, we also stage an updated certificate bundle if stage_cacerts: cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz")) cc_url = saga.Url("%s://localhost/%s" % (LOCAL_SCHEME, cc_path)) msg = "Copying CA certificate bundle '%s' to sandbox (%s)." % (cc_url, pilot_sandbox) logentries.append(Logentry(msg, logger=logger.debug)) if shared_filesystem: sandbox_tgt.copy(cc_url, os.path.basename(str(cc_url))) # ------------------------------------------------------ # sanity checks if not python_dist: raise RuntimeError("missing python distribution") if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not agent_launch_method: raise RuntimeError("missing agentlaunch method") if not task_launch_method: raise RuntimeError("missing task launch method") # massage some values if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): cleanup = "luve" # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) # # we never cleanup virtenvs which are not private if virtenv_mode is not "private": cleanup = cleanup.replace("v", "") sdists = ":".join([ru.sdist_name, saga.sdist_name, rp_sdist_name]) # if cores_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if cores_per_node: cores_per_node = int(cores_per_node) number_cores = int(cores_per_node * math.ceil(float(number_cores) / cores_per_node)) # set mandatory args bootstrap_args = "" bootstrap_args += " -d '%s'" % sdists bootstrap_args += " -m '%s'" % virtenv_mode bootstrap_args += " -p '%s'" % pilot_id bootstrap_args += " -r '%s'" % rp_version bootstrap_args += " -s '%s'" % session_id bootstrap_args += " -v '%s'" % virtenv bootstrap_args += " -b '%s'" % python_dist # set optional args if agent_type: bootstrap_args += " -a '%s'" % agent_type if lrms == "CCM": bootstrap_args += " -c" if pre_bootstrap_1: bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap_1) if pre_bootstrap_2: bootstrap_args += " -w '%s'" % "' -w '".join(pre_bootstrap_2) if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if tunnel_bind_device: bootstrap_args += " -t '%s'" % tunnel_bind_device if cleanup: bootstrap_args += " -x '%s'" % cleanup # set some agent configuration agent_cfg_dict["cores"] = number_cores agent_cfg_dict["resource_cfg"] = resource_cfg agent_cfg_dict["debug"] = os.environ.get( "RADICAL_PILOT_AGENT_VERBOSE", logger.getEffectiveLevel() ) agent_cfg_dict["mongodb_url"] = str(agent_dburl) agent_cfg_dict["lrms"] = lrms agent_cfg_dict["spawner"] = agent_spawner agent_cfg_dict["scheduler"] = agent_scheduler agent_cfg_dict["runtime"] = runtime agent_cfg_dict["pilot_id"] = pilot_id agent_cfg_dict["session_id"] = session_id agent_cfg_dict["agent_launch_method"] = agent_launch_method agent_cfg_dict["task_launch_method"] = task_launch_method agent_cfg_dict["export_to_cu"] = export_to_cu agent_cfg_dict["cu_pre_exec"] = cu_pre_exec agent_cfg_dict["cu_post_exec"] = cu_post_exec if mpi_launch_method: agent_cfg_dict["mpi_launch_method"] = mpi_launch_method if cores_per_node: agent_cfg_dict["cores_per_node"] = cores_per_node # ------------------------------------------------------ # Write agent config dict to a json file in pilot sandbox. cfg_tmp_dir = tempfile.mkdtemp(prefix="rp_agent_cfg_dir") agent_cfg_name = "agent_0.cfg" cfg_tmp_file = os.path.join(cfg_tmp_dir, agent_cfg_name) cfg_tmp_handle = os.open(cfg_tmp_file, os.O_WRONLY | os.O_CREAT) # Convert dict to json file msg = "Writing agent configuration to file '%s'." % cfg_tmp_file logentries.append(Logentry(msg, logger=logger.debug)) ru.write_json(agent_cfg_dict, cfg_tmp_file) cf_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, cfg_tmp_file)) msg = "Copying agent configuration file '%s' to sandbox (%s)." % (cf_url, pilot_sandbox) logentries.append(Logentry(msg, logger=logger.debug)) if shared_filesystem: sandbox_tgt.copy(cf_url, agent_cfg_name) # Close agent config file os.close(cfg_tmp_handle) # ------------------------------------------------------ # Done with all transfers to pilot sandbox, close handle sandbox_tgt.close() # ------------------------------------------------------ # now that the scripts are in place and configured, # we can launch the agent js_url = saga.Url(js_endpoint) logger.debug("saga.job.Service ('%s')" % js_url) if js_url in self._shared_worker_data["job_services"]: js = self._shared_worker_data["job_services"][js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data["job_services"][js_url] = js # ------------------------------------------------------ # Create SAGA Job description and submit the pilot job jd = saga.job.Description() jd.executable = "/bin/bash" jd.arguments = ["-l %s" % BOOTSTRAPPER_SCRIPT, bootstrap_args] jd.working_directory = saga.Url(pilot_sandbox).path jd.project = project jd.output = "bootstrap_1.out" jd.error = "bootstrap_1.err" jd.total_cpu_count = number_cores jd.processes_per_host = cores_per_node jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue jd.candidate_hosts = candidate_hosts jd.environment = dict() # TODO: not all files might be required, this also needs to be made conditional if not shared_filesystem: jd.file_transfer = [ #'%s > %s' % (bootstrapper_path, os.path.basename(bootstrapper_path)), "%s > %s" % ( bootstrapper_path, os.path.join(jd.working_directory, "input", os.path.basename(bootstrapper_path)), ), "%s > %s" % (cfg_tmp_file, os.path.join(jd.working_directory, "input", agent_cfg_name)), #'%s < %s' % ('agent.log', os.path.join(jd.working_directory, 'agent.log')), #'%s < %s' % (os.path.join(jd.working_directory, 'agent.log'), 'agent.log'), #'%s < %s' % ('agent.log', 'agent.log'), #'%s < %s' % (os.path.join(jd.working_directory, 'STDOUT'), 'unit.000000/STDOUT'), #'%s < %s' % (os.path.join(jd.working_directory, 'unit.000000/STDERR'), 'STDERR') #'%s < %s' % ('unit.000000/STDERR', 'unit.000000/STDERR') # TODO: This needs to go into a per pilot directory on the submit node "%s < %s" % ("pilot.0000.log.tgz", "pilot.0000.log.tgz"), ] if stage_sdist: jd.file_transfer.extend( [ #'%s > %s' % (rp_sdist_path, os.path.basename(rp_sdist_path)), "%s > %s" % ( rp_sdist_path, os.path.join( jd.working_directory, "input", os.path.basename(rp_sdist_path) ), ), #'%s > %s' % (saga.sdist_path, os.path.basename(saga.sdist_path)), "%s > %s" % ( saga.sdist_path, os.path.join( jd.working_directory, "input", os.path.basename(saga.sdist_path) ), ), #'%s > %s' % (ru.sdist_path, os.path.basename(ru.sdist_path)), "%s > %s" % ( ru.sdist_path, os.path.join( jd.working_directory, "input", os.path.basename(ru.sdist_path) ), ), ] ) if stage_cacerts: jd.file_transfer.append( "%s > %s" % (cc_path, os.path.join(jd.working_directory, "input", os.path.basename(cc_path))) ) if "RADICAL_PILOT_PROFILE" in os.environ: # TODO: This needs to go into a per pilot directory on the submit node jd.file_transfer.append("%s < %s" % ("pilot.0000.prof.tgz", "pilot.0000.prof.tgz")) # Set the SPMD variation only if required if spmd_variation: jd.spmd_variation = spmd_variation if "RADICAL_PILOT_PROFILE" in os.environ: jd.environment["RADICAL_PILOT_PROFILE"] = "TRUE" logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments)) msg = "Submitting SAGA job with description: %s" % str(jd.as_dict()) logentries.append(Logentry(msg, logger=logger.debug)) try: pilotjob = js.create_job(jd) except saga.BadParameter as e: raise ValueError("Pilot submission to %s failed: %s" % (resource_key, e)) pilotjob.run() # Clean up agent config file and dir after submission os.unlink(cfg_tmp_file) os.rmdir(cfg_tmp_dir) # do a quick error check if pilotjob.state == saga.FAILED: raise RuntimeError("SAGA Job state is FAILED.") saga_job_id = pilotjob.id self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url] msg = "SAGA job submitted with job id %s" % str(saga_job_id) logentries.append(Logentry(msg, logger=logger.debug)) # # ------------------------------------------------------ log_dicts = list() for le in logentries: log_dicts.append(le.as_dict()) # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful. ts = time.time() ret = pilot_col.update( {"_id": pilot_id, "state": LAUNCHING}, { "$set": { "state": PENDING_ACTIVE, "saga_job_id": saga_job_id, "health_check_enabled": health_check, "agent_config": agent_cfg_dict, }, "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) if ret["n"] == 0: # could not update, probably because the agent is # running already. Just update state history and # jobid then # FIXME: make sure of the agent state! ret = pilot_col.update( {"_id": pilot_id}, { "$set": {"saga_job_id": saga_job_id, "health_check_enabled": health_check}, "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) except Exception as e: # Update the Pilot's state 'FAILED'. out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = time.time() # FIXME: we seem to be unable to bson/json handle saga # log messages containing an '#'. This shows up here. # Until we find a clean workaround, make log shorter and # rely on saga logging to reveal the problem. msg = "Pilot launching failed! (%s)" % e logentries.append(Logentry(msg)) log_dicts = list() log_messages = list() for le in logentries: log_dicts.append(le.as_dict()) log_messages.append(str(le.message)) pilot_col.update( {"_id": pilot_id, "state": {"$ne": FAILED}}, { "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log}, "$push": {"statehistory": {"state": FAILED, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) logger.exception("\n".join(log_messages)) except SystemExit as e: logger.exception("pilot launcher thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main()
if __name__ == '__main__': if len(sys.argv) != 2: print "\n\tusage: %s <dir>\n" % sys.argv[0] sys.exit(1) loc = sys.argv[1] # find json file in dir, and derive session id json_files = glob.glob('%s/*.json' % loc) if len(json_files) < 1: raise ValueError('%s contains no json file!' % loc) if len(json_files) > 1: raise ValueError('%s contains more than one json file!' % loc) json_file = json_files[0] json = ru.read_json(json_file) sid = os.path.basename(json_file)[:-5] print 'sid: %s' % sid descr = rp.utils.get_session_description(sid=sid, src=loc) prof = rp.utils.get_session_profile (sid=sid, src=loc) session = ra.Session(prof, descr) # A formatting helper before starting... def ppheader(message): separator = '\n' + 78 * '-' + '\n' print separator + message + separator # and here we go. Once we filter our session object so to keep only the
random.shuffle(sequence) return sequence # ============================================================================= # EXPERIMENT # ============================================================================= if __name__ == '__main__': if len(sys.argv) < 2: print "\n\n\tusage: %s <config.json>\n\n" % sys.argv[0] sys.exit(-1) # read configuration file. cfg = ru.read_json(sys.argv[1]) # TODO: Rename aimes.emgr config keys. cfg["skeleton_template"] = cfg["skeleton"]["template"] cfg["pct_concurrency"] = cfg["strategy"]["pct_concurrency"] cfg["pct_resources"] = cfg["strategy"]["pct_resources"] cfg["recipients"] = cfg["log"]["email"]["recipients"] # TODO: Override with json skeleton config entries. cfg['skeleton_task_duration'] = { "max": cfg["skeleton"]["tasks"]["duration"]["max"], "min": cfg["skeleton"]["tasks"]["duration"]["min"]} # cfg['bundle_resources'] = {'hopper.nersc.gov' : 'pbs', # 'stampede.tacc.xsede.org' : 'slurm'} # 'gordon.sdsc.xsede.org' : 'pbs'}
def _prepare_pilot(self, resource, rcfg, pilot): pid = pilot["uid"] ret = {'ft' : list(), 'jd' : None } # # ---------------------------------------------------------------------- # # the rcfg can contain keys with string expansion placeholders where # # values from the pilot description need filling in. A prominent # # example is `%(pd.project)s`, where the pilot description's `PROJECT` # # value needs to be filled in (here in lowercase). # expand = dict() # for k,v in pilot['description'].iteritems(): # if v is None: # v = '' # expand['pd.%s' % k] = v # if isinstance(v, basestring): # expand['pd.%s' % k.upper()] = v.upper() # expand['pd.%s' % k.lower()] = v.lower() # else: # expand['pd.%s' % k.upper()] = v # expand['pd.%s' % k.lower()] = v # # for k in rcfg: # if isinstance(rcfg[k], basestring): # orig = rcfg[k] # rcfg[k] = rcfg[k] % expand # expanded = rcfg[k] # if orig != expanded: # self._log.debug('RCFG:\n%s\n%s', orig, expanded) # ---------------------------------------------------------------------- # Database connection parameters sid = self._session.uid database_url = self._session.dburl # some default values are determined at runtime default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \ (resource, self._rp_version) # ---------------------------------------------------------------------- # pilot description and resource configuration number_cores = pilot['description']['cores'] number_gpus = pilot['description']['gpus'] runtime = pilot['description']['runtime'] queue = pilot['description']['queue'] project = pilot['description']['project'] cleanup = pilot['description']['cleanup'] memory = pilot['description']['memory'] candidate_hosts = pilot['description']['candidate_hosts'] # ---------------------------------------------------------------------- # get parameters from resource cfg, set defaults where needed agent_launch_method = rcfg.get('agent_launch_method') agent_dburl = rcfg.get('agent_mongodb_endpoint', database_url) agent_spawner = rcfg.get('agent_spawner', DEFAULT_AGENT_SPAWNER) rc_agent_config = rcfg.get('agent_config', DEFAULT_AGENT_CONFIG) agent_scheduler = rcfg.get('agent_scheduler') tunnel_bind_device = rcfg.get('tunnel_bind_device') default_queue = rcfg.get('default_queue') forward_tunnel_endpoint = rcfg.get('forward_tunnel_endpoint') lrms = rcfg.get('lrms') mpi_launch_method = rcfg.get('mpi_launch_method', '') pre_bootstrap_0 = rcfg.get('pre_bootstrap_0', []) pre_bootstrap_1 = rcfg.get('pre_bootstrap_1', []) python_interpreter = rcfg.get('python_interpreter') task_launch_method = rcfg.get('task_launch_method') rp_version = rcfg.get('rp_version', DEFAULT_RP_VERSION) virtenv_mode = rcfg.get('virtenv_mode', DEFAULT_VIRTENV_MODE) virtenv = rcfg.get('virtenv', default_virtenv) cores_per_node = rcfg.get('cores_per_node', 0) gpus_per_node = rcfg.get('gpus_per_node', 0) lfs_path_per_node = rcfg.get('lfs_path_per_node', None) lfs_size_per_node = rcfg.get('lfs_size_per_node', 0) python_dist = rcfg.get('python_dist') virtenv_dist = rcfg.get('virtenv_dist', DEFAULT_VIRTENV_DIST) cu_tmp = rcfg.get('cu_tmp') spmd_variation = rcfg.get('spmd_variation') shared_filesystem = rcfg.get('shared_filesystem', True) stage_cacerts = rcfg.get('stage_cacerts', False) cu_pre_exec = rcfg.get('cu_pre_exec') cu_post_exec = rcfg.get('cu_post_exec') export_to_cu = rcfg.get('export_to_cu') mandatory_args = rcfg.get('mandatory_args', []) saga_jd_supplement = rcfg.get('saga_jd_supplement', {}) import pprint self._log.debug(cores_per_node) self._log.debug(pprint.pformat(rcfg)) # make sure that mandatory args are known for ma in mandatory_args: if pilot['description'].get(ma) is None: raise ValueError('attribute "%s" is required for "%s"' % (ma, resource)) # get pilot and global sandbox resource_sandbox = self._session._get_resource_sandbox (pilot).path session_sandbox = self._session._get_session_sandbox(pilot).path pilot_sandbox = self._session._get_pilot_sandbox (pilot).path pilot['resource_sandbox'] = str(self._session._get_resource_sandbox(pilot)) pilot['pilot_sandbox'] = str(self._session._get_pilot_sandbox(pilot)) pilot['client_sandbox'] = str(self._session._get_client_sandbox()) # Agent configuration that is not part of the public API. # The agent config can either be a config dict, or # a string pointing to a configuration name. If neither # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is # set. The last fallback is 'agent_default' agent_config = pilot['description'].get('_config') if not agent_config: agent_config = os.environ.get('RADICAL_PILOT_AGENT_CONFIG') if not agent_config: agent_config = rc_agent_config if isinstance(agent_config, dict): # use dict as is agent_cfg = agent_config elif isinstance(agent_config, basestring): try: # interpret as a config name agent_cfg_file = os.path.join(self._conf_dir, "agent_%s.json" % agent_config) self._log.info("Read agent config file: %s", agent_cfg_file) agent_cfg = ru.read_json(agent_cfg_file) # allow for user level overload user_cfg_file = '%s/.radical/pilot/config/%s' \ % (os.environ['HOME'], os.path.basename(agent_cfg_file)) if os.path.exists(user_cfg_file): self._log.info("merging user config: %s" % user_cfg_file) user_cfg = ru.read_json(user_cfg_file) ru.dict_merge (agent_cfg, user_cfg, policy='overwrite') except Exception as e: self._log.exception("Error reading agent config file: %s" % e) raise else: # we can't handle this type raise TypeError('agent config must be string (config name) or dict') # expand variables in virtenv string virtenv = virtenv % {'pilot_sandbox' : pilot_sandbox, 'session_sandbox' : session_sandbox, 'resource_sandbox': resource_sandbox} # Check for deprecated global_virtenv if 'global_virtenv' in rcfg: raise RuntimeError("'global_virtenv' is deprecated (%s)" % resource) # Create a host:port string for use by the bootstrap_0. db_url = rs.Url(agent_dburl) if db_url.port: db_hostport = "%s:%d" % (db_url.host, db_url.port) else: db_hostport = "%s:%d" % (db_url.host, 27017) # mongodb default # ---------------------------------------------------------------------- # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to root_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith('@') and \ not rp_version in ['installed', 'local', 'debug', 'release']: raise ValueError("invalid rp_version '%s'" % rp_version) if rp_version.startswith('@'): rp_version = rp_version[1:] # strip '@' # ---------------------------------------------------------------------- # sanity checks if not python_dist : raise RuntimeError("missing python distribution") if not virtenv_dist : raise RuntimeError("missing virtualenv distribution") if not agent_spawner : raise RuntimeError("missing agent spawner") if not agent_scheduler : raise RuntimeError("missing agent scheduler") if not lrms : raise RuntimeError("missing LRMS") if not agent_launch_method: raise RuntimeError("missing agentlaunch method") if not task_launch_method : raise RuntimeError("missing task launch method") # massage some values if not queue : queue = default_queue if cleanup and isinstance (cleanup, bool) : # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) if shared_filesystem: cleanup = 'luve' else: # we cannot clean the sandbox from within the agent, as the hop # staging would then fail, and we'd get nothing back. # FIXME: cleanup needs to be done by the pmgr.launcher, or # someone else, really, after fetching all logs and # profiles. cleanup = 'luv' # we never cleanup virtenvs which are not private if virtenv_mode is not 'private' : cleanup = cleanup.replace ('v', '') # add dists to staging files, if needed if rp_version in ['local', 'debug']: sdist_names = [ru.sdist_name, rs.sdist_name, self._rp_sdist_name] sdist_paths = [ru.sdist_path, rs.sdist_path, self._rp_sdist_path] else: sdist_names = list() sdist_paths = list() # if cores_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if cores_per_node: cores_per_node = int(cores_per_node) number_cores = int(cores_per_node * math.ceil(float(number_cores) / cores_per_node)) # if gpus_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if gpus_per_node: gpus_per_node = int(gpus_per_node) number_gpus = int(gpus_per_node * math.ceil(float(number_gpus) / gpus_per_node)) # set mandatory args bootstrap_args = "" bootstrap_args += " -d '%s'" % ':'.join(sdist_names) bootstrap_args += " -p '%s'" % pid bootstrap_args += " -s '%s'" % sid bootstrap_args += " -m '%s'" % virtenv_mode bootstrap_args += " -r '%s'" % rp_version bootstrap_args += " -b '%s'" % python_dist bootstrap_args += " -g '%s'" % virtenv_dist bootstrap_args += " -v '%s'" % virtenv bootstrap_args += " -y '%d'" % runtime # set optional args if lrms == "CCM": bootstrap_args += " -c" if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if tunnel_bind_device: bootstrap_args += " -t '%s'" % tunnel_bind_device if cleanup: bootstrap_args += " -x '%s'" % cleanup for arg in pre_bootstrap_0: bootstrap_args += " -e '%s'" % arg for arg in pre_bootstrap_1: bootstrap_args += " -w '%s'" % arg agent_cfg['owner'] = 'agent_0' agent_cfg['cores'] = number_cores agent_cfg['gpus'] = number_gpus agent_cfg['lrms'] = lrms agent_cfg['spawner'] = agent_spawner agent_cfg['scheduler'] = agent_scheduler agent_cfg['runtime'] = runtime agent_cfg['dburl'] = str(database_url) agent_cfg['session_id'] = sid agent_cfg['pilot_id'] = pid agent_cfg['logdir'] = '.' agent_cfg['pilot_sandbox'] = pilot_sandbox agent_cfg['session_sandbox'] = session_sandbox agent_cfg['resource_sandbox'] = resource_sandbox agent_cfg['agent_launch_method']= agent_launch_method agent_cfg['task_launch_method'] = task_launch_method agent_cfg['mpi_launch_method'] = mpi_launch_method agent_cfg['cores_per_node'] = cores_per_node agent_cfg['gpus_per_node'] = gpus_per_node agent_cfg['lfs_path_per_node'] = lfs_path_per_node agent_cfg['lfs_size_per_node'] = lfs_size_per_node agent_cfg['cu_tmp'] = cu_tmp agent_cfg['export_to_cu'] = export_to_cu agent_cfg['cu_pre_exec'] = cu_pre_exec agent_cfg['cu_post_exec'] = cu_post_exec agent_cfg['resource_cfg'] = copy.deepcopy(rcfg) agent_cfg['debug'] = self._log.getEffectiveLevel() # we'll also push the agent config into MongoDB pilot['cfg'] = agent_cfg # ---------------------------------------------------------------------- # Write agent config dict to a json file in pilot sandbox. agent_cfg_name = 'agent_0.cfg' cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.') os.close(cfg_tmp_handle) # file exists now # Convert dict to json file self._log.debug("Write agent cfg to '%s'.", cfg_tmp_file) self._log.debug(pprint.pformat(agent_cfg)) ru.write_json(agent_cfg, cfg_tmp_file) ret['ft'].append({'src' : cfg_tmp_file, 'tgt' : '%s/%s' % (pilot_sandbox, agent_cfg_name), 'rem' : True}) # purge the tmp file after packing # ---------------------------------------------------------------------- # we also touch the log and profile tarballs in the target pilot sandbox ret['ft'].append({'src' : '/dev/null', 'tgt' : '%s/%s' % (pilot_sandbox, '%s.log.tgz' % pid), 'rem' : False}) # don't remove /dev/null # only stage profiles if we profile if self._prof.enabled: ret['ft'].append({ 'src' : '/dev/null', 'tgt' : '%s/%s' % (pilot_sandbox, '%s.prof.tgz' % pid), 'rem' : False}) # don't remove /dev/null # check if we have a sandbox cached for that resource. If so, we have # nothing to do. Otherwise we create the sandbox and stage the RP # stack etc. # NOTE: this will race when multiple pilot launcher instances are used! with self._cache_lock: if resource not in self._sandboxes: for sdist in sdist_paths: base = os.path.basename(sdist) ret['ft'].append({'src' : sdist, 'tgt' : '%s/%s' % (session_sandbox, base), 'rem' : False}) # Copy the bootstrap shell script. bootstrapper_path = os.path.abspath("%s/agent/%s" % (self._root_dir, BOOTSTRAPPER_0)) self._log.debug("use bootstrapper %s", bootstrapper_path) ret['ft'].append({'src' : bootstrapper_path, 'tgt' : '%s/%s' % (session_sandbox, BOOTSTRAPPER_0), 'rem' : False}) # Some machines cannot run pip due to outdated CA certs. # For those, we also stage an updated certificate bundle # TODO: use booleans all the way? if stage_cacerts: cc_name = 'cacert.pem.gz' cc_path = os.path.abspath("%s/agent/%s" % (self._root_dir, cc_name)) self._log.debug("use CAs %s", cc_path) ret['ft'].append({'src' : cc_path, 'tgt' : '%s/%s' % (session_sandbox, cc_name), 'rem' : False}) self._sandboxes[resource] = True # ---------------------------------------------------------------------- # Create SAGA Job description and submit the pilot job jd = rs.job.Description() if shared_filesystem: bootstrap_tgt = '%s/%s' % (session_sandbox, BOOTSTRAPPER_0) else: bootstrap_tgt = '%s/%s' % ('.', BOOTSTRAPPER_0) jd.name = pid jd.executable = "/bin/bash" jd.arguments = ['-l %s' % bootstrap_tgt, bootstrap_args] jd.working_directory = pilot_sandbox jd.project = project jd.output = "bootstrap_0.out" jd.error = "bootstrap_0.err" jd.total_cpu_count = number_cores jd.total_gpu_count = number_gpus jd.processes_per_host = cores_per_node jd.spmd_variation = spmd_variation jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue jd.candidate_hosts = candidate_hosts jd.environment = dict() # we set any saga_jd_supplement keys which are not already set above for key, val in saga_jd_supplement.iteritems(): if not jd[key]: self._log.debug('supplement %s: %s', key, val) jd[key] = val if 'RADICAL_PILOT_PROFILE' in os.environ : jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE' # for condor backends and the like which do not have shared FSs, we add # additional staging directives so that the backend system binds the # files from the session and pilot sandboxes to the pilot job. jd.file_transfer = list() if not shared_filesystem: jd.file_transfer.extend([ 'site:%s/%s > %s' % (session_sandbox, BOOTSTRAPPER_0, BOOTSTRAPPER_0), 'site:%s/%s > %s' % (pilot_sandbox, agent_cfg_name, agent_cfg_name), 'site:%s/%s.log.tgz > %s.log.tgz' % (pilot_sandbox, pid, pid), 'site:%s/%s.log.tgz < %s.log.tgz' % (pilot_sandbox, pid, pid) ]) if 'RADICAL_PILOT_PROFILE' in os.environ: jd.file_transfer.extend([ 'site:%s/%s.prof.tgz > %s.prof.tgz' % (pilot_sandbox, pid, pid), 'site:%s/%s.prof.tgz < %s.prof.tgz' % (pilot_sandbox, pid, pid) ]) for sdist in sdist_names: jd.file_transfer.extend([ 'site:%s/%s > %s' % (session_sandbox, sdist, sdist) ]) if stage_cacerts: jd.file_transfer.extend([ 'site:%s/%s > %s' % (session_sandbox, cc_name, cc_name) ]) self._log.debug("Bootstrap command line: %s %s", jd.executable, jd.arguments) ret['jd'] = jd return ret
def __init__(self, session): """ Creates a new PilotManager and attaches is to the session. **Arguments:** * session [:class:`radical.pilot.Session`]: The session instance to use. **Returns:** * A new `PilotManager` object [:class:`radical.pilot.PilotManager`]. """ self._bridges = dict() self._components = dict() self._pilots = dict() self._pilots_lock = mt.RLock() self._callbacks = dict() self._pcb_lock = mt.RLock() self._terminate = mt.Event() self._closed = False self._rec_id = 0 # used for session recording for m in rpt.PMGR_METRICS: self._callbacks[m] = dict() cfg = ru.read_json("%s/configs/pmgr_%s.json" \ % (os.path.dirname(__file__), os.environ.get('RADICAL_PILOT_PMGR_CFG', 'default'))) assert(cfg['db_poll_sleeptime']), 'db_poll_sleeptime not configured' # initialize the base class (with no intent to fork) self._uid = ru.generate_id('pmgr') cfg['owner'] = self.uid rpu.Component.__init__(self, cfg, session) self.start(spawn=False) # only now we have a logger... :/ self._rep.info('<<create pilot manager') # The output queue is used to forward submitted pilots to the # launching component. self.register_output(rps.PMGR_LAUNCHING_PENDING, rpc.PMGR_LAUNCHING_QUEUE) # we also listen on the control pubsub, to learn about completed staging # directives self.register_subscriber(rpc.CONTROL_PUBSUB, self._staging_ack_cb) self._active_sds = dict() self._sds_lock = mt.Lock() # register the state notification pull cb # FIXME: we may want to have the frequency configurable # FIXME: this should be a tailing cursor in the update worker self.register_timed_cb(self._state_pull_cb, timer=self._cfg['db_poll_sleeptime']) # also listen to the state pubsub for pilot state changes self.register_subscriber(rpc.STATE_PUBSUB, self._state_sub_cb) # let session know we exist self._session._register_pmgr(self) self._prof.prof('setup_done', uid=self._uid) self._rep.ok('>>ok\n')
elif scheduler == 'round_robin': scheduler = rp.SCHED_ROUND_ROBIN else : scheduler = rp.SCHED_ROUND_ROBIN if not n_cores : raise ValueError ("need number of cores") if not n_units : raise ValueError ("need number of units") if not runtime : raise ValueError ("need pilot runtime") if not resources: raise ValueError ("need target resource") if not load : raise ValueError ("need load config") if not agent_cfg: raise ValueError ("need agent config") if not queue : queue = None resources = resources.split(',') for resource in resources: if not resource in RESOURCES: raise ValueError ("unknown resource %s" % resource) cu_load = ru.read_json (load) n_cores = int(n_cores) n_units = int(n_units) runtime = int(runtime) sid = run_experiment (n_cores, n_units, resources, runtime, cu_load, agent_cfg, scheduler, queue) with open('last.sid', 'w') as f: f.write("%s\n" % sid)
if __name__ == "__main__": # we use a reporter class for nicer output report = ru.Reporter("Getting Started") # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() # all other pilot code is now tried/excepted. If an exception is caught, we # can rely on the session object to exist and be valid, and we can thus tear # the whole RP stack down via a 'session.close()' call in the 'finally' # clause... try: report.info('read configs') resources = ru.read_json('%s/config.json', os.path.dirname(__file__)) report.ok('\\ok\n') report.header('submit pilots') # prepare some input files for the compute units os.system ('hostname > file1.dat') os.system ('date > file2.dat') # Add a Pilot Manager. Pilot managers manage one or more ComputePilots. pmgr = rp.PilotManager(session=session) # Define an [n]-core local pilot that runs for [x] minutes pdescs = list() for resource in sys.argv[1:]: pdesc = rp.ComputePilotDescription()
elif len(sys.argv) == 2: resource = sys.argv[1] else : resource = 'local.localhost' # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() # all other pilot code is now tried/excepted. If an exception is caught, we # can rely on the session object to exist and be valid, and we can thus tear # the whole RP stack down via a 'session.close()' call in the 'finally' # clause... try: # read the config used for resource details report.info('read config') config = ru.read_json('%s/config.json' % os.path.dirname(os.path.abspath(__file__))) report.ok('>>ok\n') report.header('submit pilots') # Add a Pilot Manager. Pilot managers manage one or more ComputePilots. pmgr = rp.PilotManager(session=session) # Define an [n]-core local pilot that runs for [x] minutes # Here we use a dict to initialize the description object report.info('create pilot description') pd_init = { 'resource' : resource, 'cores' : 64, # pilot size 'runtime' : 15, # pilot runtime (min) 'exit_on_error' : True,
def get_session_description(sid, src=None, dburl=None): """ This will return a description which is usable for radical.analytics evaluation. It informs about - set of stateful entities - state models of those entities - event models of those entities (maybe) - configuration of the application / module If `src` is given, it is interpreted as path to search for session information (json dump). `src` defaults to `$PWD/$sid`. if `dburl` is given, its value is used to fetch session information from a database. The dburl value defaults to `RADICAL_PILOT_DBURL`. """ from radical.pilot import states as rps from .session import fetch_json if not src: src = "%s/%s" % (os.getcwd(), sid) if os.path.isfile('%s/%s.json' % (src, sid)): json = ru.read_json('%s/%s.json' % (src, sid)) else: ftmp = fetch_json(sid=sid, dburl=dburl, tgt=src, skip_existing=True) json = ru.read_json(ftmp) # make sure we have uids # FIXME v0.47: deprecate def fix_json(json): def fix_uids(json): if isinstance(json, list): for elem in json: fix_uids(elem) elif isinstance(json, dict): if 'unitmanager' in json and 'umgr' not in json: json['umgr'] = json['unitmanager'] if 'pilotmanager' in json and 'pmgr' not in json: json['pmgr'] = json['pilotmanager'] if '_id' in json and 'uid' not in json: json['uid'] = json['_id'] if 'cfg' not in json: json['cfg'] = dict() for k,v in json.iteritems(): fix_uids(v) fix_uids(json) fix_json(json) assert(sid == json['session']['uid']), 'sid inconsistent' ret = dict() ret['entities'] = dict() tree = dict() tree[sid] = {'uid' : sid, 'etype' : 'session', 'cfg' : json['session']['cfg'], 'has' : ['umgr', 'pmgr'], 'children' : list() } for pmgr in sorted(json['pmgr'], key=lambda k: k['uid']): uid = pmgr['uid'] tree[sid]['children'].append(uid) tree[uid] = {'uid' : uid, 'etype' : 'pmgr', 'cfg' : pmgr['cfg'], 'has' : ['pilot'], 'children' : list() } for umgr in sorted(json['umgr'], key=lambda k: k['uid']): uid = umgr['uid'] tree[sid]['children'].append(uid) tree[uid] = {'uid' : uid, 'etype' : 'umgr', 'cfg' : umgr['cfg'], 'has' : ['unit'], 'children' : list() } # also inject the pilot description, and resource specifically tree[uid]['description'] = dict() for pilot in sorted(json['pilot'], key=lambda k: k['uid']): uid = pilot['uid'] pmgr = pilot['pmgr'] pilot['cfg']['resource_details'] = pilot['resource_details'] tree[pmgr]['children'].append(uid) tree[uid] = {'uid' : uid, 'etype' : 'pilot', 'cfg' : pilot['cfg'], 'description': pilot['description'], 'has' : ['unit'], 'children' : list() } # also inject the pilot description, and resource specifically for unit in sorted(json['unit'], key=lambda k: k['uid']): uid = unit['uid'] pid = unit['umgr'] umgr = unit['pilot'] tree[pid ]['children'].append(uid) tree[umgr]['children'].append(uid) tree[uid] = {'uid' : uid, 'etype' : 'unit', 'cfg' : unit, 'description' : unit['description'], 'has' : list(), 'children' : list() } # remove duplicate del(tree[uid]['cfg']['description']) ret['tree'] = tree ret['entities']['pilot'] = {'state_model' : rps._pilot_state_values, 'state_values' : rps._pilot_state_inv_full, 'event_model' : dict()} ret['entities']['unit'] = {'state_model' : rps._unit_state_values, 'state_values' : rps._unit_state_inv_full, 'event_model' : dict()} ret['entities']['session'] = {'state_model' : None, # has no states 'state_values' : None, 'event_model' : dict()} ret['config'] = dict() # session config goes here return ret
def _read_json (filename) : data = ru.read_json (filename) os.unlink (filename) return data
elif len(sys.argv) == 2: resource = sys.argv[1] else : resource = 'local.localhost' # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() # all other pilot code is now tried/excepted. If an exception is caught, we # can rely on the session object to exist and be valid, and we can thus tear # the whole RP stack down via a 'session.close()' call in the 'finally' # clause... if True: # read the config used for resource details report.info('read config') config = ru.read_json('config.json') report.ok('>>ok\n') report.header('submit pilots') # Add a Pilot Manager. Pilot managers manage one or more ComputePilots. pmgr = rp.PilotManager(session=session) # Define an [n]-core local pilot that runs for [x] minutes # Here we use a dict to initialize the description object pd_init = { 'resource' : resource, 'cores' : 2, # pilot size 'runtime' : 300, # pilot runtime (min) 'project' : config[resource]['project'], 'queue' : config[resource]['queue'],
def emulate(command=None, samples=None, src=None): if (command and samples) or \ (command and src ) or \ (samples and src ) : raise ValueError ("emulate needs *either* command, sample *or* src") if not command and not samples and not src: print "warning: emulate needs either command, sample or src" return[0, None, None] if command or src: if command: profs = rsu.get_profiles (command, mode='pro') # FIXME: average vals over all retrieved profiles prof = profs[0]['profile'] else: prof = ru.read_json(src) # pprint.pprint (prof) # get time series to emulate (all types of operations are mixed) # FIXME: we should also sample walltime for _TIM. As it is, mixing # time and other samples will yield incorrect results due to # mismatch in granularity. # FIXME: add network sample interpretation samples = list() # samples += [[_TIM, x[0], {'real' : x[1].get('real', 0)}] # for x in prof['time']] samples += [[_CPU, x[0], {'time' : x[1].get('time', 0), 'flops' : x[1].get('ops', 0), 'efficiency' : x[1].get('efficiency', 0)}] for x in prof['cpu']['sequence']] samples += [[_MEM, x[0], {'size' : x[1].get('size', 0)}] for x in prof['mem']['sequence']] samples += [[_STO, x[0], {'src' : x[1].get('src', None), 'rsize' : x[1].get('read', 0), 'tgt' : x[1].get('tgt', None), 'wsize' : x[1].get('write', 0)}] for x in prof['sto']['sequence']] # *globally* sort samples by time samples = sorted (samples, key=lambda x: x[1]) # print "samples:\n---" # pprint.pprint (samples) # print "---" watchmode = os.environ.get ('RADICAL_SYNAPSE_WATCHMODE') if not watchmode or watchmode.lower in ['none', 'noop']: start = time.time() _emulator(samples) stop = time.time() ret = None info = dict() info['cmd'] = command info['time'] = dict() info['time']['start'] = start info['time']['real'] = stop-start else: # let the profiler know that we run an emulation, so that the profile is not # stored as 'application run'. os.environ['_RADICAL_SYNAPSE_EMULATED'] = 'TRUE' os.environ['_RADICAL_SYNAPSE_EMULATEE'] = command info, ret, _ = profile(_emulator, samples) if 'ops' in info['cpu']: info['cpu']['efficiency'] = info['cpu']['ops'] \ / ( info['cpu']['ops'] \ + info['cpu']['cycles_stalled_front'] \ + info['cpu']['cycles_stalled_back'] \ ) #print 'efficiency = %s / (%s + %s + %s) = %s' % ( # info['cpu']['ops'], # info['cpu']['ops'], # info['cpu']['cycles_stalled_front'], # info['cpu']['cycles_stalled_back'], # info['cpu']['efficiency']) return (info, ret, None)
if scheduler == 'direct' : scheduler = rp.SCHED_DIRECT elif scheduler == 'backfilling': scheduler = rp.SCHED_BACKFILLING elif scheduler == 'round_robin': scheduler = rp.SCHED_ROUND_ROBIN else : scheduler = rp.SCHED_ROUND_ROBIN if not n_cores : raise ValueError ("need number of cores") if not n_units : raise ValueError ("need number of units") if not runtime : raise ValueError ("need pilot runtime") if not resources: raise ValueError ("need target resource") if not load : raise ValueError ("need load config") if not agent : raise ValueError ("need agent config") resources = resources.split(',') for resource in resources: if not resource in RESOURCES: raise ValueError ("unknown resource %s" % resource) cu_load = ru.read_json (load) agent_config = ru.read_json (agent) n_cores = int(n_cores) n_units = int(n_units) runtime = int(runtime) sid = run_experiment (n_cores, n_units, resources, runtime, cu_load, agent_config, scheduler, queue) print "session id: %s" % sid
def __init__(self, uid, cfg, dbs): self.uid = uid self._cfg = ru.read_json (credential['config']) self._dbs = dbs
if len(sys.argv) > 2: report.exit('Usage:\t%s [resource]\n\n' % sys.argv[0]) elif len(sys.argv) == 2: resource = sys.argv[1] else : resource = 'local.localhost' # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() # all other pilot code is now tried/excepted. If an exception is caught, we # can rely on the session object to exist and be valid, and we can thus tear # the whole RP stack down via a 'session.close()' call in the 'finally' # clause... try: # read the config used for resource details report.info('read config') config = ru.read_json('%s/../config.json' % PWD) report.ok('>>ok\n') report.header('submit pilots') # Add a Pilot Manager. Pilot managers manage one or more ComputePilots. pmgr = rp.PilotManager(session=session) # Define an [n]-core local pilot that runs for [x] minutes # Here we use a dict to initialize the description object pd_init = { 'resource' : resource, 'runtime' : 15, # pilot runtime (min) 'exit_on_error' : True, 'project' : config[resource]['project'], 'queue' : config[resource]['queue'],
def get_session_description(sid, src=None, dburl=None): """ This will return a description which is usable for radical.analytics evaluation. It informs about - set of stateful entities - state models of those entities - event models of those entities (maybe) - configuration of the application / module If `src` is given, it is interpreted as path to search for session information (json dump). `src` defaults to `$PWD/$sid`. if `dburl` is given, its value is used to fetch session information from a database. The dburl value defaults to `RADICAL_PILOT_DBURL`. """ from radical.pilot import states as rps from .session import fetch_json if not src: src = "%s/%s" % (os.getcwd(), sid) ftmp = fetch_json(sid=sid, dburl=dburl, tgt=src, skip_existing=True) json = ru.read_json(ftmp) # make sure we have uids def fix_json(json): def fix_uids(json): if isinstance(json, list): for elem in json: fix_uids(elem) elif isinstance(json, dict): if 'unitmanager' in json and 'umgr' not in json: json['umgr'] = json['unitmanager'] if 'pilotmanager' in json and 'pmgr' not in json: json['pmgr'] = json['pilotmanager'] if '_id' in json and 'uid' not in json: json['uid'] = json['_id'] if not 'cfg' in json: json['cfg'] = dict() for k,v in json.iteritems(): fix_uids(v) fix_uids(json) fix_json(json) ru.write_json(json, '/tmp/t.json') assert(sid == json['session']['uid']) ret = dict() ret['entities'] = dict() tree = dict() tree[sid] = {'uid' : sid, 'etype' : 'session', # 'cfg' : json['session']['cfg'], 'has' : ['umgr', 'pmgr'], 'children' : list() } for pmgr in sorted(json['pmgr'], key=lambda k: k['uid']): uid = pmgr['uid'] tree[sid]['children'].append(uid) tree[uid] = {'uid' : uid, 'etype' : 'pmgr', # 'cfg' : pmgr['cfg'], 'has' : ['pilot'], 'children' : list() } for umgr in sorted(json['umgr'], key=lambda k: k['uid']): uid = umgr['uid'] tree[sid]['children'].append(uid) tree[uid] = {'uid' : uid, 'etype' : 'umgr', # 'cfg' : umgr['cfg'], 'has' : ['unit'], 'children' : list() } for pilot in sorted(json['pilot'], key=lambda k: k['uid']): uid = pilot['uid'] pmgr = pilot['pmgr'] tree[pmgr]['children'].append(uid) tree[uid] = {'uid' : uid, 'etype' : 'pilot', # 'cfg' : pilot['cfg'], 'has' : ['unit'], 'children' : list() } for unit in sorted(json['unit'], key=lambda k: k['uid']): uid = unit['uid'] pid = unit['umgr'] umgr = unit['pilot'] tree[pid ]['children'].append(uid) tree[umgr]['children'].append(uid) tree[uid] = {'uid' : uid, 'etype' : 'unit', # 'cfg' : unit['description'], 'has' : list(), 'children' : list() } ret['tree'] = tree import pprint, sys pprint.pprint(tree) ret['entities']['pilot'] = { 'state_model' : rps.pilot_state_by_value, 'state_values' : rps.pilot_state_value, 'event_model' : dict(), } ret['entities']['unit'] = { 'state_model' : rps.unit_state_by_value, 'state_values' : rps.unit_state_value, 'event_model' : dict(), } ret['entities']['session'] = { 'state_model' : None, # session has no states, only events 'state_values' : None, 'event_model' : dict(), } ret['config'] = dict() # magic to get session config goes here return ret
def __init__(self, session, scheduler=None): """ Creates a new UnitManager and attaches it to the session. **Arguments:** * session [:class:`radical.pilot.Session`]: The session instance to use. * scheduler (`string`): The name of the scheduler plug-in to use. **Returns:** * A new `UnitManager` object [:class:`radical.pilot.UnitManager`]. """ self._bridges = dict() self._components = dict() self._pilots = dict() self._pilots_lock = threading.RLock() self._units = dict() self._units_lock = threading.RLock() self._callbacks = dict() self._cb_lock = threading.RLock() self._terminate = threading.Event() self._closed = False self._rec_id = 0 # used for session recording for m in rpt.UMGR_METRICS: self._callbacks[m] = dict() cfg = ru.read_json("%s/configs/umgr_%s.json" \ % (os.path.dirname(__file__), os.environ.get('RADICAL_PILOT_UMGR_CFG', 'default'))) if scheduler: # overwrite the scheduler from the config file cfg['scheduler'] = scheduler if not cfg.get('scheduler'): # set default scheduler if needed cfg['scheduler'] = rpus.SCHEDULER_DEFAULT assert(cfg['db_poll_sleeptime']), 'db_poll_sleeptime not configured' # initialize the base class (with no intent to fork) self._uid = ru.generate_id('umgr') cfg['owner'] = self.uid rpu.Component.__init__(self, cfg, session) self.start(spawn=False) self._log.info('started umgr %s', self._uid) # only now we have a logger... :/ self._rep.info('<<create unit manager') # The output queue is used to forward submitted units to the # scheduling component. self.register_output(rps.UMGR_SCHEDULING_PENDING, rpc.UMGR_SCHEDULING_QUEUE) # the umgr will also collect units from the agent again, for output # staging and finalization self.register_output(rps.UMGR_STAGING_OUTPUT_PENDING, rpc.UMGR_STAGING_OUTPUT_QUEUE) # register the state notification pull cb # FIXME: this should be a tailing cursor in the update worker self.register_timed_cb(self._state_pull_cb, timer=self._cfg['db_poll_sleeptime']) # register callback which pulls units back from agent # FIXME: this should be a tailing cursor in the update worker self.register_timed_cb(self._unit_pull_cb, timer=self._cfg['db_poll_sleeptime']) # also listen to the state pubsub for unit state changes self.register_subscriber(rpc.STATE_PUBSUB, self._state_sub_cb) # let session know we exist self._session._register_umgr(self) self._prof.prof('setup_done', uid=self._uid) self._rep.ok('>>ok\n')
def get_config (params) : """ This method attempts to obtain configuration settings from a variety of sources, depending on the parameter. it can point to an env var, or to a directory containing configuration files, or to a single configuration file, or to a list of any above, or it is a config dict already, or a list of such dicts. In all cases, the config is obtained from the respective source (which is assumed json formatted in the case of config files), and a single merged and expanded dict is returned. """ ret = dict() # always make params list for simpler code below if not isinstance(params, list) : params = [params] for param in params : if not param or None == param : # we silently accept None's, to save some # repetetetetive checks on the calling side continue elif isinstance (param, dict) : # simply merge it into the result ru.dict_merge (ret, param, policy='overwrite') elif isinstance (param, basestring) : # check if the string points to an env variable if param in os.environ : # assume that the value of the env var is what we really want param = os.environ[param] # is string, is not env, must be a dir or a file if os.path.isdir (param) : # config dir cfg_files = glob.glob ("%s/*" % param) # print 'is dir %s/*' % param # print cfg_files elif os.path.isfile (param) : # single config file cfg_files = [param] else : troy._logger.warning ("cannot handle config location %s" % param) cfg_files = list() print 'files: %s' % cfg_files # read and merge all config files for cfg_file in cfg_files : cfg_dict = dict() try : cfg_dict = ru.read_json (cfg_file) troy._logger.info ("reading config in %s" % cfg_file) except Exception as e : troy._logger.critical ("skipping config in %s (%s)" % (cfg_file, e)) raise # import pprint # print '================' # print cfg_file # pprint.pprint (cfg_dict) # print '================' ru.dict_merge (ret, cfg_dict, policy='overwrite') else : raise TypeError ("get_config parameter must be (list of) dict or " "string, not %s" % type(param)) # print '================================' # pprint.pprint (ret) # print '================================' # expand config(s) before returning ru.dict_stringexpand (ret) return ret