def __init__(self, command, logger=None): """ The class constructor, which runs (execvpe) command in a separately forked process. The bew process will inherit the environment of the application process. :type command: string or list of strings :param command: The given command is what is run as a child, and fed/drained via pty pipes. If given as string, command is split into an array of strings, using :func:`shlex.split`. :type logger: :class:`radical.utils.logger.Logger` instance :param logger: logger stream to send status messages to. """ self.logger = logger if not self.logger: self.logger = ru.Logger('radical.saga.pty') self.logger.debug("PTYProcess init %s" % self) if isinstance(command, basestring): command = shlex.split(command) if not isinstance(command, list): raise se.BadParameter("PTYProcess expects string or list command") if len(command) < 1: raise se.BadParameter("PTYProcess expects non-empty command") self.rlock = ru.RLock("pty process %s" % command) self.command = command # list of strings too run() self.cache = "" # data cache self.tail = "" # tail of data data cache for error messages self.child = None # the process as created by subprocess.Popen self.ptyio = None # the process' io channel, from pty.fork() self.exit_code = None # child died with code (may be revived) self.exit_signal = None # child kill by signal (may be revived) self.recover_max = 3 # TODO: make configure option. This does not self.recover_attempts = 0 # apply for recovers triggered by gc_timeout! try: self.initialize() except Exception as e: raise ptye.translate_exception(e, "pty or process creation failed")
def test_construct_command(mocked_init, mocked_configure): component = Srun(name=None, cfg=None, session=None) component._log = ru.Logger('dummy') component._cfg = {} component.name = 'srun' component.launch_command = '/bin/srun' test_cases = setUp('lm', 'srun') for unit, result in test_cases: if result != "RuntimeError": command, hop = component.construct_command(unit, None) assert ([command, hop] == result), unit['uid']
def __init__(self): # Engine manages cpis from adaptors self._adaptor_registry = dict() # get angine, adaptor and pty configs self._cfg = ru.Config('radical.saga.engine') self._pty_cfg = ru.Config('radical.saga.pty') self._registry = ru.Config('radical.saga.registry') # Initialize the logging, and log version (this is a singleton!) self._logger = ru.Logger('radical.saga') self._logger.info('radical.saga version: %s' % version_detail) # load adaptors self._load_adaptors()
def test_unschedule_unit(self, mocked_init): component = Continuous(cfg=None, session=None) _, cfg = self.setUp() unit = dict() unit['description'] = cfg[1]['unit']['description'] unit['slots'] = cfg[1]['setup']['lm']['slots'] component.nodes = cfg[1]['setup']['lm']['slots']['nodes'] component._log = ru.Logger('dummy') component.unschedule_unit(unit) try: self.assertEqual(component.nodes[0]['cores'], [0]) self.assertEqual(component.nodes[0]['gpus'], [0]) except: with pytest.raises(AssertionError): raise
def test_construct_command(mocked_init, mocked_configure, mocked_raise_on): test_cases = setUp('lm', 'yarn') component = Yarn(cfg=None, session=None, name=None) component._log = ru.Logger('dummy') component.launch_command = 'yarn' component.name = "YARN" for unit, result in test_cases: if result == "RuntimeError": with pytest.raises(RuntimeError): command, hop = component.construct_command(unit, None) else: command, hop = component.construct_command(unit, None) assert ([command, hop] == result)
def test_schedule_unit(self, mocked_init, mocked_configure, mocked_find_resources): _, cfg = self.setUp() component = Continuous(cfg=None, session=None) unit = dict() unit['uid'] = cfg[1]['unit']['uid'] unit['description'] = cfg[1]['unit']['description'] component.nodes = cfg[1]['setup']['lm']['slots']['nodes'] component._rm_cores_per_node = 32 component._rm_gpus_per_node = 2 component._rm_lfs_per_node = {"size": 0, "path": "/dev/null"} component._rm_mem_per_node = 1024 component._rm_lm_info = 'INFO' component._log = ru.Logger('dummy') component._node_offset = 0 test_slot = { 'cores_per_node': 32, 'gpus_per_node': 2, 'lfs_per_node': { 'path': '/dev/null', 'size': 0 }, 'lm_info': 'INFO', 'mem_per_node': 1024, 'nodes': [{ 'core_map': [[0]], 'gpu_map': [[0]], 'lfs': { 'path': '/dev/null', 'size': 1234 }, 'mem': 128, 'name': 'a', 'uid': 1 }] } try: self.assertEqual(component.schedule_unit(unit), test_slot) except: with pytest.raises(AssertionError): raise
def test_construct_command(mocked_init, mocked_get_mpi_info, mocked_raise_on): test_cases = setUp('lm', 'mpirun') component = MPIRun(name=None, cfg=None, session=None) component.name = 'MPIRun' component._configure() component._log = ru.Logger('dummy') component.mpi_flavor = None component.launch_command = 'mpirun' component._ccmrun = '' component._dplace = '' for unit, result in test_cases: command, hop = component.construct_command(unit, None) assert ([command, hop] == result), unit['uid']
def __init__(self, adaptor_info, adaptor_options=None, expand_env=True): # FIXME: engine is loading cfg already, here we load again... self._info = adaptor_info self._name = adaptor_info['name'] self._schemas = adaptor_info['schemas'] self._lock = mt.RLock() self._logger = ru.Logger('radical.saga.api') self._cfg = ru.Config(module='radical.saga.adaptors', name=self._name, expand=expand_env) if 'enabled' not in self._cfg: self._cfg['enabled'] = True
def __init__(self, ensemble_size, exchange_size, window_size, md_cycles): self._en_size = ensemble_size self._ex_size = exchange_size self._cycles = md_cycles self._window_size = window_size self._lock = mt.Lock() self._log = ru.Logger('radical.repex.exc') self._dout = open('dump.log', 'a') re.AppManager.__init__(self, autoterminate=False, port=32769) self.resource_desc = { "resource": "xsede.bridges", "walltime": 60, "cpus": 28, "gpus_per_node": 0, "access_schema": "gsissh", "queue": "RM", "project": "mr560ip" } #{"resource" : "xsede.comet_ssh", #"walltime" : 30, #"cpus" : 24, #"gpus_per_node" : 0, #access_schema" : "gsissh", #queue" : "debug", #"project" : "rut129" } # self.resource_desc = {"resource" : 'local.localhost', # "walltime" : 30, # "cpus" : 64} self._replicas = list() self._waitlist = list() # create the required number of replicas for i in range(self._en_size): replica = Replica(check_ex=self._check_exchange, check_res=self._check_resume, rid=i) self._replicas.append(replica) self._dump(msg='startup')
def __init__(self, uid=None): ok = False try: ok = hasattr(self, '_apitype') except: pass if not ok: self._apitype = self._get_apitype() self._logger = ru.Logger('radical.saga') if uid: self._id = uid else: self._id = ru.generate_id(self._apitype, mode=ru.ID_SIMPLE)
def __init__(self, adaptor_info, adaptor_options=None, expand_env=True): # FIXME: engine is loading cfg already, here we load again... self._info = adaptor_info self._name = adaptor_info['name'] self._schemas = adaptor_info['schemas'] self._lock = ru.RLock(self._name) self._logger = ru.Logger('radical.saga.api') # we need to expand later once we got env from the remote resource self._cfg = ru.Config(module='radical.saga', name=self._name, expand=expand_env) if 'enabled' not in self._cfg: self._cfg['enabled'] = True
def __init__(self, check_ex, rid, sbox, cores, exe): self._check_ex = check_ex # is called when checking for exchange # self._check_res = check_res # is called when exchange is done self._rid = rid self._sbox = sbox self._cores = cores self._exe = exe self._cycle = 0 # initial cycle self._log = ru.Logger('radical.repex.rep') # entk pipeline initialization re.Pipeline.__init__(self) self.name = 'p_%s' % self.rid # add an initial md stage self.add_md_stage()
def __init__(self, workload, properties): self._workload = workload self._check_ex = None self._check_res = None if 'rid' in properties: self._rid = properties['rid'] else: self._rid = ru.generate_id('replica..%(counter)06d', ru.ID_CUSTOM) self._props = properties self._cycle = 0 # initial cycle self._ex_list = None # list of replicas used in exchange step re.Pipeline.__init__(self) self.name = 'p_%s' % self.rid self._log = ru.Logger('radical.repex')
def __init__(self, check_ex, check_res, rid, sbox, cores, exe): self._check_ex = check_ex self._check_res = check_res self._rid = rid self._sbox = sbox self._cores = cores self._exe = exe self._cycle = 0 # initial cycle self._ex_list = None # list of replicas used in exchange step re.Pipeline.__init__(self) self.name = 'p_%s' % self.rid self._log = ru.Logger('radical.repex.rep') # add an initial md stage self.add_md_stage()
def test_plan4(mocked_init, mocked_raise_on): actual_plan = [('W1', {'id': 1, 'performance': 523}, 0, 102.5793499043977)] planner = HeftPlanner(None, None, None) planner._campaign = ['W1'] planner._resources = [{ 'id': 1, 'performance': 523 }, { 'id': 2, 'performance': 487 }, { 'id': 3, 'performance': 96 }] planner._num_oper = [53649] planner._logger = ru.Logger('dummy') est_plan = planner.plan() assert est_plan == actual_plan
def __init__(self, size, exchange_size, window_size, min_cycles, min_temp, max_temp, timesteps, basename, executable, cores): self._size = size self._exchange_size = exchange_size self._window_size = window_size self._min_cycles = min_cycles self._min_temp = min_temp self._max_temp = max_temp self._timesteps = timesteps self._basename = basename self._executable = executable self._cores = cores self._log = ru.Logger('radical.repex.exc') # inintialize the entk app manager re.AppManager.__init__(self, autoterminate=False, port=RMQ_PORT) self.resource_desc = { "resource": 'local.localhost', "walltime": 30, "cpus": 4 } # this is ugly self._sbox = '$Pipeline_untarPipe_Stage_untarStg_Task_untarTsk' self._cnt = 0 # count exchanges self._replicas = list() self._waitlist = list() self._exchange_list = list( ) # Sublist of self._waitlist that performs an exchange # create the required number of replicas for i in range(self._size): replica = Replica(check_ex=self._check_exchange, check_res=self._check_resume, rid=i, sbox=self._sbox, cores=self._cores, exe=self._executable) self._replicas.append(replica)
def test_construct_command(mocked_init, mocked_configure, mocked_raise_on): test_cases = setUp('lm', 'rsh') component = RSH(name=None, cfg=None, session=None) component._log = ru.Logger('dummy') component.name = 'RSH' component.mpi_flavor = None component.launch_command = 'rsh' for unit, result in test_cases: if result == "ValueError": with pytest.raises(ValueError): command, hop = component.construct_command(unit, None) elif result == "RuntimeError": with pytest.raises(RuntimeError): command, hop = component.construct_command(unit, 1) else: command, hop = component.construct_command(unit, 1) assert ([command, hop] == result)
def test_handle_cuda(self, mocked_init): tests = self.setUp() setups = tests['handle_cuda']['setup'] units = tests['handle_cuda']['unit'] results = tests['handle_cuda']['results'] component = AgentSchedulingComponent() component._log = ru.Logger('dummy') for setup, unit, result in zip(setups, units, results): component._cfg = setup if result == 'ValueError': with pytest.raises(ValueError): component._handle_cuda(unit) else: component._handle_cuda(unit) self.assertEqual( unit['description']['environment']['CUDA_VISIBLE_DEVICES'], result)
def translate_exception(e, msg=None): """ In many cases, we should be able to roughly infer the exception cause from the error message -- this is centrally done in this method. If possible, it will return a new exception with a more concise error message and appropriate exception type. """ if not issubclass(e.__class__, se.SagaException): # we do not touch non-saga exceptions return e if not issubclass(e.__class__, se.NoSuccess): # this seems to have a specific cause already, leave it alone return e ru.Logger('radical.saga.pty').debug(traceback.format_exc()) cmsg = e._plain_message if msg: cmsg = "%s (%s)" % (cmsg, msg) lmsg = cmsg.lower() if 'could not resolve hostname' in lmsg: e = se.BadParameter(cmsg) elif 'connection timed out' in lmsg: e = se.BadParameter(cmsg) elif 'connection refused' in lmsg: e = se.BadParameter(cmsg) elif 'auth' in lmsg: e = se.AuthorizationFailed(cmsg) elif 'pass' in lmsg: e = se.AuthenticationFailed(cmsg) elif 'denied' in lmsg: e = se.PermissionDenied(cmsg) elif 'man-in-the-middle' in lmsg: e = se.AuthenticationFailed("ssh key mismatch: %s" % cmsg) elif 'ssh_exchange_identification' in lmsg: e = se.AuthenticationFailed("too many login attempts: %s" % cmsg) elif 'shared connection' in lmsg: e = se.NoSuccess("Insufficient system resources: %s" % cmsg) elif 'pty allocation' in lmsg: e = se.NoSuccess("Insufficient system resources: %s" % cmsg) elif 'Connection to master closed' in lmsg: e = se.NoSuccess("Connection closed by system: %s" % cmsg) return e
def test_configure(mocked_init, mocked_init_continuous, mocked_subscriber): ''' Test 1 check configuration setup ''' cfg = dict() component = ContinuousOrdered(cfg=None, session=None) component._trigger_state = rps.UMGR_STAGING_OUTPUT_PENDING component._lock = mt.RLock() component._cfg = cfg component._ru_terminating = True component._uid = None component._log = ru.Logger('dummy') component._units = dict() component._unordered = list() component._ns = dict() component._ns_init = {'current': 0} component._order_init = {'size': 0, 'uids': list(), 'done': list()} component._configure()
def __init__(self): ''' initialize the service endpoint: - create logger, profile and reporter - set up accounts ''' self._log = ru.Logger('radical.nge.service') self._rep = ru.Reporter('radical.nge.service') self._prof = ru.Profiler('radical.nge.service') self._accounts = { 'andre': _Account('andre', 'erdna'), 'matteo': _Account('matteo', 'eottam'), 'daniel': _Account('daniel', 'leinad'), 'guest': _Account('guest', 'guest'), } self._rep.header('--- NGE (%s) ---' % rn.version)
def __init__(self, ensemble_size, exchange_size, window_size, md_cycles, min_temp, max_temp, timesteps, basename, executable, cores): self._en_size = ensemble_size self._ex_size = exchange_size self._window_size = window_size self._cycles = md_cycles self._min_temp = min_temp self._max_temp = max_temp self._timesteps = timesteps self._basename = basename self._executable = executable self._cores = cores self._lock = mt.Lock() self._log = ru.Logger('radical.repex.exc') self._dout = open('dump.log', 'a') re.AppManager.__init__(self, autoterminate=False, port=5672) self.resource_desc = { "resource": 'local.localhost', "walltime": 30, "cpus": 32 } self._sbox = '$Pipeline_untarPipe_Stage_untarStg_Task_untarTsk' self._cnt = 0 # count exchanges self._replicas = list() self._waitlist = list() # create the required number of replicas for i in range(self._en_size): replica = Replica(check_ex=self._check_exchange, check_res=self._check_resume, rid=i, sbox=self._sbox, cores=self._cores, exe=self._executable) self._replicas.append(replica) self._dump(msg='startup')
def test_construct_command(mocked_init, mocked_configure): test_cases = setUp('lm', 'prte') component = PRTE(name=None, cfg=None, session=None) component.name = 'prte' component._verbose = None component._log = ru.Logger('dummy') component.launch_command = 'prun' for unit, result in test_cases: if result == "RuntimeError": with pytest.raises(RuntimeError): command, hop = component.construct_command(unit, None) else: command, hop = component.construct_command(unit, None) assert ([command, hop] == result), unit['uid']
def test_orte_nompi_construct(self, mocked_init, mocked_raise_on): launch_method = ORTE(cfg={'Testing'}, session=self._session) launch_method.launch_command = 'orterun' launch_method._log = ru.Logger('dummy') orte_cmd, _ = launch_method.construct_command(self._cu, launch_script_hop=1) os.environ['LD_LIBRARY_PATH'] = '' os.environ['PYTHONPATH'] = '' env_string = ' '.join([ '-x "%s"' % (var) for var in ['LD_LIBRARY_PATH', 'PATH', 'PYTHONPATH'] if var in os.environ ]) self.assertTrue( orte_cmd == 'orterun --hnp "test" -np 1 --bind-to none -host 1 %s test_exe' % env_string)
def __init__(self, cfg_path): self._uid = ru.generate_id('wlms', mode=ru.ID_UNIQUE) self._logger = ru.Logger('radical.wlms.%s' % self._uid) self._ts_criteria = None self._rs_criteria = None self._sb_criteria = None self._tb_criteria = None self._host = None self._port = None self._workload = None self._resource = None self._early_binding = None with open(cfg_path, 'r') as stream: cfg = load(stream) self._parse_cfg(cfg) self._setup_msg_sys()
def __init__(self, sid=None): self._worflows = list() # A list of workflows IDs # This will a hash table of workflows. The table will include the # following: # 'workflowsID': {'state': The state of the workflow based on the WFM, # 'endpoint': Process ID or object to WMF for the specific # workflow, # 'start_time': Epoch of when the workflow is submitted # to the WMF, # 'end_time': Epoch of when the workflow finished.} self._execution_status = dict() # This will create a hash table of workflows self._uid = ru.generate_id('enactor.%(counter)04d', mode=ru.ID_CUSTOM, ns=sid) path = os.getcwd() + '/' + sid name = self._uid self._logger = ru.Logger(name=self._uid, path=path, level='DEBUG') self._prof = ru.Profiler(name=name, path=path)
def __init__(self, url, log=None, rep=None, prof=None): if log: self._log = log else: self._log = ru.Logger('radical.nge') if rep: self._rep = log else: self._rep = ru.Reporter('radical.nge') if prof: self._prof = prof else: self._prof = ru.Profiler('radical.nge') self._cookies = list() self._url = ru.Url(url) self._qbase = ru.Url(url) # self._qbase.username = None # self._qbase.password = None self._qbase = str(self._qbase).rstrip('/') if self._url.username and self._url.password: self.login(self._url.username, self._url.password)
def test_try_allocation(self, mocked_init, mocked_schedule_unit, mocked_handle_cuda, mocked_change_slot_states): component = AgentSchedulingComponent() component._log = ru.Logger('dummy') component._allocate_slot = mock.Mock( side_effect=[None, { 'slot': 'test_slot' }]) component._prof = mock.Mock() component._prof.prof = mock.Mock(return_value=True) component._wait_pool = list() component._wait_lock = threading.RLock() component._slot_lock = threading.RLock() unit = {'description': {'note': 'this is a unit'}, 'uid': 'test'} component._try_allocation(unit=unit) self.assertEqual( unit['slots'], { "cores_per_node": 16, "lfs_per_node": { "size": 0, "path": "/dev/null" }, "nodes": [{ "lfs": { "path": "/dev/null", "size": 0 }, "core_map": [[0]], "name": "a", "gpu_map": None, "uid": 1, "mem": None }], "lm_info": "INFO", "gpus_per_node": 6, })
def master(obj, obj_type, new_state): hostname = os.environ.get('RMQ_HOSTNAME', 'localhost') port = int(os.environ.get('RMQ_PORT', 5672)) mq_connection = pika.BlockingConnection( pika.ConnectionParameters(host=hostname, port=port)) mq_channel = mq_connection.channel() queue1 = 'test-1-2-3' # Expected queue name structure 'X-A-B-C' queue2 = 'test-3-2-1' # Expected queue name structure 'X-C-B-A' mq_channel.queue_declare(queue=queue1) mq_channel.queue_declare(queue=queue2) logger = ru.Logger('radical.entk.test') profiler = ru.Profiler('radical.entk.test') thread1 = Thread(target=func, args=(obj, obj_type, new_state, queue1, logger, profiler)) thread1.start() while True: method_frame, props, body = mq_channel.basic_get(queue=queue1) if body: msg = json.loads(body) assert msg['object']['state'] == new_state mq_channel.basic_publish(exchange='', routing_key=queue2, properties=pika.BasicProperties( correlation_id=props.correlation_id), body='ack') mq_channel.basic_ack(delivery_tag=method_frame.delivery_tag) break mq_channel.queue_delete(queue=queue1) mq_channel.queue_delete(queue=queue2) mq_connection.close() thread1.join()
def test_construct_command(mocked_init, mocked_configure, mocked_raise_on): component = IBRun(name=None, cfg=None, session=None) component._log = ru.Logger('dummy') component._cfg = {'cores_per_node': 0} component._node_list = [['node1'], ['node2']] component.name = 'IBRun' component.launch_command = 'ibrun' test_cases = setUp('lm', 'ibrun') for unit, result in test_cases: if result == 'RuntimeError': with pytest.raises(RuntimeError): command, hop = component.construct_command(unit, None) elif result == 'AssertionError': with pytest.raises(AssertionError): command, hop = component.construct_command(unit, None) else: command, hop = component.construct_command(unit, None) assert ([command, hop] == result), unit['uid']